📦 Make camelot-py and tabula-py optional dependencies

Moves camelot-py[cv] and tabula-py from core to optional deps
(pip install mcp-pdf[tables]). Fixes Python 3.14 install failure
caused by pdftopng lacking cp314 wheels.

- Lazy-import camelot/tabula in all extraction methods
- Auto-fallback skips unavailable methods in table extraction
- pdfplumber (pure Python, always available) handles tables by default
- Also slims get_document_structure response (~12.5k → ~400 tokens)
This commit is contained in:
Ryan Malloy 2026-03-08 03:20:01 -06:00
parent 6af3104633
commit d413438fea
5 changed files with 48 additions and 22 deletions

View File

@ -1,6 +1,6 @@
[project] [project]
name = "mcp-pdf" name = "mcp-pdf"
version = "2.1.4" version = "2.1.5"
description = "Secure FastMCP server for comprehensive PDF processing - text extraction, OCR, table extraction, forms, annotations, and more" description = "Secure FastMCP server for comprehensive PDF processing - text extraction, OCR, table extraction, forms, annotations, and more"
authors = [{name = "Ryan Malloy", email = "ryan@malloys.us"}] authors = [{name = "Ryan Malloy", email = "ryan@malloys.us"}]
readme = "README.md" readme = "README.md"
@ -36,8 +36,6 @@ dependencies = [
"python-dotenv>=1.0.0", "python-dotenv>=1.0.0",
"PyMuPDF>=1.23.0", "PyMuPDF>=1.23.0",
"pdfplumber>=0.10.0", "pdfplumber>=0.10.0",
"camelot-py[cv]>=0.11.0", # includes opencv-python
"tabula-py>=2.8.0",
"pytesseract>=0.3.10", "pytesseract>=0.3.10",
"pdf2image>=1.16.0", "pdf2image>=1.16.0",
"pypdf>=6.0.0", "pypdf>=6.0.0",
@ -64,9 +62,17 @@ forms = [
"reportlab>=4.0.0", "reportlab>=4.0.0",
] ]
# Advanced table extraction (camelot needs Ghostscript, tabula needs Java)
tables = [
"camelot-py[cv]>=0.11.0",
"tabula-py>=2.8.0",
]
# All optional features # All optional features
all = [ all = [
"reportlab>=4.0.0", "reportlab>=4.0.0",
"camelot-py[cv]>=0.11.0",
"tabula-py>=2.8.0",
] ]
# Development dependencies # Development dependencies

View File

@ -7,12 +7,12 @@ import logging
from pathlib import Path from pathlib import Path
from typing import Dict, Any, List, Optional from typing import Dict, Any, List, Optional
# PDF processing libraries # Required
import camelot
import tabula
import pdfplumber import pdfplumber
import pandas as pd import pandas as pd
# Optional — imported lazily in extraction methods
from .base import MCPMixin, mcp_tool from .base import MCPMixin, mcp_tool
from ..security import validate_pdf_path, parse_pages_parameter, sanitize_error_message from ..security import validate_pdf_path, parse_pages_parameter, sanitize_error_message
@ -144,6 +144,7 @@ class TableExtractionMixin(MCPMixin):
# Private helper methods (all synchronous for proper async pattern) # Private helper methods (all synchronous for proper async pattern)
def _extract_tables_camelot(self, pdf_path: Path, pages: Optional[List[int]] = None) -> List[pd.DataFrame]: def _extract_tables_camelot(self, pdf_path: Path, pages: Optional[List[int]] = None) -> List[pd.DataFrame]:
"""Extract tables using Camelot""" """Extract tables using Camelot"""
import camelot
page_str = ','.join(map(str, [p+1 for p in pages])) if pages else 'all' page_str = ','.join(map(str, [p+1 for p in pages])) if pages else 'all'
# Try lattice mode first (for bordered tables) # Try lattice mode first (for bordered tables)
@ -163,6 +164,7 @@ class TableExtractionMixin(MCPMixin):
def _extract_tables_tabula(self, pdf_path: Path, pages: Optional[List[int]] = None) -> List[pd.DataFrame]: def _extract_tables_tabula(self, pdf_path: Path, pages: Optional[List[int]] = None) -> List[pd.DataFrame]:
"""Extract tables using Tabula""" """Extract tables using Tabula"""
import tabula
page_list = [p+1 for p in pages] if pages else 'all' page_list = [p+1 for p in pages] if pages else 'all'
try: try:

View File

@ -11,12 +11,13 @@ from typing import Dict, Any, Optional, List
import logging import logging
import json import json
# Table extraction libraries # Required
import pandas as pd import pandas as pd
import camelot
import tabula
import pdfplumber import pdfplumber
# Optional — camelot and tabula are heavy deps with C/Java requirements.
# They're imported lazily in their extraction methods.
# Official FastMCP mixin # Official FastMCP mixin
from fastmcp.contrib.mcp_mixin import MCPMixin, mcp_tool from fastmcp.contrib.mcp_mixin import MCPMixin, mcp_tool
@ -69,8 +70,19 @@ class TableExtractionMixin(MCPMixin):
parsed_pages = self._parse_pages_parameter(pages) parsed_pages = self._parse_pages_parameter(pages)
if method == "auto": if method == "auto":
# Try methods in order of reliability # Try methods in order of reliability, skip unavailable ones
methods_to_try = ["camelot", "pdfplumber", "tabula"] methods_to_try = []
try:
import camelot # noqa: F401
methods_to_try.append("camelot")
except ImportError:
pass
methods_to_try.append("pdfplumber") # always available
try:
import tabula # noqa: F401
methods_to_try.append("tabula")
except ImportError:
pass
else: else:
methods_to_try = [method] methods_to_try = [method]

View File

@ -23,8 +23,6 @@ import httpx
# PDF processing libraries # PDF processing libraries
import fitz # PyMuPDF import fitz # PyMuPDF
import pdfplumber import pdfplumber
import camelot
import tabula
import pytesseract import pytesseract
from pdf2image import convert_from_path from pdf2image import convert_from_path
import pypdf import pypdf
@ -714,6 +712,7 @@ async def extract_text(
# Table extraction methods # Table extraction methods
async def extract_tables_camelot(pdf_path: Path, pages: Optional[List[int]] = None) -> List[pd.DataFrame]: async def extract_tables_camelot(pdf_path: Path, pages: Optional[List[int]] = None) -> List[pd.DataFrame]:
"""Extract tables using Camelot""" """Extract tables using Camelot"""
import camelot
page_str = ','.join(map(str, [p+1 for p in pages])) if pages else 'all' page_str = ','.join(map(str, [p+1 for p in pages])) if pages else 'all'
# Try lattice mode first (for bordered tables) # Try lattice mode first (for bordered tables)
@ -733,6 +732,7 @@ async def extract_tables_camelot(pdf_path: Path, pages: Optional[List[int]] = No
async def extract_tables_tabula(pdf_path: Path, pages: Optional[List[int]] = None) -> List[pd.DataFrame]: async def extract_tables_tabula(pdf_path: Path, pages: Optional[List[int]] = None) -> List[pd.DataFrame]:
"""Extract tables using Tabula""" """Extract tables using Tabula"""
import tabula
page_list = [p+1 for p in pages] if pages else 'all' page_list = [p+1 for p in pages] if pages else 'all'
try: try:

18
uv.lock generated
View File

@ -1032,10 +1032,9 @@ wheels = [
[[package]] [[package]]
name = "mcp-pdf" name = "mcp-pdf"
version = "2.1.3" version = "2.1.4"
source = { editable = "." } source = { editable = "." }
dependencies = [ dependencies = [
{ name = "camelot-py", extra = ["cv"] },
{ name = "fastmcp" }, { name = "fastmcp" },
{ name = "httpx" }, { name = "httpx" },
{ name = "markdown" }, { name = "markdown" },
@ -1048,12 +1047,13 @@ dependencies = [
{ name = "pypdf" }, { name = "pypdf" },
{ name = "pytesseract" }, { name = "pytesseract" },
{ name = "python-dotenv" }, { name = "python-dotenv" },
{ name = "tabula-py" },
] ]
[package.optional-dependencies] [package.optional-dependencies]
all = [ all = [
{ name = "camelot-py", extra = ["cv"] },
{ name = "reportlab" }, { name = "reportlab" },
{ name = "tabula-py" },
] ]
dev = [ dev = [
{ name = "black" }, { name = "black" },
@ -1069,6 +1069,10 @@ dev = [
forms = [ forms = [
{ name = "reportlab" }, { name = "reportlab" },
] ]
tables = [
{ name = "camelot-py", extra = ["cv"] },
{ name = "tabula-py" },
]
[package.dev-dependencies] [package.dev-dependencies]
dev = [ dev = [
@ -1085,7 +1089,8 @@ dev = [
requires-dist = [ requires-dist = [
{ name = "black", marker = "extra == 'dev'", specifier = ">=23.0.0" }, { name = "black", marker = "extra == 'dev'", specifier = ">=23.0.0" },
{ name = "build", marker = "extra == 'dev'", specifier = ">=0.10.0" }, { name = "build", marker = "extra == 'dev'", specifier = ">=0.10.0" },
{ name = "camelot-py", extras = ["cv"], specifier = ">=0.11.0" }, { name = "camelot-py", extras = ["cv"], marker = "extra == 'all'", specifier = ">=0.11.0" },
{ name = "camelot-py", extras = ["cv"], marker = "extra == 'tables'", specifier = ">=0.11.0" },
{ name = "fastmcp", specifier = ">=0.1.0" }, { name = "fastmcp", specifier = ">=0.1.0" },
{ name = "httpx", specifier = ">=0.25.0" }, { name = "httpx", specifier = ">=0.25.0" },
{ name = "markdown", specifier = ">=3.5.0" }, { name = "markdown", specifier = ">=3.5.0" },
@ -1106,10 +1111,11 @@ requires-dist = [
{ name = "reportlab", marker = "extra == 'forms'", specifier = ">=4.0.0" }, { name = "reportlab", marker = "extra == 'forms'", specifier = ">=4.0.0" },
{ name = "ruff", marker = "extra == 'dev'", specifier = ">=0.1.0" }, { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.1.0" },
{ name = "safety", marker = "extra == 'dev'", specifier = ">=3.0.0" }, { name = "safety", marker = "extra == 'dev'", specifier = ">=3.0.0" },
{ name = "tabula-py", specifier = ">=2.8.0" }, { name = "tabula-py", marker = "extra == 'all'", specifier = ">=2.8.0" },
{ name = "tabula-py", marker = "extra == 'tables'", specifier = ">=2.8.0" },
{ name = "twine", marker = "extra == 'dev'", specifier = ">=4.0.0" }, { name = "twine", marker = "extra == 'dev'", specifier = ">=4.0.0" },
] ]
provides-extras = ["forms", "all", "dev"] provides-extras = ["forms", "tables", "all", "dev"]
[package.metadata.requires-dev] [package.metadata.requires-dev]
dev = [ dev = [