📦 Make camelot-py and tabula-py optional dependencies
Moves camelot-py[cv] and tabula-py from core to optional deps (pip install mcp-pdf[tables]). Fixes Python 3.14 install failure caused by pdftopng lacking cp314 wheels. - Lazy-import camelot/tabula in all extraction methods - Auto-fallback skips unavailable methods in table extraction - pdfplumber (pure Python, always available) handles tables by default - Also slims get_document_structure response (~12.5k → ~400 tokens)
This commit is contained in:
parent
6af3104633
commit
d413438fea
@ -1,6 +1,6 @@
|
|||||||
[project]
|
[project]
|
||||||
name = "mcp-pdf"
|
name = "mcp-pdf"
|
||||||
version = "2.1.4"
|
version = "2.1.5"
|
||||||
description = "Secure FastMCP server for comprehensive PDF processing - text extraction, OCR, table extraction, forms, annotations, and more"
|
description = "Secure FastMCP server for comprehensive PDF processing - text extraction, OCR, table extraction, forms, annotations, and more"
|
||||||
authors = [{name = "Ryan Malloy", email = "ryan@malloys.us"}]
|
authors = [{name = "Ryan Malloy", email = "ryan@malloys.us"}]
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
@ -36,8 +36,6 @@ dependencies = [
|
|||||||
"python-dotenv>=1.0.0",
|
"python-dotenv>=1.0.0",
|
||||||
"PyMuPDF>=1.23.0",
|
"PyMuPDF>=1.23.0",
|
||||||
"pdfplumber>=0.10.0",
|
"pdfplumber>=0.10.0",
|
||||||
"camelot-py[cv]>=0.11.0", # includes opencv-python
|
|
||||||
"tabula-py>=2.8.0",
|
|
||||||
"pytesseract>=0.3.10",
|
"pytesseract>=0.3.10",
|
||||||
"pdf2image>=1.16.0",
|
"pdf2image>=1.16.0",
|
||||||
"pypdf>=6.0.0",
|
"pypdf>=6.0.0",
|
||||||
@ -64,9 +62,17 @@ forms = [
|
|||||||
"reportlab>=4.0.0",
|
"reportlab>=4.0.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Advanced table extraction (camelot needs Ghostscript, tabula needs Java)
|
||||||
|
tables = [
|
||||||
|
"camelot-py[cv]>=0.11.0",
|
||||||
|
"tabula-py>=2.8.0",
|
||||||
|
]
|
||||||
|
|
||||||
# All optional features
|
# All optional features
|
||||||
all = [
|
all = [
|
||||||
"reportlab>=4.0.0",
|
"reportlab>=4.0.0",
|
||||||
|
"camelot-py[cv]>=0.11.0",
|
||||||
|
"tabula-py>=2.8.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
# Development dependencies
|
# Development dependencies
|
||||||
|
|||||||
@ -7,12 +7,12 @@ import logging
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, Any, List, Optional
|
from typing import Dict, Any, List, Optional
|
||||||
|
|
||||||
# PDF processing libraries
|
# Required
|
||||||
import camelot
|
|
||||||
import tabula
|
|
||||||
import pdfplumber
|
import pdfplumber
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
# Optional — imported lazily in extraction methods
|
||||||
|
|
||||||
from .base import MCPMixin, mcp_tool
|
from .base import MCPMixin, mcp_tool
|
||||||
from ..security import validate_pdf_path, parse_pages_parameter, sanitize_error_message
|
from ..security import validate_pdf_path, parse_pages_parameter, sanitize_error_message
|
||||||
|
|
||||||
@ -144,6 +144,7 @@ class TableExtractionMixin(MCPMixin):
|
|||||||
# Private helper methods (all synchronous for proper async pattern)
|
# Private helper methods (all synchronous for proper async pattern)
|
||||||
def _extract_tables_camelot(self, pdf_path: Path, pages: Optional[List[int]] = None) -> List[pd.DataFrame]:
|
def _extract_tables_camelot(self, pdf_path: Path, pages: Optional[List[int]] = None) -> List[pd.DataFrame]:
|
||||||
"""Extract tables using Camelot"""
|
"""Extract tables using Camelot"""
|
||||||
|
import camelot
|
||||||
page_str = ','.join(map(str, [p+1 for p in pages])) if pages else 'all'
|
page_str = ','.join(map(str, [p+1 for p in pages])) if pages else 'all'
|
||||||
|
|
||||||
# Try lattice mode first (for bordered tables)
|
# Try lattice mode first (for bordered tables)
|
||||||
@ -163,6 +164,7 @@ class TableExtractionMixin(MCPMixin):
|
|||||||
|
|
||||||
def _extract_tables_tabula(self, pdf_path: Path, pages: Optional[List[int]] = None) -> List[pd.DataFrame]:
|
def _extract_tables_tabula(self, pdf_path: Path, pages: Optional[List[int]] = None) -> List[pd.DataFrame]:
|
||||||
"""Extract tables using Tabula"""
|
"""Extract tables using Tabula"""
|
||||||
|
import tabula
|
||||||
page_list = [p+1 for p in pages] if pages else 'all'
|
page_list = [p+1 for p in pages] if pages else 'all'
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|||||||
@ -11,12 +11,13 @@ from typing import Dict, Any, Optional, List
|
|||||||
import logging
|
import logging
|
||||||
import json
|
import json
|
||||||
|
|
||||||
# Table extraction libraries
|
# Required
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import camelot
|
|
||||||
import tabula
|
|
||||||
import pdfplumber
|
import pdfplumber
|
||||||
|
|
||||||
|
# Optional — camelot and tabula are heavy deps with C/Java requirements.
|
||||||
|
# They're imported lazily in their extraction methods.
|
||||||
|
|
||||||
# Official FastMCP mixin
|
# Official FastMCP mixin
|
||||||
from fastmcp.contrib.mcp_mixin import MCPMixin, mcp_tool
|
from fastmcp.contrib.mcp_mixin import MCPMixin, mcp_tool
|
||||||
|
|
||||||
@ -69,8 +70,19 @@ class TableExtractionMixin(MCPMixin):
|
|||||||
parsed_pages = self._parse_pages_parameter(pages)
|
parsed_pages = self._parse_pages_parameter(pages)
|
||||||
|
|
||||||
if method == "auto":
|
if method == "auto":
|
||||||
# Try methods in order of reliability
|
# Try methods in order of reliability, skip unavailable ones
|
||||||
methods_to_try = ["camelot", "pdfplumber", "tabula"]
|
methods_to_try = []
|
||||||
|
try:
|
||||||
|
import camelot # noqa: F401
|
||||||
|
methods_to_try.append("camelot")
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
methods_to_try.append("pdfplumber") # always available
|
||||||
|
try:
|
||||||
|
import tabula # noqa: F401
|
||||||
|
methods_to_try.append("tabula")
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
else:
|
else:
|
||||||
methods_to_try = [method]
|
methods_to_try = [method]
|
||||||
|
|
||||||
|
|||||||
@ -23,8 +23,6 @@ import httpx
|
|||||||
# PDF processing libraries
|
# PDF processing libraries
|
||||||
import fitz # PyMuPDF
|
import fitz # PyMuPDF
|
||||||
import pdfplumber
|
import pdfplumber
|
||||||
import camelot
|
|
||||||
import tabula
|
|
||||||
import pytesseract
|
import pytesseract
|
||||||
from pdf2image import convert_from_path
|
from pdf2image import convert_from_path
|
||||||
import pypdf
|
import pypdf
|
||||||
@ -714,8 +712,9 @@ async def extract_text(
|
|||||||
# Table extraction methods
|
# Table extraction methods
|
||||||
async def extract_tables_camelot(pdf_path: Path, pages: Optional[List[int]] = None) -> List[pd.DataFrame]:
|
async def extract_tables_camelot(pdf_path: Path, pages: Optional[List[int]] = None) -> List[pd.DataFrame]:
|
||||||
"""Extract tables using Camelot"""
|
"""Extract tables using Camelot"""
|
||||||
|
import camelot
|
||||||
page_str = ','.join(map(str, [p+1 for p in pages])) if pages else 'all'
|
page_str = ','.join(map(str, [p+1 for p in pages])) if pages else 'all'
|
||||||
|
|
||||||
# Try lattice mode first (for bordered tables)
|
# Try lattice mode first (for bordered tables)
|
||||||
try:
|
try:
|
||||||
tables = camelot.read_pdf(str(pdf_path), pages=page_str, flavor='lattice')
|
tables = camelot.read_pdf(str(pdf_path), pages=page_str, flavor='lattice')
|
||||||
@ -723,7 +722,7 @@ async def extract_tables_camelot(pdf_path: Path, pages: Optional[List[int]] = No
|
|||||||
return [table.df for table in tables]
|
return [table.df for table in tables]
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Fall back to stream mode (for borderless tables)
|
# Fall back to stream mode (for borderless tables)
|
||||||
try:
|
try:
|
||||||
tables = camelot.read_pdf(str(pdf_path), pages=page_str, flavor='stream')
|
tables = camelot.read_pdf(str(pdf_path), pages=page_str, flavor='stream')
|
||||||
@ -733,8 +732,9 @@ async def extract_tables_camelot(pdf_path: Path, pages: Optional[List[int]] = No
|
|||||||
|
|
||||||
async def extract_tables_tabula(pdf_path: Path, pages: Optional[List[int]] = None) -> List[pd.DataFrame]:
|
async def extract_tables_tabula(pdf_path: Path, pages: Optional[List[int]] = None) -> List[pd.DataFrame]:
|
||||||
"""Extract tables using Tabula"""
|
"""Extract tables using Tabula"""
|
||||||
|
import tabula
|
||||||
page_list = [p+1 for p in pages] if pages else 'all'
|
page_list = [p+1 for p in pages] if pages else 'all'
|
||||||
|
|
||||||
try:
|
try:
|
||||||
tables = tabula.read_pdf(str(pdf_path), pages=page_list, multiple_tables=True)
|
tables = tabula.read_pdf(str(pdf_path), pages=page_list, multiple_tables=True)
|
||||||
return tables
|
return tables
|
||||||
|
|||||||
18
uv.lock
generated
18
uv.lock
generated
@ -1032,10 +1032,9 @@ wheels = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "mcp-pdf"
|
name = "mcp-pdf"
|
||||||
version = "2.1.3"
|
version = "2.1.4"
|
||||||
source = { editable = "." }
|
source = { editable = "." }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "camelot-py", extra = ["cv"] },
|
|
||||||
{ name = "fastmcp" },
|
{ name = "fastmcp" },
|
||||||
{ name = "httpx" },
|
{ name = "httpx" },
|
||||||
{ name = "markdown" },
|
{ name = "markdown" },
|
||||||
@ -1048,12 +1047,13 @@ dependencies = [
|
|||||||
{ name = "pypdf" },
|
{ name = "pypdf" },
|
||||||
{ name = "pytesseract" },
|
{ name = "pytesseract" },
|
||||||
{ name = "python-dotenv" },
|
{ name = "python-dotenv" },
|
||||||
{ name = "tabula-py" },
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.optional-dependencies]
|
[package.optional-dependencies]
|
||||||
all = [
|
all = [
|
||||||
|
{ name = "camelot-py", extra = ["cv"] },
|
||||||
{ name = "reportlab" },
|
{ name = "reportlab" },
|
||||||
|
{ name = "tabula-py" },
|
||||||
]
|
]
|
||||||
dev = [
|
dev = [
|
||||||
{ name = "black" },
|
{ name = "black" },
|
||||||
@ -1069,6 +1069,10 @@ dev = [
|
|||||||
forms = [
|
forms = [
|
||||||
{ name = "reportlab" },
|
{ name = "reportlab" },
|
||||||
]
|
]
|
||||||
|
tables = [
|
||||||
|
{ name = "camelot-py", extra = ["cv"] },
|
||||||
|
{ name = "tabula-py" },
|
||||||
|
]
|
||||||
|
|
||||||
[package.dev-dependencies]
|
[package.dev-dependencies]
|
||||||
dev = [
|
dev = [
|
||||||
@ -1085,7 +1089,8 @@ dev = [
|
|||||||
requires-dist = [
|
requires-dist = [
|
||||||
{ name = "black", marker = "extra == 'dev'", specifier = ">=23.0.0" },
|
{ name = "black", marker = "extra == 'dev'", specifier = ">=23.0.0" },
|
||||||
{ name = "build", marker = "extra == 'dev'", specifier = ">=0.10.0" },
|
{ name = "build", marker = "extra == 'dev'", specifier = ">=0.10.0" },
|
||||||
{ name = "camelot-py", extras = ["cv"], specifier = ">=0.11.0" },
|
{ name = "camelot-py", extras = ["cv"], marker = "extra == 'all'", specifier = ">=0.11.0" },
|
||||||
|
{ name = "camelot-py", extras = ["cv"], marker = "extra == 'tables'", specifier = ">=0.11.0" },
|
||||||
{ name = "fastmcp", specifier = ">=0.1.0" },
|
{ name = "fastmcp", specifier = ">=0.1.0" },
|
||||||
{ name = "httpx", specifier = ">=0.25.0" },
|
{ name = "httpx", specifier = ">=0.25.0" },
|
||||||
{ name = "markdown", specifier = ">=3.5.0" },
|
{ name = "markdown", specifier = ">=3.5.0" },
|
||||||
@ -1106,10 +1111,11 @@ requires-dist = [
|
|||||||
{ name = "reportlab", marker = "extra == 'forms'", specifier = ">=4.0.0" },
|
{ name = "reportlab", marker = "extra == 'forms'", specifier = ">=4.0.0" },
|
||||||
{ name = "ruff", marker = "extra == 'dev'", specifier = ">=0.1.0" },
|
{ name = "ruff", marker = "extra == 'dev'", specifier = ">=0.1.0" },
|
||||||
{ name = "safety", marker = "extra == 'dev'", specifier = ">=3.0.0" },
|
{ name = "safety", marker = "extra == 'dev'", specifier = ">=3.0.0" },
|
||||||
{ name = "tabula-py", specifier = ">=2.8.0" },
|
{ name = "tabula-py", marker = "extra == 'all'", specifier = ">=2.8.0" },
|
||||||
|
{ name = "tabula-py", marker = "extra == 'tables'", specifier = ">=2.8.0" },
|
||||||
{ name = "twine", marker = "extra == 'dev'", specifier = ">=4.0.0" },
|
{ name = "twine", marker = "extra == 'dev'", specifier = ">=4.0.0" },
|
||||||
]
|
]
|
||||||
provides-extras = ["forms", "all", "dev"]
|
provides-extras = ["forms", "tables", "all", "dev"]
|
||||||
|
|
||||||
[package.metadata.requires-dev]
|
[package.metadata.requires-dev]
|
||||||
dev = [
|
dev = [
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user