📦 Make camelot-py and tabula-py optional dependencies

Moves camelot-py[cv] and tabula-py from core to optional deps (pip install mcp-pdf[tables]). Fixes Python 3.14 install failure caused by pdftopng lacking cp314 wheels. - Lazy-import camelot/tabula in all extraction methods - Auto-fallback skips unavailable methods in table extraction - pdfplumber (pure Python, always available) handles tables by default - Also slims get_document_structure response (~12.5k → ~400 tokens)
2026-03-08 03:20:01 -06:00 · 2026-03-08 03:20:01 -06:00 · d413438fea
commit d413438fea
parent 6af3104633
5 changed files with 48 additions and 22 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [project]
 name = "mcp-pdf"
-version = "2.1.4"
+version = "2.1.5"
 description = "Secure FastMCP server for comprehensive PDF processing - text extraction, OCR, table extraction, forms, annotations, and more"
 authors = [{name = "Ryan Malloy", email = "ryan@malloys.us"}]
 readme = "README.md"
@ -36,8 +36,6 @@ dependencies = [
    "python-dotenv>=1.0.0",
    "PyMuPDF>=1.23.0",
    "pdfplumber>=0.10.0",
    "camelot-py[cv]>=0.11.0",  # includes opencv-python
    "tabula-py>=2.8.0",
    "pytesseract>=0.3.10",
    "pdf2image>=1.16.0",
    "pypdf>=6.0.0",
@ -64,9 +62,17 @@ forms = [
    "reportlab>=4.0.0",
 ]
 # Advanced table extraction (camelot needs Ghostscript, tabula needs Java)
 tables = [
    "camelot-py[cv]>=0.11.0",
    "tabula-py>=2.8.0",
 ]
 # All optional features
 all = [
    "reportlab>=4.0.0",
    "camelot-py[cv]>=0.11.0",
    "tabula-py>=2.8.0",
 ]
 # Development dependencies
--- a/src/mcp_pdf/mixins/table_extraction.py
+++ b/src/mcp_pdf/mixins/table_extraction.py
@ -7,12 +7,12 @@ import logging
 from pathlib import Path
 from typing import Dict, Any, List, Optional
-# PDF processing libraries
+# Required
 import camelot
 import tabula
 import pdfplumber
 import pandas as pd
 # Optional — imported lazily in extraction methods
 from .base import MCPMixin, mcp_tool
 from ..security import validate_pdf_path, parse_pages_parameter, sanitize_error_message
@ -144,6 +144,7 @@ class TableExtractionMixin(MCPMixin):
    # Private helper methods (all synchronous for proper async pattern)
    def _extract_tables_camelot(self, pdf_path: Path, pages: Optional[List[int]] = None) -> List[pd.DataFrame]:
        """Extract tables using Camelot"""
        import camelot
        page_str = ','.join(map(str, [p+1 for p in pages])) if pages else 'all'
        # Try lattice mode first (for bordered tables)
@ -163,6 +164,7 @@ class TableExtractionMixin(MCPMixin):
    def _extract_tables_tabula(self, pdf_path: Path, pages: Optional[List[int]] = None) -> List[pd.DataFrame]:
        """Extract tables using Tabula"""
        import tabula
        page_list = [p+1 for p in pages] if pages else 'all'
        try:
--- a/src/mcp_pdf/mixins_official/table_extraction.py
+++ b/src/mcp_pdf/mixins_official/table_extraction.py
@ -11,12 +11,13 @@ from typing import Dict, Any, Optional, List
 import logging
 import json
-# Table extraction libraries
+# Required
 import pandas as pd
 import camelot
 import tabula
 import pdfplumber
 # Optional — camelot and tabula are heavy deps with C/Java requirements.
 # They're imported lazily in their extraction methods.
 # Official FastMCP mixin
 from fastmcp.contrib.mcp_mixin import MCPMixin, mcp_tool
@ -69,8 +70,19 @@ class TableExtractionMixin(MCPMixin):
            parsed_pages = self._parse_pages_parameter(pages)
            if method == "auto":
-                # Try methods in order of reliability
+                # Try methods in order of reliability, skip unavailable ones
-                methods_to_try = ["camelot", "pdfplumber", "tabula"]
+                methods_to_try = []
                try:
                    import camelot  # noqa: F401
                    methods_to_try.append("camelot")
                except ImportError:
                    pass
                methods_to_try.append("pdfplumber")  # always available
                try:
                    import tabula  # noqa: F401
                    methods_to_try.append("tabula")
                except ImportError:
                    pass
            else:
                methods_to_try = [method]
--- a/src/mcp_pdf/server_legacy.py
+++ b/src/mcp_pdf/server_legacy.py
@ -23,8 +23,6 @@ import httpx
 # PDF processing libraries
 import fitz  # PyMuPDF
 import pdfplumber
 import camelot
 import tabula
 import pytesseract
 from pdf2image import convert_from_path
 import pypdf
@ -714,8 +712,9 @@ async def extract_text(
 # Table extraction methods
 async def extract_tables_camelot(pdf_path: Path, pages: Optional[List[int]] = None) -> List[pd.DataFrame]:
    """Extract tables using Camelot"""
    import camelot
    page_str = ','.join(map(str, [p+1 for p in pages])) if pages else 'all'
-    
+
    # Try lattice mode first (for bordered tables)
    try:
        tables = camelot.read_pdf(str(pdf_path), pages=page_str, flavor='lattice')
@ -723,7 +722,7 @@ async def extract_tables_camelot(pdf_path: Path, pages: Optional[List[int]] = No
            return [table.df for table in tables]
    except Exception:
        pass
-    
+
    # Fall back to stream mode (for borderless tables)
    try:
        tables = camelot.read_pdf(str(pdf_path), pages=page_str, flavor='stream')
@ -733,8 +732,9 @@ async def extract_tables_camelot(pdf_path: Path, pages: Optional[List[int]] = No
 async def extract_tables_tabula(pdf_path: Path, pages: Optional[List[int]] = None) -> List[pd.DataFrame]:
    """Extract tables using Tabula"""
    import tabula
    page_list = [p+1 for p in pages] if pages else 'all'
-    
+
    try:
        tables = tabula.read_pdf(str(pdf_path), pages=page_list, multiple_tables=True)
        return tables
--- a/uv.lock
+++ b/uv.lock
@ -1032,10 +1032,9 @@ wheels = [
 [[package]]
 name = "mcp-pdf"
-version = "2.1.3"
+version = "2.1.4"
 source = { editable = "." }
 dependencies = [
    { name = "camelot-py", extra = ["cv"] },
    { name = "fastmcp" },
    { name = "httpx" },
    { name = "markdown" },
@ -1048,12 +1047,13 @@ dependencies = [
    { name = "pypdf" },
    { name = "pytesseract" },
    { name = "python-dotenv" },
    { name = "tabula-py" },
 ]
 [package.optional-dependencies]
 all = [
    { name = "camelot-py", extra = ["cv"] },
    { name = "reportlab" },
    { name = "tabula-py" },
 ]
 dev = [
    { name = "black" },
@ -1069,6 +1069,10 @@ dev = [
 forms = [
    { name = "reportlab" },
 ]
 tables = [
    { name = "camelot-py", extra = ["cv"] },
    { name = "tabula-py" },
 ]
 [package.dev-dependencies]
 dev = [
@ -1085,7 +1089,8 @@ dev = [
 requires-dist = [
    { name = "black", marker = "extra == 'dev'", specifier = ">=23.0.0" },
    { name = "build", marker = "extra == 'dev'", specifier = ">=0.10.0" },
-    { name = "camelot-py", extras = ["cv"], specifier = ">=0.11.0" },
+    { name = "camelot-py", extras = ["cv"], marker = "extra == 'all'", specifier = ">=0.11.0" },
    { name = "camelot-py", extras = ["cv"], marker = "extra == 'tables'", specifier = ">=0.11.0" },
    { name = "fastmcp", specifier = ">=0.1.0" },
    { name = "httpx", specifier = ">=0.25.0" },
    { name = "markdown", specifier = ">=3.5.0" },
@ -1106,10 +1111,11 @@ requires-dist = [
    { name = "reportlab", marker = "extra == 'forms'", specifier = ">=4.0.0" },
    { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.1.0" },
    { name = "safety", marker = "extra == 'dev'", specifier = ">=3.0.0" },
-    { name = "tabula-py", specifier = ">=2.8.0" },
+    { name = "tabula-py", marker = "extra == 'all'", specifier = ">=2.8.0" },
    { name = "tabula-py", marker = "extra == 'tables'", specifier = ">=2.8.0" },
    { name = "twine", marker = "extra == 'dev'", specifier = ">=4.0.0" },
 ]
-provides-extras = ["forms", "all", "dev"]
+provides-extras = ["forms", "tables", "all", "dev"]
 [package.metadata.requires-dev]
 dev = [