From d413438feaf944e0c7b1be9c30026cbcd8704e1c Mon Sep 17 00:00:00 2001
From: Ryan Malloy <ryan@supported.systems>
Date: Sun, 8 Mar 2026 03:20:01 -0600
Subject: [PATCH] =?UTF-8?q?=F0=9F=93=A6=20Make=20camelot-py=20and=20tabula?=
 =?UTF-8?q?-py=20optional=20dependencies?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Moves camelot-py[cv] and tabula-py from core to optional deps
(pip install mcp-pdf[tables]). Fixes Python 3.14 install failure
caused by pdftopng lacking cp314 wheels.

- Lazy-import camelot/tabula in all extraction methods
- Auto-fallback skips unavailable methods in table extraction
- pdfplumber (pure Python, always available) handles tables by default
- Also slims get_document_structure response (~12.5k → ~400 tokens)
---
 pyproject.toml                                | 12 +++++++---
 src/mcp_pdf/mixins/table_extraction.py        |  8 ++++---
 .../mixins_official/table_extraction.py       | 22 ++++++++++++++-----
 src/mcp_pdf/server_legacy.py                  | 10 ++++-----
 uv.lock                                       | 18 ++++++++++-----
 5 files changed, 48 insertions(+), 22 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 9447138..6a063e3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "mcp-pdf"
-version = "2.1.4"
+version = "2.1.5"
 description = "Secure FastMCP server for comprehensive PDF processing - text extraction, OCR, table extraction, forms, annotations, and more"
 authors = [{name = "Ryan Malloy", email = "ryan@malloys.us"}]
 readme = "README.md"
@@ -36,8 +36,6 @@ dependencies = [
     "python-dotenv>=1.0.0",
     "PyMuPDF>=1.23.0",
     "pdfplumber>=0.10.0",
-    "camelot-py[cv]>=0.11.0",  # includes opencv-python
-    "tabula-py>=2.8.0",
     "pytesseract>=0.3.10",
     "pdf2image>=1.16.0",
     "pypdf>=6.0.0",
@@ -64,9 +62,17 @@ forms = [
     "reportlab>=4.0.0",
 ]
 
+# Advanced table extraction (camelot needs Ghostscript, tabula needs Java)
+tables = [
+    "camelot-py[cv]>=0.11.0",
+    "tabula-py>=2.8.0",
+]
+
 # All optional features
 all = [
     "reportlab>=4.0.0",
+    "camelot-py[cv]>=0.11.0",
+    "tabula-py>=2.8.0",
 ]
 
 # Development dependencies
diff --git a/src/mcp_pdf/mixins/table_extraction.py b/src/mcp_pdf/mixins/table_extraction.py
index 519e3b2..681fb2f 100644
--- a/src/mcp_pdf/mixins/table_extraction.py
+++ b/src/mcp_pdf/mixins/table_extraction.py
@@ -7,12 +7,12 @@ import logging
 from pathlib import Path
 from typing import Dict, Any, List, Optional
 
-# PDF processing libraries
-import camelot
-import tabula
+# Required
 import pdfplumber
 import pandas as pd
 
+# Optional — imported lazily in extraction methods
+
 from .base import MCPMixin, mcp_tool
 from ..security import validate_pdf_path, parse_pages_parameter, sanitize_error_message
 
@@ -144,6 +144,7 @@ class TableExtractionMixin(MCPMixin):
     # Private helper methods (all synchronous for proper async pattern)
     def _extract_tables_camelot(self, pdf_path: Path, pages: Optional[List[int]] = None) -> List[pd.DataFrame]:
         """Extract tables using Camelot"""
+        import camelot
         page_str = ','.join(map(str, [p+1 for p in pages])) if pages else 'all'
 
         # Try lattice mode first (for bordered tables)
@@ -163,6 +164,7 @@ class TableExtractionMixin(MCPMixin):
 
     def _extract_tables_tabula(self, pdf_path: Path, pages: Optional[List[int]] = None) -> List[pd.DataFrame]:
         """Extract tables using Tabula"""
+        import tabula
         page_list = [p+1 for p in pages] if pages else 'all'
 
         try:
diff --git a/src/mcp_pdf/mixins_official/table_extraction.py b/src/mcp_pdf/mixins_official/table_extraction.py
index 16450e9..374fb0c 100644
--- a/src/mcp_pdf/mixins_official/table_extraction.py
+++ b/src/mcp_pdf/mixins_official/table_extraction.py
@@ -11,12 +11,13 @@ from typing import Dict, Any, Optional, List
 import logging
 import json
 
-# Table extraction libraries
+# Required
 import pandas as pd
-import camelot
-import tabula
 import pdfplumber
 
+# Optional — camelot and tabula are heavy deps with C/Java requirements.
+# They're imported lazily in their extraction methods.
+
 # Official FastMCP mixin
 from fastmcp.contrib.mcp_mixin import MCPMixin, mcp_tool
 
@@ -69,8 +70,19 @@ class TableExtractionMixin(MCPMixin):
             parsed_pages = self._parse_pages_parameter(pages)
 
             if method == "auto":
-                # Try methods in order of reliability
-                methods_to_try = ["camelot", "pdfplumber", "tabula"]
+                # Try methods in order of reliability, skip unavailable ones
+                methods_to_try = []
+                try:
+                    import camelot  # noqa: F401
+                    methods_to_try.append("camelot")
+                except ImportError:
+                    pass
+                methods_to_try.append("pdfplumber")  # always available
+                try:
+                    import tabula  # noqa: F401
+                    methods_to_try.append("tabula")
+                except ImportError:
+                    pass
             else:
                 methods_to_try = [method]
 
diff --git a/src/mcp_pdf/server_legacy.py b/src/mcp_pdf/server_legacy.py
index 0ebd552..94d9819 100644
--- a/src/mcp_pdf/server_legacy.py
+++ b/src/mcp_pdf/server_legacy.py
@@ -23,8 +23,6 @@ import httpx
 # PDF processing libraries
 import fitz  # PyMuPDF
 import pdfplumber
-import camelot
-import tabula
 import pytesseract
 from pdf2image import convert_from_path
 import pypdf
@@ -714,8 +712,9 @@ async def extract_text(
 # Table extraction methods
 async def extract_tables_camelot(pdf_path: Path, pages: Optional[List[int]] = None) -> List[pd.DataFrame]:
     """Extract tables using Camelot"""
+    import camelot
     page_str = ','.join(map(str, [p+1 for p in pages])) if pages else 'all'
-    
+
     # Try lattice mode first (for bordered tables)
     try:
         tables = camelot.read_pdf(str(pdf_path), pages=page_str, flavor='lattice')
@@ -723,7 +722,7 @@ async def extract_tables_camelot(pdf_path: Path, pages: Optional[List[int]] = No
             return [table.df for table in tables]
     except Exception:
         pass
-    
+
     # Fall back to stream mode (for borderless tables)
     try:
         tables = camelot.read_pdf(str(pdf_path), pages=page_str, flavor='stream')
@@ -733,8 +732,9 @@ async def extract_tables_camelot(pdf_path: Path, pages: Optional[List[int]] = No
 
 async def extract_tables_tabula(pdf_path: Path, pages: Optional[List[int]] = None) -> List[pd.DataFrame]:
     """Extract tables using Tabula"""
+    import tabula
     page_list = [p+1 for p in pages] if pages else 'all'
-    
+
     try:
         tables = tabula.read_pdf(str(pdf_path), pages=page_list, multiple_tables=True)
         return tables
diff --git a/uv.lock b/uv.lock
index 4e53dcb..8d823d4 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1032,10 +1032,9 @@ wheels = [
 
 [[package]]
 name = "mcp-pdf"
-version = "2.1.3"
+version = "2.1.4"
 source = { editable = "." }
 dependencies = [
-    { name = "camelot-py", extra = ["cv"] },
     { name = "fastmcp" },
     { name = "httpx" },
     { name = "markdown" },
@@ -1048,12 +1047,13 @@ dependencies = [
     { name = "pypdf" },
     { name = "pytesseract" },
     { name = "python-dotenv" },
-    { name = "tabula-py" },
 ]
 
 [package.optional-dependencies]
 all = [
+    { name = "camelot-py", extra = ["cv"] },
     { name = "reportlab" },
+    { name = "tabula-py" },
 ]
 dev = [
     { name = "black" },
@@ -1069,6 +1069,10 @@ dev = [
 forms = [
     { name = "reportlab" },
 ]
+tables = [
+    { name = "camelot-py", extra = ["cv"] },
+    { name = "tabula-py" },
+]
 
 [package.dev-dependencies]
 dev = [
@@ -1085,7 +1089,8 @@ dev = [
 requires-dist = [
     { name = "black", marker = "extra == 'dev'", specifier = ">=23.0.0" },
     { name = "build", marker = "extra == 'dev'", specifier = ">=0.10.0" },
-    { name = "camelot-py", extras = ["cv"], specifier = ">=0.11.0" },
+    { name = "camelot-py", extras = ["cv"], marker = "extra == 'all'", specifier = ">=0.11.0" },
+    { name = "camelot-py", extras = ["cv"], marker = "extra == 'tables'", specifier = ">=0.11.0" },
     { name = "fastmcp", specifier = ">=0.1.0" },
     { name = "httpx", specifier = ">=0.25.0" },
     { name = "markdown", specifier = ">=3.5.0" },
@@ -1106,10 +1111,11 @@ requires-dist = [
     { name = "reportlab", marker = "extra == 'forms'", specifier = ">=4.0.0" },
     { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.1.0" },
     { name = "safety", marker = "extra == 'dev'", specifier = ">=3.0.0" },
-    { name = "tabula-py", specifier = ">=2.8.0" },
+    { name = "tabula-py", marker = "extra == 'all'", specifier = ">=2.8.0" },
+    { name = "tabula-py", marker = "extra == 'tables'", specifier = ">=2.8.0" },
     { name = "twine", marker = "extra == 'dev'", specifier = ">=4.0.0" },
 ]
-provides-extras = ["forms", "all", "dev"]
+provides-extras = ["forms", "tables", "all", "dev"]
 
 [package.metadata.requires-dev]
 dev = [