From d413438feaf944e0c7b1be9c30026cbcd8704e1c Mon Sep 17 00:00:00 2001 From: Ryan Malloy Date: Sun, 8 Mar 2026 03:20:01 -0600 Subject: [PATCH] =?UTF-8?q?=F0=9F=93=A6=20Make=20camelot-py=20and=20tabula?= =?UTF-8?q?-py=20optional=20dependencies?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Moves camelot-py[cv] and tabula-py from core to optional deps (pip install mcp-pdf[tables]). Fixes Python 3.14 install failure caused by pdftopng lacking cp314 wheels. - Lazy-import camelot/tabula in all extraction methods - Auto-fallback skips unavailable methods in table extraction - pdfplumber (pure Python, always available) handles tables by default - Also slims get_document_structure response (~12.5k → ~400 tokens) --- pyproject.toml | 12 +++++++--- src/mcp_pdf/mixins/table_extraction.py | 8 ++++--- .../mixins_official/table_extraction.py | 22 ++++++++++++++----- src/mcp_pdf/server_legacy.py | 10 ++++----- uv.lock | 18 ++++++++++----- 5 files changed, 48 insertions(+), 22 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9447138..6a063e3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "mcp-pdf" -version = "2.1.4" +version = "2.1.5" description = "Secure FastMCP server for comprehensive PDF processing - text extraction, OCR, table extraction, forms, annotations, and more" authors = [{name = "Ryan Malloy", email = "ryan@malloys.us"}] readme = "README.md" @@ -36,8 +36,6 @@ dependencies = [ "python-dotenv>=1.0.0", "PyMuPDF>=1.23.0", "pdfplumber>=0.10.0", - "camelot-py[cv]>=0.11.0", # includes opencv-python - "tabula-py>=2.8.0", "pytesseract>=0.3.10", "pdf2image>=1.16.0", "pypdf>=6.0.0", @@ -64,9 +62,17 @@ forms = [ "reportlab>=4.0.0", ] +# Advanced table extraction (camelot needs Ghostscript, tabula needs Java) +tables = [ + "camelot-py[cv]>=0.11.0", + "tabula-py>=2.8.0", +] + # All optional features all = [ "reportlab>=4.0.0", + "camelot-py[cv]>=0.11.0", + "tabula-py>=2.8.0", ] # Development dependencies diff --git a/src/mcp_pdf/mixins/table_extraction.py b/src/mcp_pdf/mixins/table_extraction.py index 519e3b2..681fb2f 100644 --- a/src/mcp_pdf/mixins/table_extraction.py +++ b/src/mcp_pdf/mixins/table_extraction.py @@ -7,12 +7,12 @@ import logging from pathlib import Path from typing import Dict, Any, List, Optional -# PDF processing libraries -import camelot -import tabula +# Required import pdfplumber import pandas as pd +# Optional — imported lazily in extraction methods + from .base import MCPMixin, mcp_tool from ..security import validate_pdf_path, parse_pages_parameter, sanitize_error_message @@ -144,6 +144,7 @@ class TableExtractionMixin(MCPMixin): # Private helper methods (all synchronous for proper async pattern) def _extract_tables_camelot(self, pdf_path: Path, pages: Optional[List[int]] = None) -> List[pd.DataFrame]: """Extract tables using Camelot""" + import camelot page_str = ','.join(map(str, [p+1 for p in pages])) if pages else 'all' # Try lattice mode first (for bordered tables) @@ -163,6 +164,7 @@ class TableExtractionMixin(MCPMixin): def _extract_tables_tabula(self, pdf_path: Path, pages: Optional[List[int]] = None) -> List[pd.DataFrame]: """Extract tables using Tabula""" + import tabula page_list = [p+1 for p in pages] if pages else 'all' try: diff --git a/src/mcp_pdf/mixins_official/table_extraction.py b/src/mcp_pdf/mixins_official/table_extraction.py index 16450e9..374fb0c 100644 --- a/src/mcp_pdf/mixins_official/table_extraction.py +++ b/src/mcp_pdf/mixins_official/table_extraction.py @@ -11,12 +11,13 @@ from typing import Dict, Any, Optional, List import logging import json -# Table extraction libraries +# Required import pandas as pd -import camelot -import tabula import pdfplumber +# Optional — camelot and tabula are heavy deps with C/Java requirements. +# They're imported lazily in their extraction methods. + # Official FastMCP mixin from fastmcp.contrib.mcp_mixin import MCPMixin, mcp_tool @@ -69,8 +70,19 @@ class TableExtractionMixin(MCPMixin): parsed_pages = self._parse_pages_parameter(pages) if method == "auto": - # Try methods in order of reliability - methods_to_try = ["camelot", "pdfplumber", "tabula"] + # Try methods in order of reliability, skip unavailable ones + methods_to_try = [] + try: + import camelot # noqa: F401 + methods_to_try.append("camelot") + except ImportError: + pass + methods_to_try.append("pdfplumber") # always available + try: + import tabula # noqa: F401 + methods_to_try.append("tabula") + except ImportError: + pass else: methods_to_try = [method] diff --git a/src/mcp_pdf/server_legacy.py b/src/mcp_pdf/server_legacy.py index 0ebd552..94d9819 100644 --- a/src/mcp_pdf/server_legacy.py +++ b/src/mcp_pdf/server_legacy.py @@ -23,8 +23,6 @@ import httpx # PDF processing libraries import fitz # PyMuPDF import pdfplumber -import camelot -import tabula import pytesseract from pdf2image import convert_from_path import pypdf @@ -714,8 +712,9 @@ async def extract_text( # Table extraction methods async def extract_tables_camelot(pdf_path: Path, pages: Optional[List[int]] = None) -> List[pd.DataFrame]: """Extract tables using Camelot""" + import camelot page_str = ','.join(map(str, [p+1 for p in pages])) if pages else 'all' - + # Try lattice mode first (for bordered tables) try: tables = camelot.read_pdf(str(pdf_path), pages=page_str, flavor='lattice') @@ -723,7 +722,7 @@ async def extract_tables_camelot(pdf_path: Path, pages: Optional[List[int]] = No return [table.df for table in tables] except Exception: pass - + # Fall back to stream mode (for borderless tables) try: tables = camelot.read_pdf(str(pdf_path), pages=page_str, flavor='stream') @@ -733,8 +732,9 @@ async def extract_tables_camelot(pdf_path: Path, pages: Optional[List[int]] = No async def extract_tables_tabula(pdf_path: Path, pages: Optional[List[int]] = None) -> List[pd.DataFrame]: """Extract tables using Tabula""" + import tabula page_list = [p+1 for p in pages] if pages else 'all' - + try: tables = tabula.read_pdf(str(pdf_path), pages=page_list, multiple_tables=True) return tables diff --git a/uv.lock b/uv.lock index 4e53dcb..8d823d4 100644 --- a/uv.lock +++ b/uv.lock @@ -1032,10 +1032,9 @@ wheels = [ [[package]] name = "mcp-pdf" -version = "2.1.3" +version = "2.1.4" source = { editable = "." } dependencies = [ - { name = "camelot-py", extra = ["cv"] }, { name = "fastmcp" }, { name = "httpx" }, { name = "markdown" }, @@ -1048,12 +1047,13 @@ dependencies = [ { name = "pypdf" }, { name = "pytesseract" }, { name = "python-dotenv" }, - { name = "tabula-py" }, ] [package.optional-dependencies] all = [ + { name = "camelot-py", extra = ["cv"] }, { name = "reportlab" }, + { name = "tabula-py" }, ] dev = [ { name = "black" }, @@ -1069,6 +1069,10 @@ dev = [ forms = [ { name = "reportlab" }, ] +tables = [ + { name = "camelot-py", extra = ["cv"] }, + { name = "tabula-py" }, +] [package.dev-dependencies] dev = [ @@ -1085,7 +1089,8 @@ dev = [ requires-dist = [ { name = "black", marker = "extra == 'dev'", specifier = ">=23.0.0" }, { name = "build", marker = "extra == 'dev'", specifier = ">=0.10.0" }, - { name = "camelot-py", extras = ["cv"], specifier = ">=0.11.0" }, + { name = "camelot-py", extras = ["cv"], marker = "extra == 'all'", specifier = ">=0.11.0" }, + { name = "camelot-py", extras = ["cv"], marker = "extra == 'tables'", specifier = ">=0.11.0" }, { name = "fastmcp", specifier = ">=0.1.0" }, { name = "httpx", specifier = ">=0.25.0" }, { name = "markdown", specifier = ">=3.5.0" }, @@ -1106,10 +1111,11 @@ requires-dist = [ { name = "reportlab", marker = "extra == 'forms'", specifier = ">=4.0.0" }, { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.1.0" }, { name = "safety", marker = "extra == 'dev'", specifier = ">=3.0.0" }, - { name = "tabula-py", specifier = ">=2.8.0" }, + { name = "tabula-py", marker = "extra == 'all'", specifier = ">=2.8.0" }, + { name = "tabula-py", marker = "extra == 'tables'", specifier = ">=2.8.0" }, { name = "twine", marker = "extra == 'dev'", specifier = ">=4.0.0" }, ] -provides-extras = ["forms", "all", "dev"] +provides-extras = ["forms", "tables", "all", "dev"] [package.metadata.requires-dev] dev = [