From 62d9b176c80a3bf9b4a505c2b0319e5cca6c6151 Mon Sep 17 00:00:00 2001 From: Ryan Malloy Date: Thu, 19 Feb 2026 15:51:41 -0700 Subject: [PATCH] =?UTF-8?q?=F0=9F=94=A7=20Replace=20hardcoded=20100MB=20PD?= =?UTF-8?q?F=20limit=20with=20MCP=5FPDF=5FMAX=5FSIZE=20env=20var?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Centralize PDF size limit in security.py, controlled by MCP_PDF_MAX_SIZE (in MB). Default: disabled (no limit). Set e.g. MCP_PDF_MAX_SIZE=500 to cap at 500MB. Remove unused self.max_file_size from all 13 mixins. --- CLAUDE.md | 3 ++- src/mcp_pdf/mixins_official/advanced_forms.py | 1 - src/mcp_pdf/mixins_official/annotations.py | 1 - .../mixins_official/content_analysis.py | 1 - .../mixins_official/document_analysis.py | 1 - .../mixins_official/document_assembly.py | 1 - .../mixins_official/form_management.py | 1 - .../mixins_official/image_processing.py | 1 - src/mcp_pdf/mixins_official/misc_tools.py | 1 - src/mcp_pdf/mixins_official/pdf_utilities.py | 1 - src/mcp_pdf/mixins_official/permit_forms.py | 1 - .../mixins_official/security_analysis.py | 1 - .../mixins_official/table_extraction.py | 1 - .../mixins_official/text_extraction.py | 1 - src/mcp_pdf/security.py | 24 +++++++++++------- src/mcp_pdf/server.py | 2 +- src/mcp_pdf/server_legacy.py | 25 ++++++++++--------- src/mcp_pdf/server_refactored.py | 3 +-- 18 files changed, 32 insertions(+), 38 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 96ac41c..96df5d0 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -136,6 +136,7 @@ Critical system dependencies: Environment variables (optional): - `TESSDATA_PREFIX`: Tesseract language data location - `PDF_TEMP_DIR`: Temporary file processing directory (defaults to `/tmp/mcp-pdf-processing`) +- `MCP_PDF_MAX_SIZE`: Maximum PDF file size in MB (e.g., `500` for 500MB). Set to `0` or leave empty to disable the limit (default: disabled) - `MCP_PDF_ALLOWED_PATHS`: Colon-separated list of allowed output directories (e.g., `/tmp:/home/user/documents:/var/output`) - If unset: Allows writes to any directory with security warnings - If set: Restricts file outputs to specified directories only @@ -151,7 +152,7 @@ This server implements defense-in-depth, but remember: **application-level secur **Application-Level Protections (Security Theater):** **Input Validation:** -- File size limits: 100MB for PDFs, 50MB for images +- File size limits: 50MB for images. PDF size limit controlled by `MCP_PDF_MAX_SIZE` env var (in MB); disabled by default - Page count limits: Max 1000 pages per document - Path traversal protection for all file operations - JSON input size limits (10KB) to prevent DoS attacks diff --git a/src/mcp_pdf/mixins_official/advanced_forms.py b/src/mcp_pdf/mixins_official/advanced_forms.py index b038e97..8ad654e 100644 --- a/src/mcp_pdf/mixins_official/advanced_forms.py +++ b/src/mcp_pdf/mixins_official/advanced_forms.py @@ -29,7 +29,6 @@ class AdvancedFormsMixin(MCPMixin): def __init__(self): super().__init__() - self.max_file_size = 100 * 1024 * 1024 # 100MB @mcp_tool( name="add_form_fields", diff --git a/src/mcp_pdf/mixins_official/annotations.py b/src/mcp_pdf/mixins_official/annotations.py index 5fabe65..421e006 100644 --- a/src/mcp_pdf/mixins_official/annotations.py +++ b/src/mcp_pdf/mixins_official/annotations.py @@ -29,7 +29,6 @@ class AnnotationsMixin(MCPMixin): def __init__(self): super().__init__() - self.max_file_size = 100 * 1024 * 1024 # 100MB @mcp_tool( name="add_sticky_notes", diff --git a/src/mcp_pdf/mixins_official/content_analysis.py b/src/mcp_pdf/mixins_official/content_analysis.py index f483a5f..74cd894 100644 --- a/src/mcp_pdf/mixins_official/content_analysis.py +++ b/src/mcp_pdf/mixins_official/content_analysis.py @@ -31,7 +31,6 @@ class ContentAnalysisMixin(MCPMixin): def __init__(self): super().__init__() - self.max_file_size = 100 * 1024 * 1024 # 100MB @mcp_tool( name="classify_content", diff --git a/src/mcp_pdf/mixins_official/document_analysis.py b/src/mcp_pdf/mixins_official/document_analysis.py index 7492495..e6f9868 100644 --- a/src/mcp_pdf/mixins_official/document_analysis.py +++ b/src/mcp_pdf/mixins_official/document_analysis.py @@ -30,7 +30,6 @@ class DocumentAnalysisMixin(MCPMixin): def __init__(self): super().__init__() - self.max_file_size = 100 * 1024 * 1024 # 100MB @mcp_tool( name="extract_metadata", diff --git a/src/mcp_pdf/mixins_official/document_assembly.py b/src/mcp_pdf/mixins_official/document_assembly.py index 075183f..8d133fa 100644 --- a/src/mcp_pdf/mixins_official/document_assembly.py +++ b/src/mcp_pdf/mixins_official/document_assembly.py @@ -29,7 +29,6 @@ class DocumentAssemblyMixin(MCPMixin): def __init__(self): super().__init__() - self.max_file_size = 100 * 1024 * 1024 # 100MB @mcp_tool( name="merge_pdfs", diff --git a/src/mcp_pdf/mixins_official/form_management.py b/src/mcp_pdf/mixins_official/form_management.py index 4ba0c4f..78cc64d 100644 --- a/src/mcp_pdf/mixins_official/form_management.py +++ b/src/mcp_pdf/mixins_official/form_management.py @@ -31,7 +31,6 @@ class FormManagementMixin(MCPMixin): def __init__(self): super().__init__() - self.max_file_size = 100 * 1024 * 1024 # 100MB @mcp_tool( name="extract_form_data", diff --git a/src/mcp_pdf/mixins_official/image_processing.py b/src/mcp_pdf/mixins_official/image_processing.py index ab55428..c0b678e 100644 --- a/src/mcp_pdf/mixins_official/image_processing.py +++ b/src/mcp_pdf/mixins_official/image_processing.py @@ -29,7 +29,6 @@ class ImageProcessingMixin(MCPMixin): def __init__(self): super().__init__() - self.max_file_size = 100 * 1024 * 1024 # 100MB @mcp_tool( name="extract_images", diff --git a/src/mcp_pdf/mixins_official/misc_tools.py b/src/mcp_pdf/mixins_official/misc_tools.py index c6ef821..9ca39b4 100644 --- a/src/mcp_pdf/mixins_official/misc_tools.py +++ b/src/mcp_pdf/mixins_official/misc_tools.py @@ -31,7 +31,6 @@ class MiscToolsMixin(MCPMixin): def __init__(self): super().__init__() - self.max_file_size = 100 * 1024 * 1024 # 100MB @mcp_tool( name="extract_links", diff --git a/src/mcp_pdf/mixins_official/pdf_utilities.py b/src/mcp_pdf/mixins_official/pdf_utilities.py index 2c9d0e7..2e2e9ce 100644 --- a/src/mcp_pdf/mixins_official/pdf_utilities.py +++ b/src/mcp_pdf/mixins_official/pdf_utilities.py @@ -32,7 +32,6 @@ class PDFUtilitiesMixin(MCPMixin): def __init__(self): super().__init__() - self.max_file_size = 100 * 1024 * 1024 # 100MB @mcp_tool( name="compare_pdfs", diff --git a/src/mcp_pdf/mixins_official/permit_forms.py b/src/mcp_pdf/mixins_official/permit_forms.py index 22d4eb5..cee2226 100644 --- a/src/mcp_pdf/mixins_official/permit_forms.py +++ b/src/mcp_pdf/mixins_official/permit_forms.py @@ -594,7 +594,6 @@ class PermitFormMixin(MCPMixin): def __init__(self): super().__init__() - self.max_file_size = 100 * 1024 * 1024 # 100MB def _load_field_definitions( self, diff --git a/src/mcp_pdf/mixins_official/security_analysis.py b/src/mcp_pdf/mixins_official/security_analysis.py index c4b6c93..c7d88c0 100644 --- a/src/mcp_pdf/mixins_official/security_analysis.py +++ b/src/mcp_pdf/mixins_official/security_analysis.py @@ -30,7 +30,6 @@ class SecurityAnalysisMixin(MCPMixin): def __init__(self): super().__init__() - self.max_file_size = 100 * 1024 * 1024 # 100MB @mcp_tool( name="analyze_pdf_security", diff --git a/src/mcp_pdf/mixins_official/table_extraction.py b/src/mcp_pdf/mixins_official/table_extraction.py index fa1fc53..16450e9 100644 --- a/src/mcp_pdf/mixins_official/table_extraction.py +++ b/src/mcp_pdf/mixins_official/table_extraction.py @@ -33,7 +33,6 @@ class TableExtractionMixin(MCPMixin): def __init__(self): super().__init__() - self.max_file_size = 100 * 1024 * 1024 # 100MB @mcp_tool( name="extract_tables", diff --git a/src/mcp_pdf/mixins_official/text_extraction.py b/src/mcp_pdf/mixins_official/text_extraction.py index c4af624..c78a357 100644 --- a/src/mcp_pdf/mixins_official/text_extraction.py +++ b/src/mcp_pdf/mixins_official/text_extraction.py @@ -32,7 +32,6 @@ class TextExtractionMixin(MCPMixin): def __init__(self): super().__init__() self.max_pages_per_chunk = 10 - self.max_file_size = 100 * 1024 * 1024 # 100MB @mcp_tool( name="extract_text", diff --git a/src/mcp_pdf/security.py b/src/mcp_pdf/security.py index 442d0e5..b0b50e8 100644 --- a/src/mcp_pdf/security.py +++ b/src/mcp_pdf/security.py @@ -20,7 +20,10 @@ import httpx logger = logging.getLogger(__name__) # Security Configuration -MAX_PDF_SIZE = 100 * 1024 * 1024 # 100MB +# MCP_PDF_MAX_SIZE: max PDF size in MB, or "0" / empty to disable the limit +_max_size_env = os.getenv("MCP_PDF_MAX_SIZE", "").strip() +MAX_PDF_SIZE = int(_max_size_env) * 1024 * 1024 if _max_size_env and _max_size_env != "0" else 0 + MAX_IMAGE_SIZE = 50 * 1024 * 1024 # 50MB MAX_PAGES_PROCESS = 1000 MAX_JSON_SIZE = 10000 # 10KB for JSON parameters @@ -113,10 +116,11 @@ async def validate_pdf_path(pdf_path: str) -> Path: if not path.is_file(): raise ValueError(f"Path is not a file: {path}") - # Check file size - file_size = path.stat().st_size - if file_size > MAX_PDF_SIZE: - raise ValueError(f"PDF file too large: {file_size / (1024*1024):.1f}MB > {MAX_PDF_SIZE / (1024*1024)}MB") + # Check file size (skip when MAX_PDF_SIZE is 0 / disabled) + if MAX_PDF_SIZE: + file_size = path.stat().st_size + if file_size > MAX_PDF_SIZE: + raise ValueError(f"PDF file too large: {file_size / (1024*1024):.1f}MB > {MAX_PDF_SIZE / (1024*1024):.0f}MB limit (set MCP_PDF_MAX_SIZE=0 to disable)") # Basic PDF header validation try: @@ -162,8 +166,7 @@ async def _download_url_safely(url: str) -> Path: # Check if already cached if cached_file.exists(): - # Validate cached file - if cached_file.stat().st_size <= MAX_PDF_SIZE: + if not MAX_PDF_SIZE or cached_file.stat().st_size <= MAX_PDF_SIZE: logger.info(f"Using cached PDF: {cached_file}") return cached_file else: @@ -185,10 +188,13 @@ async def _download_url_safely(url: str) -> Path: with open(cached_file, 'wb') as f: async for chunk in response.aiter_bytes(chunk_size=8192): downloaded_size += len(chunk) - if downloaded_size > MAX_PDF_SIZE: + if MAX_PDF_SIZE and downloaded_size > MAX_PDF_SIZE: f.close() cached_file.unlink() - raise ValueError(f"Downloaded file too large: {downloaded_size / (1024*1024):.1f}MB") + raise ValueError( + f"Downloaded file too large: {downloaded_size / (1024*1024):.1f}MB " + f"> {MAX_PDF_SIZE / (1024*1024):.0f}MB limit (set MCP_PDF_MAX_SIZE=0 to disable)" + ) f.write(chunk) # Set secure permissions diff --git a/src/mcp_pdf/server.py b/src/mcp_pdf/server.py index 2e2fb65..0ce03b0 100644 --- a/src/mcp_pdf/server.py +++ b/src/mcp_pdf/server.py @@ -59,7 +59,7 @@ class PDFServerOfficial: def _load_configuration(self) -> Dict[str, Any]: """Load server configuration from environment and defaults""" return { - "max_pdf_size": int(os.getenv("MAX_PDF_SIZE", str(100 * 1024 * 1024))), # 100MB default + "max_pdf_size": int(os.getenv("MCP_PDF_MAX_SIZE", "0")) * 1024 * 1024 if os.getenv("MCP_PDF_MAX_SIZE", "").strip() not in ("", "0") else 0, "cache_dir": Path(os.getenv("PDF_TEMP_DIR", "/tmp/mcp-pdf-processing")), "debug": os.getenv("DEBUG", "false").lower() == "true", "allowed_domains": os.getenv("ALLOWED_DOMAINS", "").split(",") if os.getenv("ALLOWED_DOMAINS") else [], diff --git a/src/mcp_pdf/server_legacy.py b/src/mcp_pdf/server_legacy.py index 3fad6c5..0ebd552 100644 --- a/src/mcp_pdf/server_legacy.py +++ b/src/mcp_pdf/server_legacy.py @@ -38,7 +38,9 @@ logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Security Configuration -MAX_PDF_SIZE = 100 * 1024 * 1024 # 100MB +# MCP_PDF_MAX_SIZE: max PDF size in MB, or "0" / empty to disable the limit +_max_size_env = os.getenv("MCP_PDF_MAX_SIZE", "").strip() +MAX_PDF_SIZE = int(_max_size_env) * 1024 * 1024 if _max_size_env and _max_size_env != "0" else 0 MAX_IMAGE_SIZE = 50 * 1024 * 1024 # 50MB MAX_PAGES_PROCESS = 1000 MAX_JSON_SIZE = 10000 # 10KB for JSON parameters @@ -335,8 +337,7 @@ async def download_pdf_from_url(url: str) -> Path: # Check content length header content_length = response.headers.get('content-length') - if content_length and int(content_length) > MAX_PDF_SIZE: - raise ValueError(f"PDF file too large: {content_length} bytes > {MAX_PDF_SIZE}") + # Size limit delegated to security.py (MCP_PDF_MAX_SIZE env var) # Check content type content_type = response.headers.get("content-type", "").lower() @@ -355,16 +356,15 @@ async def download_pdf_from_url(url: str) -> Path: content = first_chunk async for chunk in response.aiter_bytes(chunk_size=8192): content += chunk - # Check size as we download - if len(content) > MAX_PDF_SIZE: - raise ValueError(f"PDF file too large: {len(content)} bytes > {MAX_PDF_SIZE}") + if MAX_PDF_SIZE and len(content) > MAX_PDF_SIZE: + raise ValueError(f"PDF file too large (set MCP_PDF_MAX_SIZE=0 to disable)") else: # Read all content with size checking content = b"" async for chunk in response.aiter_bytes(chunk_size=8192): content += chunk - if len(content) > MAX_PDF_SIZE: - raise ValueError(f"PDF file too large: {len(content)} bytes > {MAX_PDF_SIZE}") + if MAX_PDF_SIZE and len(content) > MAX_PDF_SIZE: + raise ValueError(f"PDF file too large (set MCP_PDF_MAX_SIZE=0 to disable)") # Double-check magic bytes if not content.startswith(b"%PDF"): @@ -410,10 +410,11 @@ async def validate_pdf_path(pdf_path: str) -> Path: if not path.suffix.lower() == '.pdf': raise ValueError(f"Not a PDF file: {pdf_path}") - # Check file size - file_size = path.stat().st_size - if file_size > MAX_PDF_SIZE: - raise ValueError(f"PDF file too large: {file_size} bytes > {MAX_PDF_SIZE}") + # Check file size (skip when MCP_PDF_MAX_SIZE is 0 / disabled) + if MAX_PDF_SIZE: + file_size = path.stat().st_size + if file_size > MAX_PDF_SIZE: + raise ValueError(f"PDF file too large (set MCP_PDF_MAX_SIZE=0 to disable)") return path diff --git a/src/mcp_pdf/server_refactored.py b/src/mcp_pdf/server_refactored.py index 8c52d36..d76175d 100644 --- a/src/mcp_pdf/server_refactored.py +++ b/src/mcp_pdf/server_refactored.py @@ -31,7 +31,6 @@ logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Security Configuration -MAX_PDF_SIZE = 100 * 1024 * 1024 # 100MB MAX_IMAGE_SIZE = 50 * 1024 * 1024 # 50MB MAX_PAGES_PROCESS = 1000 MAX_JSON_SIZE = 10000 # 10KB for JSON parameters @@ -83,7 +82,7 @@ class PDFToolsServer: def _load_configuration(self) -> Dict[str, Any]: """Load server configuration from environment and defaults""" return { - "max_pdf_size": int(os.getenv("MAX_PDF_SIZE", MAX_PDF_SIZE)), + "max_pdf_size": int(os.getenv("MCP_PDF_MAX_SIZE", "0")) * 1024 * 1024 if os.getenv("MCP_PDF_MAX_SIZE", "").strip() not in ("", "0") else 0, "max_image_size": int(os.getenv("MAX_IMAGE_SIZE", MAX_IMAGE_SIZE)), "max_pages": int(os.getenv("MAX_PAGES_PROCESS", MAX_PAGES_PROCESS)), "processing_timeout": int(os.getenv("PROCESSING_TIMEOUT", PROCESSING_TIMEOUT)),