🔧 Replace hardcoded 100MB PDF limit with MCP_PDF_MAX_SIZE env var

Centralize PDF size limit in security.py, controlled by MCP_PDF_MAX_SIZE (in MB). Default: disabled (no limit). Set e.g. MCP_PDF_MAX_SIZE=500 to cap at 500MB. Remove unused self.max_file_size from all 13 mixins.
2026-02-19 15:51:41 -07:00 · 2026-02-19 15:51:41 -07:00 · 62d9b176c8
commit 62d9b176c8
parent 38af9ee2c9
18 changed files with 32 additions and 38 deletions
--- a/CLAUDE.md
+++ b/CLAUDE.md
@ -136,6 +136,7 @@ Critical system dependencies:
 Environment variables (optional):
 - `TESSDATA_PREFIX`: Tesseract language data location
 - `PDF_TEMP_DIR`: Temporary file processing directory (defaults to `/tmp/mcp-pdf-processing`)
+- `MCP_PDF_MAX_SIZE`: Maximum PDF file size in MB (e.g., `500` for 500MB). Set to `0` or leave empty to disable the limit (default: disabled)
 - `MCP_PDF_ALLOWED_PATHS`: Colon-separated list of allowed output directories (e.g., `/tmp:/home/user/documents:/var/output`)
  - If unset: Allows writes to any directory with security warnings
  - If set: Restricts file outputs to specified directories only
@ -151,7 +152,7 @@ This server implements defense-in-depth, but remember: **application-level secur
 **Application-Level Protections (Security Theater):**

 **Input Validation:**
- File size limits: 100MB for PDFs, 50MB for images
+- File size limits: 50MB for images. PDF size limit controlled by `MCP_PDF_MAX_SIZE` env var (in MB); disabled by default
 - Page count limits: Max 1000 pages per document
 - Path traversal protection for all file operations
 - JSON input size limits (10KB) to prevent DoS attacks
--- a/src/mcp_pdf/mixins_official/advanced_forms.py
+++ b/src/mcp_pdf/mixins_official/advanced_forms.py
@ -29,7 +29,6 @@ class AdvancedFormsMixin(MCPMixin):

    def __init__(self):
        super().__init__()
-        self.max_file_size = 100 * 1024 * 1024  # 100MB

    @mcp_tool(
        name="add_form_fields",
--- a/src/mcp_pdf/mixins_official/annotations.py
+++ b/src/mcp_pdf/mixins_official/annotations.py
@ -29,7 +29,6 @@ class AnnotationsMixin(MCPMixin):

    def __init__(self):
        super().__init__()
-        self.max_file_size = 100 * 1024 * 1024  # 100MB

    @mcp_tool(
        name="add_sticky_notes",
--- a/src/mcp_pdf/mixins_official/content_analysis.py
+++ b/src/mcp_pdf/mixins_official/content_analysis.py
@ -31,7 +31,6 @@ class ContentAnalysisMixin(MCPMixin):

    def __init__(self):
        super().__init__()
-        self.max_file_size = 100 * 1024 * 1024  # 100MB

    @mcp_tool(
        name="classify_content",
--- a/src/mcp_pdf/mixins_official/document_analysis.py
+++ b/src/mcp_pdf/mixins_official/document_analysis.py
@ -30,7 +30,6 @@ class DocumentAnalysisMixin(MCPMixin):

    def __init__(self):
        super().__init__()
-        self.max_file_size = 100 * 1024 * 1024  # 100MB

    @mcp_tool(
        name="extract_metadata",
--- a/src/mcp_pdf/mixins_official/document_assembly.py
+++ b/src/mcp_pdf/mixins_official/document_assembly.py
@ -29,7 +29,6 @@ class DocumentAssemblyMixin(MCPMixin):

    def __init__(self):
        super().__init__()
-        self.max_file_size = 100 * 1024 * 1024  # 100MB

    @mcp_tool(
        name="merge_pdfs",
--- a/src/mcp_pdf/mixins_official/form_management.py
+++ b/src/mcp_pdf/mixins_official/form_management.py
@ -31,7 +31,6 @@ class FormManagementMixin(MCPMixin):

    def __init__(self):
        super().__init__()
-        self.max_file_size = 100 * 1024 * 1024  # 100MB

    @mcp_tool(
        name="extract_form_data",
--- a/src/mcp_pdf/mixins_official/image_processing.py
+++ b/src/mcp_pdf/mixins_official/image_processing.py
@ -29,7 +29,6 @@ class ImageProcessingMixin(MCPMixin):

    def __init__(self):
        super().__init__()
-        self.max_file_size = 100 * 1024 * 1024  # 100MB

    @mcp_tool(
        name="extract_images",
--- a/src/mcp_pdf/mixins_official/misc_tools.py
+++ b/src/mcp_pdf/mixins_official/misc_tools.py
@ -31,7 +31,6 @@ class MiscToolsMixin(MCPMixin):

    def __init__(self):
        super().__init__()
-        self.max_file_size = 100 * 1024 * 1024  # 100MB

    @mcp_tool(
        name="extract_links",
--- a/src/mcp_pdf/mixins_official/pdf_utilities.py
+++ b/src/mcp_pdf/mixins_official/pdf_utilities.py
@ -32,7 +32,6 @@ class PDFUtilitiesMixin(MCPMixin):

    def __init__(self):
        super().__init__()
-        self.max_file_size = 100 * 1024 * 1024  # 100MB

    @mcp_tool(
        name="compare_pdfs",
--- a/src/mcp_pdf/mixins_official/permit_forms.py
+++ b/src/mcp_pdf/mixins_official/permit_forms.py
@ -594,7 +594,6 @@ class PermitFormMixin(MCPMixin):

    def __init__(self):
        super().__init__()
-        self.max_file_size = 100 * 1024 * 1024  # 100MB

    def _load_field_definitions(
        self,
--- a/src/mcp_pdf/mixins_official/security_analysis.py
+++ b/src/mcp_pdf/mixins_official/security_analysis.py
@ -30,7 +30,6 @@ class SecurityAnalysisMixin(MCPMixin):

    def __init__(self):
        super().__init__()
-        self.max_file_size = 100 * 1024 * 1024  # 100MB

    @mcp_tool(
        name="analyze_pdf_security",
--- a/src/mcp_pdf/mixins_official/table_extraction.py
+++ b/src/mcp_pdf/mixins_official/table_extraction.py
@ -33,7 +33,6 @@ class TableExtractionMixin(MCPMixin):

    def __init__(self):
        super().__init__()
-        self.max_file_size = 100 * 1024 * 1024  # 100MB

    @mcp_tool(
        name="extract_tables",
--- a/src/mcp_pdf/mixins_official/text_extraction.py
+++ b/src/mcp_pdf/mixins_official/text_extraction.py
@ -32,7 +32,6 @@ class TextExtractionMixin(MCPMixin):
    def __init__(self):
        super().__init__()
        self.max_pages_per_chunk = 10
-        self.max_file_size = 100 * 1024 * 1024  # 100MB

    @mcp_tool(
        name="extract_text",
--- a/src/mcp_pdf/security.py
+++ b/src/mcp_pdf/security.py
@ -20,7 +20,10 @@ import httpx
 logger = logging.getLogger(__name__)

 # Security Configuration
-MAX_PDF_SIZE = 100 * 1024 * 1024  # 100MB
+# MCP_PDF_MAX_SIZE: max PDF size in MB, or "0" / empty to disable the limit
+_max_size_env = os.getenv("MCP_PDF_MAX_SIZE", "").strip()
+MAX_PDF_SIZE = int(_max_size_env) * 1024 * 1024 if _max_size_env and _max_size_env != "0" else 0
+
 MAX_IMAGE_SIZE = 50 * 1024 * 1024  # 50MB
 MAX_PAGES_PROCESS = 1000
 MAX_JSON_SIZE = 10000  # 10KB for JSON parameters
@ -113,10 +116,11 @@ async def validate_pdf_path(pdf_path: str) -> Path:
    if not path.is_file():
        raise ValueError(f"Path is not a file: {path}")

-    # Check file size
-    file_size = path.stat().st_size
-    if file_size > MAX_PDF_SIZE:
-        raise ValueError(f"PDF file too large: {file_size / (1024*1024):.1f}MB > {MAX_PDF_SIZE / (1024*1024)}MB")
+    # Check file size (skip when MAX_PDF_SIZE is 0 / disabled)
+    if MAX_PDF_SIZE:
+        file_size = path.stat().st_size
+        if file_size > MAX_PDF_SIZE:
+            raise ValueError(f"PDF file too large: {file_size / (1024*1024):.1f}MB > {MAX_PDF_SIZE / (1024*1024):.0f}MB limit (set MCP_PDF_MAX_SIZE=0 to disable)")

    # Basic PDF header validation
    try:
@ -162,8 +166,7 @@ async def _download_url_safely(url: str) -> Path:

    # Check if already cached
    if cached_file.exists():
-        # Validate cached file
-        if cached_file.stat().st_size <= MAX_PDF_SIZE:
+        if not MAX_PDF_SIZE or cached_file.stat().st_size <= MAX_PDF_SIZE:
            logger.info(f"Using cached PDF: {cached_file}")
            return cached_file
        else:
@ -185,10 +188,13 @@ async def _download_url_safely(url: str) -> Path:
                with open(cached_file, 'wb') as f:
                    async for chunk in response.aiter_bytes(chunk_size=8192):
                        downloaded_size += len(chunk)
-                        if downloaded_size > MAX_PDF_SIZE:
+                        if MAX_PDF_SIZE and downloaded_size > MAX_PDF_SIZE:
                            f.close()
                            cached_file.unlink()
-                            raise ValueError(f"Downloaded file too large: {downloaded_size / (1024*1024):.1f}MB")
+                            raise ValueError(
+                                f"Downloaded file too large: {downloaded_size / (1024*1024):.1f}MB "
+                                f"> {MAX_PDF_SIZE / (1024*1024):.0f}MB limit (set MCP_PDF_MAX_SIZE=0 to disable)"
+                            )
                        f.write(chunk)

        # Set secure permissions
--- a/src/mcp_pdf/server.py
+++ b/src/mcp_pdf/server.py
@ -59,7 +59,7 @@ class PDFServerOfficial:
    def _load_configuration(self) -> Dict[str, Any]:
        """Load server configuration from environment and defaults"""
        return {
-            "max_pdf_size": int(os.getenv("MAX_PDF_SIZE", str(100 * 1024 * 1024))),  # 100MB default
+            "max_pdf_size": int(os.getenv("MCP_PDF_MAX_SIZE", "0")) * 1024 * 1024 if os.getenv("MCP_PDF_MAX_SIZE", "").strip() not in ("", "0") else 0,
            "cache_dir": Path(os.getenv("PDF_TEMP_DIR", "/tmp/mcp-pdf-processing")),
            "debug": os.getenv("DEBUG", "false").lower() == "true",
            "allowed_domains": os.getenv("ALLOWED_DOMAINS", "").split(",") if os.getenv("ALLOWED_DOMAINS") else [],
--- a/src/mcp_pdf/server_legacy.py
+++ b/src/mcp_pdf/server_legacy.py
@ -38,7 +38,9 @@ logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)

 # Security Configuration
-MAX_PDF_SIZE = 100 * 1024 * 1024  # 100MB
+# MCP_PDF_MAX_SIZE: max PDF size in MB, or "0" / empty to disable the limit
+_max_size_env = os.getenv("MCP_PDF_MAX_SIZE", "").strip()
+MAX_PDF_SIZE = int(_max_size_env) * 1024 * 1024 if _max_size_env and _max_size_env != "0" else 0
 MAX_IMAGE_SIZE = 50 * 1024 * 1024  # 50MB
 MAX_PAGES_PROCESS = 1000
 MAX_JSON_SIZE = 10000  # 10KB for JSON parameters
@ -335,8 +337,7 @@ async def download_pdf_from_url(url: str) -> Path:
                
                # Check content length header
                content_length = response.headers.get('content-length')
-                if content_length and int(content_length) > MAX_PDF_SIZE:
-                    raise ValueError(f"PDF file too large: {content_length} bytes > {MAX_PDF_SIZE}")
+                # Size limit delegated to security.py (MCP_PDF_MAX_SIZE env var)
                
                # Check content type
                content_type = response.headers.get("content-type", "").lower()
@ -355,16 +356,15 @@ async def download_pdf_from_url(url: str) -> Path:
                    content = first_chunk
                    async for chunk in response.aiter_bytes(chunk_size=8192):
                        content += chunk
-                        # Check size as we download
-                        if len(content) > MAX_PDF_SIZE:
-                            raise ValueError(f"PDF file too large: {len(content)} bytes > {MAX_PDF_SIZE}")
+                        if MAX_PDF_SIZE and len(content) > MAX_PDF_SIZE:
+                            raise ValueError(f"PDF file too large (set MCP_PDF_MAX_SIZE=0 to disable)")
                else:
                    # Read all content with size checking
                    content = b""
                    async for chunk in response.aiter_bytes(chunk_size=8192):
                        content += chunk
-                        if len(content) > MAX_PDF_SIZE:
-                            raise ValueError(f"PDF file too large: {len(content)} bytes > {MAX_PDF_SIZE}")
+                        if MAX_PDF_SIZE and len(content) > MAX_PDF_SIZE:
+                            raise ValueError(f"PDF file too large (set MCP_PDF_MAX_SIZE=0 to disable)")
                
                # Double-check magic bytes
                if not content.startswith(b"%PDF"):
@ -410,10 +410,11 @@ async def validate_pdf_path(pdf_path: str) -> Path:
    if not path.suffix.lower() == '.pdf':
        raise ValueError(f"Not a PDF file: {pdf_path}")
    
-    # Check file size
-    file_size = path.stat().st_size
-    if file_size > MAX_PDF_SIZE:
-        raise ValueError(f"PDF file too large: {file_size} bytes > {MAX_PDF_SIZE}")
+    # Check file size (skip when MCP_PDF_MAX_SIZE is 0 / disabled)
+    if MAX_PDF_SIZE:
+        file_size = path.stat().st_size
+        if file_size > MAX_PDF_SIZE:
+            raise ValueError(f"PDF file too large (set MCP_PDF_MAX_SIZE=0 to disable)")
    
    return path

--- a/src/mcp_pdf/server_refactored.py
+++ b/src/mcp_pdf/server_refactored.py
@ -31,7 +31,6 @@ logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)

 # Security Configuration
-MAX_PDF_SIZE = 100 * 1024 * 1024  # 100MB
 MAX_IMAGE_SIZE = 50 * 1024 * 1024  # 50MB
 MAX_PAGES_PROCESS = 1000
 MAX_JSON_SIZE = 10000  # 10KB for JSON parameters
@ -83,7 +82,7 @@ class PDFToolsServer:
    def _load_configuration(self) -> Dict[str, Any]:
        """Load server configuration from environment and defaults"""
        return {
-            "max_pdf_size": int(os.getenv("MAX_PDF_SIZE", MAX_PDF_SIZE)),
+            "max_pdf_size": int(os.getenv("MCP_PDF_MAX_SIZE", "0")) * 1024 * 1024 if os.getenv("MCP_PDF_MAX_SIZE", "").strip() not in ("", "0") else 0,
            "max_image_size": int(os.getenv("MAX_IMAGE_SIZE", MAX_IMAGE_SIZE)),
            "max_pages": int(os.getenv("MAX_PAGES_PROCESS", MAX_PAGES_PROCESS)),
            "processing_timeout": int(os.getenv("PROCESSING_TIMEOUT", PROCESSING_TIMEOUT)),