🔧 Replace hardcoded 100MB PDF limit with MCP_PDF_MAX_SIZE env var

Centralize PDF size limit in security.py, controlled by MCP_PDF_MAX_SIZE
(in MB). Default: disabled (no limit). Set e.g. MCP_PDF_MAX_SIZE=500 to
cap at 500MB. Remove unused self.max_file_size from all 13 mixins.
This commit is contained in:
Ryan Malloy 2026-02-19 15:51:41 -07:00
parent 38af9ee2c9
commit 62d9b176c8
18 changed files with 32 additions and 38 deletions

View File

@ -136,6 +136,7 @@ Critical system dependencies:
Environment variables (optional): Environment variables (optional):
- `TESSDATA_PREFIX`: Tesseract language data location - `TESSDATA_PREFIX`: Tesseract language data location
- `PDF_TEMP_DIR`: Temporary file processing directory (defaults to `/tmp/mcp-pdf-processing`) - `PDF_TEMP_DIR`: Temporary file processing directory (defaults to `/tmp/mcp-pdf-processing`)
- `MCP_PDF_MAX_SIZE`: Maximum PDF file size in MB (e.g., `500` for 500MB). Set to `0` or leave empty to disable the limit (default: disabled)
- `MCP_PDF_ALLOWED_PATHS`: Colon-separated list of allowed output directories (e.g., `/tmp:/home/user/documents:/var/output`) - `MCP_PDF_ALLOWED_PATHS`: Colon-separated list of allowed output directories (e.g., `/tmp:/home/user/documents:/var/output`)
- If unset: Allows writes to any directory with security warnings - If unset: Allows writes to any directory with security warnings
- If set: Restricts file outputs to specified directories only - If set: Restricts file outputs to specified directories only
@ -151,7 +152,7 @@ This server implements defense-in-depth, but remember: **application-level secur
**Application-Level Protections (Security Theater):** **Application-Level Protections (Security Theater):**
**Input Validation:** **Input Validation:**
- File size limits: 100MB for PDFs, 50MB for images - File size limits: 50MB for images. PDF size limit controlled by `MCP_PDF_MAX_SIZE` env var (in MB); disabled by default
- Page count limits: Max 1000 pages per document - Page count limits: Max 1000 pages per document
- Path traversal protection for all file operations - Path traversal protection for all file operations
- JSON input size limits (10KB) to prevent DoS attacks - JSON input size limits (10KB) to prevent DoS attacks

View File

@ -29,7 +29,6 @@ class AdvancedFormsMixin(MCPMixin):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.max_file_size = 100 * 1024 * 1024 # 100MB
@mcp_tool( @mcp_tool(
name="add_form_fields", name="add_form_fields",

View File

@ -29,7 +29,6 @@ class AnnotationsMixin(MCPMixin):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.max_file_size = 100 * 1024 * 1024 # 100MB
@mcp_tool( @mcp_tool(
name="add_sticky_notes", name="add_sticky_notes",

View File

@ -31,7 +31,6 @@ class ContentAnalysisMixin(MCPMixin):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.max_file_size = 100 * 1024 * 1024 # 100MB
@mcp_tool( @mcp_tool(
name="classify_content", name="classify_content",

View File

@ -30,7 +30,6 @@ class DocumentAnalysisMixin(MCPMixin):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.max_file_size = 100 * 1024 * 1024 # 100MB
@mcp_tool( @mcp_tool(
name="extract_metadata", name="extract_metadata",

View File

@ -29,7 +29,6 @@ class DocumentAssemblyMixin(MCPMixin):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.max_file_size = 100 * 1024 * 1024 # 100MB
@mcp_tool( @mcp_tool(
name="merge_pdfs", name="merge_pdfs",

View File

@ -31,7 +31,6 @@ class FormManagementMixin(MCPMixin):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.max_file_size = 100 * 1024 * 1024 # 100MB
@mcp_tool( @mcp_tool(
name="extract_form_data", name="extract_form_data",

View File

@ -29,7 +29,6 @@ class ImageProcessingMixin(MCPMixin):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.max_file_size = 100 * 1024 * 1024 # 100MB
@mcp_tool( @mcp_tool(
name="extract_images", name="extract_images",

View File

@ -31,7 +31,6 @@ class MiscToolsMixin(MCPMixin):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.max_file_size = 100 * 1024 * 1024 # 100MB
@mcp_tool( @mcp_tool(
name="extract_links", name="extract_links",

View File

@ -32,7 +32,6 @@ class PDFUtilitiesMixin(MCPMixin):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.max_file_size = 100 * 1024 * 1024 # 100MB
@mcp_tool( @mcp_tool(
name="compare_pdfs", name="compare_pdfs",

View File

@ -594,7 +594,6 @@ class PermitFormMixin(MCPMixin):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.max_file_size = 100 * 1024 * 1024 # 100MB
def _load_field_definitions( def _load_field_definitions(
self, self,

View File

@ -30,7 +30,6 @@ class SecurityAnalysisMixin(MCPMixin):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.max_file_size = 100 * 1024 * 1024 # 100MB
@mcp_tool( @mcp_tool(
name="analyze_pdf_security", name="analyze_pdf_security",

View File

@ -33,7 +33,6 @@ class TableExtractionMixin(MCPMixin):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.max_file_size = 100 * 1024 * 1024 # 100MB
@mcp_tool( @mcp_tool(
name="extract_tables", name="extract_tables",

View File

@ -32,7 +32,6 @@ class TextExtractionMixin(MCPMixin):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.max_pages_per_chunk = 10 self.max_pages_per_chunk = 10
self.max_file_size = 100 * 1024 * 1024 # 100MB
@mcp_tool( @mcp_tool(
name="extract_text", name="extract_text",

View File

@ -20,7 +20,10 @@ import httpx
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# Security Configuration # Security Configuration
MAX_PDF_SIZE = 100 * 1024 * 1024 # 100MB # MCP_PDF_MAX_SIZE: max PDF size in MB, or "0" / empty to disable the limit
_max_size_env = os.getenv("MCP_PDF_MAX_SIZE", "").strip()
MAX_PDF_SIZE = int(_max_size_env) * 1024 * 1024 if _max_size_env and _max_size_env != "0" else 0
MAX_IMAGE_SIZE = 50 * 1024 * 1024 # 50MB MAX_IMAGE_SIZE = 50 * 1024 * 1024 # 50MB
MAX_PAGES_PROCESS = 1000 MAX_PAGES_PROCESS = 1000
MAX_JSON_SIZE = 10000 # 10KB for JSON parameters MAX_JSON_SIZE = 10000 # 10KB for JSON parameters
@ -113,10 +116,11 @@ async def validate_pdf_path(pdf_path: str) -> Path:
if not path.is_file(): if not path.is_file():
raise ValueError(f"Path is not a file: {path}") raise ValueError(f"Path is not a file: {path}")
# Check file size # Check file size (skip when MAX_PDF_SIZE is 0 / disabled)
file_size = path.stat().st_size if MAX_PDF_SIZE:
if file_size > MAX_PDF_SIZE: file_size = path.stat().st_size
raise ValueError(f"PDF file too large: {file_size / (1024*1024):.1f}MB > {MAX_PDF_SIZE / (1024*1024)}MB") if file_size > MAX_PDF_SIZE:
raise ValueError(f"PDF file too large: {file_size / (1024*1024):.1f}MB > {MAX_PDF_SIZE / (1024*1024):.0f}MB limit (set MCP_PDF_MAX_SIZE=0 to disable)")
# Basic PDF header validation # Basic PDF header validation
try: try:
@ -162,8 +166,7 @@ async def _download_url_safely(url: str) -> Path:
# Check if already cached # Check if already cached
if cached_file.exists(): if cached_file.exists():
# Validate cached file if not MAX_PDF_SIZE or cached_file.stat().st_size <= MAX_PDF_SIZE:
if cached_file.stat().st_size <= MAX_PDF_SIZE:
logger.info(f"Using cached PDF: {cached_file}") logger.info(f"Using cached PDF: {cached_file}")
return cached_file return cached_file
else: else:
@ -185,10 +188,13 @@ async def _download_url_safely(url: str) -> Path:
with open(cached_file, 'wb') as f: with open(cached_file, 'wb') as f:
async for chunk in response.aiter_bytes(chunk_size=8192): async for chunk in response.aiter_bytes(chunk_size=8192):
downloaded_size += len(chunk) downloaded_size += len(chunk)
if downloaded_size > MAX_PDF_SIZE: if MAX_PDF_SIZE and downloaded_size > MAX_PDF_SIZE:
f.close() f.close()
cached_file.unlink() cached_file.unlink()
raise ValueError(f"Downloaded file too large: {downloaded_size / (1024*1024):.1f}MB") raise ValueError(
f"Downloaded file too large: {downloaded_size / (1024*1024):.1f}MB "
f"> {MAX_PDF_SIZE / (1024*1024):.0f}MB limit (set MCP_PDF_MAX_SIZE=0 to disable)"
)
f.write(chunk) f.write(chunk)
# Set secure permissions # Set secure permissions

View File

@ -59,7 +59,7 @@ class PDFServerOfficial:
def _load_configuration(self) -> Dict[str, Any]: def _load_configuration(self) -> Dict[str, Any]:
"""Load server configuration from environment and defaults""" """Load server configuration from environment and defaults"""
return { return {
"max_pdf_size": int(os.getenv("MAX_PDF_SIZE", str(100 * 1024 * 1024))), # 100MB default "max_pdf_size": int(os.getenv("MCP_PDF_MAX_SIZE", "0")) * 1024 * 1024 if os.getenv("MCP_PDF_MAX_SIZE", "").strip() not in ("", "0") else 0,
"cache_dir": Path(os.getenv("PDF_TEMP_DIR", "/tmp/mcp-pdf-processing")), "cache_dir": Path(os.getenv("PDF_TEMP_DIR", "/tmp/mcp-pdf-processing")),
"debug": os.getenv("DEBUG", "false").lower() == "true", "debug": os.getenv("DEBUG", "false").lower() == "true",
"allowed_domains": os.getenv("ALLOWED_DOMAINS", "").split(",") if os.getenv("ALLOWED_DOMAINS") else [], "allowed_domains": os.getenv("ALLOWED_DOMAINS", "").split(",") if os.getenv("ALLOWED_DOMAINS") else [],

View File

@ -38,7 +38,9 @@ logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# Security Configuration # Security Configuration
MAX_PDF_SIZE = 100 * 1024 * 1024 # 100MB # MCP_PDF_MAX_SIZE: max PDF size in MB, or "0" / empty to disable the limit
_max_size_env = os.getenv("MCP_PDF_MAX_SIZE", "").strip()
MAX_PDF_SIZE = int(_max_size_env) * 1024 * 1024 if _max_size_env and _max_size_env != "0" else 0
MAX_IMAGE_SIZE = 50 * 1024 * 1024 # 50MB MAX_IMAGE_SIZE = 50 * 1024 * 1024 # 50MB
MAX_PAGES_PROCESS = 1000 MAX_PAGES_PROCESS = 1000
MAX_JSON_SIZE = 10000 # 10KB for JSON parameters MAX_JSON_SIZE = 10000 # 10KB for JSON parameters
@ -335,8 +337,7 @@ async def download_pdf_from_url(url: str) -> Path:
# Check content length header # Check content length header
content_length = response.headers.get('content-length') content_length = response.headers.get('content-length')
if content_length and int(content_length) > MAX_PDF_SIZE: # Size limit delegated to security.py (MCP_PDF_MAX_SIZE env var)
raise ValueError(f"PDF file too large: {content_length} bytes > {MAX_PDF_SIZE}")
# Check content type # Check content type
content_type = response.headers.get("content-type", "").lower() content_type = response.headers.get("content-type", "").lower()
@ -355,16 +356,15 @@ async def download_pdf_from_url(url: str) -> Path:
content = first_chunk content = first_chunk
async for chunk in response.aiter_bytes(chunk_size=8192): async for chunk in response.aiter_bytes(chunk_size=8192):
content += chunk content += chunk
# Check size as we download if MAX_PDF_SIZE and len(content) > MAX_PDF_SIZE:
if len(content) > MAX_PDF_SIZE: raise ValueError(f"PDF file too large (set MCP_PDF_MAX_SIZE=0 to disable)")
raise ValueError(f"PDF file too large: {len(content)} bytes > {MAX_PDF_SIZE}")
else: else:
# Read all content with size checking # Read all content with size checking
content = b"" content = b""
async for chunk in response.aiter_bytes(chunk_size=8192): async for chunk in response.aiter_bytes(chunk_size=8192):
content += chunk content += chunk
if len(content) > MAX_PDF_SIZE: if MAX_PDF_SIZE and len(content) > MAX_PDF_SIZE:
raise ValueError(f"PDF file too large: {len(content)} bytes > {MAX_PDF_SIZE}") raise ValueError(f"PDF file too large (set MCP_PDF_MAX_SIZE=0 to disable)")
# Double-check magic bytes # Double-check magic bytes
if not content.startswith(b"%PDF"): if not content.startswith(b"%PDF"):
@ -410,10 +410,11 @@ async def validate_pdf_path(pdf_path: str) -> Path:
if not path.suffix.lower() == '.pdf': if not path.suffix.lower() == '.pdf':
raise ValueError(f"Not a PDF file: {pdf_path}") raise ValueError(f"Not a PDF file: {pdf_path}")
# Check file size # Check file size (skip when MCP_PDF_MAX_SIZE is 0 / disabled)
file_size = path.stat().st_size if MAX_PDF_SIZE:
if file_size > MAX_PDF_SIZE: file_size = path.stat().st_size
raise ValueError(f"PDF file too large: {file_size} bytes > {MAX_PDF_SIZE}") if file_size > MAX_PDF_SIZE:
raise ValueError(f"PDF file too large (set MCP_PDF_MAX_SIZE=0 to disable)")
return path return path

View File

@ -31,7 +31,6 @@ logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# Security Configuration # Security Configuration
MAX_PDF_SIZE = 100 * 1024 * 1024 # 100MB
MAX_IMAGE_SIZE = 50 * 1024 * 1024 # 50MB MAX_IMAGE_SIZE = 50 * 1024 * 1024 # 50MB
MAX_PAGES_PROCESS = 1000 MAX_PAGES_PROCESS = 1000
MAX_JSON_SIZE = 10000 # 10KB for JSON parameters MAX_JSON_SIZE = 10000 # 10KB for JSON parameters
@ -83,7 +82,7 @@ class PDFToolsServer:
def _load_configuration(self) -> Dict[str, Any]: def _load_configuration(self) -> Dict[str, Any]:
"""Load server configuration from environment and defaults""" """Load server configuration from environment and defaults"""
return { return {
"max_pdf_size": int(os.getenv("MAX_PDF_SIZE", MAX_PDF_SIZE)), "max_pdf_size": int(os.getenv("MCP_PDF_MAX_SIZE", "0")) * 1024 * 1024 if os.getenv("MCP_PDF_MAX_SIZE", "").strip() not in ("", "0") else 0,
"max_image_size": int(os.getenv("MAX_IMAGE_SIZE", MAX_IMAGE_SIZE)), "max_image_size": int(os.getenv("MAX_IMAGE_SIZE", MAX_IMAGE_SIZE)),
"max_pages": int(os.getenv("MAX_PAGES_PROCESS", MAX_PAGES_PROCESS)), "max_pages": int(os.getenv("MAX_PAGES_PROCESS", MAX_PAGES_PROCESS)),
"processing_timeout": int(os.getenv("PROCESSING_TIMEOUT", PROCESSING_TIMEOUT)), "processing_timeout": int(os.getenv("PROCESSING_TIMEOUT", PROCESSING_TIMEOUT)),