🔧 Replace hardcoded 100MB PDF limit with MCP_PDF_MAX_SIZE env var
Centralize PDF size limit in security.py, controlled by MCP_PDF_MAX_SIZE (in MB). Default: disabled (no limit). Set e.g. MCP_PDF_MAX_SIZE=500 to cap at 500MB. Remove unused self.max_file_size from all 13 mixins.
This commit is contained in:
parent
38af9ee2c9
commit
62d9b176c8
@ -136,6 +136,7 @@ Critical system dependencies:
|
||||
Environment variables (optional):
|
||||
- `TESSDATA_PREFIX`: Tesseract language data location
|
||||
- `PDF_TEMP_DIR`: Temporary file processing directory (defaults to `/tmp/mcp-pdf-processing`)
|
||||
- `MCP_PDF_MAX_SIZE`: Maximum PDF file size in MB (e.g., `500` for 500MB). Set to `0` or leave empty to disable the limit (default: disabled)
|
||||
- `MCP_PDF_ALLOWED_PATHS`: Colon-separated list of allowed output directories (e.g., `/tmp:/home/user/documents:/var/output`)
|
||||
- If unset: Allows writes to any directory with security warnings
|
||||
- If set: Restricts file outputs to specified directories only
|
||||
@ -151,7 +152,7 @@ This server implements defense-in-depth, but remember: **application-level secur
|
||||
**Application-Level Protections (Security Theater):**
|
||||
|
||||
**Input Validation:**
|
||||
- File size limits: 100MB for PDFs, 50MB for images
|
||||
- File size limits: 50MB for images. PDF size limit controlled by `MCP_PDF_MAX_SIZE` env var (in MB); disabled by default
|
||||
- Page count limits: Max 1000 pages per document
|
||||
- Path traversal protection for all file operations
|
||||
- JSON input size limits (10KB) to prevent DoS attacks
|
||||
|
||||
@ -29,7 +29,6 @@ class AdvancedFormsMixin(MCPMixin):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.max_file_size = 100 * 1024 * 1024 # 100MB
|
||||
|
||||
@mcp_tool(
|
||||
name="add_form_fields",
|
||||
|
||||
@ -29,7 +29,6 @@ class AnnotationsMixin(MCPMixin):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.max_file_size = 100 * 1024 * 1024 # 100MB
|
||||
|
||||
@mcp_tool(
|
||||
name="add_sticky_notes",
|
||||
|
||||
@ -31,7 +31,6 @@ class ContentAnalysisMixin(MCPMixin):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.max_file_size = 100 * 1024 * 1024 # 100MB
|
||||
|
||||
@mcp_tool(
|
||||
name="classify_content",
|
||||
|
||||
@ -30,7 +30,6 @@ class DocumentAnalysisMixin(MCPMixin):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.max_file_size = 100 * 1024 * 1024 # 100MB
|
||||
|
||||
@mcp_tool(
|
||||
name="extract_metadata",
|
||||
|
||||
@ -29,7 +29,6 @@ class DocumentAssemblyMixin(MCPMixin):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.max_file_size = 100 * 1024 * 1024 # 100MB
|
||||
|
||||
@mcp_tool(
|
||||
name="merge_pdfs",
|
||||
|
||||
@ -31,7 +31,6 @@ class FormManagementMixin(MCPMixin):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.max_file_size = 100 * 1024 * 1024 # 100MB
|
||||
|
||||
@mcp_tool(
|
||||
name="extract_form_data",
|
||||
|
||||
@ -29,7 +29,6 @@ class ImageProcessingMixin(MCPMixin):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.max_file_size = 100 * 1024 * 1024 # 100MB
|
||||
|
||||
@mcp_tool(
|
||||
name="extract_images",
|
||||
|
||||
@ -31,7 +31,6 @@ class MiscToolsMixin(MCPMixin):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.max_file_size = 100 * 1024 * 1024 # 100MB
|
||||
|
||||
@mcp_tool(
|
||||
name="extract_links",
|
||||
|
||||
@ -32,7 +32,6 @@ class PDFUtilitiesMixin(MCPMixin):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.max_file_size = 100 * 1024 * 1024 # 100MB
|
||||
|
||||
@mcp_tool(
|
||||
name="compare_pdfs",
|
||||
|
||||
@ -594,7 +594,6 @@ class PermitFormMixin(MCPMixin):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.max_file_size = 100 * 1024 * 1024 # 100MB
|
||||
|
||||
def _load_field_definitions(
|
||||
self,
|
||||
|
||||
@ -30,7 +30,6 @@ class SecurityAnalysisMixin(MCPMixin):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.max_file_size = 100 * 1024 * 1024 # 100MB
|
||||
|
||||
@mcp_tool(
|
||||
name="analyze_pdf_security",
|
||||
|
||||
@ -33,7 +33,6 @@ class TableExtractionMixin(MCPMixin):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.max_file_size = 100 * 1024 * 1024 # 100MB
|
||||
|
||||
@mcp_tool(
|
||||
name="extract_tables",
|
||||
|
||||
@ -32,7 +32,6 @@ class TextExtractionMixin(MCPMixin):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.max_pages_per_chunk = 10
|
||||
self.max_file_size = 100 * 1024 * 1024 # 100MB
|
||||
|
||||
@mcp_tool(
|
||||
name="extract_text",
|
||||
|
||||
@ -20,7 +20,10 @@ import httpx
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Security Configuration
|
||||
MAX_PDF_SIZE = 100 * 1024 * 1024 # 100MB
|
||||
# MCP_PDF_MAX_SIZE: max PDF size in MB, or "0" / empty to disable the limit
|
||||
_max_size_env = os.getenv("MCP_PDF_MAX_SIZE", "").strip()
|
||||
MAX_PDF_SIZE = int(_max_size_env) * 1024 * 1024 if _max_size_env and _max_size_env != "0" else 0
|
||||
|
||||
MAX_IMAGE_SIZE = 50 * 1024 * 1024 # 50MB
|
||||
MAX_PAGES_PROCESS = 1000
|
||||
MAX_JSON_SIZE = 10000 # 10KB for JSON parameters
|
||||
@ -113,10 +116,11 @@ async def validate_pdf_path(pdf_path: str) -> Path:
|
||||
if not path.is_file():
|
||||
raise ValueError(f"Path is not a file: {path}")
|
||||
|
||||
# Check file size
|
||||
file_size = path.stat().st_size
|
||||
if file_size > MAX_PDF_SIZE:
|
||||
raise ValueError(f"PDF file too large: {file_size / (1024*1024):.1f}MB > {MAX_PDF_SIZE / (1024*1024)}MB")
|
||||
# Check file size (skip when MAX_PDF_SIZE is 0 / disabled)
|
||||
if MAX_PDF_SIZE:
|
||||
file_size = path.stat().st_size
|
||||
if file_size > MAX_PDF_SIZE:
|
||||
raise ValueError(f"PDF file too large: {file_size / (1024*1024):.1f}MB > {MAX_PDF_SIZE / (1024*1024):.0f}MB limit (set MCP_PDF_MAX_SIZE=0 to disable)")
|
||||
|
||||
# Basic PDF header validation
|
||||
try:
|
||||
@ -162,8 +166,7 @@ async def _download_url_safely(url: str) -> Path:
|
||||
|
||||
# Check if already cached
|
||||
if cached_file.exists():
|
||||
# Validate cached file
|
||||
if cached_file.stat().st_size <= MAX_PDF_SIZE:
|
||||
if not MAX_PDF_SIZE or cached_file.stat().st_size <= MAX_PDF_SIZE:
|
||||
logger.info(f"Using cached PDF: {cached_file}")
|
||||
return cached_file
|
||||
else:
|
||||
@ -185,10 +188,13 @@ async def _download_url_safely(url: str) -> Path:
|
||||
with open(cached_file, 'wb') as f:
|
||||
async for chunk in response.aiter_bytes(chunk_size=8192):
|
||||
downloaded_size += len(chunk)
|
||||
if downloaded_size > MAX_PDF_SIZE:
|
||||
if MAX_PDF_SIZE and downloaded_size > MAX_PDF_SIZE:
|
||||
f.close()
|
||||
cached_file.unlink()
|
||||
raise ValueError(f"Downloaded file too large: {downloaded_size / (1024*1024):.1f}MB")
|
||||
raise ValueError(
|
||||
f"Downloaded file too large: {downloaded_size / (1024*1024):.1f}MB "
|
||||
f"> {MAX_PDF_SIZE / (1024*1024):.0f}MB limit (set MCP_PDF_MAX_SIZE=0 to disable)"
|
||||
)
|
||||
f.write(chunk)
|
||||
|
||||
# Set secure permissions
|
||||
|
||||
@ -59,7 +59,7 @@ class PDFServerOfficial:
|
||||
def _load_configuration(self) -> Dict[str, Any]:
|
||||
"""Load server configuration from environment and defaults"""
|
||||
return {
|
||||
"max_pdf_size": int(os.getenv("MAX_PDF_SIZE", str(100 * 1024 * 1024))), # 100MB default
|
||||
"max_pdf_size": int(os.getenv("MCP_PDF_MAX_SIZE", "0")) * 1024 * 1024 if os.getenv("MCP_PDF_MAX_SIZE", "").strip() not in ("", "0") else 0,
|
||||
"cache_dir": Path(os.getenv("PDF_TEMP_DIR", "/tmp/mcp-pdf-processing")),
|
||||
"debug": os.getenv("DEBUG", "false").lower() == "true",
|
||||
"allowed_domains": os.getenv("ALLOWED_DOMAINS", "").split(",") if os.getenv("ALLOWED_DOMAINS") else [],
|
||||
|
||||
@ -38,7 +38,9 @@ logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Security Configuration
|
||||
MAX_PDF_SIZE = 100 * 1024 * 1024 # 100MB
|
||||
# MCP_PDF_MAX_SIZE: max PDF size in MB, or "0" / empty to disable the limit
|
||||
_max_size_env = os.getenv("MCP_PDF_MAX_SIZE", "").strip()
|
||||
MAX_PDF_SIZE = int(_max_size_env) * 1024 * 1024 if _max_size_env and _max_size_env != "0" else 0
|
||||
MAX_IMAGE_SIZE = 50 * 1024 * 1024 # 50MB
|
||||
MAX_PAGES_PROCESS = 1000
|
||||
MAX_JSON_SIZE = 10000 # 10KB for JSON parameters
|
||||
@ -335,8 +337,7 @@ async def download_pdf_from_url(url: str) -> Path:
|
||||
|
||||
# Check content length header
|
||||
content_length = response.headers.get('content-length')
|
||||
if content_length and int(content_length) > MAX_PDF_SIZE:
|
||||
raise ValueError(f"PDF file too large: {content_length} bytes > {MAX_PDF_SIZE}")
|
||||
# Size limit delegated to security.py (MCP_PDF_MAX_SIZE env var)
|
||||
|
||||
# Check content type
|
||||
content_type = response.headers.get("content-type", "").lower()
|
||||
@ -355,16 +356,15 @@ async def download_pdf_from_url(url: str) -> Path:
|
||||
content = first_chunk
|
||||
async for chunk in response.aiter_bytes(chunk_size=8192):
|
||||
content += chunk
|
||||
# Check size as we download
|
||||
if len(content) > MAX_PDF_SIZE:
|
||||
raise ValueError(f"PDF file too large: {len(content)} bytes > {MAX_PDF_SIZE}")
|
||||
if MAX_PDF_SIZE and len(content) > MAX_PDF_SIZE:
|
||||
raise ValueError(f"PDF file too large (set MCP_PDF_MAX_SIZE=0 to disable)")
|
||||
else:
|
||||
# Read all content with size checking
|
||||
content = b""
|
||||
async for chunk in response.aiter_bytes(chunk_size=8192):
|
||||
content += chunk
|
||||
if len(content) > MAX_PDF_SIZE:
|
||||
raise ValueError(f"PDF file too large: {len(content)} bytes > {MAX_PDF_SIZE}")
|
||||
if MAX_PDF_SIZE and len(content) > MAX_PDF_SIZE:
|
||||
raise ValueError(f"PDF file too large (set MCP_PDF_MAX_SIZE=0 to disable)")
|
||||
|
||||
# Double-check magic bytes
|
||||
if not content.startswith(b"%PDF"):
|
||||
@ -410,10 +410,11 @@ async def validate_pdf_path(pdf_path: str) -> Path:
|
||||
if not path.suffix.lower() == '.pdf':
|
||||
raise ValueError(f"Not a PDF file: {pdf_path}")
|
||||
|
||||
# Check file size
|
||||
file_size = path.stat().st_size
|
||||
if file_size > MAX_PDF_SIZE:
|
||||
raise ValueError(f"PDF file too large: {file_size} bytes > {MAX_PDF_SIZE}")
|
||||
# Check file size (skip when MCP_PDF_MAX_SIZE is 0 / disabled)
|
||||
if MAX_PDF_SIZE:
|
||||
file_size = path.stat().st_size
|
||||
if file_size > MAX_PDF_SIZE:
|
||||
raise ValueError(f"PDF file too large (set MCP_PDF_MAX_SIZE=0 to disable)")
|
||||
|
||||
return path
|
||||
|
||||
|
||||
@ -31,7 +31,6 @@ logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Security Configuration
|
||||
MAX_PDF_SIZE = 100 * 1024 * 1024 # 100MB
|
||||
MAX_IMAGE_SIZE = 50 * 1024 * 1024 # 50MB
|
||||
MAX_PAGES_PROCESS = 1000
|
||||
MAX_JSON_SIZE = 10000 # 10KB for JSON parameters
|
||||
@ -83,7 +82,7 @@ class PDFToolsServer:
|
||||
def _load_configuration(self) -> Dict[str, Any]:
|
||||
"""Load server configuration from environment and defaults"""
|
||||
return {
|
||||
"max_pdf_size": int(os.getenv("MAX_PDF_SIZE", MAX_PDF_SIZE)),
|
||||
"max_pdf_size": int(os.getenv("MCP_PDF_MAX_SIZE", "0")) * 1024 * 1024 if os.getenv("MCP_PDF_MAX_SIZE", "").strip() not in ("", "0") else 0,
|
||||
"max_image_size": int(os.getenv("MAX_IMAGE_SIZE", MAX_IMAGE_SIZE)),
|
||||
"max_pages": int(os.getenv("MAX_PAGES_PROCESS", MAX_PAGES_PROCESS)),
|
||||
"processing_timeout": int(os.getenv("PROCESSING_TIMEOUT", PROCESSING_TIMEOUT)),
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user