🔧 Replace hardcoded 100MB PDF limit with MCP_PDF_MAX_SIZE env var
Centralize PDF size limit in security.py, controlled by MCP_PDF_MAX_SIZE (in MB). Default: disabled (no limit). Set e.g. MCP_PDF_MAX_SIZE=500 to cap at 500MB. Remove unused self.max_file_size from all 13 mixins.
This commit is contained in:
parent
38af9ee2c9
commit
62d9b176c8
@ -136,6 +136,7 @@ Critical system dependencies:
|
|||||||
Environment variables (optional):
|
Environment variables (optional):
|
||||||
- `TESSDATA_PREFIX`: Tesseract language data location
|
- `TESSDATA_PREFIX`: Tesseract language data location
|
||||||
- `PDF_TEMP_DIR`: Temporary file processing directory (defaults to `/tmp/mcp-pdf-processing`)
|
- `PDF_TEMP_DIR`: Temporary file processing directory (defaults to `/tmp/mcp-pdf-processing`)
|
||||||
|
- `MCP_PDF_MAX_SIZE`: Maximum PDF file size in MB (e.g., `500` for 500MB). Set to `0` or leave empty to disable the limit (default: disabled)
|
||||||
- `MCP_PDF_ALLOWED_PATHS`: Colon-separated list of allowed output directories (e.g., `/tmp:/home/user/documents:/var/output`)
|
- `MCP_PDF_ALLOWED_PATHS`: Colon-separated list of allowed output directories (e.g., `/tmp:/home/user/documents:/var/output`)
|
||||||
- If unset: Allows writes to any directory with security warnings
|
- If unset: Allows writes to any directory with security warnings
|
||||||
- If set: Restricts file outputs to specified directories only
|
- If set: Restricts file outputs to specified directories only
|
||||||
@ -151,7 +152,7 @@ This server implements defense-in-depth, but remember: **application-level secur
|
|||||||
**Application-Level Protections (Security Theater):**
|
**Application-Level Protections (Security Theater):**
|
||||||
|
|
||||||
**Input Validation:**
|
**Input Validation:**
|
||||||
- File size limits: 100MB for PDFs, 50MB for images
|
- File size limits: 50MB for images. PDF size limit controlled by `MCP_PDF_MAX_SIZE` env var (in MB); disabled by default
|
||||||
- Page count limits: Max 1000 pages per document
|
- Page count limits: Max 1000 pages per document
|
||||||
- Path traversal protection for all file operations
|
- Path traversal protection for all file operations
|
||||||
- JSON input size limits (10KB) to prevent DoS attacks
|
- JSON input size limits (10KB) to prevent DoS attacks
|
||||||
|
|||||||
@ -29,7 +29,6 @@ class AdvancedFormsMixin(MCPMixin):
|
|||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.max_file_size = 100 * 1024 * 1024 # 100MB
|
|
||||||
|
|
||||||
@mcp_tool(
|
@mcp_tool(
|
||||||
name="add_form_fields",
|
name="add_form_fields",
|
||||||
|
|||||||
@ -29,7 +29,6 @@ class AnnotationsMixin(MCPMixin):
|
|||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.max_file_size = 100 * 1024 * 1024 # 100MB
|
|
||||||
|
|
||||||
@mcp_tool(
|
@mcp_tool(
|
||||||
name="add_sticky_notes",
|
name="add_sticky_notes",
|
||||||
|
|||||||
@ -31,7 +31,6 @@ class ContentAnalysisMixin(MCPMixin):
|
|||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.max_file_size = 100 * 1024 * 1024 # 100MB
|
|
||||||
|
|
||||||
@mcp_tool(
|
@mcp_tool(
|
||||||
name="classify_content",
|
name="classify_content",
|
||||||
|
|||||||
@ -30,7 +30,6 @@ class DocumentAnalysisMixin(MCPMixin):
|
|||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.max_file_size = 100 * 1024 * 1024 # 100MB
|
|
||||||
|
|
||||||
@mcp_tool(
|
@mcp_tool(
|
||||||
name="extract_metadata",
|
name="extract_metadata",
|
||||||
|
|||||||
@ -29,7 +29,6 @@ class DocumentAssemblyMixin(MCPMixin):
|
|||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.max_file_size = 100 * 1024 * 1024 # 100MB
|
|
||||||
|
|
||||||
@mcp_tool(
|
@mcp_tool(
|
||||||
name="merge_pdfs",
|
name="merge_pdfs",
|
||||||
|
|||||||
@ -31,7 +31,6 @@ class FormManagementMixin(MCPMixin):
|
|||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.max_file_size = 100 * 1024 * 1024 # 100MB
|
|
||||||
|
|
||||||
@mcp_tool(
|
@mcp_tool(
|
||||||
name="extract_form_data",
|
name="extract_form_data",
|
||||||
|
|||||||
@ -29,7 +29,6 @@ class ImageProcessingMixin(MCPMixin):
|
|||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.max_file_size = 100 * 1024 * 1024 # 100MB
|
|
||||||
|
|
||||||
@mcp_tool(
|
@mcp_tool(
|
||||||
name="extract_images",
|
name="extract_images",
|
||||||
|
|||||||
@ -31,7 +31,6 @@ class MiscToolsMixin(MCPMixin):
|
|||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.max_file_size = 100 * 1024 * 1024 # 100MB
|
|
||||||
|
|
||||||
@mcp_tool(
|
@mcp_tool(
|
||||||
name="extract_links",
|
name="extract_links",
|
||||||
|
|||||||
@ -32,7 +32,6 @@ class PDFUtilitiesMixin(MCPMixin):
|
|||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.max_file_size = 100 * 1024 * 1024 # 100MB
|
|
||||||
|
|
||||||
@mcp_tool(
|
@mcp_tool(
|
||||||
name="compare_pdfs",
|
name="compare_pdfs",
|
||||||
|
|||||||
@ -594,7 +594,6 @@ class PermitFormMixin(MCPMixin):
|
|||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.max_file_size = 100 * 1024 * 1024 # 100MB
|
|
||||||
|
|
||||||
def _load_field_definitions(
|
def _load_field_definitions(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@ -30,7 +30,6 @@ class SecurityAnalysisMixin(MCPMixin):
|
|||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.max_file_size = 100 * 1024 * 1024 # 100MB
|
|
||||||
|
|
||||||
@mcp_tool(
|
@mcp_tool(
|
||||||
name="analyze_pdf_security",
|
name="analyze_pdf_security",
|
||||||
|
|||||||
@ -33,7 +33,6 @@ class TableExtractionMixin(MCPMixin):
|
|||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.max_file_size = 100 * 1024 * 1024 # 100MB
|
|
||||||
|
|
||||||
@mcp_tool(
|
@mcp_tool(
|
||||||
name="extract_tables",
|
name="extract_tables",
|
||||||
|
|||||||
@ -32,7 +32,6 @@ class TextExtractionMixin(MCPMixin):
|
|||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.max_pages_per_chunk = 10
|
self.max_pages_per_chunk = 10
|
||||||
self.max_file_size = 100 * 1024 * 1024 # 100MB
|
|
||||||
|
|
||||||
@mcp_tool(
|
@mcp_tool(
|
||||||
name="extract_text",
|
name="extract_text",
|
||||||
|
|||||||
@ -20,7 +20,10 @@ import httpx
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# Security Configuration
|
# Security Configuration
|
||||||
MAX_PDF_SIZE = 100 * 1024 * 1024 # 100MB
|
# MCP_PDF_MAX_SIZE: max PDF size in MB, or "0" / empty to disable the limit
|
||||||
|
_max_size_env = os.getenv("MCP_PDF_MAX_SIZE", "").strip()
|
||||||
|
MAX_PDF_SIZE = int(_max_size_env) * 1024 * 1024 if _max_size_env and _max_size_env != "0" else 0
|
||||||
|
|
||||||
MAX_IMAGE_SIZE = 50 * 1024 * 1024 # 50MB
|
MAX_IMAGE_SIZE = 50 * 1024 * 1024 # 50MB
|
||||||
MAX_PAGES_PROCESS = 1000
|
MAX_PAGES_PROCESS = 1000
|
||||||
MAX_JSON_SIZE = 10000 # 10KB for JSON parameters
|
MAX_JSON_SIZE = 10000 # 10KB for JSON parameters
|
||||||
@ -113,10 +116,11 @@ async def validate_pdf_path(pdf_path: str) -> Path:
|
|||||||
if not path.is_file():
|
if not path.is_file():
|
||||||
raise ValueError(f"Path is not a file: {path}")
|
raise ValueError(f"Path is not a file: {path}")
|
||||||
|
|
||||||
# Check file size
|
# Check file size (skip when MAX_PDF_SIZE is 0 / disabled)
|
||||||
file_size = path.stat().st_size
|
if MAX_PDF_SIZE:
|
||||||
if file_size > MAX_PDF_SIZE:
|
file_size = path.stat().st_size
|
||||||
raise ValueError(f"PDF file too large: {file_size / (1024*1024):.1f}MB > {MAX_PDF_SIZE / (1024*1024)}MB")
|
if file_size > MAX_PDF_SIZE:
|
||||||
|
raise ValueError(f"PDF file too large: {file_size / (1024*1024):.1f}MB > {MAX_PDF_SIZE / (1024*1024):.0f}MB limit (set MCP_PDF_MAX_SIZE=0 to disable)")
|
||||||
|
|
||||||
# Basic PDF header validation
|
# Basic PDF header validation
|
||||||
try:
|
try:
|
||||||
@ -162,8 +166,7 @@ async def _download_url_safely(url: str) -> Path:
|
|||||||
|
|
||||||
# Check if already cached
|
# Check if already cached
|
||||||
if cached_file.exists():
|
if cached_file.exists():
|
||||||
# Validate cached file
|
if not MAX_PDF_SIZE or cached_file.stat().st_size <= MAX_PDF_SIZE:
|
||||||
if cached_file.stat().st_size <= MAX_PDF_SIZE:
|
|
||||||
logger.info(f"Using cached PDF: {cached_file}")
|
logger.info(f"Using cached PDF: {cached_file}")
|
||||||
return cached_file
|
return cached_file
|
||||||
else:
|
else:
|
||||||
@ -185,10 +188,13 @@ async def _download_url_safely(url: str) -> Path:
|
|||||||
with open(cached_file, 'wb') as f:
|
with open(cached_file, 'wb') as f:
|
||||||
async for chunk in response.aiter_bytes(chunk_size=8192):
|
async for chunk in response.aiter_bytes(chunk_size=8192):
|
||||||
downloaded_size += len(chunk)
|
downloaded_size += len(chunk)
|
||||||
if downloaded_size > MAX_PDF_SIZE:
|
if MAX_PDF_SIZE and downloaded_size > MAX_PDF_SIZE:
|
||||||
f.close()
|
f.close()
|
||||||
cached_file.unlink()
|
cached_file.unlink()
|
||||||
raise ValueError(f"Downloaded file too large: {downloaded_size / (1024*1024):.1f}MB")
|
raise ValueError(
|
||||||
|
f"Downloaded file too large: {downloaded_size / (1024*1024):.1f}MB "
|
||||||
|
f"> {MAX_PDF_SIZE / (1024*1024):.0f}MB limit (set MCP_PDF_MAX_SIZE=0 to disable)"
|
||||||
|
)
|
||||||
f.write(chunk)
|
f.write(chunk)
|
||||||
|
|
||||||
# Set secure permissions
|
# Set secure permissions
|
||||||
|
|||||||
@ -59,7 +59,7 @@ class PDFServerOfficial:
|
|||||||
def _load_configuration(self) -> Dict[str, Any]:
|
def _load_configuration(self) -> Dict[str, Any]:
|
||||||
"""Load server configuration from environment and defaults"""
|
"""Load server configuration from environment and defaults"""
|
||||||
return {
|
return {
|
||||||
"max_pdf_size": int(os.getenv("MAX_PDF_SIZE", str(100 * 1024 * 1024))), # 100MB default
|
"max_pdf_size": int(os.getenv("MCP_PDF_MAX_SIZE", "0")) * 1024 * 1024 if os.getenv("MCP_PDF_MAX_SIZE", "").strip() not in ("", "0") else 0,
|
||||||
"cache_dir": Path(os.getenv("PDF_TEMP_DIR", "/tmp/mcp-pdf-processing")),
|
"cache_dir": Path(os.getenv("PDF_TEMP_DIR", "/tmp/mcp-pdf-processing")),
|
||||||
"debug": os.getenv("DEBUG", "false").lower() == "true",
|
"debug": os.getenv("DEBUG", "false").lower() == "true",
|
||||||
"allowed_domains": os.getenv("ALLOWED_DOMAINS", "").split(",") if os.getenv("ALLOWED_DOMAINS") else [],
|
"allowed_domains": os.getenv("ALLOWED_DOMAINS", "").split(",") if os.getenv("ALLOWED_DOMAINS") else [],
|
||||||
|
|||||||
@ -38,7 +38,9 @@ logging.basicConfig(level=logging.INFO)
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# Security Configuration
|
# Security Configuration
|
||||||
MAX_PDF_SIZE = 100 * 1024 * 1024 # 100MB
|
# MCP_PDF_MAX_SIZE: max PDF size in MB, or "0" / empty to disable the limit
|
||||||
|
_max_size_env = os.getenv("MCP_PDF_MAX_SIZE", "").strip()
|
||||||
|
MAX_PDF_SIZE = int(_max_size_env) * 1024 * 1024 if _max_size_env and _max_size_env != "0" else 0
|
||||||
MAX_IMAGE_SIZE = 50 * 1024 * 1024 # 50MB
|
MAX_IMAGE_SIZE = 50 * 1024 * 1024 # 50MB
|
||||||
MAX_PAGES_PROCESS = 1000
|
MAX_PAGES_PROCESS = 1000
|
||||||
MAX_JSON_SIZE = 10000 # 10KB for JSON parameters
|
MAX_JSON_SIZE = 10000 # 10KB for JSON parameters
|
||||||
@ -335,8 +337,7 @@ async def download_pdf_from_url(url: str) -> Path:
|
|||||||
|
|
||||||
# Check content length header
|
# Check content length header
|
||||||
content_length = response.headers.get('content-length')
|
content_length = response.headers.get('content-length')
|
||||||
if content_length and int(content_length) > MAX_PDF_SIZE:
|
# Size limit delegated to security.py (MCP_PDF_MAX_SIZE env var)
|
||||||
raise ValueError(f"PDF file too large: {content_length} bytes > {MAX_PDF_SIZE}")
|
|
||||||
|
|
||||||
# Check content type
|
# Check content type
|
||||||
content_type = response.headers.get("content-type", "").lower()
|
content_type = response.headers.get("content-type", "").lower()
|
||||||
@ -355,16 +356,15 @@ async def download_pdf_from_url(url: str) -> Path:
|
|||||||
content = first_chunk
|
content = first_chunk
|
||||||
async for chunk in response.aiter_bytes(chunk_size=8192):
|
async for chunk in response.aiter_bytes(chunk_size=8192):
|
||||||
content += chunk
|
content += chunk
|
||||||
# Check size as we download
|
if MAX_PDF_SIZE and len(content) > MAX_PDF_SIZE:
|
||||||
if len(content) > MAX_PDF_SIZE:
|
raise ValueError(f"PDF file too large (set MCP_PDF_MAX_SIZE=0 to disable)")
|
||||||
raise ValueError(f"PDF file too large: {len(content)} bytes > {MAX_PDF_SIZE}")
|
|
||||||
else:
|
else:
|
||||||
# Read all content with size checking
|
# Read all content with size checking
|
||||||
content = b""
|
content = b""
|
||||||
async for chunk in response.aiter_bytes(chunk_size=8192):
|
async for chunk in response.aiter_bytes(chunk_size=8192):
|
||||||
content += chunk
|
content += chunk
|
||||||
if len(content) > MAX_PDF_SIZE:
|
if MAX_PDF_SIZE and len(content) > MAX_PDF_SIZE:
|
||||||
raise ValueError(f"PDF file too large: {len(content)} bytes > {MAX_PDF_SIZE}")
|
raise ValueError(f"PDF file too large (set MCP_PDF_MAX_SIZE=0 to disable)")
|
||||||
|
|
||||||
# Double-check magic bytes
|
# Double-check magic bytes
|
||||||
if not content.startswith(b"%PDF"):
|
if not content.startswith(b"%PDF"):
|
||||||
@ -410,10 +410,11 @@ async def validate_pdf_path(pdf_path: str) -> Path:
|
|||||||
if not path.suffix.lower() == '.pdf':
|
if not path.suffix.lower() == '.pdf':
|
||||||
raise ValueError(f"Not a PDF file: {pdf_path}")
|
raise ValueError(f"Not a PDF file: {pdf_path}")
|
||||||
|
|
||||||
# Check file size
|
# Check file size (skip when MCP_PDF_MAX_SIZE is 0 / disabled)
|
||||||
file_size = path.stat().st_size
|
if MAX_PDF_SIZE:
|
||||||
if file_size > MAX_PDF_SIZE:
|
file_size = path.stat().st_size
|
||||||
raise ValueError(f"PDF file too large: {file_size} bytes > {MAX_PDF_SIZE}")
|
if file_size > MAX_PDF_SIZE:
|
||||||
|
raise ValueError(f"PDF file too large (set MCP_PDF_MAX_SIZE=0 to disable)")
|
||||||
|
|
||||||
return path
|
return path
|
||||||
|
|
||||||
|
|||||||
@ -31,7 +31,6 @@ logging.basicConfig(level=logging.INFO)
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# Security Configuration
|
# Security Configuration
|
||||||
MAX_PDF_SIZE = 100 * 1024 * 1024 # 100MB
|
|
||||||
MAX_IMAGE_SIZE = 50 * 1024 * 1024 # 50MB
|
MAX_IMAGE_SIZE = 50 * 1024 * 1024 # 50MB
|
||||||
MAX_PAGES_PROCESS = 1000
|
MAX_PAGES_PROCESS = 1000
|
||||||
MAX_JSON_SIZE = 10000 # 10KB for JSON parameters
|
MAX_JSON_SIZE = 10000 # 10KB for JSON parameters
|
||||||
@ -83,7 +82,7 @@ class PDFToolsServer:
|
|||||||
def _load_configuration(self) -> Dict[str, Any]:
|
def _load_configuration(self) -> Dict[str, Any]:
|
||||||
"""Load server configuration from environment and defaults"""
|
"""Load server configuration from environment and defaults"""
|
||||||
return {
|
return {
|
||||||
"max_pdf_size": int(os.getenv("MAX_PDF_SIZE", MAX_PDF_SIZE)),
|
"max_pdf_size": int(os.getenv("MCP_PDF_MAX_SIZE", "0")) * 1024 * 1024 if os.getenv("MCP_PDF_MAX_SIZE", "").strip() not in ("", "0") else 0,
|
||||||
"max_image_size": int(os.getenv("MAX_IMAGE_SIZE", MAX_IMAGE_SIZE)),
|
"max_image_size": int(os.getenv("MAX_IMAGE_SIZE", MAX_IMAGE_SIZE)),
|
||||||
"max_pages": int(os.getenv("MAX_PAGES_PROCESS", MAX_PAGES_PROCESS)),
|
"max_pages": int(os.getenv("MAX_PAGES_PROCESS", MAX_PAGES_PROCESS)),
|
||||||
"processing_timeout": int(os.getenv("PROCESSING_TIMEOUT", PROCESSING_TIMEOUT)),
|
"processing_timeout": int(os.getenv("PROCESSING_TIMEOUT", PROCESSING_TIMEOUT)),
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user