From b0477103d537fcd9114fef53c9bba2bfbc4cd5c7 Mon Sep 17 00:00:00 2001 From: Ryan Malloy Date: Tue, 20 Jan 2026 18:47:19 -0700 Subject: [PATCH] Add file_content parameter for hosted HTTP transport - Add MCP_ALLOW_LOCAL_FILES env var (default false for security) - All tools now accept file_content (base64) for remote document upload - Local file access blocked on hosted servers unless explicitly enabled - Update docker-compose to set MCP_ALLOW_LOCAL_FILES=false - Fix test assertions for updated function signatures --- docker-compose.yml | 23 +++--- src/mcwaddams/mixins/excel.py | 22 ++++-- src/mcwaddams/mixins/universal.py | 45 +++++++----- src/mcwaddams/mixins/word.py | 47 ++++++++---- src/mcwaddams/utils/caching.py | 114 ++++++++++++++++++++++++++---- tests/test_mixins.py | 5 +- tests/test_universal_mixin.py | 5 +- 7 files changed, 197 insertions(+), 64 deletions(-) mode change 100644 => 100755 src/mcwaddams/mixins/excel.py mode change 100644 => 100755 src/mcwaddams/mixins/universal.py mode change 100644 => 100755 src/mcwaddams/mixins/word.py diff --git a/docker-compose.yml b/docker-compose.yml index ef1fa85..c50cade 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -15,25 +15,24 @@ services: - MCP_PORT=8000 - DEBUG=${DEBUG:-false} - OFFICE_TEMP_DIR=/tmp/mcwaddams + # Security: Disable local file access for hosted server + # Clients must use file_content parameter to upload documents + - MCP_ALLOW_LOCAL_FILES=false volumes: # Temp directory for document processing - mcwaddams-temp:/tmp/mcwaddams networks: - caddy labels: - # Caddy-docker-proxy labels for /mcp endpoint + # Caddy-docker-proxy labels - direct reverse proxy (no path stripping) + # MCP is served at /mcp on the backend caddy: ${MCWADDAMS_HOST:-mcwaddams.l.supported.systems} - caddy.@mcp.path: /mcp/* - caddy.@mcp.path_strip: /mcp - caddy.handle: "@mcp" - caddy.handle.reverse_proxy: "{{upstreams 8000}}" - caddy.handle.reverse_proxy.flush_interval: "-1" - caddy.handle.reverse_proxy.transport: "http" - caddy.handle.reverse_proxy.transport.read_timeout: "0" - caddy.handle.reverse_proxy.transport.write_timeout: "0" - caddy.handle.reverse_proxy.stream_timeout: "24h" - caddy.handle.reverse_proxy.header_up.Connection: "{http.request.header.Connection}" - caddy.handle.reverse_proxy.header_up.Upgrade: "{http.request.header.Upgrade}" + caddy.reverse_proxy: "{{upstreams 8000}}" + caddy.reverse_proxy.flush_interval: "-1" + caddy.reverse_proxy.transport: "http" + caddy.reverse_proxy.transport.read_timeout: "0" + caddy.reverse_proxy.transport.write_timeout: "0" + caddy.reverse_proxy.stream_timeout: "24h" healthcheck: test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://127.0.0.1:8000/health')"] interval: 30s diff --git a/src/mcwaddams/mixins/excel.py b/src/mcwaddams/mixins/excel.py old mode 100644 new mode 100755 index 814b507..ce6b025 --- a/src/mcwaddams/mixins/excel.py +++ b/src/mcwaddams/mixins/excel.py @@ -17,6 +17,13 @@ from ..utils import ( ) +# Common field description for file_content parameter +FILE_CONTENT_DESC = ( + "Base64-encoded file content (for hosted/HTTP transport). " + "When provided, file_path is used only for extension detection." +) + + class ExcelMixin(MCPMixin): """Mixin containing Excel-specific tools for advanced spreadsheet processing.""" @@ -34,6 +41,7 @@ class ExcelMixin(MCPMixin): async def analyze_excel_data( self, file_path: str = Field(description="Path to Excel document or URL"), + file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC), sheet_names: List[str] = Field(default=[], description="Specific sheets to analyze (empty = all sheets)"), include_statistics: bool = Field(default=True, description="Include statistical analysis (mean, median, etc.)"), detect_data_types: bool = Field(default=True, description="Analyze and detect optimal data types"), @@ -42,8 +50,8 @@ class ExcelMixin(MCPMixin): """Analyze Excel data with comprehensive statistics and data quality assessment.""" start_time = time.time() - # Resolve and validate file - resolved_path = await resolve_office_file_path(file_path) + # Resolve and validate file (download if URL, or decode if content provided) + resolved_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path) validation = await validate_office_file(resolved_path) if validation["category"] not in ["excel"]: @@ -178,6 +186,7 @@ class ExcelMixin(MCPMixin): async def extract_excel_formulas( self, file_path: str = Field(description="Path to Excel document or URL"), + file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC), sheet_names: List[str] = Field(default=[], description="Specific sheets to process (empty = all sheets)"), include_values: bool = Field(default=True, description="Include calculated values alongside formulas"), analyze_dependencies: bool = Field(default=True, description="Analyze formula dependencies and references") @@ -186,8 +195,8 @@ class ExcelMixin(MCPMixin): start_time = time.time() import re - # Resolve and validate file - resolved_path = await resolve_office_file_path(file_path) + # Resolve and validate file (download if URL, or decode if content provided) + resolved_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path) validation = await validate_office_file(resolved_path) if validation["category"] not in ["excel"] or validation["extension"] == ".csv": @@ -288,6 +297,7 @@ class ExcelMixin(MCPMixin): async def create_excel_chart_data( self, file_path: str = Field(description="Path to Excel document or URL"), + file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC), sheet_name: str = Field(default="", description="Sheet to process (empty = first sheet)"), chart_type: str = Field(default="auto", description="Chart type: auto, bar, line, pie, scatter, histogram"), x_column: str = Field(default="", description="Column for X-axis (empty = auto-detect)"), @@ -297,8 +307,8 @@ class ExcelMixin(MCPMixin): """Generate chart-ready data and configurations from Excel spreadsheets.""" start_time = time.time() - # Resolve and validate file - resolved_path = await resolve_office_file_path(file_path) + # Resolve and validate file (download if URL, or decode if content provided) + resolved_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path) validation = await validate_office_file(resolved_path) if validation["category"] not in ["excel"]: diff --git a/src/mcwaddams/mixins/universal.py b/src/mcwaddams/mixins/universal.py old mode 100644 new mode 100755 index 5062da6..420091a --- a/src/mcwaddams/mixins/universal.py +++ b/src/mcwaddams/mixins/universal.py @@ -1,7 +1,7 @@ """Universal Office Tools Mixin - Format-agnostic tools that work across all Office document types.""" import time -from typing import Any +from typing import Any, Optional from fastmcp.contrib.mcp_mixin import MCPMixin, mcp_tool from pydantic import Field @@ -17,6 +17,13 @@ from ..utils import ( from ..resources import resource_store, EmbeddedResource, ResourceStore +# Common field description for file_content parameter +FILE_CONTENT_DESC = ( + "Base64-encoded file content (for hosted/HTTP transport). " + "When provided, file_path is used only for extension detection." +) + + class UniversalMixin(MCPMixin): """Mixin containing format-agnostic tools that work across Word, Excel, PowerPoint, and CSV files.""" @@ -27,6 +34,7 @@ class UniversalMixin(MCPMixin): async def extract_text( self, file_path: str = Field(description="Path to Office document or URL"), + file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC), preserve_formatting: bool = Field(default=False, description="Preserve text formatting and structure"), include_metadata: bool = Field(default=True, description="Include document metadata in output"), method: str = Field(default="auto", description="Extraction method: auto, primary, fallback") @@ -34,8 +42,8 @@ class UniversalMixin(MCPMixin): start_time = time.time() try: - # Resolve file path (download if URL) - local_path = await resolve_office_file_path(file_path) + # Resolve file path (download if URL, or decode if content provided) + local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path) # Validate file validation = await validate_office_file(local_path) @@ -85,6 +93,7 @@ class UniversalMixin(MCPMixin): async def extract_images( self, file_path: str = Field(description="Path to Office document or URL"), + file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC), min_width: int = Field(default=100, description="Minimum image width in pixels"), min_height: int = Field(default=100, description="Minimum image height in pixels"), output_format: str = Field(default="png", description="Output image format: png, jpg, jpeg"), @@ -93,8 +102,8 @@ class UniversalMixin(MCPMixin): start_time = time.time() try: - # Resolve file path - local_path = await resolve_office_file_path(file_path) + # Resolve file path (download if URL, or decode if content provided) + local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path) # Validate file validation = await validate_office_file(local_path) @@ -135,13 +144,14 @@ class UniversalMixin(MCPMixin): ) async def extract_metadata( self, - file_path: str = Field(description="Path to Office document or URL") + file_path: str = Field(description="Path to Office document or URL"), + file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC) ) -> dict[str, Any]: start_time = time.time() try: - # Resolve file path - local_path = await resolve_office_file_path(file_path) + # Resolve file path (download if URL, or decode if content provided) + local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path) # Validate file validation = await validate_office_file(local_path) @@ -175,11 +185,12 @@ class UniversalMixin(MCPMixin): ) async def detect_office_format( self, - file_path: str = Field(description="Path to Office document or URL") + file_path: str = Field(description="Path to Office document or URL"), + file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC) ) -> dict[str, Any]: try: - # Resolve file path - local_path = await resolve_office_file_path(file_path) + # Resolve file path (download if URL, or decode if content provided) + local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path) # Get comprehensive format detection format_info = await detect_format(local_path) @@ -199,13 +210,14 @@ class UniversalMixin(MCPMixin): ) async def analyze_document_health( self, - file_path: str = Field(description="Path to Office document or URL") + file_path: str = Field(description="Path to Office document or URL"), + file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC) ) -> dict[str, Any]: start_time = time.time() try: - # Resolve file path - local_path = await resolve_office_file_path(file_path) + # Resolve file path (download if URL, or decode if content provided) + local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path) # Validate file thoroughly validation = await validate_office_file(local_path) @@ -350,6 +362,7 @@ class UniversalMixin(MCPMixin): async def index_document( self, file_path: str = Field(description="Path to Office document or URL"), + file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC), include_images: bool = Field(default=True, description="Index embedded images"), include_chapters: bool = Field(default=True, description="Index chapters/sections (Word docs)"), include_sheets: bool = Field(default=True, description="Index sheets (Excel docs)"), @@ -362,8 +375,8 @@ class UniversalMixin(MCPMixin): """ start_time = time.time() - # Resolve and validate - local_path = await resolve_office_file_path(file_path) + # Resolve and validate (download if URL, or decode if content provided) + local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path) validation = await validate_office_file(local_path) if not validation["is_valid"]: raise OfficeFileError(f"Invalid file: {', '.join(validation['errors'])}") diff --git a/src/mcwaddams/mixins/word.py b/src/mcwaddams/mixins/word.py old mode 100644 new mode 100755 index c97ed64..36418d5 --- a/src/mcwaddams/mixins/word.py +++ b/src/mcwaddams/mixins/word.py @@ -18,6 +18,13 @@ from ..utils import ( from ..pagination import paginate_document_conversion, PaginationParams +# Common field description for file_content parameter +FILE_CONTENT_DESC = ( + "Base64-encoded file content (for hosted/HTTP transport). " + "When provided, file_path is used only for extension detection." +) + + class WordMixin(MCPMixin): """Mixin containing Word-specific tools for advanced document processing.""" @@ -44,6 +51,7 @@ class WordMixin(MCPMixin): async def convert_to_markdown( self, file_path: str = Field(description="Path to Office document or URL"), + file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC), include_images: bool = Field(default=True, description="Include images in markdown output. When True, images are extracted to files and linked in the markdown."), image_mode: str = Field(default="files", description="Image handling mode: 'files' (default, saves to disk and links), 'base64' (embeds inline - WARNING: can create massive responses), or 'references' (metadata only, no content)"), max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding (only used when image_mode='base64')"), @@ -61,8 +69,8 @@ class WordMixin(MCPMixin): ) -> dict[str, Any]: start_time = time.time() - # Resolve file path - local_path = await resolve_office_file_path(file_path) + # Resolve file path (download if URL, or decode if content provided) + local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path) # Validate file validation = await validate_office_file(local_path) @@ -275,6 +283,7 @@ class WordMixin(MCPMixin): async def extract_word_tables( self, file_path: str = Field(description="Path to Word document or URL"), + file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC), include_styling: bool = Field(default=True, description="Include table styling information (borders, alignment, etc.)"), output_format: str = Field(default="structured", description="Output format: structured, csv, json, markdown"), preserve_merged_cells: bool = Field(default=True, description="Handle merged cells appropriately"), @@ -286,8 +295,8 @@ class WordMixin(MCPMixin): import json import io - # Resolve and validate file - resolved_path = await resolve_office_file_path(file_path) + # Resolve and validate file (download if URL, or decode if content provided) + resolved_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path) validation = await validate_office_file(resolved_path) if validation["category"] != "word": @@ -451,6 +460,7 @@ class WordMixin(MCPMixin): async def analyze_word_structure( self, file_path: str = Field(description="Path to Word document or URL"), + file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC), include_page_info: bool = Field(default=True, description="Include page layout and section information"), extract_outline: bool = Field(default=True, description="Extract document outline and heading hierarchy"), analyze_styles: bool = Field(default=True, description="Analyze custom styles and formatting patterns") @@ -458,8 +468,8 @@ class WordMixin(MCPMixin): """Analyze Word document structure and organization.""" start_time = time.time() - # Resolve and validate file - resolved_path = await resolve_office_file_path(file_path) + # Resolve and validate file (download if URL, or decode if content provided) + resolved_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path) validation = await validate_office_file(resolved_path) if validation["category"] != "word": @@ -646,6 +656,7 @@ class WordMixin(MCPMixin): async def get_document_outline( self, file_path: str = Field(description="Path to Word document or URL"), + file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC), include_word_counts: bool = Field(default=True, description="Include estimated word count per section"), detect_chapters: bool = Field(default=True, description="Detect and flag chapter headings specifically") ) -> dict[str, Any]: @@ -654,7 +665,7 @@ class WordMixin(MCPMixin): from docx.oxml.ns import qn start_time = time.time() - local_path = await resolve_office_file_path(file_path) + local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path) validation = await validate_office_file(local_path) if not validation["is_valid"]: @@ -765,13 +776,14 @@ class WordMixin(MCPMixin): @handle_office_errors("Style consistency check") async def check_style_consistency( self, - file_path: str = Field(description="Path to Word document or URL") + file_path: str = Field(description="Path to Word document or URL"), + file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC) ) -> dict[str, Any]: """Check document for style and formatting consistency issues.""" from docx import Document start_time = time.time() - local_path = await resolve_office_file_path(file_path) + local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path) validation = await validate_office_file(local_path) if not validation["is_valid"]: @@ -924,6 +936,7 @@ class WordMixin(MCPMixin): self, file_path: str = Field(description="Path to Word document or URL"), query: str = Field(description="Text to search for (case-insensitive)"), + file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC), context_chars: int = Field(default=100, description="Number of characters of context before and after match"), max_results: int = Field(default=20, description="Maximum number of results to return") ) -> dict[str, Any]: @@ -931,7 +944,7 @@ class WordMixin(MCPMixin): from docx import Document start_time = time.time() - local_path = await resolve_office_file_path(file_path) + local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path) validation = await validate_office_file(local_path) if not validation["is_valid"]: @@ -1009,6 +1022,7 @@ class WordMixin(MCPMixin): async def extract_entities( self, file_path: str = Field(description="Path to Word document or URL"), + file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC), entity_types: str = Field(default="all", description="Entity types to extract: 'all', 'people', 'places', 'organizations', or comma-separated combination"), min_occurrences: int = Field(default=1, description="Minimum occurrences for an entity to be included"), include_context: bool = Field(default=True, description="Include sample context for each entity") @@ -1019,7 +1033,7 @@ class WordMixin(MCPMixin): import re start_time = time.time() - local_path = await resolve_office_file_path(file_path) + local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path) validation = await validate_office_file(local_path) if not validation["is_valid"]: @@ -1219,6 +1233,7 @@ class WordMixin(MCPMixin): async def get_chapter_summaries( self, file_path: str = Field(description="Path to Word document or URL"), + file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC), sentences_per_chapter: int = Field(default=3, description="Number of opening sentences to include per chapter"), include_word_counts: bool = Field(default=True, description="Include word count for each chapter") ) -> dict[str, Any]: @@ -1227,7 +1242,7 @@ class WordMixin(MCPMixin): import re start_time = time.time() - local_path = await resolve_office_file_path(file_path) + local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path) validation = await validate_office_file(local_path) if not validation["is_valid"]: @@ -1318,6 +1333,7 @@ class WordMixin(MCPMixin): async def save_reading_progress( self, file_path: str = Field(description="Path to Word document"), + file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC), chapter_number: int = Field(default=1, description="Current chapter number"), paragraph_index: int = Field(default=0, description="Current paragraph index"), notes: str = Field(default="", description="Optional notes about where you left off") @@ -1326,7 +1342,7 @@ class WordMixin(MCPMixin): import json from datetime import datetime - local_path = await resolve_office_file_path(file_path) + local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path) validation = await validate_office_file(local_path) if not validation["is_valid"]: @@ -1386,12 +1402,13 @@ class WordMixin(MCPMixin): @handle_office_errors("Get reading progress") async def get_reading_progress( self, - file_path: str = Field(description="Path to Word document") + file_path: str = Field(description="Path to Word document"), + file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC) ) -> dict[str, Any]: """Retrieve saved reading progress from bookmark file.""" import json - local_path = await resolve_office_file_path(file_path) + local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path) validation = await validate_office_file(local_path) if not validation["is_valid"]: diff --git a/src/mcwaddams/utils/caching.py b/src/mcwaddams/utils/caching.py index b72d09e..daa339f 100644 --- a/src/mcwaddams/utils/caching.py +++ b/src/mcwaddams/utils/caching.py @@ -4,6 +4,7 @@ import os import time import hashlib import tempfile +import base64 from pathlib import Path from typing import Optional, Dict, Any import aiofiles @@ -12,6 +13,11 @@ from urllib.parse import urlparse from .validation import OfficeFileError +# Environment variable to control local file access +# Default to False (secure) - set to "true" for local stdio transport +MCP_ALLOW_LOCAL_FILES = os.environ.get("MCP_ALLOW_LOCAL_FILES", "false").lower() == "true" + + class OfficeFileCache: """Simple file cache for downloaded Office documents.""" @@ -212,38 +218,120 @@ def get_cache() -> OfficeFileCache: return _global_cache -async def resolve_office_file_path(file_path: str, use_cache: bool = True) -> str: - """Resolve file path, downloading from URL if necessary. - +async def resolve_office_file_path( + file_path: str, + use_cache: bool = True, + file_content: Optional[str] = None, + filename: Optional[str] = None +) -> str: + """Resolve file path, downloading from URL if necessary, or decode inline content. + Args: - file_path: Local file path or URL + file_path: Local file path or URL (ignored if file_content provided) use_cache: Whether to use caching for URLs - + file_content: Base64-encoded file content (for hosted/HTTP transport) + filename: Original filename for extension detection (used with file_content) + Returns: - Local file path (downloaded if was URL) + Local file path (temp file if from content, downloaded if from URL) + + Security: + When MCP_ALLOW_LOCAL_FILES=false (default for HTTP transport): + - Local file paths are rejected + - Only URLs and file_content are allowed + - This prevents hosted servers from accessing server-side files """ + # Priority 1: If file_content is provided, decode and write to temp file + if file_content: + return await _resolve_from_content(file_content, filename or file_path) + # Check if it's a URL parsed = urlparse(file_path) - if not (parsed.scheme and parsed.netloc): - # Local file path + is_url = bool(parsed.scheme and parsed.netloc) + + if not is_url: + # Local file path - check if allowed + if not MCP_ALLOW_LOCAL_FILES: + raise OfficeFileError( + "Local file access is disabled for this server. " + "Please use file_content parameter to upload document data, " + "or provide a URL. Set MCP_ALLOW_LOCAL_FILES=true to enable local files." + ) return file_path - + # Validate URL scheme if parsed.scheme not in ['http', 'https']: raise OfficeFileError(f"Unsupported URL scheme: {parsed.scheme}") - + cache = get_cache() - + # Check cache first if use_cache and cache.is_cached(file_path): cached_path = cache.get_cached_path(file_path) if cached_path: return cached_path - + # Download and cache if use_cache: return await cache.cache_url(file_path) else: # Direct download without caching from .validation import download_office_file - return await download_office_file(file_path) \ No newline at end of file + return await download_office_file(file_path) + + +async def _resolve_from_content(file_content: str, filename_hint: str) -> str: + """Decode base64 content and write to a temp file. + + Args: + file_content: Base64-encoded file data + filename_hint: Filename or path to extract extension from + + Returns: + Path to temporary file containing decoded content + """ + try: + # Decode base64 content + content_bytes = base64.b64decode(file_content) + except Exception as e: + raise OfficeFileError(f"Invalid base64 content: {str(e)}") + + # Extract extension from filename hint + ext = Path(filename_hint).suffix.lower() + if not ext: + # Try to detect from content magic bytes + ext = _detect_extension_from_bytes(content_bytes) + + # Create temp file with correct extension + temp_dir = Path(tempfile.gettempdir()) / "mcp_office_uploads" + temp_dir.mkdir(exist_ok=True) + + # Generate unique filename + content_hash = hashlib.sha256(content_bytes).hexdigest()[:12] + temp_path = temp_dir / f"upload_{content_hash}{ext}" + + # Write content to temp file + async with aiofiles.open(temp_path, 'wb') as f: + await f.write(content_bytes) + + return str(temp_path) + + +def _detect_extension_from_bytes(content: bytes) -> str: + """Detect file extension from magic bytes.""" + # ZIP-based formats (docx, xlsx, pptx) + if content[:4] == b'PK\x03\x04': + # Could be docx, xlsx, or pptx - default to .docx + # Full detection would require reading internal XML + return ".docx" + + # OLE Compound Document (doc, xls, ppt) + if content[:8] == b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1': + return ".doc" + + # CSV (text-based, starts with printable characters) + if content[:1].isalpha() or content[:1] in b'"\'': + return ".csv" + + # Default + return ".bin" \ No newline at end of file diff --git a/tests/test_mixins.py b/tests/test_mixins.py index 9b9af4b..036c002 100644 --- a/tests/test_mixins.py +++ b/tests/test_mixins.py @@ -324,7 +324,10 @@ class TestMockingStrategies: assert result["document_metadata"] == mock_office_file["metadata"] # Verify mocks were called correctly - mock_resolve.assert_called_once_with(mock_office_file["path"]) + mock_resolve.assert_called_once() + # First positional arg should be the file path + call_args = mock_resolve.call_args + assert call_args[0][0] == mock_office_file["path"] mock_validate.assert_called_once_with(mock_office_file["path"]) mock_detect.assert_called_once_with(mock_office_file["path"]) diff --git a/tests/test_universal_mixin.py b/tests/test_universal_mixin.py index 2033ede..a6b80c5 100644 --- a/tests/test_universal_mixin.py +++ b/tests/test_universal_mixin.py @@ -408,7 +408,10 @@ class TestMockingPatterns: assert "structure" in result # Because preserve_formatting=True # Verify all mocks were called appropriately - mock_resolve.assert_called_once_with("/test/document.docx") + mock_resolve.assert_called_once() + # First positional arg should be the file path + call_args = mock_resolve.call_args + assert call_args[0][0] == "/test/document.docx" mock_validate.assert_called_once_with("/realistic/path/document.docx") mock_detect.assert_called_once_with("/realistic/path/document.docx") mock_extract.assert_called_once()