Add file_content parameter for hosted HTTP transport

- Add MCP_ALLOW_LOCAL_FILES env var (default false for security) - All tools now accept file_content (base64) for remote document upload - Local file access blocked on hosted servers unless explicitly enabled - Update docker-compose to set MCP_ALLOW_LOCAL_FILES=false - Fix test assertions for updated function signatures
2026-01-20 18:47:19 -07:00 · 2026-01-20 18:47:19 -07:00 · b0477103d5
commit b0477103d5
parent 483ed9121b
7 changed files with 197 additions and 64 deletions
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -15,25 +15,24 @@ services:
      - MCP_PORT=8000
      - DEBUG=${DEBUG:-false}
      - OFFICE_TEMP_DIR=/tmp/mcwaddams
      # Security: Disable local file access for hosted server
      # Clients must use file_content parameter to upload documents
      - MCP_ALLOW_LOCAL_FILES=false
    volumes:
      # Temp directory for document processing
      - mcwaddams-temp:/tmp/mcwaddams
    networks:
      - caddy
    labels:
-      # Caddy-docker-proxy labels for /mcp endpoint
+      # Caddy-docker-proxy labels - direct reverse proxy (no path stripping)
      # MCP is served at /mcp on the backend
      caddy: ${MCWADDAMS_HOST:-mcwaddams.l.supported.systems}
-      caddy.@mcp.path: /mcp/*
+      caddy.reverse_proxy: "{{upstreams 8000}}"
-      caddy.@mcp.path_strip: /mcp
+      caddy.reverse_proxy.flush_interval: "-1"
-      caddy.handle: "@mcp"
+      caddy.reverse_proxy.transport: "http"
-      caddy.handle.reverse_proxy: "{{upstreams 8000}}"
+      caddy.reverse_proxy.transport.read_timeout: "0"
-      caddy.handle.reverse_proxy.flush_interval: "-1"
+      caddy.reverse_proxy.transport.write_timeout: "0"
-      caddy.handle.reverse_proxy.transport: "http"
+      caddy.reverse_proxy.stream_timeout: "24h"
      caddy.handle.reverse_proxy.transport.read_timeout: "0"
      caddy.handle.reverse_proxy.transport.write_timeout: "0"
      caddy.handle.reverse_proxy.stream_timeout: "24h"
      caddy.handle.reverse_proxy.header_up.Connection: "{http.request.header.Connection}"
      caddy.handle.reverse_proxy.header_up.Upgrade: "{http.request.header.Upgrade}"
    healthcheck:
      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://127.0.0.1:8000/health')"]
      interval: 30s
--- a/src/mcwaddams/mixins/excel.py
+++ b/src/mcwaddams/mixins/excel.py
@ -17,6 +17,13 @@ from ..utils import (
 )
 # Common field description for file_content parameter
 FILE_CONTENT_DESC = (
    "Base64-encoded file content (for hosted/HTTP transport). "
    "When provided, file_path is used only for extension detection."
 )
 class ExcelMixin(MCPMixin):
    """Mixin containing Excel-specific tools for advanced spreadsheet processing."""
@ -34,6 +41,7 @@ class ExcelMixin(MCPMixin):
    async def analyze_excel_data(
        self,
        file_path: str = Field(description="Path to Excel document or URL"),
        file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
        sheet_names: List[str] = Field(default=[], description="Specific sheets to analyze (empty = all sheets)"),
        include_statistics: bool = Field(default=True, description="Include statistical analysis (mean, median, etc.)"),
        detect_data_types: bool = Field(default=True, description="Analyze and detect optimal data types"),
@ -42,8 +50,8 @@ class ExcelMixin(MCPMixin):
        """Analyze Excel data with comprehensive statistics and data quality assessment."""
        start_time = time.time()
-        # Resolve and validate file
+        # Resolve and validate file (download if URL, or decode if content provided)
-        resolved_path = await resolve_office_file_path(file_path)
+        resolved_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
        validation = await validate_office_file(resolved_path)
        if validation["category"] not in ["excel"]:
@ -178,6 +186,7 @@ class ExcelMixin(MCPMixin):
    async def extract_excel_formulas(
        self,
        file_path: str = Field(description="Path to Excel document or URL"),
        file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
        sheet_names: List[str] = Field(default=[], description="Specific sheets to process (empty = all sheets)"),
        include_values: bool = Field(default=True, description="Include calculated values alongside formulas"),
        analyze_dependencies: bool = Field(default=True, description="Analyze formula dependencies and references")
@ -186,8 +195,8 @@ class ExcelMixin(MCPMixin):
        start_time = time.time()
        import re
-        # Resolve and validate file
+        # Resolve and validate file (download if URL, or decode if content provided)
-        resolved_path = await resolve_office_file_path(file_path)
+        resolved_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
        validation = await validate_office_file(resolved_path)
        if validation["category"] not in ["excel"] or validation["extension"] == ".csv":
@ -288,6 +297,7 @@ class ExcelMixin(MCPMixin):
    async def create_excel_chart_data(
        self,
        file_path: str = Field(description="Path to Excel document or URL"),
        file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
        sheet_name: str = Field(default="", description="Sheet to process (empty = first sheet)"),
        chart_type: str = Field(default="auto", description="Chart type: auto, bar, line, pie, scatter, histogram"),
        x_column: str = Field(default="", description="Column for X-axis (empty = auto-detect)"),
@ -297,8 +307,8 @@ class ExcelMixin(MCPMixin):
        """Generate chart-ready data and configurations from Excel spreadsheets."""
        start_time = time.time()
-        # Resolve and validate file
+        # Resolve and validate file (download if URL, or decode if content provided)
-        resolved_path = await resolve_office_file_path(file_path)
+        resolved_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
        validation = await validate_office_file(resolved_path)
        if validation["category"] not in ["excel"]:
--- a/src/mcwaddams/mixins/universal.py
+++ b/src/mcwaddams/mixins/universal.py
@ -1,7 +1,7 @@
 """Universal Office Tools Mixin - Format-agnostic tools that work across all Office document types."""
 import time
-from typing import Any
+from typing import Any, Optional
 from fastmcp.contrib.mcp_mixin import MCPMixin, mcp_tool
 from pydantic import Field
@ -17,6 +17,13 @@ from ..utils import (
 from ..resources import resource_store, EmbeddedResource, ResourceStore
 # Common field description for file_content parameter
 FILE_CONTENT_DESC = (
    "Base64-encoded file content (for hosted/HTTP transport). "
    "When provided, file_path is used only for extension detection."
 )
 class UniversalMixin(MCPMixin):
    """Mixin containing format-agnostic tools that work across Word, Excel, PowerPoint, and CSV files."""
@ -27,6 +34,7 @@ class UniversalMixin(MCPMixin):
    async def extract_text(
        self,
        file_path: str = Field(description="Path to Office document or URL"),
        file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
        preserve_formatting: bool = Field(default=False, description="Preserve text formatting and structure"),
        include_metadata: bool = Field(default=True, description="Include document metadata in output"),
        method: str = Field(default="auto", description="Extraction method: auto, primary, fallback")
@ -34,8 +42,8 @@ class UniversalMixin(MCPMixin):
        start_time = time.time()
        try:
-            # Resolve file path (download if URL)
+            # Resolve file path (download if URL, or decode if content provided)
-            local_path = await resolve_office_file_path(file_path)
+            local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
            # Validate file
            validation = await validate_office_file(local_path)
@ -85,6 +93,7 @@ class UniversalMixin(MCPMixin):
    async def extract_images(
        self,
        file_path: str = Field(description="Path to Office document or URL"),
        file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
        min_width: int = Field(default=100, description="Minimum image width in pixels"),
        min_height: int = Field(default=100, description="Minimum image height in pixels"),
        output_format: str = Field(default="png", description="Output image format: png, jpg, jpeg"),
@ -93,8 +102,8 @@ class UniversalMixin(MCPMixin):
        start_time = time.time()
        try:
-            # Resolve file path
+            # Resolve file path (download if URL, or decode if content provided)
-            local_path = await resolve_office_file_path(file_path)
+            local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
            # Validate file
            validation = await validate_office_file(local_path)
@ -135,13 +144,14 @@ class UniversalMixin(MCPMixin):
    )
    async def extract_metadata(
        self,
-        file_path: str = Field(description="Path to Office document or URL")
+        file_path: str = Field(description="Path to Office document or URL"),
        file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC)
    ) -> dict[str, Any]:
        start_time = time.time()
        try:
-            # Resolve file path
+            # Resolve file path (download if URL, or decode if content provided)
-            local_path = await resolve_office_file_path(file_path)
+            local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
            # Validate file
            validation = await validate_office_file(local_path)
@ -175,11 +185,12 @@ class UniversalMixin(MCPMixin):
    )
    async def detect_office_format(
        self,
-        file_path: str = Field(description="Path to Office document or URL")
+        file_path: str = Field(description="Path to Office document or URL"),
        file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC)
    ) -> dict[str, Any]:
        try:
-            # Resolve file path
+            # Resolve file path (download if URL, or decode if content provided)
-            local_path = await resolve_office_file_path(file_path)
+            local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
            # Get comprehensive format detection
            format_info = await detect_format(local_path)
@ -199,13 +210,14 @@ class UniversalMixin(MCPMixin):
    )
    async def analyze_document_health(
        self,
-        file_path: str = Field(description="Path to Office document or URL")
+        file_path: str = Field(description="Path to Office document or URL"),
        file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC)
    ) -> dict[str, Any]:
        start_time = time.time()
        try:
-            # Resolve file path
+            # Resolve file path (download if URL, or decode if content provided)
-            local_path = await resolve_office_file_path(file_path)
+            local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
            # Validate file thoroughly
            validation = await validate_office_file(local_path)
@ -350,6 +362,7 @@ class UniversalMixin(MCPMixin):
    async def index_document(
        self,
        file_path: str = Field(description="Path to Office document or URL"),
        file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
        include_images: bool = Field(default=True, description="Index embedded images"),
        include_chapters: bool = Field(default=True, description="Index chapters/sections (Word docs)"),
        include_sheets: bool = Field(default=True, description="Index sheets (Excel docs)"),
@ -362,8 +375,8 @@ class UniversalMixin(MCPMixin):
        """
        start_time = time.time()
-        # Resolve and validate
+        # Resolve and validate (download if URL, or decode if content provided)
-        local_path = await resolve_office_file_path(file_path)
+        local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
        validation = await validate_office_file(local_path)
        if not validation["is_valid"]:
            raise OfficeFileError(f"Invalid file: {', '.join(validation['errors'])}")
--- a/src/mcwaddams/mixins/word.py
+++ b/src/mcwaddams/mixins/word.py
@ -18,6 +18,13 @@ from ..utils import (
 from ..pagination import paginate_document_conversion, PaginationParams
 # Common field description for file_content parameter
 FILE_CONTENT_DESC = (
    "Base64-encoded file content (for hosted/HTTP transport). "
    "When provided, file_path is used only for extension detection."
 )
 class WordMixin(MCPMixin):
    """Mixin containing Word-specific tools for advanced document processing."""
@ -44,6 +51,7 @@ class WordMixin(MCPMixin):
    async def convert_to_markdown(
        self,
        file_path: str = Field(description="Path to Office document or URL"),
        file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
        include_images: bool = Field(default=True, description="Include images in markdown output. When True, images are extracted to files and linked in the markdown."),
        image_mode: str = Field(default="files", description="Image handling mode: 'files' (default, saves to disk and links), 'base64' (embeds inline - WARNING: can create massive responses), or 'references' (metadata only, no content)"),
        max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding (only used when image_mode='base64')"),
@ -61,8 +69,8 @@ class WordMixin(MCPMixin):
    ) -> dict[str, Any]:
        start_time = time.time()
-        # Resolve file path
+        # Resolve file path (download if URL, or decode if content provided)
-        local_path = await resolve_office_file_path(file_path)
+        local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
        # Validate file
        validation = await validate_office_file(local_path)
@ -275,6 +283,7 @@ class WordMixin(MCPMixin):
    async def extract_word_tables(
        self,
        file_path: str = Field(description="Path to Word document or URL"),
        file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
        include_styling: bool = Field(default=True, description="Include table styling information (borders, alignment, etc.)"),
        output_format: str = Field(default="structured", description="Output format: structured, csv, json, markdown"),
        preserve_merged_cells: bool = Field(default=True, description="Handle merged cells appropriately"),
@ -286,8 +295,8 @@ class WordMixin(MCPMixin):
        import json
        import io
-        # Resolve and validate file
+        # Resolve and validate file (download if URL, or decode if content provided)
-        resolved_path = await resolve_office_file_path(file_path)
+        resolved_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
        validation = await validate_office_file(resolved_path)
        if validation["category"] != "word":
@ -451,6 +460,7 @@ class WordMixin(MCPMixin):
    async def analyze_word_structure(
        self,
        file_path: str = Field(description="Path to Word document or URL"),
        file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
        include_page_info: bool = Field(default=True, description="Include page layout and section information"),
        extract_outline: bool = Field(default=True, description="Extract document outline and heading hierarchy"),
        analyze_styles: bool = Field(default=True, description="Analyze custom styles and formatting patterns")
@ -458,8 +468,8 @@ class WordMixin(MCPMixin):
        """Analyze Word document structure and organization."""
        start_time = time.time()
-        # Resolve and validate file
+        # Resolve and validate file (download if URL, or decode if content provided)
-        resolved_path = await resolve_office_file_path(file_path)
+        resolved_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
        validation = await validate_office_file(resolved_path)
        if validation["category"] != "word":
@ -646,6 +656,7 @@ class WordMixin(MCPMixin):
    async def get_document_outline(
        self,
        file_path: str = Field(description="Path to Word document or URL"),
        file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
        include_word_counts: bool = Field(default=True, description="Include estimated word count per section"),
        detect_chapters: bool = Field(default=True, description="Detect and flag chapter headings specifically")
    ) -> dict[str, Any]:
@ -654,7 +665,7 @@ class WordMixin(MCPMixin):
        from docx.oxml.ns import qn
        start_time = time.time()
-        local_path = await resolve_office_file_path(file_path)
+        local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
        validation = await validate_office_file(local_path)
        if not validation["is_valid"]:
@ -765,13 +776,14 @@ class WordMixin(MCPMixin):
    @handle_office_errors("Style consistency check")
    async def check_style_consistency(
        self,
-        file_path: str = Field(description="Path to Word document or URL")
+        file_path: str = Field(description="Path to Word document or URL"),
        file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC)
    ) -> dict[str, Any]:
        """Check document for style and formatting consistency issues."""
        from docx import Document
        start_time = time.time()
-        local_path = await resolve_office_file_path(file_path)
+        local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
        validation = await validate_office_file(local_path)
        if not validation["is_valid"]:
@ -924,6 +936,7 @@ class WordMixin(MCPMixin):
        self,
        file_path: str = Field(description="Path to Word document or URL"),
        query: str = Field(description="Text to search for (case-insensitive)"),
        file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
        context_chars: int = Field(default=100, description="Number of characters of context before and after match"),
        max_results: int = Field(default=20, description="Maximum number of results to return")
    ) -> dict[str, Any]:
@ -931,7 +944,7 @@ class WordMixin(MCPMixin):
        from docx import Document
        start_time = time.time()
-        local_path = await resolve_office_file_path(file_path)
+        local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
        validation = await validate_office_file(local_path)
        if not validation["is_valid"]:
@ -1009,6 +1022,7 @@ class WordMixin(MCPMixin):
    async def extract_entities(
        self,
        file_path: str = Field(description="Path to Word document or URL"),
        file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
        entity_types: str = Field(default="all", description="Entity types to extract: 'all', 'people', 'places', 'organizations', or comma-separated combination"),
        min_occurrences: int = Field(default=1, description="Minimum occurrences for an entity to be included"),
        include_context: bool = Field(default=True, description="Include sample context for each entity")
@ -1019,7 +1033,7 @@ class WordMixin(MCPMixin):
        import re
        start_time = time.time()
-        local_path = await resolve_office_file_path(file_path)
+        local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
        validation = await validate_office_file(local_path)
        if not validation["is_valid"]:
@ -1219,6 +1233,7 @@ class WordMixin(MCPMixin):
    async def get_chapter_summaries(
        self,
        file_path: str = Field(description="Path to Word document or URL"),
        file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
        sentences_per_chapter: int = Field(default=3, description="Number of opening sentences to include per chapter"),
        include_word_counts: bool = Field(default=True, description="Include word count for each chapter")
    ) -> dict[str, Any]:
@ -1227,7 +1242,7 @@ class WordMixin(MCPMixin):
        import re
        start_time = time.time()
-        local_path = await resolve_office_file_path(file_path)
+        local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
        validation = await validate_office_file(local_path)
        if not validation["is_valid"]:
@ -1318,6 +1333,7 @@ class WordMixin(MCPMixin):
    async def save_reading_progress(
        self,
        file_path: str = Field(description="Path to Word document"),
        file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
        chapter_number: int = Field(default=1, description="Current chapter number"),
        paragraph_index: int = Field(default=0, description="Current paragraph index"),
        notes: str = Field(default="", description="Optional notes about where you left off")
@ -1326,7 +1342,7 @@ class WordMixin(MCPMixin):
        import json
        from datetime import datetime
-        local_path = await resolve_office_file_path(file_path)
+        local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
        validation = await validate_office_file(local_path)
        if not validation["is_valid"]:
@ -1386,12 +1402,13 @@ class WordMixin(MCPMixin):
    @handle_office_errors("Get reading progress")
    async def get_reading_progress(
        self,
-        file_path: str = Field(description="Path to Word document")
+        file_path: str = Field(description="Path to Word document"),
        file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC)
    ) -> dict[str, Any]:
        """Retrieve saved reading progress from bookmark file."""
        import json
-        local_path = await resolve_office_file_path(file_path)
+        local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
        validation = await validate_office_file(local_path)
        if not validation["is_valid"]:
--- a/src/mcwaddams/utils/caching.py
+++ b/src/mcwaddams/utils/caching.py
@ -4,6 +4,7 @@ import os
 import time
 import hashlib
 import tempfile
 import base64
 from pathlib import Path
 from typing import Optional, Dict, Any
 import aiofiles
@ -12,6 +13,11 @@ from urllib.parse import urlparse
 from .validation import OfficeFileError
 # Environment variable to control local file access
 # Default to False (secure) - set to "true" for local stdio transport
 MCP_ALLOW_LOCAL_FILES = os.environ.get("MCP_ALLOW_LOCAL_FILES", "false").lower() == "true"
 class OfficeFileCache:
    """Simple file cache for downloaded Office documents."""
@ -212,20 +218,45 @@ def get_cache() -> OfficeFileCache:
    return _global_cache
-async def resolve_office_file_path(file_path: str, use_cache: bool = True) -> str:
+async def resolve_office_file_path(
-    """Resolve file path, downloading from URL if necessary.
+    file_path: str,
    use_cache: bool = True,
    file_content: Optional[str] = None,
    filename: Optional[str] = None
 ) -> str:
    """Resolve file path, downloading from URL if necessary, or decode inline content.
    Args:
-        file_path: Local file path or URL
+        file_path: Local file path or URL (ignored if file_content provided)
        use_cache: Whether to use caching for URLs
        file_content: Base64-encoded file content (for hosted/HTTP transport)
        filename: Original filename for extension detection (used with file_content)
    Returns:
-        Local file path (downloaded if was URL)
+        Local file path (temp file if from content, downloaded if from URL)
    Security:
        When MCP_ALLOW_LOCAL_FILES=false (default for HTTP transport):
        - Local file paths are rejected
        - Only URLs and file_content are allowed
        - This prevents hosted servers from accessing server-side files
    """
    # Priority 1: If file_content is provided, decode and write to temp file
    if file_content:
        return await _resolve_from_content(file_content, filename or file_path)
    # Check if it's a URL
    parsed = urlparse(file_path)
-    if not (parsed.scheme and parsed.netloc):
+    is_url = bool(parsed.scheme and parsed.netloc)
-        # Local file path
+
    if not is_url:
        # Local file path - check if allowed
        if not MCP_ALLOW_LOCAL_FILES:
            raise OfficeFileError(
                "Local file access is disabled for this server. "
                "Please use file_content parameter to upload document data, "
                "or provide a URL. Set MCP_ALLOW_LOCAL_FILES=true to enable local files."
            )
        return file_path
    # Validate URL scheme
@ -247,3 +278,60 @@ async def resolve_office_file_path(file_path: str, use_cache: bool = True) -> st
        # Direct download without caching
        from .validation import download_office_file
        return await download_office_file(file_path)
 async def _resolve_from_content(file_content: str, filename_hint: str) -> str:
    """Decode base64 content and write to a temp file.
    Args:
        file_content: Base64-encoded file data
        filename_hint: Filename or path to extract extension from
    Returns:
        Path to temporary file containing decoded content
    """
    try:
        # Decode base64 content
        content_bytes = base64.b64decode(file_content)
    except Exception as e:
        raise OfficeFileError(f"Invalid base64 content: {str(e)}")
    # Extract extension from filename hint
    ext = Path(filename_hint).suffix.lower()
    if not ext:
        # Try to detect from content magic bytes
        ext = _detect_extension_from_bytes(content_bytes)
    # Create temp file with correct extension
    temp_dir = Path(tempfile.gettempdir()) / "mcp_office_uploads"
    temp_dir.mkdir(exist_ok=True)
    # Generate unique filename
    content_hash = hashlib.sha256(content_bytes).hexdigest()[:12]
    temp_path = temp_dir / f"upload_{content_hash}{ext}"
    # Write content to temp file
    async with aiofiles.open(temp_path, 'wb') as f:
        await f.write(content_bytes)
    return str(temp_path)
 def _detect_extension_from_bytes(content: bytes) -> str:
    """Detect file extension from magic bytes."""
    # ZIP-based formats (docx, xlsx, pptx)
    if content[:4] == b'PK\x03\x04':
        # Could be docx, xlsx, or pptx - default to .docx
        # Full detection would require reading internal XML
        return ".docx"
    # OLE Compound Document (doc, xls, ppt)
    if content[:8] == b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1':
        return ".doc"
    # CSV (text-based, starts with printable characters)
    if content[:1].isalpha() or content[:1] in b'"\'':
        return ".csv"
    # Default
    return ".bin"
--- a/tests/test_mixins.py
+++ b/tests/test_mixins.py
@ -324,7 +324,10 @@ class TestMockingStrategies:
                assert result["document_metadata"] == mock_office_file["metadata"]
                # Verify mocks were called correctly
-                mock_resolve.assert_called_once_with(mock_office_file["path"])
+                mock_resolve.assert_called_once()
                # First positional arg should be the file path
                call_args = mock_resolve.call_args
                assert call_args[0][0] == mock_office_file["path"]
                mock_validate.assert_called_once_with(mock_office_file["path"])
                mock_detect.assert_called_once_with(mock_office_file["path"])
--- a/tests/test_universal_mixin.py
+++ b/tests/test_universal_mixin.py
@ -408,7 +408,10 @@ class TestMockingPatterns:
                            assert "structure" in result  # Because preserve_formatting=True
                            # Verify all mocks were called appropriately
-                            mock_resolve.assert_called_once_with("/test/document.docx")
+                            mock_resolve.assert_called_once()
                            # First positional arg should be the file path
                            call_args = mock_resolve.call_args
                            assert call_args[0][0] == "/test/document.docx"
                            mock_validate.assert_called_once_with("/realistic/path/document.docx")
                            mock_detect.assert_called_once_with("/realistic/path/document.docx")
                            mock_extract.assert_called_once()