Add file_content parameter for hosted HTTP transport

- Add MCP_ALLOW_LOCAL_FILES env var (default false for security)
- All tools now accept file_content (base64) for remote document upload
- Local file access blocked on hosted servers unless explicitly enabled
- Update docker-compose to set MCP_ALLOW_LOCAL_FILES=false
- Fix test assertions for updated function signatures
This commit is contained in:
Ryan Malloy 2026-01-20 18:47:19 -07:00
parent 483ed9121b
commit b0477103d5
7 changed files with 197 additions and 64 deletions

View File

@ -15,25 +15,24 @@ services:
- MCP_PORT=8000
- DEBUG=${DEBUG:-false}
- OFFICE_TEMP_DIR=/tmp/mcwaddams
# Security: Disable local file access for hosted server
# Clients must use file_content parameter to upload documents
- MCP_ALLOW_LOCAL_FILES=false
volumes:
# Temp directory for document processing
- mcwaddams-temp:/tmp/mcwaddams
networks:
- caddy
labels:
# Caddy-docker-proxy labels for /mcp endpoint
# Caddy-docker-proxy labels - direct reverse proxy (no path stripping)
# MCP is served at /mcp on the backend
caddy: ${MCWADDAMS_HOST:-mcwaddams.l.supported.systems}
caddy.@mcp.path: /mcp/*
caddy.@mcp.path_strip: /mcp
caddy.handle: "@mcp"
caddy.handle.reverse_proxy: "{{upstreams 8000}}"
caddy.handle.reverse_proxy.flush_interval: "-1"
caddy.handle.reverse_proxy.transport: "http"
caddy.handle.reverse_proxy.transport.read_timeout: "0"
caddy.handle.reverse_proxy.transport.write_timeout: "0"
caddy.handle.reverse_proxy.stream_timeout: "24h"
caddy.handle.reverse_proxy.header_up.Connection: "{http.request.header.Connection}"
caddy.handle.reverse_proxy.header_up.Upgrade: "{http.request.header.Upgrade}"
caddy.reverse_proxy: "{{upstreams 8000}}"
caddy.reverse_proxy.flush_interval: "-1"
caddy.reverse_proxy.transport: "http"
caddy.reverse_proxy.transport.read_timeout: "0"
caddy.reverse_proxy.transport.write_timeout: "0"
caddy.reverse_proxy.stream_timeout: "24h"
healthcheck:
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://127.0.0.1:8000/health')"]
interval: 30s

22
src/mcwaddams/mixins/excel.py Normal file → Executable file
View File

@ -17,6 +17,13 @@ from ..utils import (
)
# Common field description for file_content parameter
FILE_CONTENT_DESC = (
"Base64-encoded file content (for hosted/HTTP transport). "
"When provided, file_path is used only for extension detection."
)
class ExcelMixin(MCPMixin):
"""Mixin containing Excel-specific tools for advanced spreadsheet processing."""
@ -34,6 +41,7 @@ class ExcelMixin(MCPMixin):
async def analyze_excel_data(
self,
file_path: str = Field(description="Path to Excel document or URL"),
file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
sheet_names: List[str] = Field(default=[], description="Specific sheets to analyze (empty = all sheets)"),
include_statistics: bool = Field(default=True, description="Include statistical analysis (mean, median, etc.)"),
detect_data_types: bool = Field(default=True, description="Analyze and detect optimal data types"),
@ -42,8 +50,8 @@ class ExcelMixin(MCPMixin):
"""Analyze Excel data with comprehensive statistics and data quality assessment."""
start_time = time.time()
# Resolve and validate file
resolved_path = await resolve_office_file_path(file_path)
# Resolve and validate file (download if URL, or decode if content provided)
resolved_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
validation = await validate_office_file(resolved_path)
if validation["category"] not in ["excel"]:
@ -178,6 +186,7 @@ class ExcelMixin(MCPMixin):
async def extract_excel_formulas(
self,
file_path: str = Field(description="Path to Excel document or URL"),
file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
sheet_names: List[str] = Field(default=[], description="Specific sheets to process (empty = all sheets)"),
include_values: bool = Field(default=True, description="Include calculated values alongside formulas"),
analyze_dependencies: bool = Field(default=True, description="Analyze formula dependencies and references")
@ -186,8 +195,8 @@ class ExcelMixin(MCPMixin):
start_time = time.time()
import re
# Resolve and validate file
resolved_path = await resolve_office_file_path(file_path)
# Resolve and validate file (download if URL, or decode if content provided)
resolved_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
validation = await validate_office_file(resolved_path)
if validation["category"] not in ["excel"] or validation["extension"] == ".csv":
@ -288,6 +297,7 @@ class ExcelMixin(MCPMixin):
async def create_excel_chart_data(
self,
file_path: str = Field(description="Path to Excel document or URL"),
file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
sheet_name: str = Field(default="", description="Sheet to process (empty = first sheet)"),
chart_type: str = Field(default="auto", description="Chart type: auto, bar, line, pie, scatter, histogram"),
x_column: str = Field(default="", description="Column for X-axis (empty = auto-detect)"),
@ -297,8 +307,8 @@ class ExcelMixin(MCPMixin):
"""Generate chart-ready data and configurations from Excel spreadsheets."""
start_time = time.time()
# Resolve and validate file
resolved_path = await resolve_office_file_path(file_path)
# Resolve and validate file (download if URL, or decode if content provided)
resolved_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
validation = await validate_office_file(resolved_path)
if validation["category"] not in ["excel"]:

45
src/mcwaddams/mixins/universal.py Normal file → Executable file
View File

@ -1,7 +1,7 @@
"""Universal Office Tools Mixin - Format-agnostic tools that work across all Office document types."""
import time
from typing import Any
from typing import Any, Optional
from fastmcp.contrib.mcp_mixin import MCPMixin, mcp_tool
from pydantic import Field
@ -17,6 +17,13 @@ from ..utils import (
from ..resources import resource_store, EmbeddedResource, ResourceStore
# Common field description for file_content parameter
FILE_CONTENT_DESC = (
"Base64-encoded file content (for hosted/HTTP transport). "
"When provided, file_path is used only for extension detection."
)
class UniversalMixin(MCPMixin):
"""Mixin containing format-agnostic tools that work across Word, Excel, PowerPoint, and CSV files."""
@ -27,6 +34,7 @@ class UniversalMixin(MCPMixin):
async def extract_text(
self,
file_path: str = Field(description="Path to Office document or URL"),
file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
preserve_formatting: bool = Field(default=False, description="Preserve text formatting and structure"),
include_metadata: bool = Field(default=True, description="Include document metadata in output"),
method: str = Field(default="auto", description="Extraction method: auto, primary, fallback")
@ -34,8 +42,8 @@ class UniversalMixin(MCPMixin):
start_time = time.time()
try:
# Resolve file path (download if URL)
local_path = await resolve_office_file_path(file_path)
# Resolve file path (download if URL, or decode if content provided)
local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
# Validate file
validation = await validate_office_file(local_path)
@ -85,6 +93,7 @@ class UniversalMixin(MCPMixin):
async def extract_images(
self,
file_path: str = Field(description="Path to Office document or URL"),
file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
min_width: int = Field(default=100, description="Minimum image width in pixels"),
min_height: int = Field(default=100, description="Minimum image height in pixels"),
output_format: str = Field(default="png", description="Output image format: png, jpg, jpeg"),
@ -93,8 +102,8 @@ class UniversalMixin(MCPMixin):
start_time = time.time()
try:
# Resolve file path
local_path = await resolve_office_file_path(file_path)
# Resolve file path (download if URL, or decode if content provided)
local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
# Validate file
validation = await validate_office_file(local_path)
@ -135,13 +144,14 @@ class UniversalMixin(MCPMixin):
)
async def extract_metadata(
self,
file_path: str = Field(description="Path to Office document or URL")
file_path: str = Field(description="Path to Office document or URL"),
file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC)
) -> dict[str, Any]:
start_time = time.time()
try:
# Resolve file path
local_path = await resolve_office_file_path(file_path)
# Resolve file path (download if URL, or decode if content provided)
local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
# Validate file
validation = await validate_office_file(local_path)
@ -175,11 +185,12 @@ class UniversalMixin(MCPMixin):
)
async def detect_office_format(
self,
file_path: str = Field(description="Path to Office document or URL")
file_path: str = Field(description="Path to Office document or URL"),
file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC)
) -> dict[str, Any]:
try:
# Resolve file path
local_path = await resolve_office_file_path(file_path)
# Resolve file path (download if URL, or decode if content provided)
local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
# Get comprehensive format detection
format_info = await detect_format(local_path)
@ -199,13 +210,14 @@ class UniversalMixin(MCPMixin):
)
async def analyze_document_health(
self,
file_path: str = Field(description="Path to Office document or URL")
file_path: str = Field(description="Path to Office document or URL"),
file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC)
) -> dict[str, Any]:
start_time = time.time()
try:
# Resolve file path
local_path = await resolve_office_file_path(file_path)
# Resolve file path (download if URL, or decode if content provided)
local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
# Validate file thoroughly
validation = await validate_office_file(local_path)
@ -350,6 +362,7 @@ class UniversalMixin(MCPMixin):
async def index_document(
self,
file_path: str = Field(description="Path to Office document or URL"),
file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
include_images: bool = Field(default=True, description="Index embedded images"),
include_chapters: bool = Field(default=True, description="Index chapters/sections (Word docs)"),
include_sheets: bool = Field(default=True, description="Index sheets (Excel docs)"),
@ -362,8 +375,8 @@ class UniversalMixin(MCPMixin):
"""
start_time = time.time()
# Resolve and validate
local_path = await resolve_office_file_path(file_path)
# Resolve and validate (download if URL, or decode if content provided)
local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
validation = await validate_office_file(local_path)
if not validation["is_valid"]:
raise OfficeFileError(f"Invalid file: {', '.join(validation['errors'])}")

47
src/mcwaddams/mixins/word.py Normal file → Executable file
View File

@ -18,6 +18,13 @@ from ..utils import (
from ..pagination import paginate_document_conversion, PaginationParams
# Common field description for file_content parameter
FILE_CONTENT_DESC = (
"Base64-encoded file content (for hosted/HTTP transport). "
"When provided, file_path is used only for extension detection."
)
class WordMixin(MCPMixin):
"""Mixin containing Word-specific tools for advanced document processing."""
@ -44,6 +51,7 @@ class WordMixin(MCPMixin):
async def convert_to_markdown(
self,
file_path: str = Field(description="Path to Office document or URL"),
file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
include_images: bool = Field(default=True, description="Include images in markdown output. When True, images are extracted to files and linked in the markdown."),
image_mode: str = Field(default="files", description="Image handling mode: 'files' (default, saves to disk and links), 'base64' (embeds inline - WARNING: can create massive responses), or 'references' (metadata only, no content)"),
max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding (only used when image_mode='base64')"),
@ -61,8 +69,8 @@ class WordMixin(MCPMixin):
) -> dict[str, Any]:
start_time = time.time()
# Resolve file path
local_path = await resolve_office_file_path(file_path)
# Resolve file path (download if URL, or decode if content provided)
local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
# Validate file
validation = await validate_office_file(local_path)
@ -275,6 +283,7 @@ class WordMixin(MCPMixin):
async def extract_word_tables(
self,
file_path: str = Field(description="Path to Word document or URL"),
file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
include_styling: bool = Field(default=True, description="Include table styling information (borders, alignment, etc.)"),
output_format: str = Field(default="structured", description="Output format: structured, csv, json, markdown"),
preserve_merged_cells: bool = Field(default=True, description="Handle merged cells appropriately"),
@ -286,8 +295,8 @@ class WordMixin(MCPMixin):
import json
import io
# Resolve and validate file
resolved_path = await resolve_office_file_path(file_path)
# Resolve and validate file (download if URL, or decode if content provided)
resolved_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
validation = await validate_office_file(resolved_path)
if validation["category"] != "word":
@ -451,6 +460,7 @@ class WordMixin(MCPMixin):
async def analyze_word_structure(
self,
file_path: str = Field(description="Path to Word document or URL"),
file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
include_page_info: bool = Field(default=True, description="Include page layout and section information"),
extract_outline: bool = Field(default=True, description="Extract document outline and heading hierarchy"),
analyze_styles: bool = Field(default=True, description="Analyze custom styles and formatting patterns")
@ -458,8 +468,8 @@ class WordMixin(MCPMixin):
"""Analyze Word document structure and organization."""
start_time = time.time()
# Resolve and validate file
resolved_path = await resolve_office_file_path(file_path)
# Resolve and validate file (download if URL, or decode if content provided)
resolved_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
validation = await validate_office_file(resolved_path)
if validation["category"] != "word":
@ -646,6 +656,7 @@ class WordMixin(MCPMixin):
async def get_document_outline(
self,
file_path: str = Field(description="Path to Word document or URL"),
file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
include_word_counts: bool = Field(default=True, description="Include estimated word count per section"),
detect_chapters: bool = Field(default=True, description="Detect and flag chapter headings specifically")
) -> dict[str, Any]:
@ -654,7 +665,7 @@ class WordMixin(MCPMixin):
from docx.oxml.ns import qn
start_time = time.time()
local_path = await resolve_office_file_path(file_path)
local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
validation = await validate_office_file(local_path)
if not validation["is_valid"]:
@ -765,13 +776,14 @@ class WordMixin(MCPMixin):
@handle_office_errors("Style consistency check")
async def check_style_consistency(
self,
file_path: str = Field(description="Path to Word document or URL")
file_path: str = Field(description="Path to Word document or URL"),
file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC)
) -> dict[str, Any]:
"""Check document for style and formatting consistency issues."""
from docx import Document
start_time = time.time()
local_path = await resolve_office_file_path(file_path)
local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
validation = await validate_office_file(local_path)
if not validation["is_valid"]:
@ -924,6 +936,7 @@ class WordMixin(MCPMixin):
self,
file_path: str = Field(description="Path to Word document or URL"),
query: str = Field(description="Text to search for (case-insensitive)"),
file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
context_chars: int = Field(default=100, description="Number of characters of context before and after match"),
max_results: int = Field(default=20, description="Maximum number of results to return")
) -> dict[str, Any]:
@ -931,7 +944,7 @@ class WordMixin(MCPMixin):
from docx import Document
start_time = time.time()
local_path = await resolve_office_file_path(file_path)
local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
validation = await validate_office_file(local_path)
if not validation["is_valid"]:
@ -1009,6 +1022,7 @@ class WordMixin(MCPMixin):
async def extract_entities(
self,
file_path: str = Field(description="Path to Word document or URL"),
file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
entity_types: str = Field(default="all", description="Entity types to extract: 'all', 'people', 'places', 'organizations', or comma-separated combination"),
min_occurrences: int = Field(default=1, description="Minimum occurrences for an entity to be included"),
include_context: bool = Field(default=True, description="Include sample context for each entity")
@ -1019,7 +1033,7 @@ class WordMixin(MCPMixin):
import re
start_time = time.time()
local_path = await resolve_office_file_path(file_path)
local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
validation = await validate_office_file(local_path)
if not validation["is_valid"]:
@ -1219,6 +1233,7 @@ class WordMixin(MCPMixin):
async def get_chapter_summaries(
self,
file_path: str = Field(description="Path to Word document or URL"),
file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
sentences_per_chapter: int = Field(default=3, description="Number of opening sentences to include per chapter"),
include_word_counts: bool = Field(default=True, description="Include word count for each chapter")
) -> dict[str, Any]:
@ -1227,7 +1242,7 @@ class WordMixin(MCPMixin):
import re
start_time = time.time()
local_path = await resolve_office_file_path(file_path)
local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
validation = await validate_office_file(local_path)
if not validation["is_valid"]:
@ -1318,6 +1333,7 @@ class WordMixin(MCPMixin):
async def save_reading_progress(
self,
file_path: str = Field(description="Path to Word document"),
file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
chapter_number: int = Field(default=1, description="Current chapter number"),
paragraph_index: int = Field(default=0, description="Current paragraph index"),
notes: str = Field(default="", description="Optional notes about where you left off")
@ -1326,7 +1342,7 @@ class WordMixin(MCPMixin):
import json
from datetime import datetime
local_path = await resolve_office_file_path(file_path)
local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
validation = await validate_office_file(local_path)
if not validation["is_valid"]:
@ -1386,12 +1402,13 @@ class WordMixin(MCPMixin):
@handle_office_errors("Get reading progress")
async def get_reading_progress(
self,
file_path: str = Field(description="Path to Word document")
file_path: str = Field(description="Path to Word document"),
file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC)
) -> dict[str, Any]:
"""Retrieve saved reading progress from bookmark file."""
import json
local_path = await resolve_office_file_path(file_path)
local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
validation = await validate_office_file(local_path)
if not validation["is_valid"]:

View File

@ -4,6 +4,7 @@ import os
import time
import hashlib
import tempfile
import base64
from pathlib import Path
from typing import Optional, Dict, Any
import aiofiles
@ -12,6 +13,11 @@ from urllib.parse import urlparse
from .validation import OfficeFileError
# Environment variable to control local file access
# Default to False (secure) - set to "true" for local stdio transport
MCP_ALLOW_LOCAL_FILES = os.environ.get("MCP_ALLOW_LOCAL_FILES", "false").lower() == "true"
class OfficeFileCache:
"""Simple file cache for downloaded Office documents."""
@ -212,38 +218,120 @@ def get_cache() -> OfficeFileCache:
return _global_cache
async def resolve_office_file_path(file_path: str, use_cache: bool = True) -> str:
"""Resolve file path, downloading from URL if necessary.
async def resolve_office_file_path(
file_path: str,
use_cache: bool = True,
file_content: Optional[str] = None,
filename: Optional[str] = None
) -> str:
"""Resolve file path, downloading from URL if necessary, or decode inline content.
Args:
file_path: Local file path or URL
file_path: Local file path or URL (ignored if file_content provided)
use_cache: Whether to use caching for URLs
file_content: Base64-encoded file content (for hosted/HTTP transport)
filename: Original filename for extension detection (used with file_content)
Returns:
Local file path (downloaded if was URL)
Local file path (temp file if from content, downloaded if from URL)
Security:
When MCP_ALLOW_LOCAL_FILES=false (default for HTTP transport):
- Local file paths are rejected
- Only URLs and file_content are allowed
- This prevents hosted servers from accessing server-side files
"""
# Priority 1: If file_content is provided, decode and write to temp file
if file_content:
return await _resolve_from_content(file_content, filename or file_path)
# Check if it's a URL
parsed = urlparse(file_path)
if not (parsed.scheme and parsed.netloc):
# Local file path
is_url = bool(parsed.scheme and parsed.netloc)
if not is_url:
# Local file path - check if allowed
if not MCP_ALLOW_LOCAL_FILES:
raise OfficeFileError(
"Local file access is disabled for this server. "
"Please use file_content parameter to upload document data, "
"or provide a URL. Set MCP_ALLOW_LOCAL_FILES=true to enable local files."
)
return file_path
# Validate URL scheme
if parsed.scheme not in ['http', 'https']:
raise OfficeFileError(f"Unsupported URL scheme: {parsed.scheme}")
cache = get_cache()
# Check cache first
if use_cache and cache.is_cached(file_path):
cached_path = cache.get_cached_path(file_path)
if cached_path:
return cached_path
# Download and cache
if use_cache:
return await cache.cache_url(file_path)
else:
# Direct download without caching
from .validation import download_office_file
return await download_office_file(file_path)
return await download_office_file(file_path)
async def _resolve_from_content(file_content: str, filename_hint: str) -> str:
"""Decode base64 content and write to a temp file.
Args:
file_content: Base64-encoded file data
filename_hint: Filename or path to extract extension from
Returns:
Path to temporary file containing decoded content
"""
try:
# Decode base64 content
content_bytes = base64.b64decode(file_content)
except Exception as e:
raise OfficeFileError(f"Invalid base64 content: {str(e)}")
# Extract extension from filename hint
ext = Path(filename_hint).suffix.lower()
if not ext:
# Try to detect from content magic bytes
ext = _detect_extension_from_bytes(content_bytes)
# Create temp file with correct extension
temp_dir = Path(tempfile.gettempdir()) / "mcp_office_uploads"
temp_dir.mkdir(exist_ok=True)
# Generate unique filename
content_hash = hashlib.sha256(content_bytes).hexdigest()[:12]
temp_path = temp_dir / f"upload_{content_hash}{ext}"
# Write content to temp file
async with aiofiles.open(temp_path, 'wb') as f:
await f.write(content_bytes)
return str(temp_path)
def _detect_extension_from_bytes(content: bytes) -> str:
"""Detect file extension from magic bytes."""
# ZIP-based formats (docx, xlsx, pptx)
if content[:4] == b'PK\x03\x04':
# Could be docx, xlsx, or pptx - default to .docx
# Full detection would require reading internal XML
return ".docx"
# OLE Compound Document (doc, xls, ppt)
if content[:8] == b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1':
return ".doc"
# CSV (text-based, starts with printable characters)
if content[:1].isalpha() or content[:1] in b'"\'':
return ".csv"
# Default
return ".bin"

View File

@ -324,7 +324,10 @@ class TestMockingStrategies:
assert result["document_metadata"] == mock_office_file["metadata"]
# Verify mocks were called correctly
mock_resolve.assert_called_once_with(mock_office_file["path"])
mock_resolve.assert_called_once()
# First positional arg should be the file path
call_args = mock_resolve.call_args
assert call_args[0][0] == mock_office_file["path"]
mock_validate.assert_called_once_with(mock_office_file["path"])
mock_detect.assert_called_once_with(mock_office_file["path"])

View File

@ -408,7 +408,10 @@ class TestMockingPatterns:
assert "structure" in result # Because preserve_formatting=True
# Verify all mocks were called appropriately
mock_resolve.assert_called_once_with("/test/document.docx")
mock_resolve.assert_called_once()
# First positional arg should be the file path
call_args = mock_resolve.call_args
assert call_args[0][0] == "/test/document.docx"
mock_validate.assert_called_once_with("/realistic/path/document.docx")
mock_detect.assert_called_once_with("/realistic/path/document.docx")
mock_extract.assert_called_once()