Add file_content parameter for hosted HTTP transport
- Add MCP_ALLOW_LOCAL_FILES env var (default false for security) - All tools now accept file_content (base64) for remote document upload - Local file access blocked on hosted servers unless explicitly enabled - Update docker-compose to set MCP_ALLOW_LOCAL_FILES=false - Fix test assertions for updated function signatures
This commit is contained in:
parent
483ed9121b
commit
b0477103d5
@ -15,25 +15,24 @@ services:
|
|||||||
- MCP_PORT=8000
|
- MCP_PORT=8000
|
||||||
- DEBUG=${DEBUG:-false}
|
- DEBUG=${DEBUG:-false}
|
||||||
- OFFICE_TEMP_DIR=/tmp/mcwaddams
|
- OFFICE_TEMP_DIR=/tmp/mcwaddams
|
||||||
|
# Security: Disable local file access for hosted server
|
||||||
|
# Clients must use file_content parameter to upload documents
|
||||||
|
- MCP_ALLOW_LOCAL_FILES=false
|
||||||
volumes:
|
volumes:
|
||||||
# Temp directory for document processing
|
# Temp directory for document processing
|
||||||
- mcwaddams-temp:/tmp/mcwaddams
|
- mcwaddams-temp:/tmp/mcwaddams
|
||||||
networks:
|
networks:
|
||||||
- caddy
|
- caddy
|
||||||
labels:
|
labels:
|
||||||
# Caddy-docker-proxy labels for /mcp endpoint
|
# Caddy-docker-proxy labels - direct reverse proxy (no path stripping)
|
||||||
|
# MCP is served at /mcp on the backend
|
||||||
caddy: ${MCWADDAMS_HOST:-mcwaddams.l.supported.systems}
|
caddy: ${MCWADDAMS_HOST:-mcwaddams.l.supported.systems}
|
||||||
caddy.@mcp.path: /mcp/*
|
caddy.reverse_proxy: "{{upstreams 8000}}"
|
||||||
caddy.@mcp.path_strip: /mcp
|
caddy.reverse_proxy.flush_interval: "-1"
|
||||||
caddy.handle: "@mcp"
|
caddy.reverse_proxy.transport: "http"
|
||||||
caddy.handle.reverse_proxy: "{{upstreams 8000}}"
|
caddy.reverse_proxy.transport.read_timeout: "0"
|
||||||
caddy.handle.reverse_proxy.flush_interval: "-1"
|
caddy.reverse_proxy.transport.write_timeout: "0"
|
||||||
caddy.handle.reverse_proxy.transport: "http"
|
caddy.reverse_proxy.stream_timeout: "24h"
|
||||||
caddy.handle.reverse_proxy.transport.read_timeout: "0"
|
|
||||||
caddy.handle.reverse_proxy.transport.write_timeout: "0"
|
|
||||||
caddy.handle.reverse_proxy.stream_timeout: "24h"
|
|
||||||
caddy.handle.reverse_proxy.header_up.Connection: "{http.request.header.Connection}"
|
|
||||||
caddy.handle.reverse_proxy.header_up.Upgrade: "{http.request.header.Upgrade}"
|
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://127.0.0.1:8000/health')"]
|
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://127.0.0.1:8000/health')"]
|
||||||
interval: 30s
|
interval: 30s
|
||||||
|
|||||||
22
src/mcwaddams/mixins/excel.py
Normal file → Executable file
22
src/mcwaddams/mixins/excel.py
Normal file → Executable file
@ -17,6 +17,13 @@ from ..utils import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Common field description for file_content parameter
|
||||||
|
FILE_CONTENT_DESC = (
|
||||||
|
"Base64-encoded file content (for hosted/HTTP transport). "
|
||||||
|
"When provided, file_path is used only for extension detection."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class ExcelMixin(MCPMixin):
|
class ExcelMixin(MCPMixin):
|
||||||
"""Mixin containing Excel-specific tools for advanced spreadsheet processing."""
|
"""Mixin containing Excel-specific tools for advanced spreadsheet processing."""
|
||||||
|
|
||||||
@ -34,6 +41,7 @@ class ExcelMixin(MCPMixin):
|
|||||||
async def analyze_excel_data(
|
async def analyze_excel_data(
|
||||||
self,
|
self,
|
||||||
file_path: str = Field(description="Path to Excel document or URL"),
|
file_path: str = Field(description="Path to Excel document or URL"),
|
||||||
|
file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
|
||||||
sheet_names: List[str] = Field(default=[], description="Specific sheets to analyze (empty = all sheets)"),
|
sheet_names: List[str] = Field(default=[], description="Specific sheets to analyze (empty = all sheets)"),
|
||||||
include_statistics: bool = Field(default=True, description="Include statistical analysis (mean, median, etc.)"),
|
include_statistics: bool = Field(default=True, description="Include statistical analysis (mean, median, etc.)"),
|
||||||
detect_data_types: bool = Field(default=True, description="Analyze and detect optimal data types"),
|
detect_data_types: bool = Field(default=True, description="Analyze and detect optimal data types"),
|
||||||
@ -42,8 +50,8 @@ class ExcelMixin(MCPMixin):
|
|||||||
"""Analyze Excel data with comprehensive statistics and data quality assessment."""
|
"""Analyze Excel data with comprehensive statistics and data quality assessment."""
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
# Resolve and validate file
|
# Resolve and validate file (download if URL, or decode if content provided)
|
||||||
resolved_path = await resolve_office_file_path(file_path)
|
resolved_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
|
||||||
validation = await validate_office_file(resolved_path)
|
validation = await validate_office_file(resolved_path)
|
||||||
|
|
||||||
if validation["category"] not in ["excel"]:
|
if validation["category"] not in ["excel"]:
|
||||||
@ -178,6 +186,7 @@ class ExcelMixin(MCPMixin):
|
|||||||
async def extract_excel_formulas(
|
async def extract_excel_formulas(
|
||||||
self,
|
self,
|
||||||
file_path: str = Field(description="Path to Excel document or URL"),
|
file_path: str = Field(description="Path to Excel document or URL"),
|
||||||
|
file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
|
||||||
sheet_names: List[str] = Field(default=[], description="Specific sheets to process (empty = all sheets)"),
|
sheet_names: List[str] = Field(default=[], description="Specific sheets to process (empty = all sheets)"),
|
||||||
include_values: bool = Field(default=True, description="Include calculated values alongside formulas"),
|
include_values: bool = Field(default=True, description="Include calculated values alongside formulas"),
|
||||||
analyze_dependencies: bool = Field(default=True, description="Analyze formula dependencies and references")
|
analyze_dependencies: bool = Field(default=True, description="Analyze formula dependencies and references")
|
||||||
@ -186,8 +195,8 @@ class ExcelMixin(MCPMixin):
|
|||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
import re
|
import re
|
||||||
|
|
||||||
# Resolve and validate file
|
# Resolve and validate file (download if URL, or decode if content provided)
|
||||||
resolved_path = await resolve_office_file_path(file_path)
|
resolved_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
|
||||||
validation = await validate_office_file(resolved_path)
|
validation = await validate_office_file(resolved_path)
|
||||||
|
|
||||||
if validation["category"] not in ["excel"] or validation["extension"] == ".csv":
|
if validation["category"] not in ["excel"] or validation["extension"] == ".csv":
|
||||||
@ -288,6 +297,7 @@ class ExcelMixin(MCPMixin):
|
|||||||
async def create_excel_chart_data(
|
async def create_excel_chart_data(
|
||||||
self,
|
self,
|
||||||
file_path: str = Field(description="Path to Excel document or URL"),
|
file_path: str = Field(description="Path to Excel document or URL"),
|
||||||
|
file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
|
||||||
sheet_name: str = Field(default="", description="Sheet to process (empty = first sheet)"),
|
sheet_name: str = Field(default="", description="Sheet to process (empty = first sheet)"),
|
||||||
chart_type: str = Field(default="auto", description="Chart type: auto, bar, line, pie, scatter, histogram"),
|
chart_type: str = Field(default="auto", description="Chart type: auto, bar, line, pie, scatter, histogram"),
|
||||||
x_column: str = Field(default="", description="Column for X-axis (empty = auto-detect)"),
|
x_column: str = Field(default="", description="Column for X-axis (empty = auto-detect)"),
|
||||||
@ -297,8 +307,8 @@ class ExcelMixin(MCPMixin):
|
|||||||
"""Generate chart-ready data and configurations from Excel spreadsheets."""
|
"""Generate chart-ready data and configurations from Excel spreadsheets."""
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
# Resolve and validate file
|
# Resolve and validate file (download if URL, or decode if content provided)
|
||||||
resolved_path = await resolve_office_file_path(file_path)
|
resolved_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
|
||||||
validation = await validate_office_file(resolved_path)
|
validation = await validate_office_file(resolved_path)
|
||||||
|
|
||||||
if validation["category"] not in ["excel"]:
|
if validation["category"] not in ["excel"]:
|
||||||
|
|||||||
45
src/mcwaddams/mixins/universal.py
Normal file → Executable file
45
src/mcwaddams/mixins/universal.py
Normal file → Executable file
@ -1,7 +1,7 @@
|
|||||||
"""Universal Office Tools Mixin - Format-agnostic tools that work across all Office document types."""
|
"""Universal Office Tools Mixin - Format-agnostic tools that work across all Office document types."""
|
||||||
|
|
||||||
import time
|
import time
|
||||||
from typing import Any
|
from typing import Any, Optional
|
||||||
|
|
||||||
from fastmcp.contrib.mcp_mixin import MCPMixin, mcp_tool
|
from fastmcp.contrib.mcp_mixin import MCPMixin, mcp_tool
|
||||||
from pydantic import Field
|
from pydantic import Field
|
||||||
@ -17,6 +17,13 @@ from ..utils import (
|
|||||||
from ..resources import resource_store, EmbeddedResource, ResourceStore
|
from ..resources import resource_store, EmbeddedResource, ResourceStore
|
||||||
|
|
||||||
|
|
||||||
|
# Common field description for file_content parameter
|
||||||
|
FILE_CONTENT_DESC = (
|
||||||
|
"Base64-encoded file content (for hosted/HTTP transport). "
|
||||||
|
"When provided, file_path is used only for extension detection."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class UniversalMixin(MCPMixin):
|
class UniversalMixin(MCPMixin):
|
||||||
"""Mixin containing format-agnostic tools that work across Word, Excel, PowerPoint, and CSV files."""
|
"""Mixin containing format-agnostic tools that work across Word, Excel, PowerPoint, and CSV files."""
|
||||||
|
|
||||||
@ -27,6 +34,7 @@ class UniversalMixin(MCPMixin):
|
|||||||
async def extract_text(
|
async def extract_text(
|
||||||
self,
|
self,
|
||||||
file_path: str = Field(description="Path to Office document or URL"),
|
file_path: str = Field(description="Path to Office document or URL"),
|
||||||
|
file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
|
||||||
preserve_formatting: bool = Field(default=False, description="Preserve text formatting and structure"),
|
preserve_formatting: bool = Field(default=False, description="Preserve text formatting and structure"),
|
||||||
include_metadata: bool = Field(default=True, description="Include document metadata in output"),
|
include_metadata: bool = Field(default=True, description="Include document metadata in output"),
|
||||||
method: str = Field(default="auto", description="Extraction method: auto, primary, fallback")
|
method: str = Field(default="auto", description="Extraction method: auto, primary, fallback")
|
||||||
@ -34,8 +42,8 @@ class UniversalMixin(MCPMixin):
|
|||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Resolve file path (download if URL)
|
# Resolve file path (download if URL, or decode if content provided)
|
||||||
local_path = await resolve_office_file_path(file_path)
|
local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
|
||||||
|
|
||||||
# Validate file
|
# Validate file
|
||||||
validation = await validate_office_file(local_path)
|
validation = await validate_office_file(local_path)
|
||||||
@ -85,6 +93,7 @@ class UniversalMixin(MCPMixin):
|
|||||||
async def extract_images(
|
async def extract_images(
|
||||||
self,
|
self,
|
||||||
file_path: str = Field(description="Path to Office document or URL"),
|
file_path: str = Field(description="Path to Office document or URL"),
|
||||||
|
file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
|
||||||
min_width: int = Field(default=100, description="Minimum image width in pixels"),
|
min_width: int = Field(default=100, description="Minimum image width in pixels"),
|
||||||
min_height: int = Field(default=100, description="Minimum image height in pixels"),
|
min_height: int = Field(default=100, description="Minimum image height in pixels"),
|
||||||
output_format: str = Field(default="png", description="Output image format: png, jpg, jpeg"),
|
output_format: str = Field(default="png", description="Output image format: png, jpg, jpeg"),
|
||||||
@ -93,8 +102,8 @@ class UniversalMixin(MCPMixin):
|
|||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Resolve file path
|
# Resolve file path (download if URL, or decode if content provided)
|
||||||
local_path = await resolve_office_file_path(file_path)
|
local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
|
||||||
|
|
||||||
# Validate file
|
# Validate file
|
||||||
validation = await validate_office_file(local_path)
|
validation = await validate_office_file(local_path)
|
||||||
@ -135,13 +144,14 @@ class UniversalMixin(MCPMixin):
|
|||||||
)
|
)
|
||||||
async def extract_metadata(
|
async def extract_metadata(
|
||||||
self,
|
self,
|
||||||
file_path: str = Field(description="Path to Office document or URL")
|
file_path: str = Field(description="Path to Office document or URL"),
|
||||||
|
file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC)
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Resolve file path
|
# Resolve file path (download if URL, or decode if content provided)
|
||||||
local_path = await resolve_office_file_path(file_path)
|
local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
|
||||||
|
|
||||||
# Validate file
|
# Validate file
|
||||||
validation = await validate_office_file(local_path)
|
validation = await validate_office_file(local_path)
|
||||||
@ -175,11 +185,12 @@ class UniversalMixin(MCPMixin):
|
|||||||
)
|
)
|
||||||
async def detect_office_format(
|
async def detect_office_format(
|
||||||
self,
|
self,
|
||||||
file_path: str = Field(description="Path to Office document or URL")
|
file_path: str = Field(description="Path to Office document or URL"),
|
||||||
|
file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC)
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
try:
|
try:
|
||||||
# Resolve file path
|
# Resolve file path (download if URL, or decode if content provided)
|
||||||
local_path = await resolve_office_file_path(file_path)
|
local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
|
||||||
|
|
||||||
# Get comprehensive format detection
|
# Get comprehensive format detection
|
||||||
format_info = await detect_format(local_path)
|
format_info = await detect_format(local_path)
|
||||||
@ -199,13 +210,14 @@ class UniversalMixin(MCPMixin):
|
|||||||
)
|
)
|
||||||
async def analyze_document_health(
|
async def analyze_document_health(
|
||||||
self,
|
self,
|
||||||
file_path: str = Field(description="Path to Office document or URL")
|
file_path: str = Field(description="Path to Office document or URL"),
|
||||||
|
file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC)
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Resolve file path
|
# Resolve file path (download if URL, or decode if content provided)
|
||||||
local_path = await resolve_office_file_path(file_path)
|
local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
|
||||||
|
|
||||||
# Validate file thoroughly
|
# Validate file thoroughly
|
||||||
validation = await validate_office_file(local_path)
|
validation = await validate_office_file(local_path)
|
||||||
@ -350,6 +362,7 @@ class UniversalMixin(MCPMixin):
|
|||||||
async def index_document(
|
async def index_document(
|
||||||
self,
|
self,
|
||||||
file_path: str = Field(description="Path to Office document or URL"),
|
file_path: str = Field(description="Path to Office document or URL"),
|
||||||
|
file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
|
||||||
include_images: bool = Field(default=True, description="Index embedded images"),
|
include_images: bool = Field(default=True, description="Index embedded images"),
|
||||||
include_chapters: bool = Field(default=True, description="Index chapters/sections (Word docs)"),
|
include_chapters: bool = Field(default=True, description="Index chapters/sections (Word docs)"),
|
||||||
include_sheets: bool = Field(default=True, description="Index sheets (Excel docs)"),
|
include_sheets: bool = Field(default=True, description="Index sheets (Excel docs)"),
|
||||||
@ -362,8 +375,8 @@ class UniversalMixin(MCPMixin):
|
|||||||
"""
|
"""
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
# Resolve and validate
|
# Resolve and validate (download if URL, or decode if content provided)
|
||||||
local_path = await resolve_office_file_path(file_path)
|
local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
|
||||||
validation = await validate_office_file(local_path)
|
validation = await validate_office_file(local_path)
|
||||||
if not validation["is_valid"]:
|
if not validation["is_valid"]:
|
||||||
raise OfficeFileError(f"Invalid file: {', '.join(validation['errors'])}")
|
raise OfficeFileError(f"Invalid file: {', '.join(validation['errors'])}")
|
||||||
|
|||||||
47
src/mcwaddams/mixins/word.py
Normal file → Executable file
47
src/mcwaddams/mixins/word.py
Normal file → Executable file
@ -18,6 +18,13 @@ from ..utils import (
|
|||||||
from ..pagination import paginate_document_conversion, PaginationParams
|
from ..pagination import paginate_document_conversion, PaginationParams
|
||||||
|
|
||||||
|
|
||||||
|
# Common field description for file_content parameter
|
||||||
|
FILE_CONTENT_DESC = (
|
||||||
|
"Base64-encoded file content (for hosted/HTTP transport). "
|
||||||
|
"When provided, file_path is used only for extension detection."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class WordMixin(MCPMixin):
|
class WordMixin(MCPMixin):
|
||||||
"""Mixin containing Word-specific tools for advanced document processing."""
|
"""Mixin containing Word-specific tools for advanced document processing."""
|
||||||
|
|
||||||
@ -44,6 +51,7 @@ class WordMixin(MCPMixin):
|
|||||||
async def convert_to_markdown(
|
async def convert_to_markdown(
|
||||||
self,
|
self,
|
||||||
file_path: str = Field(description="Path to Office document or URL"),
|
file_path: str = Field(description="Path to Office document or URL"),
|
||||||
|
file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
|
||||||
include_images: bool = Field(default=True, description="Include images in markdown output. When True, images are extracted to files and linked in the markdown."),
|
include_images: bool = Field(default=True, description="Include images in markdown output. When True, images are extracted to files and linked in the markdown."),
|
||||||
image_mode: str = Field(default="files", description="Image handling mode: 'files' (default, saves to disk and links), 'base64' (embeds inline - WARNING: can create massive responses), or 'references' (metadata only, no content)"),
|
image_mode: str = Field(default="files", description="Image handling mode: 'files' (default, saves to disk and links), 'base64' (embeds inline - WARNING: can create massive responses), or 'references' (metadata only, no content)"),
|
||||||
max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding (only used when image_mode='base64')"),
|
max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding (only used when image_mode='base64')"),
|
||||||
@ -61,8 +69,8 @@ class WordMixin(MCPMixin):
|
|||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
# Resolve file path
|
# Resolve file path (download if URL, or decode if content provided)
|
||||||
local_path = await resolve_office_file_path(file_path)
|
local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
|
||||||
|
|
||||||
# Validate file
|
# Validate file
|
||||||
validation = await validate_office_file(local_path)
|
validation = await validate_office_file(local_path)
|
||||||
@ -275,6 +283,7 @@ class WordMixin(MCPMixin):
|
|||||||
async def extract_word_tables(
|
async def extract_word_tables(
|
||||||
self,
|
self,
|
||||||
file_path: str = Field(description="Path to Word document or URL"),
|
file_path: str = Field(description="Path to Word document or URL"),
|
||||||
|
file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
|
||||||
include_styling: bool = Field(default=True, description="Include table styling information (borders, alignment, etc.)"),
|
include_styling: bool = Field(default=True, description="Include table styling information (borders, alignment, etc.)"),
|
||||||
output_format: str = Field(default="structured", description="Output format: structured, csv, json, markdown"),
|
output_format: str = Field(default="structured", description="Output format: structured, csv, json, markdown"),
|
||||||
preserve_merged_cells: bool = Field(default=True, description="Handle merged cells appropriately"),
|
preserve_merged_cells: bool = Field(default=True, description="Handle merged cells appropriately"),
|
||||||
@ -286,8 +295,8 @@ class WordMixin(MCPMixin):
|
|||||||
import json
|
import json
|
||||||
import io
|
import io
|
||||||
|
|
||||||
# Resolve and validate file
|
# Resolve and validate file (download if URL, or decode if content provided)
|
||||||
resolved_path = await resolve_office_file_path(file_path)
|
resolved_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
|
||||||
validation = await validate_office_file(resolved_path)
|
validation = await validate_office_file(resolved_path)
|
||||||
|
|
||||||
if validation["category"] != "word":
|
if validation["category"] != "word":
|
||||||
@ -451,6 +460,7 @@ class WordMixin(MCPMixin):
|
|||||||
async def analyze_word_structure(
|
async def analyze_word_structure(
|
||||||
self,
|
self,
|
||||||
file_path: str = Field(description="Path to Word document or URL"),
|
file_path: str = Field(description="Path to Word document or URL"),
|
||||||
|
file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
|
||||||
include_page_info: bool = Field(default=True, description="Include page layout and section information"),
|
include_page_info: bool = Field(default=True, description="Include page layout and section information"),
|
||||||
extract_outline: bool = Field(default=True, description="Extract document outline and heading hierarchy"),
|
extract_outline: bool = Field(default=True, description="Extract document outline and heading hierarchy"),
|
||||||
analyze_styles: bool = Field(default=True, description="Analyze custom styles and formatting patterns")
|
analyze_styles: bool = Field(default=True, description="Analyze custom styles and formatting patterns")
|
||||||
@ -458,8 +468,8 @@ class WordMixin(MCPMixin):
|
|||||||
"""Analyze Word document structure and organization."""
|
"""Analyze Word document structure and organization."""
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
# Resolve and validate file
|
# Resolve and validate file (download if URL, or decode if content provided)
|
||||||
resolved_path = await resolve_office_file_path(file_path)
|
resolved_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
|
||||||
validation = await validate_office_file(resolved_path)
|
validation = await validate_office_file(resolved_path)
|
||||||
|
|
||||||
if validation["category"] != "word":
|
if validation["category"] != "word":
|
||||||
@ -646,6 +656,7 @@ class WordMixin(MCPMixin):
|
|||||||
async def get_document_outline(
|
async def get_document_outline(
|
||||||
self,
|
self,
|
||||||
file_path: str = Field(description="Path to Word document or URL"),
|
file_path: str = Field(description="Path to Word document or URL"),
|
||||||
|
file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
|
||||||
include_word_counts: bool = Field(default=True, description="Include estimated word count per section"),
|
include_word_counts: bool = Field(default=True, description="Include estimated word count per section"),
|
||||||
detect_chapters: bool = Field(default=True, description="Detect and flag chapter headings specifically")
|
detect_chapters: bool = Field(default=True, description="Detect and flag chapter headings specifically")
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
@ -654,7 +665,7 @@ class WordMixin(MCPMixin):
|
|||||||
from docx.oxml.ns import qn
|
from docx.oxml.ns import qn
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
local_path = await resolve_office_file_path(file_path)
|
local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
|
||||||
|
|
||||||
validation = await validate_office_file(local_path)
|
validation = await validate_office_file(local_path)
|
||||||
if not validation["is_valid"]:
|
if not validation["is_valid"]:
|
||||||
@ -765,13 +776,14 @@ class WordMixin(MCPMixin):
|
|||||||
@handle_office_errors("Style consistency check")
|
@handle_office_errors("Style consistency check")
|
||||||
async def check_style_consistency(
|
async def check_style_consistency(
|
||||||
self,
|
self,
|
||||||
file_path: str = Field(description="Path to Word document or URL")
|
file_path: str = Field(description="Path to Word document or URL"),
|
||||||
|
file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC)
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
"""Check document for style and formatting consistency issues."""
|
"""Check document for style and formatting consistency issues."""
|
||||||
from docx import Document
|
from docx import Document
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
local_path = await resolve_office_file_path(file_path)
|
local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
|
||||||
|
|
||||||
validation = await validate_office_file(local_path)
|
validation = await validate_office_file(local_path)
|
||||||
if not validation["is_valid"]:
|
if not validation["is_valid"]:
|
||||||
@ -924,6 +936,7 @@ class WordMixin(MCPMixin):
|
|||||||
self,
|
self,
|
||||||
file_path: str = Field(description="Path to Word document or URL"),
|
file_path: str = Field(description="Path to Word document or URL"),
|
||||||
query: str = Field(description="Text to search for (case-insensitive)"),
|
query: str = Field(description="Text to search for (case-insensitive)"),
|
||||||
|
file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
|
||||||
context_chars: int = Field(default=100, description="Number of characters of context before and after match"),
|
context_chars: int = Field(default=100, description="Number of characters of context before and after match"),
|
||||||
max_results: int = Field(default=20, description="Maximum number of results to return")
|
max_results: int = Field(default=20, description="Maximum number of results to return")
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
@ -931,7 +944,7 @@ class WordMixin(MCPMixin):
|
|||||||
from docx import Document
|
from docx import Document
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
local_path = await resolve_office_file_path(file_path)
|
local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
|
||||||
|
|
||||||
validation = await validate_office_file(local_path)
|
validation = await validate_office_file(local_path)
|
||||||
if not validation["is_valid"]:
|
if not validation["is_valid"]:
|
||||||
@ -1009,6 +1022,7 @@ class WordMixin(MCPMixin):
|
|||||||
async def extract_entities(
|
async def extract_entities(
|
||||||
self,
|
self,
|
||||||
file_path: str = Field(description="Path to Word document or URL"),
|
file_path: str = Field(description="Path to Word document or URL"),
|
||||||
|
file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
|
||||||
entity_types: str = Field(default="all", description="Entity types to extract: 'all', 'people', 'places', 'organizations', or comma-separated combination"),
|
entity_types: str = Field(default="all", description="Entity types to extract: 'all', 'people', 'places', 'organizations', or comma-separated combination"),
|
||||||
min_occurrences: int = Field(default=1, description="Minimum occurrences for an entity to be included"),
|
min_occurrences: int = Field(default=1, description="Minimum occurrences for an entity to be included"),
|
||||||
include_context: bool = Field(default=True, description="Include sample context for each entity")
|
include_context: bool = Field(default=True, description="Include sample context for each entity")
|
||||||
@ -1019,7 +1033,7 @@ class WordMixin(MCPMixin):
|
|||||||
import re
|
import re
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
local_path = await resolve_office_file_path(file_path)
|
local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
|
||||||
|
|
||||||
validation = await validate_office_file(local_path)
|
validation = await validate_office_file(local_path)
|
||||||
if not validation["is_valid"]:
|
if not validation["is_valid"]:
|
||||||
@ -1219,6 +1233,7 @@ class WordMixin(MCPMixin):
|
|||||||
async def get_chapter_summaries(
|
async def get_chapter_summaries(
|
||||||
self,
|
self,
|
||||||
file_path: str = Field(description="Path to Word document or URL"),
|
file_path: str = Field(description="Path to Word document or URL"),
|
||||||
|
file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
|
||||||
sentences_per_chapter: int = Field(default=3, description="Number of opening sentences to include per chapter"),
|
sentences_per_chapter: int = Field(default=3, description="Number of opening sentences to include per chapter"),
|
||||||
include_word_counts: bool = Field(default=True, description="Include word count for each chapter")
|
include_word_counts: bool = Field(default=True, description="Include word count for each chapter")
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
@ -1227,7 +1242,7 @@ class WordMixin(MCPMixin):
|
|||||||
import re
|
import re
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
local_path = await resolve_office_file_path(file_path)
|
local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
|
||||||
|
|
||||||
validation = await validate_office_file(local_path)
|
validation = await validate_office_file(local_path)
|
||||||
if not validation["is_valid"]:
|
if not validation["is_valid"]:
|
||||||
@ -1318,6 +1333,7 @@ class WordMixin(MCPMixin):
|
|||||||
async def save_reading_progress(
|
async def save_reading_progress(
|
||||||
self,
|
self,
|
||||||
file_path: str = Field(description="Path to Word document"),
|
file_path: str = Field(description="Path to Word document"),
|
||||||
|
file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC),
|
||||||
chapter_number: int = Field(default=1, description="Current chapter number"),
|
chapter_number: int = Field(default=1, description="Current chapter number"),
|
||||||
paragraph_index: int = Field(default=0, description="Current paragraph index"),
|
paragraph_index: int = Field(default=0, description="Current paragraph index"),
|
||||||
notes: str = Field(default="", description="Optional notes about where you left off")
|
notes: str = Field(default="", description="Optional notes about where you left off")
|
||||||
@ -1326,7 +1342,7 @@ class WordMixin(MCPMixin):
|
|||||||
import json
|
import json
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
local_path = await resolve_office_file_path(file_path)
|
local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
|
||||||
|
|
||||||
validation = await validate_office_file(local_path)
|
validation = await validate_office_file(local_path)
|
||||||
if not validation["is_valid"]:
|
if not validation["is_valid"]:
|
||||||
@ -1386,12 +1402,13 @@ class WordMixin(MCPMixin):
|
|||||||
@handle_office_errors("Get reading progress")
|
@handle_office_errors("Get reading progress")
|
||||||
async def get_reading_progress(
|
async def get_reading_progress(
|
||||||
self,
|
self,
|
||||||
file_path: str = Field(description="Path to Word document")
|
file_path: str = Field(description="Path to Word document"),
|
||||||
|
file_content: Optional[str] = Field(default=None, description=FILE_CONTENT_DESC)
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
"""Retrieve saved reading progress from bookmark file."""
|
"""Retrieve saved reading progress from bookmark file."""
|
||||||
import json
|
import json
|
||||||
|
|
||||||
local_path = await resolve_office_file_path(file_path)
|
local_path = await resolve_office_file_path(file_path, file_content=file_content, filename=file_path)
|
||||||
|
|
||||||
validation = await validate_office_file(local_path)
|
validation = await validate_office_file(local_path)
|
||||||
if not validation["is_valid"]:
|
if not validation["is_valid"]:
|
||||||
|
|||||||
@ -4,6 +4,7 @@ import os
|
|||||||
import time
|
import time
|
||||||
import hashlib
|
import hashlib
|
||||||
import tempfile
|
import tempfile
|
||||||
|
import base64
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, Dict, Any
|
from typing import Optional, Dict, Any
|
||||||
import aiofiles
|
import aiofiles
|
||||||
@ -12,6 +13,11 @@ from urllib.parse import urlparse
|
|||||||
from .validation import OfficeFileError
|
from .validation import OfficeFileError
|
||||||
|
|
||||||
|
|
||||||
|
# Environment variable to control local file access
|
||||||
|
# Default to False (secure) - set to "true" for local stdio transport
|
||||||
|
MCP_ALLOW_LOCAL_FILES = os.environ.get("MCP_ALLOW_LOCAL_FILES", "false").lower() == "true"
|
||||||
|
|
||||||
|
|
||||||
class OfficeFileCache:
|
class OfficeFileCache:
|
||||||
"""Simple file cache for downloaded Office documents."""
|
"""Simple file cache for downloaded Office documents."""
|
||||||
|
|
||||||
@ -212,20 +218,45 @@ def get_cache() -> OfficeFileCache:
|
|||||||
return _global_cache
|
return _global_cache
|
||||||
|
|
||||||
|
|
||||||
async def resolve_office_file_path(file_path: str, use_cache: bool = True) -> str:
|
async def resolve_office_file_path(
|
||||||
"""Resolve file path, downloading from URL if necessary.
|
file_path: str,
|
||||||
|
use_cache: bool = True,
|
||||||
|
file_content: Optional[str] = None,
|
||||||
|
filename: Optional[str] = None
|
||||||
|
) -> str:
|
||||||
|
"""Resolve file path, downloading from URL if necessary, or decode inline content.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_path: Local file path or URL
|
file_path: Local file path or URL (ignored if file_content provided)
|
||||||
use_cache: Whether to use caching for URLs
|
use_cache: Whether to use caching for URLs
|
||||||
|
file_content: Base64-encoded file content (for hosted/HTTP transport)
|
||||||
|
filename: Original filename for extension detection (used with file_content)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Local file path (downloaded if was URL)
|
Local file path (temp file if from content, downloaded if from URL)
|
||||||
|
|
||||||
|
Security:
|
||||||
|
When MCP_ALLOW_LOCAL_FILES=false (default for HTTP transport):
|
||||||
|
- Local file paths are rejected
|
||||||
|
- Only URLs and file_content are allowed
|
||||||
|
- This prevents hosted servers from accessing server-side files
|
||||||
"""
|
"""
|
||||||
|
# Priority 1: If file_content is provided, decode and write to temp file
|
||||||
|
if file_content:
|
||||||
|
return await _resolve_from_content(file_content, filename or file_path)
|
||||||
|
|
||||||
# Check if it's a URL
|
# Check if it's a URL
|
||||||
parsed = urlparse(file_path)
|
parsed = urlparse(file_path)
|
||||||
if not (parsed.scheme and parsed.netloc):
|
is_url = bool(parsed.scheme and parsed.netloc)
|
||||||
# Local file path
|
|
||||||
|
if not is_url:
|
||||||
|
# Local file path - check if allowed
|
||||||
|
if not MCP_ALLOW_LOCAL_FILES:
|
||||||
|
raise OfficeFileError(
|
||||||
|
"Local file access is disabled for this server. "
|
||||||
|
"Please use file_content parameter to upload document data, "
|
||||||
|
"or provide a URL. Set MCP_ALLOW_LOCAL_FILES=true to enable local files."
|
||||||
|
)
|
||||||
return file_path
|
return file_path
|
||||||
|
|
||||||
# Validate URL scheme
|
# Validate URL scheme
|
||||||
@ -247,3 +278,60 @@ async def resolve_office_file_path(file_path: str, use_cache: bool = True) -> st
|
|||||||
# Direct download without caching
|
# Direct download without caching
|
||||||
from .validation import download_office_file
|
from .validation import download_office_file
|
||||||
return await download_office_file(file_path)
|
return await download_office_file(file_path)
|
||||||
|
|
||||||
|
|
||||||
|
async def _resolve_from_content(file_content: str, filename_hint: str) -> str:
|
||||||
|
"""Decode base64 content and write to a temp file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_content: Base64-encoded file data
|
||||||
|
filename_hint: Filename or path to extract extension from
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Path to temporary file containing decoded content
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Decode base64 content
|
||||||
|
content_bytes = base64.b64decode(file_content)
|
||||||
|
except Exception as e:
|
||||||
|
raise OfficeFileError(f"Invalid base64 content: {str(e)}")
|
||||||
|
|
||||||
|
# Extract extension from filename hint
|
||||||
|
ext = Path(filename_hint).suffix.lower()
|
||||||
|
if not ext:
|
||||||
|
# Try to detect from content magic bytes
|
||||||
|
ext = _detect_extension_from_bytes(content_bytes)
|
||||||
|
|
||||||
|
# Create temp file with correct extension
|
||||||
|
temp_dir = Path(tempfile.gettempdir()) / "mcp_office_uploads"
|
||||||
|
temp_dir.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
# Generate unique filename
|
||||||
|
content_hash = hashlib.sha256(content_bytes).hexdigest()[:12]
|
||||||
|
temp_path = temp_dir / f"upload_{content_hash}{ext}"
|
||||||
|
|
||||||
|
# Write content to temp file
|
||||||
|
async with aiofiles.open(temp_path, 'wb') as f:
|
||||||
|
await f.write(content_bytes)
|
||||||
|
|
||||||
|
return str(temp_path)
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_extension_from_bytes(content: bytes) -> str:
|
||||||
|
"""Detect file extension from magic bytes."""
|
||||||
|
# ZIP-based formats (docx, xlsx, pptx)
|
||||||
|
if content[:4] == b'PK\x03\x04':
|
||||||
|
# Could be docx, xlsx, or pptx - default to .docx
|
||||||
|
# Full detection would require reading internal XML
|
||||||
|
return ".docx"
|
||||||
|
|
||||||
|
# OLE Compound Document (doc, xls, ppt)
|
||||||
|
if content[:8] == b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1':
|
||||||
|
return ".doc"
|
||||||
|
|
||||||
|
# CSV (text-based, starts with printable characters)
|
||||||
|
if content[:1].isalpha() or content[:1] in b'"\'':
|
||||||
|
return ".csv"
|
||||||
|
|
||||||
|
# Default
|
||||||
|
return ".bin"
|
||||||
@ -324,7 +324,10 @@ class TestMockingStrategies:
|
|||||||
assert result["document_metadata"] == mock_office_file["metadata"]
|
assert result["document_metadata"] == mock_office_file["metadata"]
|
||||||
|
|
||||||
# Verify mocks were called correctly
|
# Verify mocks were called correctly
|
||||||
mock_resolve.assert_called_once_with(mock_office_file["path"])
|
mock_resolve.assert_called_once()
|
||||||
|
# First positional arg should be the file path
|
||||||
|
call_args = mock_resolve.call_args
|
||||||
|
assert call_args[0][0] == mock_office_file["path"]
|
||||||
mock_validate.assert_called_once_with(mock_office_file["path"])
|
mock_validate.assert_called_once_with(mock_office_file["path"])
|
||||||
mock_detect.assert_called_once_with(mock_office_file["path"])
|
mock_detect.assert_called_once_with(mock_office_file["path"])
|
||||||
|
|
||||||
|
|||||||
@ -408,7 +408,10 @@ class TestMockingPatterns:
|
|||||||
assert "structure" in result # Because preserve_formatting=True
|
assert "structure" in result # Because preserve_formatting=True
|
||||||
|
|
||||||
# Verify all mocks were called appropriately
|
# Verify all mocks were called appropriately
|
||||||
mock_resolve.assert_called_once_with("/test/document.docx")
|
mock_resolve.assert_called_once()
|
||||||
|
# First positional arg should be the file path
|
||||||
|
call_args = mock_resolve.call_args
|
||||||
|
assert call_args[0][0] == "/test/document.docx"
|
||||||
mock_validate.assert_called_once_with("/realistic/path/document.docx")
|
mock_validate.assert_called_once_with("/realistic/path/document.docx")
|
||||||
mock_detect.assert_called_once_with("/realistic/path/document.docx")
|
mock_detect.assert_called_once_with("/realistic/path/document.docx")
|
||||||
mock_extract.assert_called_once()
|
mock_extract.assert_called_once()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user