From 772bcac0dfc250fee4f6076176b6962eb68ea891 Mon Sep 17 00:00:00 2001 From: Ryan Malloy Date: Wed, 18 Feb 2026 15:01:43 -0700 Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20File-first=20output=20for=20extr?= =?UTF-8?q?act=5Ftext=20and=20pdf=5Fto=5Fmarkdown?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both tools now write to disk by default and return file path + short preview instead of full content inline. Prevents MCP context overflow on large PDFs. Set inline=True for the old behavior. pdf_to_markdown always extracts images to ./images/ with relative paths (no more dead pdf-image:// URIs). extract_text writes a .txt file. --- CLAUDE.md | 10 +- .../mixins_official/image_processing.py | 207 +++++++++--------- .../mixins_official/text_extraction.py | 128 ++++++++--- 3 files changed, 205 insertions(+), 140 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index d818df8..ce15c16 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -87,11 +87,11 @@ uv publish ### Tool Categories -1. **Text Extraction**: `extract_text` - Intelligent method selection with automatic chunking for large files +1. **Text Extraction**: `extract_text` - Writes extracted text to a .txt file by default, returns path + preview. Set `inline=True` for full text in response. 2. **Table Extraction**: `extract_tables` - Auto-fallback through Camelot → pdfplumber → Tabula 3. **OCR Processing**: `ocr_pdf` - Tesseract with preprocessing options 4. **Document Analysis**: `is_scanned_pdf`, `get_document_structure`, `extract_metadata` -5. **Format Conversion**: `pdf_to_markdown` - Convert PDF to markdown. With `output_directory`, extracts images to disk with relative `./images/` paths. Without, uses `pdf-image://` MCP resource URIs (legacy). +5. **Format Conversion**: `pdf_to_markdown` - Writes markdown + extracted images to disk by default, returns path + preview. Images use relative `./images/` paths. Set `inline=True` for full markdown in response. 6. **Image Processing**: `extract_images` - Extract images with custom output paths and clean summary output 7. **Link Extraction**: `extract_links` - Extract all hyperlinks with page filtering and type categorization 8. **PDF Forms**: `extract_form_data`, `create_form_pdf`, `fill_form_pdf`, `add_form_fields` - Complete form lifecycle management @@ -103,9 +103,9 @@ uv publish **Optimized for MCP Context Management:** - **Custom Output Paths**: `extract_images` allows users to specify where images are saved - **Clean Summary Output**: Returns concise extraction summary instead of verbose image metadata -- **Resource URIs**: `pdf_to_markdown` uses `pdf-image://{image_id}` protocol when no `output_directory` is set (legacy mode) -- **Disk-Based Images**: When `output_directory` is provided, `pdf_to_markdown` extracts images to `{output_directory}/images/` with relative `./images/` paths — compatible with Starlight, browsers, and standard renderers -- **Prevents Context Overflow**: Avoids verbose output that fills client message windows +- **File-First Output**: `extract_text` and `pdf_to_markdown` write results to files by default, returning paths + short previews instead of full content — prevents MCP context overflow on large PDFs +- **Disk-Based Images**: `pdf_to_markdown` always extracts images to `{output_directory}/images/` with relative `./images/` paths — compatible with Starlight, browsers, and standard renderers +- **Inline Escape Hatch**: Both tools accept `inline=True` to return full content in the response for small queries - **User Control**: Flexible output directory support with automatic directory creation ### Intelligent Fallbacks and Token Management diff --git a/src/mcp_pdf/mixins_official/image_processing.py b/src/mcp_pdf/mixins_official/image_processing.py index bc9f0b4..d490251 100644 --- a/src/mcp_pdf/mixins_official/image_processing.py +++ b/src/mcp_pdf/mixins_official/image_processing.py @@ -215,10 +215,10 @@ class ImageProcessingMixin(MCPMixin): @mcp_tool( name="pdf_to_markdown", description=( - "Convert PDF to markdown. When output_directory is provided, images are " - "extracted to {output_directory}/images/ with relative ./images/ paths in " - "the markdown — ready for Starlight, browsers, or any renderer. " - "Without output_directory, images use pdf-image:// MCP resource URIs." + "Convert PDF to markdown and write to a .md file. Images are extracted " + "to {output_directory}/images/ with relative ./images/ paths. Returns " + "the output file path and a short preview — full markdown is in the file. " + "Set inline=True to get full markdown in the response instead." ) ) async def pdf_to_markdown( @@ -231,30 +231,29 @@ class ImageProcessingMixin(MCPMixin): min_width: int = 100, min_height: int = 100, image_format: str = "png", - save_markdown: bool = False + inline: bool = False ) -> Dict[str, Any]: """ - Convert PDF to clean markdown format. + Convert PDF to clean markdown format and write to file. - Two image modes: - - With output_directory: extracts images to disk, uses relative paths in markdown. - Images are filtered by min_width/min_height (matching extract_images behavior). - - Without output_directory: uses pdf-image:// MCP resource URIs (legacy behavior). + By default, writes markdown to a file and extracts images to an images/ + subdirectory with relative paths. Returns file path + summary to avoid + filling the MCP context window. Set inline=True for full markdown in response. Args: pdf_path: Path to PDF file or HTTPS URL pages: Page numbers to convert (comma-separated, 1-based), None for all include_images: Whether to include images in markdown include_metadata: Whether to include document metadata - output_directory: Directory for extracted images and optional markdown file. - When set, images go to {output_directory}/images/ with relative paths. - min_width: Minimum image width to extract (only when output_directory is set) - min_height: Minimum image height to extract (only when output_directory is set) - image_format: Image format - "png" or "jpg" (only when output_directory is set) - save_markdown: Save markdown to {output_directory}/{filename}.md + output_directory: Directory for output .md file and images/ subdirectory. + Defaults to a temp directory if not specified. + min_width: Minimum image width to extract (filters small decorative images) + min_height: Minimum image height to extract (filters small decorative images) + image_format: Image format - "png" or "jpg" + inline: Return full markdown in response instead of writing to file Returns: - Dictionary containing markdown content and metadata + Dictionary with output_file path and summary, or full markdown if inline=True """ start_time = time.time() @@ -273,17 +272,18 @@ class ImageProcessingMixin(MCPMixin): pages_to_process = parsed_pages if parsed_pages else list(range(total_pages)) pages_to_process = [p for p in pages_to_process if 0 <= p < total_pages] - # Setup output directory for image extraction - images_dir = None + # Setup output directory — always needed (file output is the default) images_extracted = 0 images_skipped = 0 extracted_image_info = [] if output_directory: output_dir = validate_output_path(output_directory) - output_dir.mkdir(parents=True, exist_ok=True) - images_dir = output_dir / "images" - images_dir.mkdir(parents=True, exist_ok=True) + else: + output_dir = Path(tempfile.mkdtemp(prefix="pdf_markdown_")) + output_dir.mkdir(parents=True, exist_ok=True) + images_dir = output_dir / "images" + images_dir.mkdir(parents=True, exist_ok=True) markdown_parts = [] @@ -322,53 +322,44 @@ class ImageProcessingMixin(MCPMixin): for img_index, img in enumerate(image_list): try: alt_text = f"Image {img_index + 1} from page {page_num + 1}" + xref = img[0] + pix = fitz.Pixmap(doc, xref) - if images_dir: - # Disk mode: extract image, filter by size, save to images/ - xref = img[0] - pix = fitz.Pixmap(doc, xref) - - if pix.width < min_width or pix.height < min_height: - images_skipped += 1 - pix = None - continue - - # Convert CMYK to RGB if necessary - if pix.n - pix.alpha >= 4: - pix = fitz.Pixmap(fitz.csRGB, pix) - - base_name = input_pdf_path.stem - filename = f"{base_name}_page_{page_num + 1}_img_{img_index + 1}.{image_format}" - img_path = images_dir / filename - - if image_format.lower() in ["jpg", "jpeg"]: - pix.save(str(img_path), "JPEG") - else: - pix.save(str(img_path), "PNG") - - file_size = img_path.stat().st_size - extracted_image_info.append({ - "filename": filename, - "path": str(img_path), - "page": page_num + 1, - "width": pix.width, - "height": pix.height, - "size_bytes": file_size - }) - images_extracted += 1 + if pix.width < min_width or pix.height < min_height: + images_skipped += 1 pix = None + continue - markdown_parts.append(f"![{alt_text}](./images/{filename})\n\n") + # Convert CMYK to RGB if necessary + if pix.n - pix.alpha >= 4: + pix = fitz.Pixmap(fitz.csRGB, pix) + + base_name = input_pdf_path.stem + filename = f"{base_name}_page_{page_num + 1}_img_{img_index + 1}.{image_format}" + img_path = images_dir / filename + + if image_format.lower() in ["jpg", "jpeg"]: + pix.save(str(img_path), "JPEG") else: - # Legacy mode: pdf-image:// MCP resource URI - image_id = f"page_{page_num + 1}_img_{img_index + 1}" - mcp_uri = f"pdf-image://{image_id}" - markdown_parts.append(f"![{alt_text}]({mcp_uri})\n\n") + pix.save(str(img_path), "PNG") + + file_size = img_path.stat().st_size + extracted_image_info.append({ + "filename": filename, + "path": str(img_path), + "page": page_num + 1, + "width": pix.width, + "height": pix.height, + "size_bytes": file_size + }) + images_extracted += 1 + pix = None + + markdown_parts.append(f"![{alt_text}](./images/{filename})\n\n") except Exception as e: logger.warning(f"Failed to process image {img_index + 1} on page {page_num + 1}: {e}") - if images_dir: - images_skipped += 1 + images_skipped += 1 except Exception as e: logger.warning(f"Failed to process page {page_num + 1}: {e}") @@ -379,42 +370,57 @@ class ImageProcessingMixin(MCPMixin): # Combine all markdown parts full_markdown = "".join(markdown_parts) - # Save markdown file if requested - markdown_path = None - if save_markdown and output_directory: - md_path = output_dir / f"{input_pdf_path.stem}.md" - with open(md_path, 'w', encoding='utf-8') as f: - f.write(full_markdown) - markdown_path = str(md_path) - # Calculate statistics word_count = len(full_markdown.split()) line_count = len(full_markdown.split('\n')) char_count = len(full_markdown) - result = { - "success": True, - "markdown": full_markdown, - "conversion_summary": { - "pages_converted": len(pages_to_process), - "total_pages": total_pages, - "word_count": word_count, - "line_count": line_count, - "character_count": char_count, - "includes_images": include_images, - "includes_metadata": include_metadata, - "images_extracted": images_extracted, - "images_skipped": images_skipped - }, - "file_info": { - "input_path": str(input_pdf_path), - "pages_processed": pages or "all" - }, - "conversion_time": round(time.time() - start_time, 2) + conversion_summary = { + "pages_converted": len(pages_to_process), + "total_pages": total_pages, + "word_count": word_count, + "line_count": line_count, + "character_count": char_count, + "images_extracted": images_extracted, + "images_skipped": images_skipped } - if images_dir: - result["image_output"] = { + # Inline mode: return full markdown in response + if inline: + return { + "success": True, + "markdown": full_markdown, + "conversion_summary": conversion_summary, + "image_output": { + "images_directory": str(images_dir), + "images": extracted_image_info + }, + "file_info": { + "input_path": str(input_pdf_path), + "pages_processed": pages or "all" + }, + "conversion_time": round(time.time() - start_time, 2) + } + + # File output mode (default): write .md file, return path + summary + md_path = output_dir / f"{input_pdf_path.stem}.md" + with open(md_path, 'w', encoding='utf-8') as f: + f.write(full_markdown) + + # Build preview (first ~500 chars at sentence boundary) + preview = full_markdown[:500] + if len(full_markdown) > 500: + last_period = preview.rfind('.') + if last_period > 300: + preview = preview[:last_period + 1] + preview += " [...]" + + return { + "success": True, + "output_file": str(md_path), + "markdown_preview": preview, + "conversion_summary": conversion_summary, + "image_output": { "images_directory": str(images_dir), "images_extracted": images_extracted, "images_skipped": images_skipped, @@ -424,17 +430,14 @@ class ImageProcessingMixin(MCPMixin): "image_format": image_format }, "images": extracted_image_info - } - else: - result["mcp_integration"] = { - "image_uri_format": "pdf-image://{image_id}", - "description": "Images use MCP resource URIs. Set output_directory for disk-based images with relative paths." - } - - if markdown_path: - result["markdown_path"] = markdown_path - - return result + }, + "file_info": { + "input_path": str(input_pdf_path), + "output_directory": str(output_dir), + "pages_processed": pages or "all" + }, + "conversion_time": round(time.time() - start_time, 2) + } except Exception as e: error_msg = sanitize_error_message(str(e)) diff --git a/src/mcp_pdf/mixins_official/text_extraction.py b/src/mcp_pdf/mixins_official/text_extraction.py index 518b526..c4af624 100644 --- a/src/mcp_pdf/mixins_official/text_extraction.py +++ b/src/mcp_pdf/mixins_official/text_extraction.py @@ -3,8 +3,8 @@ Text Extraction Mixin - PDF text extraction, OCR, and scanned PDF detection Uses official fastmcp.contrib.mcp_mixin pattern """ -import asyncio import time +import tempfile from pathlib import Path from typing import Dict, Any, Optional, List import logging @@ -18,7 +18,7 @@ import io # Official FastMCP mixin from fastmcp.contrib.mcp_mixin import MCPMixin, mcp_tool -from ..security import validate_pdf_path, sanitize_error_message +from ..security import validate_pdf_path, validate_output_path, sanitize_error_message logger = logging.getLogger(__name__) @@ -36,30 +36,43 @@ class TextExtractionMixin(MCPMixin): @mcp_tool( name="extract_text", - description="Extract text from PDF with intelligent method selection and automatic chunking for large files" + description=( + "Extract text from PDF and write to a .txt file. Returns the output " + "file path and a short preview — full text is in the file, not in the " + "response. Use output_directory to control where the file is saved, " + "or set inline=True to get full text in the response instead." + ) ) async def extract_text( self, pdf_path: str, pages: Optional[str] = None, method: str = "auto", + preserve_layout: bool = False, + output_directory: Optional[str] = None, + inline: bool = False, chunk_pages: int = 10, - max_tokens: int = 20000, - preserve_layout: bool = False + max_tokens: int = 20000 ) -> Dict[str, Any]: """ Extract text from PDF with intelligent method selection. + By default, writes extracted text to a file and returns the path with + a short preview. This prevents large extractions from filling the MCP + context window. Set inline=True for the old behavior (full text in response). + Args: pdf_path: Path to PDF file or HTTPS URL pages: Page numbers to extract (comma-separated, 1-based), None for all method: Extraction method ("auto", "pymupdf", "pdfplumber", "pypdf") - chunk_pages: Number of pages per chunk for large files - max_tokens: Maximum tokens per response to prevent overflow preserve_layout: Whether to preserve text layout and formatting + output_directory: Directory to save the text file (default: temp directory) + inline: Return full text in response instead of writing to file + chunk_pages: Pages per chunk when inline=True (ignored for file output) + max_tokens: Max chars when inline=True (ignored for file output) Returns: - Dictionary containing extracted text and metadata + Dictionary with output_file path and summary, or full text if inline=True """ start_time = time.time() @@ -84,44 +97,93 @@ class TextExtractionMixin(MCPMixin): "extraction_time": 0 } - # Check if chunking is needed - if len(pages_to_extract) > chunk_pages: - return await self._extract_text_chunked( - doc, path, pages_to_extract, method, chunk_pages, - max_tokens, preserve_layout, start_time - ) + # Inline mode: old behavior with chunking/truncation + if inline: + if len(pages_to_extract) > chunk_pages: + return await self._extract_text_chunked( + doc, path, pages_to_extract, method, chunk_pages, + max_tokens, preserve_layout, start_time + ) - # Extract text from specified pages + extraction_result = await self._extract_text_from_pages( + doc, pages_to_extract, method, preserve_layout + ) + doc.close() + + if len(extraction_result["text"]) > max_tokens: + truncated_text = extraction_result["text"][:max_tokens] + last_period = truncated_text.rfind('.') + if last_period > max_tokens * 0.8: + truncated_text = truncated_text[:last_period + 1] + extraction_result["text"] = truncated_text + extraction_result["truncated"] = True + extraction_result["truncation_reason"] = f"Response too large (>{max_tokens} chars)" + + extraction_result.update({ + "success": True, + "file_info": { + "path": str(path), + "total_pages": total_pages, + "pages_extracted": len(pages_to_extract), + "pages_requested": pages or "all" + }, + "extraction_time": round(time.time() - start_time, 2) + }) + return extraction_result + + # File output mode (default): extract all requested pages, write to file extraction_result = await self._extract_text_from_pages( doc, pages_to_extract, method, preserve_layout ) - doc.close() - # Check token limit and truncate if necessary - if len(extraction_result["text"]) > max_tokens: - truncated_text = extraction_result["text"][:max_tokens] - # Try to truncate at sentence boundary - last_period = truncated_text.rfind('.') - if last_period > max_tokens * 0.8: # If we can find a good break point - truncated_text = truncated_text[:last_period + 1] + full_text = extraction_result["text"] - extraction_result["text"] = truncated_text - extraction_result["truncated"] = True - extraction_result["truncation_reason"] = f"Response too large (>{max_tokens} chars)" + # Setup output directory + if output_directory: + output_dir = validate_output_path(output_directory) + output_dir.mkdir(parents=True, exist_ok=True) + else: + output_dir = Path(tempfile.mkdtemp(prefix="pdf_text_")) - extraction_result.update({ + # Write text to file + output_filename = f"{path.stem}.txt" + output_path = output_dir / output_filename + with open(output_path, 'w', encoding='utf-8') as f: + f.write(full_text) + + # Build preview (first ~500 chars at sentence boundary) + preview = full_text[:500] + if len(full_text) > 500: + last_period = preview.rfind('.') + if last_period > 300: + preview = preview[:last_period + 1] + preview += " [...]" + + word_count = len(full_text.split()) + char_count = len(full_text) + file_size = output_path.stat().st_size + + return { "success": True, - "file_info": { - "path": str(path), - "total_pages": total_pages, + "output_file": str(output_path), + "text_preview": preview, + "extraction_summary": { + "word_count": word_count, + "character_count": char_count, + "file_size_bytes": file_size, + "file_size_kb": round(file_size / 1024, 1), "pages_extracted": len(pages_to_extract), + "total_pages": total_pages, + "method_used": extraction_result.get("method_used", method) + }, + "file_info": { + "input_path": str(path), + "total_pages": total_pages, "pages_requested": pages or "all" }, "extraction_time": round(time.time() - start_time, 2) - }) - - return extraction_result + } except Exception as e: error_msg = sanitize_error_message(str(e))