🐛 Fix pdf_to_markdown broken image references

pdf_to_markdown generated pdf-image:// URIs that never resolved — the
resource handler only existed in the legacy server. Add output_directory
parameter: when set, images extract to disk with relative ./images/ paths.
Without it, existing pdf-image:// behavior preserved for backward compat.

Also adds min_width/min_height filtering (matching extract_images),
save_markdown option, and fixes missing extract_vector_graphics in
list_capabilities.
This commit is contained in:
Ryan Malloy 2026-02-12 20:24:19 -07:00
parent febe6dae13
commit 8b5783585f
3 changed files with 119 additions and 23 deletions

View File

@ -91,7 +91,7 @@ uv publish
2. **Table Extraction**: `extract_tables` - Auto-fallback through Camelot → pdfplumber → Tabula 2. **Table Extraction**: `extract_tables` - Auto-fallback through Camelot → pdfplumber → Tabula
3. **OCR Processing**: `ocr_pdf` - Tesseract with preprocessing options 3. **OCR Processing**: `ocr_pdf` - Tesseract with preprocessing options
4. **Document Analysis**: `is_scanned_pdf`, `get_document_structure`, `extract_metadata` 4. **Document Analysis**: `is_scanned_pdf`, `get_document_structure`, `extract_metadata`
5. **Format Conversion**: `pdf_to_markdown` - Clean markdown with MCP resource URIs for images 5. **Format Conversion**: `pdf_to_markdown` - Convert PDF to markdown. With `output_directory`, extracts images to disk with relative `./images/` paths. Without, uses `pdf-image://` MCP resource URIs (legacy).
6. **Image Processing**: `extract_images` - Extract images with custom output paths and clean summary output 6. **Image Processing**: `extract_images` - Extract images with custom output paths and clean summary output
7. **Link Extraction**: `extract_links` - Extract all hyperlinks with page filtering and type categorization 7. **Link Extraction**: `extract_links` - Extract all hyperlinks with page filtering and type categorization
8. **PDF Forms**: `extract_form_data`, `create_form_pdf`, `fill_form_pdf`, `add_form_fields` - Complete form lifecycle management 8. **PDF Forms**: `extract_form_data`, `create_form_pdf`, `fill_form_pdf`, `add_form_fields` - Complete form lifecycle management
@ -103,7 +103,8 @@ uv publish
**Optimized for MCP Context Management:** **Optimized for MCP Context Management:**
- **Custom Output Paths**: `extract_images` allows users to specify where images are saved - **Custom Output Paths**: `extract_images` allows users to specify where images are saved
- **Clean Summary Output**: Returns concise extraction summary instead of verbose image metadata - **Clean Summary Output**: Returns concise extraction summary instead of verbose image metadata
- **Resource URIs**: `pdf_to_markdown` uses `pdf-image://{image_id}` protocol for seamless client integration - **Resource URIs**: `pdf_to_markdown` uses `pdf-image://{image_id}` protocol when no `output_directory` is set (legacy mode)
- **Disk-Based Images**: When `output_directory` is provided, `pdf_to_markdown` extracts images to `{output_directory}/images/` with relative `./images/` paths — compatible with Starlight, browsers, and standard renderers
- **Prevents Context Overflow**: Avoids verbose output that fills client message windows - **Prevents Context Overflow**: Avoids verbose output that fills client message windows
- **User Control**: Flexible output directory support with automatic directory creation - **User Control**: Flexible output directory support with automatic directory creation

View File

@ -3,19 +3,14 @@ Image Processing Mixin - PDF image extraction and markdown conversion
Uses official fastmcp.contrib.mcp_mixin pattern Uses official fastmcp.contrib.mcp_mixin pattern
""" """
import asyncio
import time import time
import tempfile import tempfile
import json
from pathlib import Path from pathlib import Path
from typing import Dict, Any, Optional, List from typing import Dict, Any, Optional, List
import logging import logging
# PDF and image processing libraries # PDF and image processing libraries
import fitz # PyMuPDF import fitz # PyMuPDF
from PIL import Image
import io
import base64
# Official FastMCP mixin # Official FastMCP mixin
from fastmcp.contrib.mcp_mixin import MCPMixin, mcp_tool from fastmcp.contrib.mcp_mixin import MCPMixin, mcp_tool
@ -219,23 +214,44 @@ class ImageProcessingMixin(MCPMixin):
@mcp_tool( @mcp_tool(
name="pdf_to_markdown", name="pdf_to_markdown",
description="Convert PDF to markdown with MCP resource URIs" description=(
"Convert PDF to markdown. When output_directory is provided, images are "
"extracted to {output_directory}/images/ with relative ./images/ paths in "
"the markdown — ready for Starlight, browsers, or any renderer. "
"Without output_directory, images use pdf-image:// MCP resource URIs."
)
) )
async def pdf_to_markdown( async def pdf_to_markdown(
self, self,
pdf_path: str, pdf_path: str,
pages: Optional[str] = None, pages: Optional[str] = None,
include_images: bool = True, include_images: bool = True,
include_metadata: bool = True include_metadata: bool = True,
output_directory: Optional[str] = None,
min_width: int = 100,
min_height: int = 100,
image_format: str = "png",
save_markdown: bool = False
) -> Dict[str, Any]: ) -> Dict[str, Any]:
""" """
Convert PDF to clean markdown format with MCP resource URIs for images. Convert PDF to clean markdown format.
Two image modes:
- With output_directory: extracts images to disk, uses relative paths in markdown.
Images are filtered by min_width/min_height (matching extract_images behavior).
- Without output_directory: uses pdf-image:// MCP resource URIs (legacy behavior).
Args: Args:
pdf_path: Path to PDF file or HTTPS URL pdf_path: Path to PDF file or HTTPS URL
pages: Page numbers to convert (comma-separated, 1-based), None for all pages: Page numbers to convert (comma-separated, 1-based), None for all
include_images: Whether to include images in markdown include_images: Whether to include images in markdown
include_metadata: Whether to include document metadata include_metadata: Whether to include document metadata
output_directory: Directory for extracted images and optional markdown file.
When set, images go to {output_directory}/images/ with relative paths.
min_width: Minimum image width to extract (only when output_directory is set)
min_height: Minimum image height to extract (only when output_directory is set)
image_format: Image format - "png" or "jpg" (only when output_directory is set)
save_markdown: Save markdown to {output_directory}/{filename}.md
Returns: Returns:
Dictionary containing markdown content and metadata Dictionary containing markdown content and metadata
@ -257,6 +273,18 @@ class ImageProcessingMixin(MCPMixin):
pages_to_process = parsed_pages if parsed_pages else list(range(total_pages)) pages_to_process = parsed_pages if parsed_pages else list(range(total_pages))
pages_to_process = [p for p in pages_to_process if 0 <= p < total_pages] pages_to_process = [p for p in pages_to_process if 0 <= p < total_pages]
# Setup output directory for image extraction
images_dir = None
images_extracted = 0
images_skipped = 0
extracted_image_info = []
if output_directory:
output_dir = validate_output_path(output_directory)
output_dir.mkdir(parents=True, exist_ok=True)
images_dir = output_dir / "images"
images_dir.mkdir(parents=True, exist_ok=True)
markdown_parts = [] markdown_parts = []
# Add metadata if requested # Add metadata if requested
@ -293,16 +321,54 @@ class ImageProcessingMixin(MCPMixin):
for img_index, img in enumerate(image_list): for img_index, img in enumerate(image_list):
try: try:
# Create MCP resource URI for the image
image_id = f"page_{page_num + 1}_img_{img_index + 1}"
mcp_uri = f"pdf-image://{image_id}"
# Add markdown image reference
alt_text = f"Image {img_index + 1} from page {page_num + 1}" alt_text = f"Image {img_index + 1} from page {page_num + 1}"
markdown_parts.append(f"![{alt_text}]({mcp_uri})\n\n")
if images_dir:
# Disk mode: extract image, filter by size, save to images/
xref = img[0]
pix = fitz.Pixmap(doc, xref)
if pix.width < min_width or pix.height < min_height:
images_skipped += 1
pix = None
continue
# Convert CMYK to RGB if necessary
if pix.n - pix.alpha >= 4:
pix = fitz.Pixmap(fitz.csRGB, pix)
base_name = input_pdf_path.stem
filename = f"{base_name}_page_{page_num + 1}_img_{img_index + 1}.{image_format}"
img_path = images_dir / filename
if image_format.lower() in ["jpg", "jpeg"]:
pix.save(str(img_path), "JPEG")
else:
pix.save(str(img_path), "PNG")
file_size = img_path.stat().st_size
extracted_image_info.append({
"filename": filename,
"path": str(img_path),
"page": page_num + 1,
"width": pix.width,
"height": pix.height,
"size_bytes": file_size
})
images_extracted += 1
pix = None
markdown_parts.append(f"![{alt_text}](./images/{filename})\n\n")
else:
# Legacy mode: pdf-image:// MCP resource URI
image_id = f"page_{page_num + 1}_img_{img_index + 1}"
mcp_uri = f"pdf-image://{image_id}"
markdown_parts.append(f"![{alt_text}]({mcp_uri})\n\n")
except Exception as e: except Exception as e:
logger.warning(f"Failed to process image {img_index + 1} on page {page_num + 1}: {e}") logger.warning(f"Failed to process image {img_index + 1} on page {page_num + 1}: {e}")
if images_dir:
images_skipped += 1
except Exception as e: except Exception as e:
logger.warning(f"Failed to process page {page_num + 1}: {e}") logger.warning(f"Failed to process page {page_num + 1}: {e}")
@ -313,12 +379,20 @@ class ImageProcessingMixin(MCPMixin):
# Combine all markdown parts # Combine all markdown parts
full_markdown = "".join(markdown_parts) full_markdown = "".join(markdown_parts)
# Save markdown file if requested
markdown_path = None
if save_markdown and output_directory:
md_path = output_dir / f"{input_pdf_path.stem}.md"
with open(md_path, 'w', encoding='utf-8') as f:
f.write(full_markdown)
markdown_path = str(md_path)
# Calculate statistics # Calculate statistics
word_count = len(full_markdown.split()) word_count = len(full_markdown.split())
line_count = len(full_markdown.split('\n')) line_count = len(full_markdown.split('\n'))
char_count = len(full_markdown) char_count = len(full_markdown)
return { result = {
"success": True, "success": True,
"markdown": full_markdown, "markdown": full_markdown,
"conversion_summary": { "conversion_summary": {
@ -328,11 +402,9 @@ class ImageProcessingMixin(MCPMixin):
"line_count": line_count, "line_count": line_count,
"character_count": char_count, "character_count": char_count,
"includes_images": include_images, "includes_images": include_images,
"includes_metadata": include_metadata "includes_metadata": include_metadata,
}, "images_extracted": images_extracted,
"mcp_integration": { "images_skipped": images_skipped
"image_uri_format": "pdf-image://{image_id}",
"description": "Images use MCP resource URIs for seamless client integration"
}, },
"file_info": { "file_info": {
"input_path": str(input_pdf_path), "input_path": str(input_pdf_path),
@ -341,6 +413,29 @@ class ImageProcessingMixin(MCPMixin):
"conversion_time": round(time.time() - start_time, 2) "conversion_time": round(time.time() - start_time, 2)
} }
if images_dir:
result["image_output"] = {
"images_directory": str(images_dir),
"images_extracted": images_extracted,
"images_skipped": images_skipped,
"filter_settings": {
"min_width": min_width,
"min_height": min_height,
"image_format": image_format
},
"images": extracted_image_info
}
else:
result["mcp_integration"] = {
"image_uri_format": "pdf-image://{image_id}",
"description": "Images use MCP resource URIs. Set output_directory for disk-based images with relative paths."
}
if markdown_path:
result["markdown_path"] = markdown_path
return result
except Exception as e: except Exception as e:
error_msg = sanitize_error_message(str(e)) error_msg = sanitize_error_message(str(e))
logger.error(f"PDF to markdown conversion failed: {error_msg}") logger.error(f"PDF to markdown conversion failed: {error_msg}")

View File

@ -137,7 +137,7 @@ class PDFServerOfficial:
"form_management": ["extract_form_data", "fill_form_pdf", "create_form_pdf"], "form_management": ["extract_form_data", "fill_form_pdf", "create_form_pdf"],
"document_assembly": ["merge_pdfs", "split_pdf", "reorder_pdf_pages"], "document_assembly": ["merge_pdfs", "split_pdf", "reorder_pdf_pages"],
"annotations": ["add_sticky_notes", "add_highlights", "add_stamps", "extract_all_annotations"], "annotations": ["add_sticky_notes", "add_highlights", "add_stamps", "extract_all_annotations"],
"image_processing": ["extract_images", "pdf_to_markdown"] "image_processing": ["extract_images", "pdf_to_markdown", "extract_vector_graphics"]
} }
} }