✨ Chapter-aware PDF extraction: detect_structure, split_pdf_by_structure, batch_extract
New StructureDetectionMixin with 3 tools: - detect_structure: finds chapters/sections via bookmarks, font-size heuristics, numbering patterns, and user-supplied regex - split_pdf_by_structure: auto-splits PDF into per-chapter directories with markdown + images + vectors in one call - batch_extract: process N user-specified page ranges from one PDF Enhanced pdf_to_markdown: - output_filename parameter for custom .md filenames - vector_diagnostics reporting for skipped pages - vector_fallback_raster: render sub-threshold pages as PNG at 150 DPI Bumps version to 2.1.0
This commit is contained in:
parent
5161a5f952
commit
823318ec15
@ -91,12 +91,13 @@ uv publish
|
||||
2. **Table Extraction**: `extract_tables` - Auto-fallback through Camelot → pdfplumber → Tabula
|
||||
3. **OCR Processing**: `ocr_pdf` - Tesseract with preprocessing options
|
||||
4. **Document Analysis**: `is_scanned_pdf`, `get_document_structure`, `extract_metadata`
|
||||
5. **Format Conversion**: `pdf_to_markdown` - Writes markdown + extracted raster images and vector graphics (SVG) to disk by default, returns path + preview. Images use relative `./images/` paths, vectors use `./vectors/` paths. Set `inline=True` for full markdown in response. Set `include_vectors=False` to skip vector extraction.
|
||||
5. **Format Conversion**: `pdf_to_markdown` - Writes markdown + extracted raster images and vector graphics (SVG) to disk by default, returns path + preview. Images use relative `./images/` paths, vectors use `./vectors/` paths. Set `inline=True` for full markdown in response. Set `include_vectors=False` to skip vector extraction. Use `output_filename` to override the default .md filename. When `include_vectors=True`, returns `vector_diagnostics` showing which pages had drawings below the complexity threshold. Set `vector_fallback_raster=True` to render those sub-threshold pages as full-page raster images (PNG at 150 DPI) instead of skipping them.
|
||||
6. **Image Processing**: `extract_images` - Extract images with custom output paths and clean summary output
|
||||
7. **Link Extraction**: `extract_links` - Extract all hyperlinks with page filtering and type categorization
|
||||
8. **PDF Forms**: `extract_form_data`, `create_form_pdf`, `fill_form_pdf`, `add_form_fields` - Complete form lifecycle management
|
||||
9. **Document Assembly**: `merge_pdfs`, `split_pdf_by_pages`, `reorder_pdf_pages` - PDF manipulation and organization
|
||||
10. **Annotations & Markup**: `add_sticky_notes`, `add_highlights`, `add_stamps`, `add_video_notes`, `extract_all_annotations` - Collaboration and multimedia review tools
|
||||
11. **Structure Detection**: `detect_structure`, `split_pdf_by_structure`, `batch_extract` - Chapter-aware document analysis and extraction. `detect_structure` finds headings via bookmarks, font-size heuristics, and numbering patterns. `split_pdf_by_structure` auto-splits into per-chapter directories with markdown + images. `batch_extract` processes user-specified page ranges in a single call (replaces 24+ individual tool calls).
|
||||
|
||||
### MCP Client-Friendly Design
|
||||
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
[project]
|
||||
name = "mcp-pdf"
|
||||
version = "2.0.14"
|
||||
version = "2.1.0"
|
||||
description = "Secure FastMCP server for comprehensive PDF processing - text extraction, OCR, table extraction, forms, annotations, and more"
|
||||
authors = [{name = "Ryan Malloy", email = "ryan@malloys.us"}]
|
||||
readme = "README.md"
|
||||
|
||||
@ -17,6 +17,7 @@ from .security_analysis import SecurityAnalysisMixin
|
||||
from .content_analysis import ContentAnalysisMixin
|
||||
from .pdf_utilities import PDFUtilitiesMixin
|
||||
from .misc_tools import MiscToolsMixin
|
||||
from .structure_detection import StructureDetectionMixin
|
||||
|
||||
__all__ = [
|
||||
"TextExtractionMixin",
|
||||
@ -31,4 +32,5 @@ __all__ = [
|
||||
"ContentAnalysisMixin",
|
||||
"PDFUtilitiesMixin",
|
||||
"MiscToolsMixin",
|
||||
"StructureDetectionMixin",
|
||||
]
|
||||
@ -218,7 +218,10 @@ class ImageProcessingMixin(MCPMixin):
|
||||
"extracted to {output_directory}/images/ and vector graphics (charts, "
|
||||
"schematics, diagrams) to {output_directory}/vectors/ as SVG. Returns "
|
||||
"the output file path and a short preview — full markdown is in the file. "
|
||||
"Set inline=True to get full markdown in the response instead."
|
||||
"Set inline=True to get full markdown in the response instead. "
|
||||
"Use output_filename to override the default .md filename. "
|
||||
"Set vector_fallback_raster=True to render pages with sub-threshold "
|
||||
"drawings as raster images instead of skipping them entirely."
|
||||
)
|
||||
)
|
||||
async def pdf_to_markdown(
|
||||
@ -228,6 +231,7 @@ class ImageProcessingMixin(MCPMixin):
|
||||
include_images: bool = True,
|
||||
include_metadata: bool = True,
|
||||
output_directory: Optional[str] = None,
|
||||
output_filename: Optional[str] = None,
|
||||
min_width: int = 100,
|
||||
min_height: int = 100,
|
||||
image_format: str = "png",
|
||||
@ -235,6 +239,7 @@ class ImageProcessingMixin(MCPMixin):
|
||||
include_vectors: bool = True,
|
||||
vector_min_drawings: int = 5,
|
||||
vector_min_complexity: int = 50,
|
||||
vector_fallback_raster: bool = False,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Convert PDF to clean markdown format and write to file.
|
||||
@ -252,6 +257,8 @@ class ImageProcessingMixin(MCPMixin):
|
||||
include_metadata: Whether to include document metadata
|
||||
output_directory: Directory for output .md file and images/ subdirectory.
|
||||
Defaults to a temp directory if not specified.
|
||||
output_filename: Custom filename for the output .md file (e.g., "chapter_1.md").
|
||||
Defaults to the PDF filename with .md extension.
|
||||
min_width: Minimum image width to extract (filters small decorative images)
|
||||
min_height: Minimum image height to extract (filters small decorative images)
|
||||
image_format: Image format - "png" or "jpg"
|
||||
@ -260,6 +267,10 @@ class ImageProcessingMixin(MCPMixin):
|
||||
Detects charts, schematics, and technical drawings automatically.
|
||||
vector_min_drawings: Minimum drawing count per page to consider (default: 5)
|
||||
vector_min_complexity: Minimum total path items for extraction (default: 50)
|
||||
vector_fallback_raster: When True, pages with drawings below the vector
|
||||
complexity threshold are rendered as full-page raster images (PNG at
|
||||
150 DPI) instead of being skipped. Captures charts and diagrams that
|
||||
are too simple for SVG extraction but still visually meaningful.
|
||||
|
||||
Returns:
|
||||
Dictionary with output_file path and summary, or full markdown if inline=True
|
||||
@ -285,8 +296,10 @@ class ImageProcessingMixin(MCPMixin):
|
||||
images_extracted = 0
|
||||
images_skipped = 0
|
||||
vectors_extracted = 0
|
||||
raster_fallbacks = 0
|
||||
extracted_image_info = []
|
||||
extracted_vector_info = []
|
||||
vector_diagnostics = []
|
||||
|
||||
if output_directory:
|
||||
output_dir = validate_output_path(output_directory)
|
||||
@ -403,6 +416,49 @@ class ImageProcessingMixin(MCPMixin):
|
||||
markdown_parts.append(
|
||||
f"\n\n"
|
||||
)
|
||||
elif drawings:
|
||||
# Page has drawings but below SVG complexity threshold
|
||||
diag_entry = {
|
||||
"page": page_num + 1,
|
||||
"drawing_count": len(drawings),
|
||||
"total_path_items": sum(len(d.get("items", [])) for d in drawings),
|
||||
"raster_images_on_page": len(page.get_images()),
|
||||
}
|
||||
|
||||
if vector_fallback_raster:
|
||||
# Render full page as raster image at 150 DPI
|
||||
try:
|
||||
base_name = input_pdf_path.stem
|
||||
pix = page.get_pixmap(dpi=150)
|
||||
fallback_filename = f"{base_name}_page_{page_num + 1}_fallback.png"
|
||||
fallback_path = images_dir / fallback_filename
|
||||
pix.save(str(fallback_path))
|
||||
file_size = fallback_path.stat().st_size
|
||||
extracted_image_info.append({
|
||||
"filename": fallback_filename,
|
||||
"path": str(fallback_path),
|
||||
"page": page_num + 1,
|
||||
"width": pix.width,
|
||||
"height": pix.height,
|
||||
"size_bytes": file_size,
|
||||
"type": "vector_fallback",
|
||||
})
|
||||
raster_fallbacks += 1
|
||||
pix = None
|
||||
markdown_parts.append(
|
||||
f"\n\n"
|
||||
)
|
||||
diag_entry["reason"] = "raster_fallback_rendered"
|
||||
except Exception as fb_exc:
|
||||
logger.warning(
|
||||
"Raster fallback failed for page %d: %s",
|
||||
page_num + 1, fb_exc,
|
||||
)
|
||||
diag_entry["reason"] = "raster_fallback_failed"
|
||||
else:
|
||||
diag_entry["reason"] = "below_complexity_threshold"
|
||||
|
||||
vector_diagnostics.append(diag_entry)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to extract vectors from page {page_num + 1}: {e}")
|
||||
|
||||
@ -429,6 +485,7 @@ class ImageProcessingMixin(MCPMixin):
|
||||
"images_extracted": images_extracted,
|
||||
"images_skipped": images_skipped,
|
||||
"vectors_extracted": vectors_extracted,
|
||||
"raster_fallbacks": raster_fallbacks,
|
||||
}
|
||||
|
||||
# Inline mode: return full markdown in response
|
||||
@ -453,10 +510,22 @@ class ImageProcessingMixin(MCPMixin):
|
||||
"vectors_extracted": vectors_extracted,
|
||||
"vectors": extracted_vector_info,
|
||||
}
|
||||
if include_vectors:
|
||||
result["vector_diagnostics"] = {
|
||||
"pages_with_vectors": vectors_extracted,
|
||||
"pages_with_drawings_skipped": len(vector_diagnostics),
|
||||
"pages_analyzed": len(pages_to_process),
|
||||
"skipped_pages": vector_diagnostics[:20],
|
||||
}
|
||||
return result
|
||||
|
||||
# File output mode (default): write .md file, return path + summary
|
||||
md_path = output_dir / f"{input_pdf_path.stem}.md"
|
||||
if output_filename:
|
||||
if not output_filename.endswith('.md'):
|
||||
output_filename += '.md'
|
||||
md_path = output_dir / output_filename
|
||||
else:
|
||||
md_path = output_dir / f"{input_pdf_path.stem}.md"
|
||||
with open(md_path, 'w', encoding='utf-8') as f:
|
||||
f.write(full_markdown)
|
||||
|
||||
@ -497,6 +566,13 @@ class ImageProcessingMixin(MCPMixin):
|
||||
"vectors_extracted": vectors_extracted,
|
||||
"vectors": extracted_vector_info,
|
||||
}
|
||||
if include_vectors:
|
||||
result["vector_diagnostics"] = {
|
||||
"pages_with_vectors": vectors_extracted,
|
||||
"pages_with_drawings_skipped": len(vector_diagnostics),
|
||||
"pages_analyzed": len(pages_to_process),
|
||||
"skipped_pages": vector_diagnostics[:20],
|
||||
}
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
|
||||
1022
src/mcp_pdf/mixins_official/structure_detection.py
Normal file
1022
src/mcp_pdf/mixins_official/structure_detection.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -24,6 +24,7 @@ from .mixins_official.security_analysis import SecurityAnalysisMixin
|
||||
from .mixins_official.content_analysis import ContentAnalysisMixin
|
||||
from .mixins_official.pdf_utilities import PDFUtilitiesMixin
|
||||
from .mixins_official.misc_tools import MiscToolsMixin
|
||||
from .mixins_official.structure_detection import StructureDetectionMixin
|
||||
from .mixins_official.permit_forms import PermitFormMixin
|
||||
|
||||
# Configure logging
|
||||
@ -80,6 +81,7 @@ class PDFServerOfficial:
|
||||
ContentAnalysisMixin,
|
||||
PDFUtilitiesMixin,
|
||||
MiscToolsMixin,
|
||||
StructureDetectionMixin,
|
||||
PermitFormMixin,
|
||||
]
|
||||
|
||||
@ -137,7 +139,8 @@ class PDFServerOfficial:
|
||||
"form_management": ["extract_form_data", "fill_form_pdf", "create_form_pdf"],
|
||||
"document_assembly": ["merge_pdfs", "split_pdf", "reorder_pdf_pages"],
|
||||
"annotations": ["add_sticky_notes", "add_highlights", "add_stamps", "extract_all_annotations"],
|
||||
"image_processing": ["extract_images", "pdf_to_markdown", "extract_vector_graphics"]
|
||||
"image_processing": ["extract_images", "pdf_to_markdown", "extract_vector_graphics"],
|
||||
"structure_detection": ["detect_structure", "split_pdf_by_structure", "batch_extract"]
|
||||
}
|
||||
}
|
||||
|
||||
@ -162,7 +165,7 @@ def main():
|
||||
from importlib.metadata import version
|
||||
package_version = version("mcp-pdf")
|
||||
except:
|
||||
package_version = "2.0.12"
|
||||
package_version = "2.1.0"
|
||||
|
||||
logger.info(f"🎬 MCP PDF Tools Server v{package_version} (Official Pattern)")
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user