📉 File-first output for detect_structure — 20× context reduction

detect_structure now writes full JSON to disk and returns a compact
summary (~1k tokens) instead of the full structure tree (~20k tokens).
Prevents MCP context overflow on large documents. Set inline=True to
get full data in response (used internally by split_pdf_by_structure).
This commit is contained in:
Ryan Malloy 2026-03-04 17:12:36 -07:00
parent 56ab8356bc
commit a23fd8467a
4 changed files with 90 additions and 19 deletions

View File

@ -97,7 +97,7 @@ uv publish
8. **PDF Forms**: `extract_form_data`, `create_form_pdf`, `fill_form_pdf`, `add_form_fields` - Complete form lifecycle management 8. **PDF Forms**: `extract_form_data`, `create_form_pdf`, `fill_form_pdf`, `add_form_fields` - Complete form lifecycle management
9. **Document Assembly**: `merge_pdfs`, `split_pdf_by_pages`, `reorder_pdf_pages` - PDF manipulation and organization 9. **Document Assembly**: `merge_pdfs`, `split_pdf_by_pages`, `reorder_pdf_pages` - PDF manipulation and organization
10. **Annotations & Markup**: `add_sticky_notes`, `add_highlights`, `add_stamps`, `add_video_notes`, `extract_all_annotations` - Collaboration and multimedia review tools 10. **Annotations & Markup**: `add_sticky_notes`, `add_highlights`, `add_stamps`, `add_video_notes`, `extract_all_annotations` - Collaboration and multimedia review tools
11. **Structure Detection**: `detect_structure`, `split_pdf_by_structure`, `batch_extract` - Chapter-aware document analysis and extraction. `detect_structure` finds headings via bookmarks, font-size heuristics, and numbering patterns. `split_pdf_by_structure` auto-splits into per-chapter directories with markdown + images. `batch_extract` processes user-specified page ranges in a single call (replaces 24+ individual tool calls). 11. **Structure Detection**: `detect_structure`, `split_pdf_by_structure`, `batch_extract` - Chapter-aware document analysis and extraction. `detect_structure` finds headings via bookmarks, font-size heuristics, and numbering patterns. Writes full structure to a JSON file by default, returns compact summary + path (~1k tokens vs ~20k inline). Set `inline=True` for full data in response. `split_pdf_by_structure` auto-splits into per-chapter directories with markdown + images. `batch_extract` processes user-specified page ranges in a single call (replaces 24+ individual tool calls).
### MCP Client-Friendly Design ### MCP Client-Friendly Design

View File

@ -1,6 +1,6 @@
[project] [project]
name = "mcp-pdf" name = "mcp-pdf"
version = "2.1.1" version = "2.1.2"
description = "Secure FastMCP server for comprehensive PDF processing - text extraction, OCR, table extraction, forms, annotations, and more" description = "Secure FastMCP server for comprehensive PDF processing - text extraction, OCR, table extraction, forms, annotations, and more"
authors = [{name = "Ryan Malloy", email = "ryan@malloys.us"}] authors = [{name = "Ryan Malloy", email = "ryan@malloys.us"}]
readme = "README.md" readme = "README.md"

View File

@ -63,8 +63,9 @@ class StructureDetectionMixin(MCPMixin):
description=( description=(
"Detect logical structure (chapters, sections, headings) of a PDF " "Detect logical structure (chapters, sections, headings) of a PDF "
"using bookmarks, font-size analysis, and numbering patterns. " "using bookmarks, font-size analysis, and numbering patterns. "
"Returns a hierarchical section tree and a flat boundary list " "By default writes full structure to a JSON file and returns a "
"with confidence scores for each detected heading." "compact summary with the file path. Set inline=True to return "
"the complete structure in the response (use for small documents)."
), ),
) )
async def detect_structure( async def detect_structure(
@ -75,6 +76,8 @@ class StructureDetectionMixin(MCPMixin):
heading_pattern: Optional[str] = None, heading_pattern: Optional[str] = None,
max_heading_levels: int = 3, max_heading_levels: int = 3,
min_confidence: float = 0.5, min_confidence: float = 0.5,
output_directory: Optional[str] = None,
inline: bool = False,
) -> Dict[str, Any]: ) -> Dict[str, Any]:
""" """
Detect logical document structure. Detect logical document structure.
@ -91,10 +94,15 @@ class StructureDetectionMixin(MCPMixin):
heading_pattern: Optional user-supplied regex for headings. heading_pattern: Optional user-supplied regex for headings.
max_heading_levels: Maximum heading depth to report (1-6). max_heading_levels: Maximum heading depth to report (1-6).
min_confidence: Drop boundaries below this confidence (0-1). min_confidence: Drop boundaries below this confidence (0-1).
output_directory: Directory for the structure JSON file.
Defaults to the same directory as the PDF.
inline: If True, return full structure in the response instead
of writing to a file. Useful for small documents or internal
calls. Default: False.
Returns: Returns:
Dict with success flag, hierarchical structure, flat boundaries, Dict with success flag, compact summary + file path (default),
detection metadata, and timing. or full hierarchical structure + flat boundaries (inline=True).
""" """
start_time = time.time() start_time = time.time()
@ -214,20 +222,73 @@ class StructureDetectionMixin(MCPMixin):
# Build hierarchical tree # Build hierarchical tree
sections = self._boundaries_to_sections(flat_boundaries, total_pages) sections = self._boundaries_to_sections(flat_boundaries, total_pages)
detection_info = {
"strategies_used": strategies_used,
"bookmarks_found": bookmarks_found,
"body_font": body_font_info,
"heading_fonts": heading_font_info,
"total_pages": total_pages,
}
full_structure = {
"sections": sections,
"flat_boundaries": flat_boundaries,
}
elapsed = round(time.time() - start_time, 2)
# ── Inline mode: return everything in the response ──
if inline:
return {
"success": True,
"structure": full_structure,
"detection_info": detection_info,
"detection_time": elapsed,
}
# ── File-first mode (default): write JSON, return summary ──
if output_directory:
out_dir = Path(validate_output_path(output_directory))
else:
out_dir = path.parent
out_dir.mkdir(parents=True, exist_ok=True)
json_filename = f"{path.stem}_structure.json"
json_path = out_dir / json_filename
full_result = {
"structure": full_structure,
"detection_info": detection_info,
"detection_time": elapsed,
}
json_path.write_text(
json.dumps(full_result, indent=2, ensure_ascii=False),
encoding="utf-8",
)
# Build compact summary: top-level sections with subsection counts
summary_sections = []
for sec in sections:
sub_count = self._count_subsections(sec)
summary_sections.append({
"title": sec["title"],
"level": sec["level"],
"pages": f"{sec['page_start']}-{sec['page_end']}",
"confidence": sec["confidence"],
"method": sec["detection_method"],
"subsections": sub_count,
})
return { return {
"success": True, "success": True,
"structure": { "output_file": str(json_path),
"sections": sections, "summary": {
"flat_boundaries": flat_boundaries, "total_boundaries": len(flat_boundaries),
"top_level_sections": len(sections),
"sections": summary_sections,
}, },
"detection_info": { "detection_info": detection_info,
"strategies_used": strategies_used, "detection_time": elapsed,
"bookmarks_found": bookmarks_found,
"body_font": body_font_info,
"heading_fonts": heading_font_info,
"total_pages": total_pages,
},
"detection_time": round(time.time() - start_time, 2),
} }
except Exception as e: except Exception as e:
@ -645,6 +706,15 @@ class StructureDetectionMixin(MCPMixin):
) )
section["page_end"] = max(section["page_end"], child_max) section["page_end"] = max(section["page_end"], child_max)
@staticmethod
def _count_subsections(section: Dict[str, Any]) -> int:
"""Recursively count all subsections (direct + nested)."""
subs = section.get("subsections", [])
total = len(subs)
for sub in subs:
total += StructureDetectionMixin._count_subsections(sub)
return total
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# Filesystem-safe name helper (for downstream splitting tools) # Filesystem-safe name helper (for downstream splitting tools)
# ------------------------------------------------------------------ # ------------------------------------------------------------------
@ -725,12 +795,13 @@ class StructureDetectionMixin(MCPMixin):
output_dir = Path(validate_output_path(output_directory)) output_dir = Path(validate_output_path(output_directory))
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
# Step 1: Detect structure # Step 1: Detect structure (inline=True for internal use)
structure_result = await self.detect_structure( structure_result = await self.detect_structure(
pdf_path=pdf_path, pdf_path=pdf_path,
strategies=strategies, strategies=strategies,
heading_pattern=heading_pattern, heading_pattern=heading_pattern,
min_confidence=min_confidence, min_confidence=min_confidence,
inline=True,
) )
if not structure_result.get("success"): if not structure_result.get("success"):

2
uv.lock generated
View File

@ -1032,7 +1032,7 @@ wheels = [
[[package]] [[package]]
name = "mcp-pdf" name = "mcp-pdf"
version = "2.1.0" version = "2.1.1"
source = { editable = "." } source = { editable = "." }
dependencies = [ dependencies = [
{ name = "camelot-py", extra = ["cv"] }, { name = "camelot-py", extra = ["cv"] },