📉 File-first output for detect_structure — 20× context reduction
detect_structure now writes full JSON to disk and returns a compact summary (~1k tokens) instead of the full structure tree (~20k tokens). Prevents MCP context overflow on large documents. Set inline=True to get full data in response (used internally by split_pdf_by_structure).
This commit is contained in:
parent
56ab8356bc
commit
a23fd8467a
@ -97,7 +97,7 @@ uv publish
|
|||||||
8. **PDF Forms**: `extract_form_data`, `create_form_pdf`, `fill_form_pdf`, `add_form_fields` - Complete form lifecycle management
|
8. **PDF Forms**: `extract_form_data`, `create_form_pdf`, `fill_form_pdf`, `add_form_fields` - Complete form lifecycle management
|
||||||
9. **Document Assembly**: `merge_pdfs`, `split_pdf_by_pages`, `reorder_pdf_pages` - PDF manipulation and organization
|
9. **Document Assembly**: `merge_pdfs`, `split_pdf_by_pages`, `reorder_pdf_pages` - PDF manipulation and organization
|
||||||
10. **Annotations & Markup**: `add_sticky_notes`, `add_highlights`, `add_stamps`, `add_video_notes`, `extract_all_annotations` - Collaboration and multimedia review tools
|
10. **Annotations & Markup**: `add_sticky_notes`, `add_highlights`, `add_stamps`, `add_video_notes`, `extract_all_annotations` - Collaboration and multimedia review tools
|
||||||
11. **Structure Detection**: `detect_structure`, `split_pdf_by_structure`, `batch_extract` - Chapter-aware document analysis and extraction. `detect_structure` finds headings via bookmarks, font-size heuristics, and numbering patterns. `split_pdf_by_structure` auto-splits into per-chapter directories with markdown + images. `batch_extract` processes user-specified page ranges in a single call (replaces 24+ individual tool calls).
|
11. **Structure Detection**: `detect_structure`, `split_pdf_by_structure`, `batch_extract` - Chapter-aware document analysis and extraction. `detect_structure` finds headings via bookmarks, font-size heuristics, and numbering patterns. Writes full structure to a JSON file by default, returns compact summary + path (~1k tokens vs ~20k inline). Set `inline=True` for full data in response. `split_pdf_by_structure` auto-splits into per-chapter directories with markdown + images. `batch_extract` processes user-specified page ranges in a single call (replaces 24+ individual tool calls).
|
||||||
|
|
||||||
### MCP Client-Friendly Design
|
### MCP Client-Friendly Design
|
||||||
|
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
[project]
|
[project]
|
||||||
name = "mcp-pdf"
|
name = "mcp-pdf"
|
||||||
version = "2.1.1"
|
version = "2.1.2"
|
||||||
description = "Secure FastMCP server for comprehensive PDF processing - text extraction, OCR, table extraction, forms, annotations, and more"
|
description = "Secure FastMCP server for comprehensive PDF processing - text extraction, OCR, table extraction, forms, annotations, and more"
|
||||||
authors = [{name = "Ryan Malloy", email = "ryan@malloys.us"}]
|
authors = [{name = "Ryan Malloy", email = "ryan@malloys.us"}]
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
|
|||||||
@ -63,8 +63,9 @@ class StructureDetectionMixin(MCPMixin):
|
|||||||
description=(
|
description=(
|
||||||
"Detect logical structure (chapters, sections, headings) of a PDF "
|
"Detect logical structure (chapters, sections, headings) of a PDF "
|
||||||
"using bookmarks, font-size analysis, and numbering patterns. "
|
"using bookmarks, font-size analysis, and numbering patterns. "
|
||||||
"Returns a hierarchical section tree and a flat boundary list "
|
"By default writes full structure to a JSON file and returns a "
|
||||||
"with confidence scores for each detected heading."
|
"compact summary with the file path. Set inline=True to return "
|
||||||
|
"the complete structure in the response (use for small documents)."
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
async def detect_structure(
|
async def detect_structure(
|
||||||
@ -75,6 +76,8 @@ class StructureDetectionMixin(MCPMixin):
|
|||||||
heading_pattern: Optional[str] = None,
|
heading_pattern: Optional[str] = None,
|
||||||
max_heading_levels: int = 3,
|
max_heading_levels: int = 3,
|
||||||
min_confidence: float = 0.5,
|
min_confidence: float = 0.5,
|
||||||
|
output_directory: Optional[str] = None,
|
||||||
|
inline: bool = False,
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Detect logical document structure.
|
Detect logical document structure.
|
||||||
@ -91,10 +94,15 @@ class StructureDetectionMixin(MCPMixin):
|
|||||||
heading_pattern: Optional user-supplied regex for headings.
|
heading_pattern: Optional user-supplied regex for headings.
|
||||||
max_heading_levels: Maximum heading depth to report (1-6).
|
max_heading_levels: Maximum heading depth to report (1-6).
|
||||||
min_confidence: Drop boundaries below this confidence (0-1).
|
min_confidence: Drop boundaries below this confidence (0-1).
|
||||||
|
output_directory: Directory for the structure JSON file.
|
||||||
|
Defaults to the same directory as the PDF.
|
||||||
|
inline: If True, return full structure in the response instead
|
||||||
|
of writing to a file. Useful for small documents or internal
|
||||||
|
calls. Default: False.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dict with success flag, hierarchical structure, flat boundaries,
|
Dict with success flag, compact summary + file path (default),
|
||||||
detection metadata, and timing.
|
or full hierarchical structure + flat boundaries (inline=True).
|
||||||
"""
|
"""
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
@ -214,20 +222,73 @@ class StructureDetectionMixin(MCPMixin):
|
|||||||
# Build hierarchical tree
|
# Build hierarchical tree
|
||||||
sections = self._boundaries_to_sections(flat_boundaries, total_pages)
|
sections = self._boundaries_to_sections(flat_boundaries, total_pages)
|
||||||
|
|
||||||
|
detection_info = {
|
||||||
|
"strategies_used": strategies_used,
|
||||||
|
"bookmarks_found": bookmarks_found,
|
||||||
|
"body_font": body_font_info,
|
||||||
|
"heading_fonts": heading_font_info,
|
||||||
|
"total_pages": total_pages,
|
||||||
|
}
|
||||||
|
|
||||||
|
full_structure = {
|
||||||
|
"sections": sections,
|
||||||
|
"flat_boundaries": flat_boundaries,
|
||||||
|
}
|
||||||
|
|
||||||
|
elapsed = round(time.time() - start_time, 2)
|
||||||
|
|
||||||
|
# ── Inline mode: return everything in the response ──
|
||||||
|
if inline:
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"structure": full_structure,
|
||||||
|
"detection_info": detection_info,
|
||||||
|
"detection_time": elapsed,
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── File-first mode (default): write JSON, return summary ──
|
||||||
|
if output_directory:
|
||||||
|
out_dir = Path(validate_output_path(output_directory))
|
||||||
|
else:
|
||||||
|
out_dir = path.parent
|
||||||
|
|
||||||
|
out_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
json_filename = f"{path.stem}_structure.json"
|
||||||
|
json_path = out_dir / json_filename
|
||||||
|
|
||||||
|
full_result = {
|
||||||
|
"structure": full_structure,
|
||||||
|
"detection_info": detection_info,
|
||||||
|
"detection_time": elapsed,
|
||||||
|
}
|
||||||
|
json_path.write_text(
|
||||||
|
json.dumps(full_result, indent=2, ensure_ascii=False),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Build compact summary: top-level sections with subsection counts
|
||||||
|
summary_sections = []
|
||||||
|
for sec in sections:
|
||||||
|
sub_count = self._count_subsections(sec)
|
||||||
|
summary_sections.append({
|
||||||
|
"title": sec["title"],
|
||||||
|
"level": sec["level"],
|
||||||
|
"pages": f"{sec['page_start']}-{sec['page_end']}",
|
||||||
|
"confidence": sec["confidence"],
|
||||||
|
"method": sec["detection_method"],
|
||||||
|
"subsections": sub_count,
|
||||||
|
})
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"success": True,
|
"success": True,
|
||||||
"structure": {
|
"output_file": str(json_path),
|
||||||
"sections": sections,
|
"summary": {
|
||||||
"flat_boundaries": flat_boundaries,
|
"total_boundaries": len(flat_boundaries),
|
||||||
|
"top_level_sections": len(sections),
|
||||||
|
"sections": summary_sections,
|
||||||
},
|
},
|
||||||
"detection_info": {
|
"detection_info": detection_info,
|
||||||
"strategies_used": strategies_used,
|
"detection_time": elapsed,
|
||||||
"bookmarks_found": bookmarks_found,
|
|
||||||
"body_font": body_font_info,
|
|
||||||
"heading_fonts": heading_font_info,
|
|
||||||
"total_pages": total_pages,
|
|
||||||
},
|
|
||||||
"detection_time": round(time.time() - start_time, 2),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -645,6 +706,15 @@ class StructureDetectionMixin(MCPMixin):
|
|||||||
)
|
)
|
||||||
section["page_end"] = max(section["page_end"], child_max)
|
section["page_end"] = max(section["page_end"], child_max)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _count_subsections(section: Dict[str, Any]) -> int:
|
||||||
|
"""Recursively count all subsections (direct + nested)."""
|
||||||
|
subs = section.get("subsections", [])
|
||||||
|
total = len(subs)
|
||||||
|
for sub in subs:
|
||||||
|
total += StructureDetectionMixin._count_subsections(sub)
|
||||||
|
return total
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
# Filesystem-safe name helper (for downstream splitting tools)
|
# Filesystem-safe name helper (for downstream splitting tools)
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
@ -725,12 +795,13 @@ class StructureDetectionMixin(MCPMixin):
|
|||||||
output_dir = Path(validate_output_path(output_directory))
|
output_dir = Path(validate_output_path(output_directory))
|
||||||
output_dir.mkdir(parents=True, exist_ok=True)
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
# Step 1: Detect structure
|
# Step 1: Detect structure (inline=True for internal use)
|
||||||
structure_result = await self.detect_structure(
|
structure_result = await self.detect_structure(
|
||||||
pdf_path=pdf_path,
|
pdf_path=pdf_path,
|
||||||
strategies=strategies,
|
strategies=strategies,
|
||||||
heading_pattern=heading_pattern,
|
heading_pattern=heading_pattern,
|
||||||
min_confidence=min_confidence,
|
min_confidence=min_confidence,
|
||||||
|
inline=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
if not structure_result.get("success"):
|
if not structure_result.get("success"):
|
||||||
|
|||||||
2
uv.lock
generated
2
uv.lock
generated
@ -1032,7 +1032,7 @@ wheels = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "mcp-pdf"
|
name = "mcp-pdf"
|
||||||
version = "2.1.0"
|
version = "2.1.1"
|
||||||
source = { editable = "." }
|
source = { editable = "." }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "camelot-py", extra = ["cv"] },
|
{ name = "camelot-py", extra = ["cv"] },
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user