From a23fd8467a44d307598b114f7d6d37895c589c54 Mon Sep 17 00:00:00 2001 From: Ryan Malloy Date: Wed, 4 Mar 2026 17:12:36 -0700 Subject: [PATCH] =?UTF-8?q?=F0=9F=93=89=20File-first=20output=20for=20dete?= =?UTF-8?q?ct=5Fstructure=20=E2=80=94=2020=C3=97=20context=20reduction?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit detect_structure now writes full JSON to disk and returns a compact summary (~1k tokens) instead of the full structure tree (~20k tokens). Prevents MCP context overflow on large documents. Set inline=True to get full data in response (used internally by split_pdf_by_structure). --- CLAUDE.md | 2 +- pyproject.toml | 2 +- .../mixins_official/structure_detection.py | 103 +++++++++++++++--- uv.lock | 2 +- 4 files changed, 90 insertions(+), 19 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 238794e..db00a51 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -97,7 +97,7 @@ uv publish 8. **PDF Forms**: `extract_form_data`, `create_form_pdf`, `fill_form_pdf`, `add_form_fields` - Complete form lifecycle management 9. **Document Assembly**: `merge_pdfs`, `split_pdf_by_pages`, `reorder_pdf_pages` - PDF manipulation and organization 10. **Annotations & Markup**: `add_sticky_notes`, `add_highlights`, `add_stamps`, `add_video_notes`, `extract_all_annotations` - Collaboration and multimedia review tools -11. **Structure Detection**: `detect_structure`, `split_pdf_by_structure`, `batch_extract` - Chapter-aware document analysis and extraction. `detect_structure` finds headings via bookmarks, font-size heuristics, and numbering patterns. `split_pdf_by_structure` auto-splits into per-chapter directories with markdown + images. `batch_extract` processes user-specified page ranges in a single call (replaces 24+ individual tool calls). +11. **Structure Detection**: `detect_structure`, `split_pdf_by_structure`, `batch_extract` - Chapter-aware document analysis and extraction. `detect_structure` finds headings via bookmarks, font-size heuristics, and numbering patterns. Writes full structure to a JSON file by default, returns compact summary + path (~1k tokens vs ~20k inline). Set `inline=True` for full data in response. `split_pdf_by_structure` auto-splits into per-chapter directories with markdown + images. `batch_extract` processes user-specified page ranges in a single call (replaces 24+ individual tool calls). ### MCP Client-Friendly Design diff --git a/pyproject.toml b/pyproject.toml index 678c158..7aa8b25 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "mcp-pdf" -version = "2.1.1" +version = "2.1.2" description = "Secure FastMCP server for comprehensive PDF processing - text extraction, OCR, table extraction, forms, annotations, and more" authors = [{name = "Ryan Malloy", email = "ryan@malloys.us"}] readme = "README.md" diff --git a/src/mcp_pdf/mixins_official/structure_detection.py b/src/mcp_pdf/mixins_official/structure_detection.py index dfb9e75..97bc054 100644 --- a/src/mcp_pdf/mixins_official/structure_detection.py +++ b/src/mcp_pdf/mixins_official/structure_detection.py @@ -63,8 +63,9 @@ class StructureDetectionMixin(MCPMixin): description=( "Detect logical structure (chapters, sections, headings) of a PDF " "using bookmarks, font-size analysis, and numbering patterns. " - "Returns a hierarchical section tree and a flat boundary list " - "with confidence scores for each detected heading." + "By default writes full structure to a JSON file and returns a " + "compact summary with the file path. Set inline=True to return " + "the complete structure in the response (use for small documents)." ), ) async def detect_structure( @@ -75,6 +76,8 @@ class StructureDetectionMixin(MCPMixin): heading_pattern: Optional[str] = None, max_heading_levels: int = 3, min_confidence: float = 0.5, + output_directory: Optional[str] = None, + inline: bool = False, ) -> Dict[str, Any]: """ Detect logical document structure. @@ -91,10 +94,15 @@ class StructureDetectionMixin(MCPMixin): heading_pattern: Optional user-supplied regex for headings. max_heading_levels: Maximum heading depth to report (1-6). min_confidence: Drop boundaries below this confidence (0-1). + output_directory: Directory for the structure JSON file. + Defaults to the same directory as the PDF. + inline: If True, return full structure in the response instead + of writing to a file. Useful for small documents or internal + calls. Default: False. Returns: - Dict with success flag, hierarchical structure, flat boundaries, - detection metadata, and timing. + Dict with success flag, compact summary + file path (default), + or full hierarchical structure + flat boundaries (inline=True). """ start_time = time.time() @@ -214,20 +222,73 @@ class StructureDetectionMixin(MCPMixin): # Build hierarchical tree sections = self._boundaries_to_sections(flat_boundaries, total_pages) + detection_info = { + "strategies_used": strategies_used, + "bookmarks_found": bookmarks_found, + "body_font": body_font_info, + "heading_fonts": heading_font_info, + "total_pages": total_pages, + } + + full_structure = { + "sections": sections, + "flat_boundaries": flat_boundaries, + } + + elapsed = round(time.time() - start_time, 2) + + # ── Inline mode: return everything in the response ── + if inline: + return { + "success": True, + "structure": full_structure, + "detection_info": detection_info, + "detection_time": elapsed, + } + + # ── File-first mode (default): write JSON, return summary ── + if output_directory: + out_dir = Path(validate_output_path(output_directory)) + else: + out_dir = path.parent + + out_dir.mkdir(parents=True, exist_ok=True) + json_filename = f"{path.stem}_structure.json" + json_path = out_dir / json_filename + + full_result = { + "structure": full_structure, + "detection_info": detection_info, + "detection_time": elapsed, + } + json_path.write_text( + json.dumps(full_result, indent=2, ensure_ascii=False), + encoding="utf-8", + ) + + # Build compact summary: top-level sections with subsection counts + summary_sections = [] + for sec in sections: + sub_count = self._count_subsections(sec) + summary_sections.append({ + "title": sec["title"], + "level": sec["level"], + "pages": f"{sec['page_start']}-{sec['page_end']}", + "confidence": sec["confidence"], + "method": sec["detection_method"], + "subsections": sub_count, + }) + return { "success": True, - "structure": { - "sections": sections, - "flat_boundaries": flat_boundaries, + "output_file": str(json_path), + "summary": { + "total_boundaries": len(flat_boundaries), + "top_level_sections": len(sections), + "sections": summary_sections, }, - "detection_info": { - "strategies_used": strategies_used, - "bookmarks_found": bookmarks_found, - "body_font": body_font_info, - "heading_fonts": heading_font_info, - "total_pages": total_pages, - }, - "detection_time": round(time.time() - start_time, 2), + "detection_info": detection_info, + "detection_time": elapsed, } except Exception as e: @@ -645,6 +706,15 @@ class StructureDetectionMixin(MCPMixin): ) section["page_end"] = max(section["page_end"], child_max) + @staticmethod + def _count_subsections(section: Dict[str, Any]) -> int: + """Recursively count all subsections (direct + nested).""" + subs = section.get("subsections", []) + total = len(subs) + for sub in subs: + total += StructureDetectionMixin._count_subsections(sub) + return total + # ------------------------------------------------------------------ # Filesystem-safe name helper (for downstream splitting tools) # ------------------------------------------------------------------ @@ -725,12 +795,13 @@ class StructureDetectionMixin(MCPMixin): output_dir = Path(validate_output_path(output_directory)) output_dir.mkdir(parents=True, exist_ok=True) - # Step 1: Detect structure + # Step 1: Detect structure (inline=True for internal use) structure_result = await self.detect_structure( pdf_path=pdf_path, strategies=strategies, heading_pattern=heading_pattern, min_confidence=min_confidence, + inline=True, ) if not structure_result.get("success"): diff --git a/uv.lock b/uv.lock index fa2a051..31cc8dd 100644 --- a/uv.lock +++ b/uv.lock @@ -1032,7 +1032,7 @@ wheels = [ [[package]] name = "mcp-pdf" -version = "2.1.0" +version = "2.1.1" source = { editable = "." } dependencies = [ { name = "camelot-py", extra = ["cv"] },