📉 File-first output for detect_structure — 20× context reduction

detect_structure now writes full JSON to disk and returns a compact summary (~1k tokens) instead of the full structure tree (~20k tokens). Prevents MCP context overflow on large documents. Set inline=True to get full data in response (used internally by split_pdf_by_structure).
2026-03-04 17:12:36 -07:00 · 2026-03-04 17:12:36 -07:00 · a23fd8467a
commit a23fd8467a
parent 56ab8356bc
4 changed files with 90 additions and 19 deletions
--- a/CLAUDE.md
+++ b/CLAUDE.md
@ -97,7 +97,7 @@ uv publish
 8. **PDF Forms**: `extract_form_data`, `create_form_pdf`, `fill_form_pdf`, `add_form_fields` - Complete form lifecycle management
 9. **Document Assembly**: `merge_pdfs`, `split_pdf_by_pages`, `reorder_pdf_pages` - PDF manipulation and organization
 10. **Annotations & Markup**: `add_sticky_notes`, `add_highlights`, `add_stamps`, `add_video_notes`, `extract_all_annotations` - Collaboration and multimedia review tools
-11. **Structure Detection**: `detect_structure`, `split_pdf_by_structure`, `batch_extract` - Chapter-aware document analysis and extraction. `detect_structure` finds headings via bookmarks, font-size heuristics, and numbering patterns. `split_pdf_by_structure` auto-splits into per-chapter directories with markdown + images. `batch_extract` processes user-specified page ranges in a single call (replaces 24+ individual tool calls).
+11. **Structure Detection**: `detect_structure`, `split_pdf_by_structure`, `batch_extract` - Chapter-aware document analysis and extraction. `detect_structure` finds headings via bookmarks, font-size heuristics, and numbering patterns. Writes full structure to a JSON file by default, returns compact summary + path (~1k tokens vs ~20k inline). Set `inline=True` for full data in response. `split_pdf_by_structure` auto-splits into per-chapter directories with markdown + images. `batch_extract` processes user-specified page ranges in a single call (replaces 24+ individual tool calls).

 ### MCP Client-Friendly Design

--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [project]
 name = "mcp-pdf"
-version = "2.1.1"
+version = "2.1.2"
 description = "Secure FastMCP server for comprehensive PDF processing - text extraction, OCR, table extraction, forms, annotations, and more"
 authors = [{name = "Ryan Malloy", email = "ryan@malloys.us"}]
 readme = "README.md"
--- a/src/mcp_pdf/mixins_official/structure_detection.py
+++ b/src/mcp_pdf/mixins_official/structure_detection.py
@ -63,8 +63,9 @@ class StructureDetectionMixin(MCPMixin):
        description=(
            "Detect logical structure (chapters, sections, headings) of a PDF "
            "using bookmarks, font-size analysis, and numbering patterns. "
-            "Returns a hierarchical section tree and a flat boundary list "
-            "with confidence scores for each detected heading."
+            "By default writes full structure to a JSON file and returns a "
+            "compact summary with the file path. Set inline=True to return "
+            "the complete structure in the response (use for small documents)."
        ),
    )
    async def detect_structure(
@ -75,6 +76,8 @@ class StructureDetectionMixin(MCPMixin):
        heading_pattern: Optional[str] = None,
        max_heading_levels: int = 3,
        min_confidence: float = 0.5,
+        output_directory: Optional[str] = None,
+        inline: bool = False,
    ) -> Dict[str, Any]:
        """
        Detect logical document structure.
@ -91,10 +94,15 @@ class StructureDetectionMixin(MCPMixin):
            heading_pattern: Optional user-supplied regex for headings.
            max_heading_levels: Maximum heading depth to report (1-6).
            min_confidence: Drop boundaries below this confidence (0-1).
+            output_directory: Directory for the structure JSON file.
+                Defaults to the same directory as the PDF.
+            inline: If True, return full structure in the response instead
+                of writing to a file. Useful for small documents or internal
+                calls. Default: False.

        Returns:
-            Dict with success flag, hierarchical structure, flat boundaries,
-            detection metadata, and timing.
+            Dict with success flag, compact summary + file path (default),
+            or full hierarchical structure + flat boundaries (inline=True).
        """
        start_time = time.time()

@ -214,20 +222,73 @@ class StructureDetectionMixin(MCPMixin):
            # Build hierarchical tree
            sections = self._boundaries_to_sections(flat_boundaries, total_pages)

+            detection_info = {
+                "strategies_used": strategies_used,
+                "bookmarks_found": bookmarks_found,
+                "body_font": body_font_info,
+                "heading_fonts": heading_font_info,
+                "total_pages": total_pages,
+            }
+
+            full_structure = {
+                "sections": sections,
+                "flat_boundaries": flat_boundaries,
+            }
+
+            elapsed = round(time.time() - start_time, 2)
+
+            # ── Inline mode: return everything in the response ──
+            if inline:
+                return {
+                    "success": True,
+                    "structure": full_structure,
+                    "detection_info": detection_info,
+                    "detection_time": elapsed,
+                }
+
+            # ── File-first mode (default): write JSON, return summary ──
+            if output_directory:
+                out_dir = Path(validate_output_path(output_directory))
+            else:
+                out_dir = path.parent
+
+            out_dir.mkdir(parents=True, exist_ok=True)
+            json_filename = f"{path.stem}_structure.json"
+            json_path = out_dir / json_filename
+
+            full_result = {
+                "structure": full_structure,
+                "detection_info": detection_info,
+                "detection_time": elapsed,
+            }
+            json_path.write_text(
+                json.dumps(full_result, indent=2, ensure_ascii=False),
+                encoding="utf-8",
+            )
+
+            # Build compact summary: top-level sections with subsection counts
+            summary_sections = []
+            for sec in sections:
+                sub_count = self._count_subsections(sec)
+                summary_sections.append({
+                    "title": sec["title"],
+                    "level": sec["level"],
+                    "pages": f"{sec['page_start']}-{sec['page_end']}",
+                    "confidence": sec["confidence"],
+                    "method": sec["detection_method"],
+                    "subsections": sub_count,
+                })
+
            return {
                "success": True,
-                "structure": {
-                    "sections": sections,
-                    "flat_boundaries": flat_boundaries,
+                "output_file": str(json_path),
+                "summary": {
+                    "total_boundaries": len(flat_boundaries),
+                    "top_level_sections": len(sections),
+                    "sections": summary_sections,
                },
-                "detection_info": {
-                    "strategies_used": strategies_used,
-                    "bookmarks_found": bookmarks_found,
-                    "body_font": body_font_info,
-                    "heading_fonts": heading_font_info,
-                    "total_pages": total_pages,
-                },
-                "detection_time": round(time.time() - start_time, 2),
+                "detection_info": detection_info,
+                "detection_time": elapsed,
            }

        except Exception as e:
@ -645,6 +706,15 @@ class StructureDetectionMixin(MCPMixin):
                )
                section["page_end"] = max(section["page_end"], child_max)

+    @staticmethod
+    def _count_subsections(section: Dict[str, Any]) -> int:
+        """Recursively count all subsections (direct + nested)."""
+        subs = section.get("subsections", [])
+        total = len(subs)
+        for sub in subs:
+            total += StructureDetectionMixin._count_subsections(sub)
+        return total
+
    # ------------------------------------------------------------------
    # Filesystem-safe name helper (for downstream splitting tools)
    # ------------------------------------------------------------------
@ -725,12 +795,13 @@ class StructureDetectionMixin(MCPMixin):
            output_dir = Path(validate_output_path(output_directory))
            output_dir.mkdir(parents=True, exist_ok=True)

-            # Step 1: Detect structure
+            # Step 1: Detect structure (inline=True for internal use)
            structure_result = await self.detect_structure(
                pdf_path=pdf_path,
                strategies=strategies,
                heading_pattern=heading_pattern,
                min_confidence=min_confidence,
+                inline=True,
            )

            if not structure_result.get("success"):
--- a/uv.lock
+++ b/uv.lock
@ -1032,7 +1032,7 @@ wheels = [

 [[package]]
 name = "mcp-pdf"
-version = "2.1.0"
+version = "2.1.1"
 source = { editable = "." }
 dependencies = [
    { name = "camelot-py", extra = ["cv"] },