"""
Structure Detection Mixin - Detect document structure via bookmarks, font analysis,
and numbering/regex patterns. Produces hierarchical section trees and flat boundary
lists suitable for downstream splitting and batch extraction.

Uses official fastmcp.contrib.mcp_mixin pattern.
"""

import json
import re
import time
import logging
from collections import defaultdict
from pathlib import Path
from typing import Dict, Any, Optional, List, Tuple

import fitz  # PyMuPDF

from fastmcp.contrib.mcp_mixin import MCPMixin, mcp_tool

from ..security import validate_pdf_path, validate_output_path, sanitize_error_message
from .utils import parse_pages_parameter
from .image_processing import ImageProcessingMixin

logger = logging.getLogger(__name__)

# Common section-heading patterns (case-insensitive)
_NUMBERING_PATTERNS = [
    # "Chapter 1", "CHAPTER IV"
    (r"^(?:chapter|ch\.?)\s+(?:\d+|[IVXLCDM]+)", 1),
    # "Part 1", "PART III"
    (r"^(?:part)\s+(?:\d+|[IVXLCDM]+)", 1),
    # "ANNEX A", "Annex 1"
    (r"^(?:annex|appendix)\s+[A-Z0-9]+", 1),
    # "Section 2.3"
    (r"^(?:section)\s+\d+(?:\.\d+)*", 2),
    # "1.2.3 Title text" (numbered headings like 1., 1.2, 1.2.3)
    (r"^\d+\.\d+\.\d+(?:\.\d+)*\s+\S", 3),
    (r"^\d+\.\d+\s+\S", 2),
    (r"^\d+\.\s+\S", 1),
]


class StructureDetectionMixin(MCPMixin):
    """
    Detects document structure from bookmarks, font-size analysis, and
    numbering/regex patterns.  Produces a hierarchical section tree and a
    flat boundary list that downstream tools (split_pdf_by_structure,
    batch_extract) can consume directly.

    Uses the official FastMCP mixin pattern.
    """

    def __init__(self):
        super().__init__()

    # ------------------------------------------------------------------
    # Public MCP tool
    # ------------------------------------------------------------------

    @mcp_tool(
        name="detect_structure",
        description=(
            "Detect logical structure (chapters, sections, headings) of a PDF "
            "using bookmarks, font-size analysis, and numbering patterns. "
            "By default writes full structure to a JSON file and returns a "
            "compact summary with the file path. Set inline=True to return "
            "the complete structure in the response (use for small documents)."
        ),
    )
    async def detect_structure(
        self,
        pdf_path: str,
        pages: Optional[str] = None,
        strategies: str = "auto",
        heading_pattern: Optional[str] = None,
        max_heading_levels: int = 3,
        min_confidence: float = 0.5,
        output_directory: Optional[str] = None,
        inline: bool = False,
    ) -> Dict[str, Any]:
        """
        Detect logical document structure.

        Args:
            pdf_path: Path to PDF file or HTTPS URL.
            pages: Pages to analyse (comma-separated, 1-based). None = all.
            strategies: Detection strategy —
                "auto"      try bookmarks first, always run fonts, cross-validate.
                "bookmarks" bookmarks only.
                "fonts"     font-size heuristic only.
                "numbering" regex / numbering patterns only.
                "all"       run every strategy and merge.
            heading_pattern: Optional user-supplied regex for headings.
            max_heading_levels: Maximum heading depth to report (1-6).
            min_confidence: Drop boundaries below this confidence (0-1).
            output_directory: Directory for the structure JSON file.
                Defaults to the same directory as the PDF.
            inline: If True, return full structure in the response instead
                of writing to a file. Useful for small documents or internal
                calls. Default: False.

        Returns:
            Dict with success flag, compact summary + file path (default),
            or full hierarchical structure + flat boundaries (inline=True).
        """
        start_time = time.time()

        try:
            path = await validate_pdf_path(pdf_path)
            doc = fitz.open(str(path))
            total_pages = len(doc)

            # Determine which pages to process
            parsed_pages = parse_pages_parameter(pages)
            if parsed_pages is not None:
                pages_to_process = sorted(
                    p for p in parsed_pages if 0 <= p < total_pages
                )
            else:
                pages_to_process = list(range(total_pages))

            if not pages_to_process:
                pages_to_process = list(range(total_pages))

            max_heading_levels = max(1, min(6, max_heading_levels))

            # Collect detections per strategy
            all_detections: List[List[Dict[str, Any]]] = []
            strategies_used: List[str] = []
            bookmarks_found = 0
            body_font_info: Dict[str, Any] = {}
            heading_font_info: Dict[int, Dict[str, Any]] = {}

            strategies_lower = strategies.strip().lower()

            # --- Bookmarks ---
            run_bookmarks = strategies_lower in ("auto", "bookmarks", "all")
            bookmark_detections: List[Dict[str, Any]] = []
            if run_bookmarks:
                try:
                    bookmark_detections = self._detect_by_bookmarks(doc)
                    bookmarks_found = len(bookmark_detections)
                    if bookmark_detections:
                        strategies_used.append("bookmarks")
                        all_detections.append(bookmark_detections)
                except Exception as exc:
                    logger.warning("Bookmark detection failed: %s", exc)

            # --- Fonts ---
            run_fonts = strategies_lower in ("auto", "fonts", "all")
            if run_fonts:
                try:
                    font_detections, body_info, heading_info = (
                        self._detect_by_fonts(doc, pages_to_process, max_heading_levels)
                    )
                    body_font_info = body_info
                    heading_font_info = heading_info
                    if font_detections:
                        strategies_used.append("fonts")
                        all_detections.append(font_detections)
                except Exception as exc:
                    logger.warning("Font-based detection failed: %s", exc)

            # --- Numbering / built-in patterns ---
            run_numbering = strategies_lower in ("auto", "numbering", "all")
            if run_numbering:
                try:
                    numbering_detections = self._detect_by_numbering(
                        doc, pages_to_process
                    )
                    if numbering_detections:
                        strategies_used.append("numbering")
                        all_detections.append(numbering_detections)
                except Exception as exc:
                    logger.warning("Numbering detection failed: %s", exc)

            # --- User-supplied regex ---
            if heading_pattern:
                try:
                    user_detections = self._detect_by_pattern(
                        doc, pages_to_process, heading_pattern
                    )
                    if user_detections:
                        strategies_used.append("user_regex")
                        all_detections.append(user_detections)
                except Exception as exc:
                    logger.warning("User-regex detection failed: %s", exc)

            # Auto-mode cross-validation: if bookmarks are sparse but exist,
            # still include font detections; if bookmarks are rich (>=3),
            # treat them as primary and boost font matches on the same pages.
            # (The merge step handles the boosting automatically.)

            doc.close()

            # Merge all detections
            merged = self._merge_detections(*all_detections)

            # Filter by min_confidence and max_heading_levels
            filtered = [
                b for b in merged
                if b["confidence"] >= min_confidence
                and b["level"] <= max_heading_levels
            ]

            # Sort by page then by position within page (implicit from detection order)
            filtered.sort(key=lambda b: (b["page"], b.get("_sort_y", 0)))

            # Strip internal sort keys
            flat_boundaries = []
            for b in filtered:
                entry = {
                    "title": b["title"],
                    "level": b["level"],
                    "page": b["page"],
                    "confidence": round(b["confidence"], 3),
                    "detection_method": b["detection_method"],
                }
                flat_boundaries.append(entry)

            # Build hierarchical tree
            sections = self._boundaries_to_sections(flat_boundaries, total_pages)

            detection_info = {
                "strategies_used": strategies_used,
                "bookmarks_found": bookmarks_found,
                "body_font": body_font_info,
                "heading_fonts": heading_font_info,
                "total_pages": total_pages,
            }

            full_structure = {
                "sections": sections,
                "flat_boundaries": flat_boundaries,
            }

            elapsed = round(time.time() - start_time, 2)

            # ── Inline mode: return everything in the response ──
            if inline:
                return {
                    "success": True,
                    "structure": full_structure,
                    "detection_info": detection_info,
                    "detection_time": elapsed,
                }

            # ── File-first mode (default): write JSON, return summary ──
            if output_directory:
                out_dir = Path(validate_output_path(output_directory))
            else:
                out_dir = path.parent

            out_dir.mkdir(parents=True, exist_ok=True)
            json_filename = f"{path.stem}_structure.json"
            json_path = out_dir / json_filename

            full_result = {
                "structure": full_structure,
                "detection_info": detection_info,
                "detection_time": elapsed,
            }
            json_path.write_text(
                json.dumps(full_result, indent=2, ensure_ascii=False),
                encoding="utf-8",
            )

            # Build compact preview: "p1-30: Title (5 subs)" lines
            max_preview = 10
            preview_lines = []
            for sec in sections[:max_preview]:
                sub_count = self._count_subsections(sec)
                sub_info = f" ({sub_count} sub)" if sub_count else ""
                preview_lines.append(
                    f"p{sec['page_start']}-{sec['page_end']}: "
                    f"{sec['title'][:60]}{sub_info}"
                )
            if len(sections) > max_preview:
                preview_lines.append(
                    f"... and {len(sections) - max_preview} more sections"
                )

            return {
                "success": True,
                "output_file": str(json_path),
                "total_boundaries": len(flat_boundaries),
                "top_level_sections": len(sections),
                "strategies_used": strategies_used,
                "total_pages": total_pages,
                "preview": preview_lines,
                "detection_time": elapsed,
            }

        except Exception as e:
            error_msg = sanitize_error_message(str(e))
            logger.error("Structure detection failed: %s", error_msg)
            return {
                "success": False,
                "error": error_msg,
                "detection_time": round(time.time() - start_time, 2),
            }

    # ------------------------------------------------------------------
    # Private helpers
    # ------------------------------------------------------------------

    def _detect_by_bookmarks(
        self, doc: fitz.Document
    ) -> List[Dict[str, Any]]:
        """Extract boundaries from PDF bookmarks / table of contents."""
        toc = doc.get_toc()
        boundaries: List[Dict[str, Any]] = []
        for level, title, page_num in toc:
            title_clean = title.strip()
            if not title_clean:
                continue
            boundaries.append(
                {
                    "title": title_clean,
                    "level": level,
                    "page": page_num,  # 1-based from fitz
                    "confidence": 0.95,
                    "detection_method": "bookmarks",
                    "_sort_y": 0,
                }
            )
        return boundaries

    def _detect_by_fonts(
        self,
        doc: fitz.Document,
        pages_to_process: List[int],
        max_levels: int,
    ) -> Tuple[List[Dict[str, Any]], Dict[str, Any], Dict[int, Dict[str, Any]]]:
        """
        Detect headings by font-size histogram analysis.

        Returns (boundaries, body_font_info, heading_font_map).
        """
        # Pass 1: build a histogram of font sizes weighted by character count
        size_char_count: Dict[float, int] = defaultdict(int)
        size_font_name: Dict[float, str] = {}

        for page_idx in pages_to_process:
            page = doc[page_idx]
            text_dict = page.get_text("dict")
            for block in text_dict.get("blocks", []):
                for line in block.get("lines", []):
                    for span in line.get("spans", []):
                        sz = round(span["size"], 1)
                        chars = len(span["text"])
                        if chars == 0:
                            continue
                        size_char_count[sz] += chars
                        # Keep the most-seen font name for each size
                        if sz not in size_font_name or size_char_count[sz] > 0:
                            size_font_name[sz] = span.get("font", "")

        if not size_char_count:
            return [], {}, {}

        # Body size = font size with highest total character count
        body_size = max(size_char_count, key=size_char_count.get)
        body_font_name = size_font_name.get(body_size, "")
        body_font_info = {"size": body_size, "name": body_font_name}

        # Heading candidates: sizes > body_size * 1.15
        threshold = body_size * 1.15
        heading_sizes = sorted(
            [sz for sz in size_char_count if sz > threshold], reverse=True
        )

        if not heading_sizes:
            return [], body_font_info, {}

        # Cluster heading sizes into at most max_levels levels.
        # Sizes within 1pt of each other collapse into one level.
        levels: List[List[float]] = []
        for sz in heading_sizes:
            placed = False
            for cluster in levels:
                if abs(sz - cluster[0]) <= 1.0:
                    cluster.append(sz)
                    placed = True
                    break
            if not placed:
                if len(levels) < max_levels:
                    levels.append([sz])
                # else: ignore smaller heading sizes beyond max_levels

        # Map each font size to its heading level (1 = largest)
        size_to_level: Dict[float, int] = {}
        heading_font_map: Dict[int, Dict[str, Any]] = {}
        for idx, cluster in enumerate(levels):
            level = idx + 1
            representative = max(cluster)
            heading_font_map[level] = {
                "size": representative,
                "name": size_font_name.get(representative, ""),
            }
            for sz in cluster:
                size_to_level[sz] = level

        # Pass 2: collect heading spans
        boundaries: List[Dict[str, Any]] = []
        for page_idx in pages_to_process:
            page = doc[page_idx]
            text_dict = page.get_text("dict")
            for block in text_dict.get("blocks", []):
                for line in block.get("lines", []):
                    line_text_parts: List[str] = []
                    line_size: Optional[float] = None
                    line_is_bold = False
                    line_y = line.get("bbox", [0, 0, 0, 0])[1]

                    spans = line.get("spans", [])

                    # First pass: identify which spans are heading-sized
                    span_roles = []
                    for span in spans:
                        sz = round(span["size"], 1)
                        is_heading = sz in size_to_level
                        span_roles.append((span, sz, is_heading))

                    # Second pass: collect heading spans AND sandwiched
                    # non-heading spans (superscripts like ² in I²C)
                    for idx, (span, sz, is_heading) in enumerate(span_roles):
                        if is_heading:
                            line_text_parts.append(span["text"])
                            line_size = sz
                            if span.get("flags", 0) & 16:
                                line_is_bold = True
                        elif line_text_parts and idx + 1 < len(span_roles):
                            # Non-heading span between heading spans —
                            # likely a superscript/subscript (e.g. ² in I²C)
                            if span_roles[idx + 1][2]:  # next span is heading
                                line_text_parts.append(span["text"])

                    if not line_text_parts or line_size is None:
                        continue

                    heading_text = "".join(line_text_parts).strip()
                    if not heading_text:
                        continue

                    # Confidence scoring
                    confidence = 0.70
                    # Boost for bold
                    if line_is_bold:
                        confidence += 0.07
                    # Boost for short text (likely a heading, not a paragraph)
                    if len(heading_text) < 100:
                        confidence += 0.06
                    # Boost if text matches a common numbering pattern
                    for pat, _ in _NUMBERING_PATTERNS:
                        if re.match(pat, heading_text, re.IGNORECASE):
                            confidence += 0.07
                            break
                    confidence = min(confidence, 0.90)

                    level = size_to_level[line_size]
                    # page is 1-based for the boundary dict
                    boundaries.append(
                        {
                            "title": heading_text,
                            "level": level,
                            "page": page_idx + 1,
                            "confidence": confidence,
                            "detection_method": "fonts",
                            "_sort_y": line_y,
                        }
                    )

        # De-duplicate near-identical entries on the same page (same text, same page)
        seen: set = set()
        deduped: List[Dict[str, Any]] = []
        for b in boundaries:
            key = (b["page"], b["title"][:60])
            if key not in seen:
                seen.add(key)
                deduped.append(b)

        return deduped, body_font_info, heading_font_map

    def _detect_by_numbering(
        self, doc: fitz.Document, pages_to_process: List[int]
    ) -> List[Dict[str, Any]]:
        """Detect headings using built-in numbering/chapter patterns."""
        boundaries: List[Dict[str, Any]] = []

        for page_idx in pages_to_process:
            page = doc[page_idx]
            text = page.get_text()
            # Look at the first 200 chars or first line, whichever is longer
            first_line = text.split("\n", 1)[0].strip() if text else ""
            search_text = text[:200] if len(text) > 200 else text

            for pat, default_level in _NUMBERING_PATTERNS:
                match = re.search(pat, search_text, re.IGNORECASE | re.MULTILINE)
                if match:
                    matched_text = match.group(0).strip()
                    # Grab the heading title up to the first newline
                    line_end = search_text.find("\n", match.start())
                    if line_end == -1:
                        line_end = len(search_text)
                    title = search_text[match.start():line_end].strip()
                    # Cap title length to avoid grabbing full sentences
                    if len(title) > 80:
                        title = title[:80].rstrip()
                        # Try to break at a word boundary
                        last_space = title.rfind(" ", 40)
                        if last_space > 0:
                            title = title[:last_space]

                    # Confidence varies: exact first-line match is higher
                    confidence = 0.70
                    if matched_text.lower() == first_line.lower()[:len(matched_text)]:
                        confidence = 0.80

                    boundaries.append(
                        {
                            "title": title,
                            "level": default_level,
                            "page": page_idx + 1,
                            "confidence": confidence,
                            "detection_method": "numbering",
                            "_sort_y": 0,
                        }
                    )
                    # Only take the first matching pattern per page
                    break

        return boundaries

    def _detect_by_pattern(
        self,
        doc: fitz.Document,
        pages_to_process: List[int],
        pattern: str,
    ) -> List[Dict[str, Any]]:
        """Apply a user-supplied regex to page text."""
        try:
            compiled = re.compile(pattern, re.IGNORECASE | re.MULTILINE)
        except re.error as exc:
            logger.warning("Invalid user heading_pattern regex: %s", exc)
            return []

        boundaries: List[Dict[str, Any]] = []

        for page_idx in pages_to_process:
            page = doc[page_idx]
            text = page.get_text()

            for match in compiled.finditer(text):
                title = match.group(0).strip()
                if not title:
                    continue
                if len(title) > 120:
                    title = title[:120].rstrip()

                boundaries.append(
                    {
                        "title": title,
                        "level": 1,  # User patterns default to level 1
                        "page": page_idx + 1,
                        "confidence": 0.85,
                        "detection_method": "user_regex",
                        "_sort_y": match.start(),
                    }
                )

        return boundaries

    # ------------------------------------------------------------------
    # Merge and tree-building
    # ------------------------------------------------------------------

    def _merge_detections(
        self, *detection_lists: List[Dict[str, Any]]
    ) -> List[Dict[str, Any]]:
        """
        Merge multiple detection lists, de-duplicating boundaries that
        refer to the same heading (same page +/-1, similar title).
        When merging, take the max confidence and combine method names.
        """
        if not detection_lists:
            return []

        # Flatten
        all_items: List[Dict[str, Any]] = []
        for dl in detection_lists:
            all_items.extend(dl)

        if not all_items:
            return []

        # Sort by page then sort_y
        all_items.sort(key=lambda b: (b["page"], b.get("_sort_y", 0)))

        merged: List[Dict[str, Any]] = []

        for item in all_items:
            matched = False
            for existing in merged:
                # Same page (+/-1) and similar title
                if abs(existing["page"] - item["page"]) <= 1:
                    if self._titles_similar(existing["title"], item["title"]):
                        # Merge: boost confidence, combine methods
                        existing["confidence"] = min(
                            0.99,
                            max(existing["confidence"], item["confidence"]) + 0.05,
                        )
                        methods = set(existing["detection_method"].split("+"))
                        methods.add(item["detection_method"])
                        existing["detection_method"] = "+".join(sorted(methods))
                        # Keep the smaller (more prominent) level
                        existing["level"] = min(existing["level"], item["level"])
                        matched = True
                        break
            if not matched:
                merged.append(dict(item))

        return merged

    @staticmethod
    def _titles_similar(a: str, b: str) -> bool:
        """Check whether two heading titles are similar enough to merge."""
        a_norm = re.sub(r"\s+", " ", a.strip().lower())
        b_norm = re.sub(r"\s+", " ", b.strip().lower())
        if a_norm == b_norm:
            return True
        # One contains the other (common with partial extractions)
        if a_norm in b_norm or b_norm in a_norm:
            return True
        # Compare first 40 chars (handles trailing differences)
        if len(a_norm) > 10 and len(b_norm) > 10:
            return a_norm[:40] == b_norm[:40]
        return False

    def _boundaries_to_sections(
        self,
        boundaries: List[Dict[str, Any]],
        total_pages: int,
    ) -> List[Dict[str, Any]]:
        """
        Convert a flat sorted boundary list into a hierarchical section tree.
        Each section gets page_start, page_end, and nested subsections.
        """
        if not boundaries:
            return []

        # Assign page_end to each boundary: runs until the next boundary's page - 1
        enriched: List[Dict[str, Any]] = []
        for i, b in enumerate(boundaries):
            page_start = b["page"]
            if i + 1 < len(boundaries):
                page_end = boundaries[i + 1]["page"] - 1
                # Ensure page_end >= page_start
                page_end = max(page_end, page_start)
            else:
                page_end = total_pages
            enriched.append(
                {
                    "title": b["title"],
                    "level": b["level"],
                    "page_start": page_start,
                    "page_end": page_end,
                    "confidence": b["confidence"],
                    "detection_method": b["detection_method"],
                    "subsections": [],
                }
            )

        # Build tree using a stack-based approach
        root_sections: List[Dict[str, Any]] = []
        stack: List[Dict[str, Any]] = []  # stack of currently open sections

        for section in enriched:
            # Pop sections from the stack that are at the same level or deeper
            while stack and stack[-1]["level"] >= section["level"]:
                stack.pop()

            if stack:
                # This section is a child of the top of the stack
                stack[-1]["subsections"].append(section)
            else:
                # Top-level section
                root_sections.append(section)

            stack.append(section)

        # Adjust page_end for parent sections to encompass children
        self._fix_parent_page_ends(root_sections, total_pages)

        return root_sections

    def _fix_parent_page_ends(
        self, sections: List[Dict[str, Any]], total_pages: int
    ) -> None:
        """Recursively ensure parent page_end covers all children."""
        for section in sections:
            if section["subsections"]:
                self._fix_parent_page_ends(section["subsections"], total_pages)
                child_max = max(
                    child["page_end"] for child in section["subsections"]
                )
                section["page_end"] = max(section["page_end"], child_max)

    @staticmethod
    def _count_subsections(section: Dict[str, Any]) -> int:
        """Recursively count all subsections (direct + nested)."""
        subs = section.get("subsections", [])
        total = len(subs)
        for sub in subs:
            total += StructureDetectionMixin._count_subsections(sub)
        return total

    # ------------------------------------------------------------------
    # Filesystem-safe name helper (for downstream splitting tools)
    # ------------------------------------------------------------------

    @staticmethod
    def _sanitize_dirname(title: str) -> str:
        """
        Convert a heading title into a filesystem-safe directory name.

        Replaces special characters with underscores, strips leading/trailing
        underscores and whitespace, and truncates to 50 characters at a word
        boundary for clean directory listings.
        """
        # Replace anything that isn't alphanumeric, space, hyphen, or underscore
        safe = re.sub(r"[^\w\s-]", "_", title)
        # Collapse runs of whitespace / underscores
        safe = re.sub(r"[\s_]+", "_", safe)
        # Strip leading/trailing underscores and whitespace
        safe = safe.strip("_ ")
        # Truncate at word boundary for clean names
        if len(safe) > 50:
            truncated = safe[:50]
            last_sep = truncated.rfind("_", 20)
            if last_sep > 0:
                truncated = truncated[:last_sep]
            safe = truncated.rstrip("_")
        return safe or "untitled"

    # ------------------------------------------------------------------
    # Tool 2: split_pdf_by_structure
    # ------------------------------------------------------------------

    @mcp_tool(
        name="split_pdf_by_structure",
        description=(
            "Detect document structure then split the PDF into per-chapter/section "
            "directories. Each section gets its own PDF and optionally markdown + images. "
            "Combines detect_structure + split + pdf_to_markdown into one operation."
        ),
    )
    async def split_pdf_by_structure(
        self,
        pdf_path: str,
        output_directory: str,
        split_level: int = 1,
        include_markdown: bool = True,
        include_images: bool = True,
        include_vectors: bool = True,
        strategies: str = "auto",
        heading_pattern: Optional[str] = None,
        min_confidence: float = 0.5,
        output_format: str = "markdown",
    ) -> Dict[str, Any]:
        """
        Detect structure and split a PDF into per-section directories.

        Args:
            pdf_path: Path to PDF file or HTTPS URL.
            output_directory: Root directory for section output folders.
            split_level: Heading level to split on (1=chapters, 2=sections, etc.).
            include_markdown: Convert each split PDF to markdown.
            include_images: Extract raster images during markdown conversion.
            include_vectors: Extract vector graphics during markdown conversion.
            strategies: Detection strategy for structure detection.
            heading_pattern: Optional user-supplied regex for headings.
            min_confidence: Drop boundaries below this confidence (0-1).
            output_format: "markdown", "pdf", or "both".

        Returns:
            Dict with per-section results, paths, extraction counts, and
            the detected structure.
        """
        start_time = time.time()

        try:
            # Validate inputs
            path = await validate_pdf_path(pdf_path)
            output_dir = Path(validate_output_path(output_directory))
            output_dir.mkdir(parents=True, exist_ok=True)

            # Step 1: Detect structure (inline=True for internal use)
            structure_result = await self.detect_structure(
                pdf_path=pdf_path,
                strategies=strategies,
                heading_pattern=heading_pattern,
                min_confidence=min_confidence,
                inline=True,
            )

            if not structure_result.get("success"):
                return {
                    "success": False,
                    "error": structure_result.get("error", "Structure detection failed"),
                    "split_time": round(time.time() - start_time, 2),
                }

            flat_boundaries = structure_result["structure"]["flat_boundaries"]

            # Step 2: Filter boundaries at the requested split_level
            split_boundaries = [
                b for b in flat_boundaries
                if b["level"] <= split_level and b["confidence"] >= min_confidence
            ]

            if not split_boundaries:
                return {
                    "success": False,
                    "error": (
                        f"No boundaries found at level <= {split_level} with "
                        f"confidence >= {min_confidence}. Try lowering min_confidence "
                        f"or increasing split_level."
                    ),
                    "detected_structure": structure_result["structure"],
                    "split_time": round(time.time() - start_time, 2),
                }

            # Get total page count
            source_doc = fitz.open(str(path))
            total_pages = len(source_doc)

            # Step 3: Compute page ranges from adjacent boundaries
            sections_results = []
            for i, boundary in enumerate(split_boundaries):
                page_start = boundary["page"]  # 1-based
                if i + 1 < len(split_boundaries):
                    page_end = split_boundaries[i + 1]["page"] - 1
                    page_end = max(page_end, page_start)
                else:
                    page_end = total_pages

                title = boundary["title"]
                clean_title = self._sanitize_dirname(title)
                section_dirname = f"{i:02d}_{clean_title}"
                section_dir = output_dir / section_dirname
                section_dir.mkdir(parents=True, exist_ok=True)

                # Step 4a: Create split PDF
                section_pdf_path = section_dir / f"{clean_title}.pdf"
                new_doc = fitz.open()
                new_doc.insert_pdf(
                    source_doc,
                    from_page=page_start - 1,  # convert to 0-based
                    to_page=page_end - 1,       # convert to 0-based
                )
                new_doc.save(str(section_pdf_path))
                new_doc.close()

                # Step 4b: Optionally convert to markdown
                md_path = None
                images_extracted = 0
                vectors_extracted = 0

                if include_markdown and output_format in ("markdown", "both"):
                    try:
                        img_mixin = ImageProcessingMixin()
                        md_result = await img_mixin.pdf_to_markdown(
                            pdf_path=str(section_pdf_path),
                            output_directory=str(section_dir),
                            output_filename=f"{clean_title}.md",
                            include_images=include_images,
                            include_vectors=include_vectors,
                        )
                        if md_result.get("success"):
                            md_path = md_result.get("output_file")
                            summary = md_result.get("conversion_summary", {})
                            images_extracted = summary.get("images_extracted", 0)
                            vectors_extracted = summary.get("vectors_extracted", 0)
                    except Exception as md_exc:
                        logger.warning(
                            "Markdown conversion failed for section '%s': %s",
                            title, md_exc,
                        )

                # If output_format is "markdown" only, remove the split PDF
                if output_format == "markdown" and md_path:
                    try:
                        section_pdf_path.unlink()
                        section_pdf_path = None
                    except OSError:
                        pass

                sections_results.append({
                    "title": title,
                    "page_start": page_start,
                    "page_end": page_end,
                    "directory": str(section_dir),
                    "pdf_path": str(section_pdf_path) if section_pdf_path else None,
                    "markdown_path": str(md_path) if md_path else None,
                    "images_extracted": images_extracted,
                    "vectors_extracted": vectors_extracted,
                })

            source_doc.close()

            return {
                "success": True,
                "sections_created": len(sections_results),
                "output_directory": str(output_dir),
                "sections": sections_results,
                "detected_structure": structure_result["structure"],
                "split_time": round(time.time() - start_time, 2),
            }

        except Exception as e:
            error_msg = sanitize_error_message(str(e))
            logger.error("split_pdf_by_structure failed: %s", error_msg)
            return {
                "success": False,
                "error": error_msg,
                "split_time": round(time.time() - start_time, 2),
            }

    # ------------------------------------------------------------------
    # Tool 3: batch_extract
    # ------------------------------------------------------------------

    @mcp_tool(
        name="batch_extract",
        description=(
            "Extract multiple page ranges from a single PDF, each producing its own "
            "markdown + images + vectors in a separate output directory. Replaces "
            "24+ individual tool calls with a single operation."
        ),
    )
    async def batch_extract(
        self,
        pdf_path: str,
        sections: str,
        include_images: bool = True,
        include_vectors: bool = True,
    ) -> Dict[str, Any]:
        """
        Extract multiple page ranges from a single PDF into separate directories.

        Args:
            pdf_path: Path to PDF file or HTTPS URL.
            sections: JSON string — a list of objects, each with:
                - "pages": page range string, e.g. "11-80"
                - "output_dir": output directory path for this section
                - "name": human-readable name for the section
            include_images: Extract raster images during markdown conversion.
            include_vectors: Extract vector graphics during markdown conversion.

        Returns:
            Dict with per-section extraction results and timing.
        """
        start_time = time.time()

        try:
            # Parse sections JSON
            try:
                section_list = json.loads(sections)
            except (json.JSONDecodeError, TypeError) as parse_err:
                return {
                    "success": False,
                    "error": f"Invalid sections JSON: {parse_err}",
                    "batch_time": round(time.time() - start_time, 2),
                }

            if not isinstance(section_list, list) or not section_list:
                return {
                    "success": False,
                    "error": "sections must be a non-empty JSON array",
                    "batch_time": round(time.time() - start_time, 2),
                }

            # Validate the source PDF once
            path = await validate_pdf_path(pdf_path)
            source_doc = fitz.open(str(path))
            total_pages = len(source_doc)

            results = []

            for idx, section in enumerate(section_list):
                section_name = section.get("name", f"section_{idx:02d}")
                pages_str = section.get("pages", "")
                section_output_dir = section.get("output_dir", "")

                if not pages_str or not section_output_dir:
                    results.append({
                        "name": section_name,
                        "pages": pages_str,
                        "success": False,
                        "error": "Missing 'pages' or 'output_dir' field",
                    })
                    continue

                try:
                    # Parse page range (e.g. "11-80")
                    out_dir = Path(validate_output_path(section_output_dir))
                    out_dir.mkdir(parents=True, exist_ok=True)

                    # Parse "start-end" format
                    page_start, page_end = self._parse_page_range(pages_str, total_pages)

                    # Create split PDF
                    clean_name = self._sanitize_dirname(section_name)
                    section_pdf_path = out_dir / f"{clean_name}.pdf"
                    new_doc = fitz.open()
                    new_doc.insert_pdf(
                        source_doc,
                        from_page=page_start - 1,  # convert to 0-based
                        to_page=page_end - 1,       # convert to 0-based
                    )
                    new_doc.save(str(section_pdf_path))
                    new_doc.close()

                    # Convert to markdown
                    md_result = None
                    try:
                        img_mixin = ImageProcessingMixin()
                        md_result = await img_mixin.pdf_to_markdown(
                            pdf_path=str(section_pdf_path),
                            output_directory=str(out_dir),
                            output_filename=f"{clean_name}.md",
                            include_images=include_images,
                            include_vectors=include_vectors,
                        )
                    except Exception as md_exc:
                        logger.warning(
                            "Markdown conversion failed for '%s': %s",
                            section_name, md_exc,
                        )
                        md_result = {"success": False, "error": str(md_exc)}

                    results.append({
                        "name": section_name,
                        "pages": pages_str,
                        "output_directory": str(out_dir),
                        "pdf_path": str(section_pdf_path),
                        "markdown_result": md_result,
                    })

                except Exception as sec_exc:
                    error_msg = sanitize_error_message(str(sec_exc))
                    logger.warning(
                        "batch_extract section '%s' failed: %s",
                        section_name, error_msg,
                    )
                    results.append({
                        "name": section_name,
                        "pages": pages_str,
                        "success": False,
                        "error": error_msg,
                    })

            source_doc.close()

            return {
                "success": True,
                "sections_processed": len(results),
                "sections": results,
                "batch_time": round(time.time() - start_time, 2),
            }

        except Exception as e:
            error_msg = sanitize_error_message(str(e))
            logger.error("batch_extract failed: %s", error_msg)
            return {
                "success": False,
                "error": error_msg,
                "batch_time": round(time.time() - start_time, 2),
            }

    # ------------------------------------------------------------------
    # Page range parsing helper
    # ------------------------------------------------------------------

    @staticmethod
    def _parse_page_range(pages_str: str, total_pages: int) -> Tuple[int, int]:
        """
        Parse a page range string like "11-80" into (start, end) 1-based ints.

        Supports formats:
            "11-80"   -> (11, 80)
            "5"       -> (5, 5)
            "11-end"  -> (11, total_pages)

        Raises ValueError on invalid input.
        """
        pages_str = pages_str.strip()

        if "-" in pages_str:
            parts = pages_str.split("-", 1)
            start_str = parts[0].strip()
            end_str = parts[1].strip()

            page_start = int(start_str)
            if end_str.lower() == "end":
                page_end = total_pages
            else:
                page_end = int(end_str)
        else:
            page_start = int(pages_str)
            page_end = page_start

        # Clamp to valid range
        page_start = max(1, min(page_start, total_pages))
        page_end = max(page_start, min(page_end, total_pages))

        return page_start, page_end