🐛 Fix superscript handling and directory name truncation in detect_structure

- Two-pass span collection includes sandwiched non-heading spans (e.g. ² in I²C)
  so superscripts between heading-sized spans aren't dropped
- Join heading line parts without spaces ("".join) for proper glyph concatenation
- Cap numbering-pattern title at first newline + 80 chars with word boundary break
- Reduce _sanitize_dirname max from 80→50 chars with word-boundary truncation
This commit is contained in:
Ryan Malloy 2026-03-02 02:14:26 -07:00
parent 823318ec15
commit 56ab8356bc
3 changed files with 39 additions and 14 deletions

View File

@ -1,6 +1,6 @@
[project] [project]
name = "mcp-pdf" name = "mcp-pdf"
version = "2.1.0" version = "2.1.1"
description = "Secure FastMCP server for comprehensive PDF processing - text extraction, OCR, table extraction, forms, annotations, and more" description = "Secure FastMCP server for comprehensive PDF processing - text extraction, OCR, table extraction, forms, annotations, and more"
authors = [{name = "Ryan Malloy", email = "ryan@malloys.us"}] authors = [{name = "Ryan Malloy", email = "ryan@malloys.us"}]
readme = "README.md" readme = "README.md"

View File

@ -352,18 +352,33 @@ class StructureDetectionMixin(MCPMixin):
line_is_bold = False line_is_bold = False
line_y = line.get("bbox", [0, 0, 0, 0])[1] line_y = line.get("bbox", [0, 0, 0, 0])[1]
for span in line.get("spans", []): spans = line.get("spans", [])
# First pass: identify which spans are heading-sized
span_roles = []
for span in spans:
sz = round(span["size"], 1) sz = round(span["size"], 1)
if sz in size_to_level: is_heading = sz in size_to_level
span_roles.append((span, sz, is_heading))
# Second pass: collect heading spans AND sandwiched
# non-heading spans (superscripts like ² in I²C)
for idx, (span, sz, is_heading) in enumerate(span_roles):
if is_heading:
line_text_parts.append(span["text"]) line_text_parts.append(span["text"])
line_size = sz line_size = sz
if span.get("flags", 0) & 16: if span.get("flags", 0) & 16:
line_is_bold = True line_is_bold = True
elif line_text_parts and idx + 1 < len(span_roles):
# Non-heading span between heading spans —
# likely a superscript/subscript (e.g. ² in I²C)
if span_roles[idx + 1][2]: # next span is heading
line_text_parts.append(span["text"])
if not line_text_parts or line_size is None: if not line_text_parts or line_size is None:
continue continue
heading_text = " ".join(line_text_parts).strip() heading_text = "".join(line_text_parts).strip()
if not heading_text: if not heading_text:
continue continue
@ -423,13 +438,18 @@ class StructureDetectionMixin(MCPMixin):
match = re.search(pat, search_text, re.IGNORECASE | re.MULTILINE) match = re.search(pat, search_text, re.IGNORECASE | re.MULTILINE)
if match: if match:
matched_text = match.group(0).strip() matched_text = match.group(0).strip()
# Try to grab the rest of the line as the heading title # Grab the heading title up to the first newline
line_end = search_text.find("\n", match.end()) line_end = search_text.find("\n", match.start())
if line_end == -1: if line_end == -1:
line_end = min(match.end() + 120, len(search_text)) line_end = len(search_text)
title = search_text[match.start():line_end].strip() title = search_text[match.start():line_end].strip()
if len(title) > 120: # Cap title length to avoid grabbing full sentences
title = title[:120].rstrip() if len(title) > 80:
title = title[:80].rstrip()
# Try to break at a word boundary
last_space = title.rfind(" ", 40)
if last_space > 0:
title = title[:last_space]
# Confidence varies: exact first-line match is higher # Confidence varies: exact first-line match is higher
confidence = 0.70 confidence = 0.70
@ -635,7 +655,8 @@ class StructureDetectionMixin(MCPMixin):
Convert a heading title into a filesystem-safe directory name. Convert a heading title into a filesystem-safe directory name.
Replaces special characters with underscores, strips leading/trailing Replaces special characters with underscores, strips leading/trailing
underscores and whitespace, and truncates to 80 characters. underscores and whitespace, and truncates to 50 characters at a word
boundary for clean directory listings.
""" """
# Replace anything that isn't alphanumeric, space, hyphen, or underscore # Replace anything that isn't alphanumeric, space, hyphen, or underscore
safe = re.sub(r"[^\w\s-]", "_", title) safe = re.sub(r"[^\w\s-]", "_", title)
@ -643,9 +664,13 @@ class StructureDetectionMixin(MCPMixin):
safe = re.sub(r"[\s_]+", "_", safe) safe = re.sub(r"[\s_]+", "_", safe)
# Strip leading/trailing underscores and whitespace # Strip leading/trailing underscores and whitespace
safe = safe.strip("_ ") safe = safe.strip("_ ")
# Truncate # Truncate at word boundary for clean names
if len(safe) > 80: if len(safe) > 50:
safe = safe[:80].rstrip("_") truncated = safe[:50]
last_sep = truncated.rfind("_", 20)
if last_sep > 0:
truncated = truncated[:last_sep]
safe = truncated.rstrip("_")
return safe or "untitled" return safe or "untitled"
# ------------------------------------------------------------------ # ------------------------------------------------------------------

2
uv.lock generated
View File

@ -1032,7 +1032,7 @@ wheels = [
[[package]] [[package]]
name = "mcp-pdf" name = "mcp-pdf"
version = "2.0.14" version = "2.1.0"
source = { editable = "." } source = { editable = "." }
dependencies = [ dependencies = [
{ name = "camelot-py", extra = ["cv"] }, { name = "camelot-py", extra = ["cv"] },