🐛 Fix superscript handling and directory name truncation in detect_structure
- Two-pass span collection includes sandwiched non-heading spans (e.g. ² in I²C)
so superscripts between heading-sized spans aren't dropped
- Join heading line parts without spaces ("".join) for proper glyph concatenation
- Cap numbering-pattern title at first newline + 80 chars with word boundary break
- Reduce _sanitize_dirname max from 80→50 chars with word-boundary truncation
This commit is contained in:
parent
823318ec15
commit
56ab8356bc
@ -1,6 +1,6 @@
|
|||||||
[project]
|
[project]
|
||||||
name = "mcp-pdf"
|
name = "mcp-pdf"
|
||||||
version = "2.1.0"
|
version = "2.1.1"
|
||||||
description = "Secure FastMCP server for comprehensive PDF processing - text extraction, OCR, table extraction, forms, annotations, and more"
|
description = "Secure FastMCP server for comprehensive PDF processing - text extraction, OCR, table extraction, forms, annotations, and more"
|
||||||
authors = [{name = "Ryan Malloy", email = "ryan@malloys.us"}]
|
authors = [{name = "Ryan Malloy", email = "ryan@malloys.us"}]
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
|
|||||||
@ -352,18 +352,33 @@ class StructureDetectionMixin(MCPMixin):
|
|||||||
line_is_bold = False
|
line_is_bold = False
|
||||||
line_y = line.get("bbox", [0, 0, 0, 0])[1]
|
line_y = line.get("bbox", [0, 0, 0, 0])[1]
|
||||||
|
|
||||||
for span in line.get("spans", []):
|
spans = line.get("spans", [])
|
||||||
|
|
||||||
|
# First pass: identify which spans are heading-sized
|
||||||
|
span_roles = []
|
||||||
|
for span in spans:
|
||||||
sz = round(span["size"], 1)
|
sz = round(span["size"], 1)
|
||||||
if sz in size_to_level:
|
is_heading = sz in size_to_level
|
||||||
|
span_roles.append((span, sz, is_heading))
|
||||||
|
|
||||||
|
# Second pass: collect heading spans AND sandwiched
|
||||||
|
# non-heading spans (superscripts like ² in I²C)
|
||||||
|
for idx, (span, sz, is_heading) in enumerate(span_roles):
|
||||||
|
if is_heading:
|
||||||
line_text_parts.append(span["text"])
|
line_text_parts.append(span["text"])
|
||||||
line_size = sz
|
line_size = sz
|
||||||
if span.get("flags", 0) & 16:
|
if span.get("flags", 0) & 16:
|
||||||
line_is_bold = True
|
line_is_bold = True
|
||||||
|
elif line_text_parts and idx + 1 < len(span_roles):
|
||||||
|
# Non-heading span between heading spans —
|
||||||
|
# likely a superscript/subscript (e.g. ² in I²C)
|
||||||
|
if span_roles[idx + 1][2]: # next span is heading
|
||||||
|
line_text_parts.append(span["text"])
|
||||||
|
|
||||||
if not line_text_parts or line_size is None:
|
if not line_text_parts or line_size is None:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
heading_text = " ".join(line_text_parts).strip()
|
heading_text = "".join(line_text_parts).strip()
|
||||||
if not heading_text:
|
if not heading_text:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -423,13 +438,18 @@ class StructureDetectionMixin(MCPMixin):
|
|||||||
match = re.search(pat, search_text, re.IGNORECASE | re.MULTILINE)
|
match = re.search(pat, search_text, re.IGNORECASE | re.MULTILINE)
|
||||||
if match:
|
if match:
|
||||||
matched_text = match.group(0).strip()
|
matched_text = match.group(0).strip()
|
||||||
# Try to grab the rest of the line as the heading title
|
# Grab the heading title up to the first newline
|
||||||
line_end = search_text.find("\n", match.end())
|
line_end = search_text.find("\n", match.start())
|
||||||
if line_end == -1:
|
if line_end == -1:
|
||||||
line_end = min(match.end() + 120, len(search_text))
|
line_end = len(search_text)
|
||||||
title = search_text[match.start():line_end].strip()
|
title = search_text[match.start():line_end].strip()
|
||||||
if len(title) > 120:
|
# Cap title length to avoid grabbing full sentences
|
||||||
title = title[:120].rstrip()
|
if len(title) > 80:
|
||||||
|
title = title[:80].rstrip()
|
||||||
|
# Try to break at a word boundary
|
||||||
|
last_space = title.rfind(" ", 40)
|
||||||
|
if last_space > 0:
|
||||||
|
title = title[:last_space]
|
||||||
|
|
||||||
# Confidence varies: exact first-line match is higher
|
# Confidence varies: exact first-line match is higher
|
||||||
confidence = 0.70
|
confidence = 0.70
|
||||||
@ -635,7 +655,8 @@ class StructureDetectionMixin(MCPMixin):
|
|||||||
Convert a heading title into a filesystem-safe directory name.
|
Convert a heading title into a filesystem-safe directory name.
|
||||||
|
|
||||||
Replaces special characters with underscores, strips leading/trailing
|
Replaces special characters with underscores, strips leading/trailing
|
||||||
underscores and whitespace, and truncates to 80 characters.
|
underscores and whitespace, and truncates to 50 characters at a word
|
||||||
|
boundary for clean directory listings.
|
||||||
"""
|
"""
|
||||||
# Replace anything that isn't alphanumeric, space, hyphen, or underscore
|
# Replace anything that isn't alphanumeric, space, hyphen, or underscore
|
||||||
safe = re.sub(r"[^\w\s-]", "_", title)
|
safe = re.sub(r"[^\w\s-]", "_", title)
|
||||||
@ -643,9 +664,13 @@ class StructureDetectionMixin(MCPMixin):
|
|||||||
safe = re.sub(r"[\s_]+", "_", safe)
|
safe = re.sub(r"[\s_]+", "_", safe)
|
||||||
# Strip leading/trailing underscores and whitespace
|
# Strip leading/trailing underscores and whitespace
|
||||||
safe = safe.strip("_ ")
|
safe = safe.strip("_ ")
|
||||||
# Truncate
|
# Truncate at word boundary for clean names
|
||||||
if len(safe) > 80:
|
if len(safe) > 50:
|
||||||
safe = safe[:80].rstrip("_")
|
truncated = safe[:50]
|
||||||
|
last_sep = truncated.rfind("_", 20)
|
||||||
|
if last_sep > 0:
|
||||||
|
truncated = truncated[:last_sep]
|
||||||
|
safe = truncated.rstrip("_")
|
||||||
return safe or "untitled"
|
return safe or "untitled"
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
|
|||||||
2
uv.lock
generated
2
uv.lock
generated
@ -1032,7 +1032,7 @@ wheels = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "mcp-pdf"
|
name = "mcp-pdf"
|
||||||
version = "2.0.14"
|
version = "2.1.0"
|
||||||
source = { editable = "." }
|
source = { editable = "." }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "camelot-py", extra = ["cv"] },
|
{ name = "camelot-py", extra = ["cv"] },
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user