📉 File-first output for ocr_pdf, slim split_pdf_by_structure response
ocr_pdf: writes OCR text to file by default, returns path + preview instead of full text dump (~17k tokens → ~500 tokens). inline=True for old behavior. split_pdf_by_structure: sections are now one-line summaries instead of full path objects. Removed detected_structure dump from response.
This commit is contained in:
parent
d413438fea
commit
057aa5be40
@ -1,6 +1,6 @@
|
|||||||
[project]
|
[project]
|
||||||
name = "mcp-pdf"
|
name = "mcp-pdf"
|
||||||
version = "2.1.5"
|
version = "2.1.6"
|
||||||
description = "Secure FastMCP server for comprehensive PDF processing - text extraction, OCR, table extraction, forms, annotations, and more"
|
description = "Secure FastMCP server for comprehensive PDF processing - text extraction, OCR, table extraction, forms, annotations, and more"
|
||||||
authors = [{name = "Ryan Malloy", email = "ryan@malloys.us"}]
|
authors = [{name = "Ryan Malloy", email = "ryan@malloys.us"}]
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
|
|||||||
@ -826,9 +826,9 @@ class StructureDetectionMixin(MCPMixin):
|
|||||||
"error": (
|
"error": (
|
||||||
f"No boundaries found at level <= {split_level} with "
|
f"No boundaries found at level <= {split_level} with "
|
||||||
f"confidence >= {min_confidence}. Try lowering min_confidence "
|
f"confidence >= {min_confidence}. Try lowering min_confidence "
|
||||||
f"or increasing split_level."
|
f"or increasing split_level. "
|
||||||
|
f"({len(flat_boundaries)} total boundaries detected)"
|
||||||
),
|
),
|
||||||
"detected_structure": structure_result["structure"],
|
|
||||||
"split_time": round(time.time() - start_time, 2),
|
"split_time": round(time.time() - start_time, 2),
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -897,16 +897,10 @@ class StructureDetectionMixin(MCPMixin):
|
|||||||
except OSError:
|
except OSError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
sections_results.append({
|
sections_results.append(
|
||||||
"title": title,
|
f"p{page_start}-{page_end}: {title[:60]} "
|
||||||
"page_start": page_start,
|
f"({images_extracted} img, {vectors_extracted} vec)"
|
||||||
"page_end": page_end,
|
)
|
||||||
"directory": str(section_dir),
|
|
||||||
"pdf_path": str(section_pdf_path) if section_pdf_path else None,
|
|
||||||
"markdown_path": str(md_path) if md_path else None,
|
|
||||||
"images_extracted": images_extracted,
|
|
||||||
"vectors_extracted": vectors_extracted,
|
|
||||||
})
|
|
||||||
|
|
||||||
source_doc.close()
|
source_doc.close()
|
||||||
|
|
||||||
@ -915,7 +909,6 @@ class StructureDetectionMixin(MCPMixin):
|
|||||||
"sections_created": len(sections_results),
|
"sections_created": len(sections_results),
|
||||||
"output_directory": str(output_dir),
|
"output_directory": str(output_dir),
|
||||||
"sections": sections_results,
|
"sections": sections_results,
|
||||||
"detected_structure": structure_result["structure"],
|
|
||||||
"split_time": round(time.time() - start_time, 2),
|
"split_time": round(time.time() - start_time, 2),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -195,7 +195,11 @@ class TextExtractionMixin(MCPMixin):
|
|||||||
|
|
||||||
@mcp_tool(
|
@mcp_tool(
|
||||||
name="ocr_pdf",
|
name="ocr_pdf",
|
||||||
description="Perform OCR on scanned PDFs with preprocessing options"
|
description=(
|
||||||
|
"Perform OCR on scanned PDFs. By default writes extracted text "
|
||||||
|
"to a .txt file and returns the path with a short preview. "
|
||||||
|
"Set inline=True to return full OCR text in the response."
|
||||||
|
)
|
||||||
)
|
)
|
||||||
async def ocr_pdf(
|
async def ocr_pdf(
|
||||||
self,
|
self,
|
||||||
@ -203,7 +207,9 @@ class TextExtractionMixin(MCPMixin):
|
|||||||
pages: Optional[str] = None,
|
pages: Optional[str] = None,
|
||||||
languages: List[str] = ["eng"],
|
languages: List[str] = ["eng"],
|
||||||
dpi: int = 300,
|
dpi: int = 300,
|
||||||
preprocess: bool = True
|
preprocess: bool = True,
|
||||||
|
output_directory: Optional[str] = None,
|
||||||
|
inline: bool = False,
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Perform OCR on scanned PDF pages.
|
Perform OCR on scanned PDF pages.
|
||||||
@ -214,9 +220,14 @@ class TextExtractionMixin(MCPMixin):
|
|||||||
languages: List of language codes for OCR
|
languages: List of language codes for OCR
|
||||||
dpi: DPI for image rendering
|
dpi: DPI for image rendering
|
||||||
preprocess: Whether to preprocess images for better OCR
|
preprocess: Whether to preprocess images for better OCR
|
||||||
|
output_directory: Directory for the OCR text file.
|
||||||
|
Defaults to a temp directory.
|
||||||
|
inline: If True, return full OCR text in the response.
|
||||||
|
Default: False (write to file, return path + preview).
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dictionary containing OCR results
|
Dictionary containing OCR file path and summary, or full text
|
||||||
|
if inline=True
|
||||||
"""
|
"""
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
@ -294,25 +305,54 @@ class TextExtractionMixin(MCPMixin):
|
|||||||
# Calculate overall statistics
|
# Calculate overall statistics
|
||||||
successful_pages = [r for r in ocr_results if "error" not in r]
|
successful_pages = [r for r in ocr_results if "error" not in r]
|
||||||
avg_confidence = sum(r["confidence"] for r in successful_pages) / len(successful_pages) if successful_pages else 0
|
avg_confidence = sum(r["confidence"] for r in successful_pages) / len(successful_pages) if successful_pages else 0
|
||||||
|
full_text = "\n\n".join(total_text)
|
||||||
|
word_count = len(full_text.split())
|
||||||
|
elapsed = round(time.time() - start_time, 2)
|
||||||
|
|
||||||
|
# ── Inline mode: return everything in the response ──
|
||||||
|
if inline:
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"text": full_text,
|
||||||
|
"pages_processed": len(pages_to_process),
|
||||||
|
"pages_successful": len(successful_pages),
|
||||||
|
"overall_confidence": round(avg_confidence, 2),
|
||||||
|
"page_results": ocr_results,
|
||||||
|
"ocr_time": elapsed,
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── File-first mode (default): write text, return summary ──
|
||||||
|
if output_directory:
|
||||||
|
out_dir = Path(validate_output_path(output_directory))
|
||||||
|
else:
|
||||||
|
out_dir = Path(tempfile.mkdtemp(prefix="pdf_ocr_"))
|
||||||
|
out_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
output_filename = f"{path.stem}_ocr.txt"
|
||||||
|
output_path = out_dir / output_filename
|
||||||
|
output_path.write_text(full_text, encoding="utf-8")
|
||||||
|
|
||||||
|
# Build preview (first ~500 chars at sentence boundary)
|
||||||
|
preview = full_text[:500]
|
||||||
|
if len(full_text) > 500:
|
||||||
|
last_period = preview.rfind(".")
|
||||||
|
if last_period > 300:
|
||||||
|
preview = preview[:last_period + 1]
|
||||||
|
preview += " [...]"
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"success": True,
|
"success": True,
|
||||||
"text": "\n\n".join(total_text),
|
"output_file": str(output_path),
|
||||||
"pages_processed": len(pages_to_process),
|
"text_preview": preview,
|
||||||
"pages_successful": len(successful_pages),
|
"ocr_summary": {
|
||||||
"pages_failed": len(pages_to_process) - len(successful_pages),
|
"word_count": word_count,
|
||||||
"overall_confidence": round(avg_confidence, 2),
|
"character_count": len(full_text),
|
||||||
"page_results": ocr_results,
|
"pages_processed": len(pages_to_process),
|
||||||
"ocr_settings": {
|
"pages_successful": len(successful_pages),
|
||||||
"languages": languages,
|
"pages_failed": len(pages_to_process) - len(successful_pages),
|
||||||
"dpi": dpi,
|
"overall_confidence": round(avg_confidence, 2),
|
||||||
"preprocessing": preprocess
|
|
||||||
},
|
},
|
||||||
"file_info": {
|
"ocr_time": elapsed,
|
||||||
"path": str(path),
|
|
||||||
"total_pages": total_pages
|
|
||||||
},
|
|
||||||
"ocr_time": round(time.time() - start_time, 2)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user