📉 File-first output for ocr_pdf, slim split_pdf_by_structure response
ocr_pdf: writes OCR text to file by default, returns path + preview instead of full text dump (~17k tokens → ~500 tokens). inline=True for old behavior. split_pdf_by_structure: sections are now one-line summaries instead of full path objects. Removed detected_structure dump from response.
This commit is contained in:
parent
d413438fea
commit
057aa5be40
@ -1,6 +1,6 @@
|
||||
[project]
|
||||
name = "mcp-pdf"
|
||||
version = "2.1.5"
|
||||
version = "2.1.6"
|
||||
description = "Secure FastMCP server for comprehensive PDF processing - text extraction, OCR, table extraction, forms, annotations, and more"
|
||||
authors = [{name = "Ryan Malloy", email = "ryan@malloys.us"}]
|
||||
readme = "README.md"
|
||||
|
||||
@ -826,9 +826,9 @@ class StructureDetectionMixin(MCPMixin):
|
||||
"error": (
|
||||
f"No boundaries found at level <= {split_level} with "
|
||||
f"confidence >= {min_confidence}. Try lowering min_confidence "
|
||||
f"or increasing split_level."
|
||||
f"or increasing split_level. "
|
||||
f"({len(flat_boundaries)} total boundaries detected)"
|
||||
),
|
||||
"detected_structure": structure_result["structure"],
|
||||
"split_time": round(time.time() - start_time, 2),
|
||||
}
|
||||
|
||||
@ -897,16 +897,10 @@ class StructureDetectionMixin(MCPMixin):
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
sections_results.append({
|
||||
"title": title,
|
||||
"page_start": page_start,
|
||||
"page_end": page_end,
|
||||
"directory": str(section_dir),
|
||||
"pdf_path": str(section_pdf_path) if section_pdf_path else None,
|
||||
"markdown_path": str(md_path) if md_path else None,
|
||||
"images_extracted": images_extracted,
|
||||
"vectors_extracted": vectors_extracted,
|
||||
})
|
||||
sections_results.append(
|
||||
f"p{page_start}-{page_end}: {title[:60]} "
|
||||
f"({images_extracted} img, {vectors_extracted} vec)"
|
||||
)
|
||||
|
||||
source_doc.close()
|
||||
|
||||
@ -915,7 +909,6 @@ class StructureDetectionMixin(MCPMixin):
|
||||
"sections_created": len(sections_results),
|
||||
"output_directory": str(output_dir),
|
||||
"sections": sections_results,
|
||||
"detected_structure": structure_result["structure"],
|
||||
"split_time": round(time.time() - start_time, 2),
|
||||
}
|
||||
|
||||
|
||||
@ -195,7 +195,11 @@ class TextExtractionMixin(MCPMixin):
|
||||
|
||||
@mcp_tool(
|
||||
name="ocr_pdf",
|
||||
description="Perform OCR on scanned PDFs with preprocessing options"
|
||||
description=(
|
||||
"Perform OCR on scanned PDFs. By default writes extracted text "
|
||||
"to a .txt file and returns the path with a short preview. "
|
||||
"Set inline=True to return full OCR text in the response."
|
||||
)
|
||||
)
|
||||
async def ocr_pdf(
|
||||
self,
|
||||
@ -203,7 +207,9 @@ class TextExtractionMixin(MCPMixin):
|
||||
pages: Optional[str] = None,
|
||||
languages: List[str] = ["eng"],
|
||||
dpi: int = 300,
|
||||
preprocess: bool = True
|
||||
preprocess: bool = True,
|
||||
output_directory: Optional[str] = None,
|
||||
inline: bool = False,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Perform OCR on scanned PDF pages.
|
||||
@ -214,9 +220,14 @@ class TextExtractionMixin(MCPMixin):
|
||||
languages: List of language codes for OCR
|
||||
dpi: DPI for image rendering
|
||||
preprocess: Whether to preprocess images for better OCR
|
||||
output_directory: Directory for the OCR text file.
|
||||
Defaults to a temp directory.
|
||||
inline: If True, return full OCR text in the response.
|
||||
Default: False (write to file, return path + preview).
|
||||
|
||||
Returns:
|
||||
Dictionary containing OCR results
|
||||
Dictionary containing OCR file path and summary, or full text
|
||||
if inline=True
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
@ -294,25 +305,54 @@ class TextExtractionMixin(MCPMixin):
|
||||
# Calculate overall statistics
|
||||
successful_pages = [r for r in ocr_results if "error" not in r]
|
||||
avg_confidence = sum(r["confidence"] for r in successful_pages) / len(successful_pages) if successful_pages else 0
|
||||
full_text = "\n\n".join(total_text)
|
||||
word_count = len(full_text.split())
|
||||
elapsed = round(time.time() - start_time, 2)
|
||||
|
||||
# ── Inline mode: return everything in the response ──
|
||||
if inline:
|
||||
return {
|
||||
"success": True,
|
||||
"text": full_text,
|
||||
"pages_processed": len(pages_to_process),
|
||||
"pages_successful": len(successful_pages),
|
||||
"overall_confidence": round(avg_confidence, 2),
|
||||
"page_results": ocr_results,
|
||||
"ocr_time": elapsed,
|
||||
}
|
||||
|
||||
# ── File-first mode (default): write text, return summary ──
|
||||
if output_directory:
|
||||
out_dir = Path(validate_output_path(output_directory))
|
||||
else:
|
||||
out_dir = Path(tempfile.mkdtemp(prefix="pdf_ocr_"))
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
output_filename = f"{path.stem}_ocr.txt"
|
||||
output_path = out_dir / output_filename
|
||||
output_path.write_text(full_text, encoding="utf-8")
|
||||
|
||||
# Build preview (first ~500 chars at sentence boundary)
|
||||
preview = full_text[:500]
|
||||
if len(full_text) > 500:
|
||||
last_period = preview.rfind(".")
|
||||
if last_period > 300:
|
||||
preview = preview[:last_period + 1]
|
||||
preview += " [...]"
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"text": "\n\n".join(total_text),
|
||||
"pages_processed": len(pages_to_process),
|
||||
"pages_successful": len(successful_pages),
|
||||
"pages_failed": len(pages_to_process) - len(successful_pages),
|
||||
"overall_confidence": round(avg_confidence, 2),
|
||||
"page_results": ocr_results,
|
||||
"ocr_settings": {
|
||||
"languages": languages,
|
||||
"dpi": dpi,
|
||||
"preprocessing": preprocess
|
||||
"output_file": str(output_path),
|
||||
"text_preview": preview,
|
||||
"ocr_summary": {
|
||||
"word_count": word_count,
|
||||
"character_count": len(full_text),
|
||||
"pages_processed": len(pages_to_process),
|
||||
"pages_successful": len(successful_pages),
|
||||
"pages_failed": len(pages_to_process) - len(successful_pages),
|
||||
"overall_confidence": round(avg_confidence, 2),
|
||||
},
|
||||
"file_info": {
|
||||
"path": str(path),
|
||||
"total_pages": total_pages
|
||||
},
|
||||
"ocr_time": round(time.time() - start_time, 2)
|
||||
"ocr_time": elapsed,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user