From b2d9073f04979f1fa45a6ed5eded46da4d7ab1ec Mon Sep 17 00:00:00 2001 From: Ryan Malloy Date: Tue, 5 May 2026 16:21:09 -0600 Subject: [PATCH] =?UTF-8?q?Add=20markdown=5Fto=5Fpdf=20tool=20=E2=80=94=20?= =?UTF-8?q?convert=20.md=20to=20PDF=20via=20pandoc?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New tool in ImageProcessingMixin (sibling of pdf_to_markdown). Accepts either a markdown file path or inline markdown text, writes a PDF to a caller-specified output path. Engine selection auto-detects what's available on PATH, preferring quality: xelatex > pdflatex > tectonic > weasyprint > wkhtmltopdf. Caller can force a specific engine or pass raw pandoc args for advanced cases. pypandoc is gated behind a new [markdown] optional extra so the base install stays lean. The tool surfaces clear errors if pypandoc, pandoc, or all PDF engines are missing. Bumps to v2.2.0 (new feature, minor bump). --- CLAUDE.md | 2 +- pyproject.toml | 9 +- .../mixins_official/image_processing.py | 204 +++++++++++++++++- uv.lock | 19 +- 4 files changed, 229 insertions(+), 5 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index db00a51..d7c4a3c 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -91,7 +91,7 @@ uv publish 2. **Table Extraction**: `extract_tables` - Auto-fallback through Camelot → pdfplumber → Tabula 3. **OCR Processing**: `ocr_pdf` - Tesseract with preprocessing options 4. **Document Analysis**: `is_scanned_pdf`, `get_document_structure`, `extract_metadata` -5. **Format Conversion**: `pdf_to_markdown` - Writes markdown + extracted raster images and vector graphics (SVG) to disk by default, returns path + preview. Images use relative `./images/` paths, vectors use `./vectors/` paths. Set `inline=True` for full markdown in response. Set `include_vectors=False` to skip vector extraction. Use `output_filename` to override the default .md filename. When `include_vectors=True`, returns `vector_diagnostics` showing which pages had drawings below the complexity threshold. Set `vector_fallback_raster=True` to render those sub-threshold pages as full-page raster images (PNG at 150 DPI) instead of skipping them. +5. **Format Conversion**: `pdf_to_markdown` - Writes markdown + extracted raster images and vector graphics (SVG) to disk by default, returns path + preview. Images use relative `./images/` paths, vectors use `./vectors/` paths. Set `inline=True` for full markdown in response. Set `include_vectors=False` to skip vector extraction. Use `output_filename` to override the default .md filename. When `include_vectors=True`, returns `vector_diagnostics` showing which pages had drawings below the complexity threshold. Set `vector_fallback_raster=True` to render those sub-threshold pages as full-page raster images (PNG at 150 DPI) instead of skipping them. `markdown_to_pdf` - Reverse direction: converts a `.md` file or inline markdown text to PDF using pandoc. Auto-detects available PDF engines (xelatex, pdflatex, tectonic, weasyprint, wkhtmltopdf) and picks the best one on PATH. Pass `pdf_engine` to override or `extra_args` for raw pandoc options. Requires `pip install mcp-pdf[markdown]` and the pandoc binary. 6. **Image Processing**: `extract_images` - Extract images with custom output paths and clean summary output 7. **Link Extraction**: `extract_links` - Extract all hyperlinks with page filtering and type categorization 8. **PDF Forms**: `extract_form_data`, `create_form_pdf`, `fill_form_pdf`, `add_form_fields` - Complete form lifecycle management diff --git a/pyproject.toml b/pyproject.toml index 637801c..c9768c8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "mcp-pdf" -version = "2.1.7" +version = "2.2.0" description = "Secure FastMCP server for comprehensive PDF processing - text extraction, OCR, table extraction, forms, annotations, and more" authors = [{name = "Ryan Malloy", email = "ryan@malloys.us"}] readme = "README.md" @@ -68,11 +68,18 @@ tables = [ "tabula-py>=2.8.0", ] +# Markdown → PDF conversion (requires pandoc binary + a PDF engine such as +# xelatex, pdflatex, tectonic, weasyprint, or wkhtmltopdf) +markdown = [ + "pypandoc>=1.13", +] + # All optional features all = [ "reportlab>=4.0.0", "camelot-py[cv]>=0.11.0", "tabula-py>=2.8.0", + "pypandoc>=1.13", ] # Development dependencies diff --git a/src/mcp_pdf/mixins_official/image_processing.py b/src/mcp_pdf/mixins_official/image_processing.py index 363f936..16b8e86 100644 --- a/src/mcp_pdf/mixins_official/image_processing.py +++ b/src/mcp_pdf/mixins_official/image_processing.py @@ -990,4 +990,206 @@ class ImageProcessingMixin(MCPMixin): # Match floating point numbers in SVG simplified = re.sub(r'-?\d+\.\d{3,}', reduce_precision, svg_content) - return simplified \ No newline at end of file + return simplified + + @mcp_tool( + name="markdown_to_pdf", + description=( + "Convert a Markdown file (or inline text) to PDF using pandoc. " + "Auto-detects available PDF engines (xelatex, pdflatex, tectonic, " + "weasyprint, wkhtmltopdf) and falls back through them in that order. " + "Pass pdf_engine to override, or extra_args for custom pandoc options " + "(e.g. ['-V', 'geometry:margin=1in']). Requires pandoc binary on host " + "and at least one PDF engine. Install with: pip install mcp-pdf[markdown]" + ) + ) + async def markdown_to_pdf( + self, + output_path: str, + markdown_path: Optional[str] = None, + markdown_text: Optional[str] = None, + pdf_engine: Optional[str] = None, + toc: bool = False, + title: Optional[str] = None, + author: Optional[str] = None, + date: Optional[str] = None, + base_path: Optional[str] = None, + extra_args: Optional[List[str]] = None, + ) -> Dict[str, Any]: + """ + Convert Markdown to PDF using pandoc as the parser and one of several + available PDF rendering engines as the backend. + + Provide either `markdown_path` (a .md file) or `markdown_text` (an + inline string), but not both. The output PDF is written to `output_path`. + + Engine selection: + If `pdf_engine` is None, the first engine found on PATH is used, + preferring quality: xelatex > pdflatex > tectonic > weasyprint > wkhtmltopdf. + If a specific engine is requested but not on PATH, an error is returned + listing what *is* available. + + Args: + output_path: Where to write the resulting PDF (required). + markdown_path: Path to a .md file. Mutually exclusive with markdown_text. + markdown_text: Inline markdown content. Mutually exclusive with markdown_path. + pdf_engine: Force a specific engine ("xelatex", "pdflatex", "tectonic", + "weasyprint", "wkhtmltopdf"). Default: auto-detect. + toc: Generate a table of contents from headings. + title: Document title (overrides any YAML frontmatter title). + author: Document author (overrides any YAML frontmatter author). + date: Document date string (overrides any YAML frontmatter date). + base_path: Resource resolution base for relative image references. + Defaults to the markdown file's directory when markdown_path is used. + extra_args: Additional raw pandoc CLI arguments (advanced). + + Returns: + Dict with output_path, file_size, engine_used, conversion_time, and + (when applicable) detected_engines listing what's available on the host. + """ + import shutil + + start_time = time.time() + + ENGINE_PREFERENCE = ["xelatex", "pdflatex", "tectonic", "weasyprint", "wkhtmltopdf"] + + try: + # Optional dep check — pypandoc is gated behind the [markdown] extra + try: + import pypandoc + except ImportError: + return { + "error": ( + "pypandoc is not installed. Install with: " + "pip install mcp-pdf[markdown] (also requires the pandoc " + "binary and a PDF engine on PATH)" + ), + "conversion_time": round(time.time() - start_time, 2), + } + + # Verify pandoc binary is reachable — pypandoc raises OSError if missing + try: + pypandoc.get_pandoc_version() + except OSError: + return { + "error": ( + "pandoc binary not found on PATH. Install pandoc: " + "https://pandoc.org/installing.html" + ), + "conversion_time": round(time.time() - start_time, 2), + } + + # Validate input — exactly one of markdown_path or markdown_text + if bool(markdown_path) == bool(markdown_text): + return { + "error": "Provide exactly one of markdown_path or markdown_text", + "conversion_time": round(time.time() - start_time, 2), + } + + # Validate output path + output = validate_output_path(output_path) + output.parent.mkdir(parents=True, exist_ok=True) + + # Detect available PDF engines on PATH + available_engines = [e for e in ENGINE_PREFERENCE if shutil.which(e)] + + # Pick engine: explicit override or first available + if pdf_engine: + if not shutil.which(pdf_engine): + return { + "error": ( + f"Requested PDF engine '{pdf_engine}' not found on PATH. " + f"Available engines: {available_engines or 'none'}" + ), + "detected_engines": available_engines, + "conversion_time": round(time.time() - start_time, 2), + } + engine = pdf_engine + else: + if not available_engines: + return { + "error": ( + "No PDF engine found on PATH. Install one of: " + + ", ".join(ENGINE_PREFERENCE) + ), + "conversion_time": round(time.time() - start_time, 2), + } + engine = available_engines[0] + + # Build pandoc arguments + args: List[str] = [f"--pdf-engine={engine}"] + if toc: + args.append("--toc") + if title: + args.extend(["-M", f"title={title}"]) + if author: + args.extend(["-M", f"author={author}"]) + if date: + args.extend(["-M", f"date={date}"]) + + # Resource path for relative image refs — defaults to source dir + if base_path: + resource_dir = Path(base_path).resolve() + elif markdown_path: + resource_dir = Path(markdown_path).resolve().parent + else: + resource_dir = None + + if resource_dir: + args.extend(["--resource-path", str(resource_dir)]) + + if extra_args: + args.extend(extra_args) + + # Convert — file path or inline text + if markdown_path: + source_path = Path(markdown_path).resolve() + if not source_path.is_file(): + return { + "error": f"Markdown file not found: {markdown_path}", + "conversion_time": round(time.time() - start_time, 2), + } + pypandoc.convert_file( + str(source_path), + to="pdf", + outputfile=str(output), + extra_args=args, + ) + else: + pypandoc.convert_text( + markdown_text, + to="pdf", + format="md", + outputfile=str(output), + extra_args=args, + ) + + file_size = output.stat().st_size + + return { + "output_path": str(output), + "file_size": file_size, + "file_size_kb": round(file_size / 1024, 2), + "engine_used": engine, + "detected_engines": available_engines, + "toc": toc, + "conversion_time": round(time.time() - start_time, 2), + } + + except RuntimeError as e: + # pypandoc raises RuntimeError for pandoc subprocess failures — + # the message often contains the engine's stderr, which is the most + # useful signal a user can get for typesetting errors + error_msg = sanitize_error_message(str(e)) + logger.error(f"Markdown to PDF conversion failed: {error_msg}") + return { + "error": f"Pandoc conversion failed: {error_msg}", + "conversion_time": round(time.time() - start_time, 2), + } + except Exception as e: + error_msg = sanitize_error_message(str(e)) + logger.error(f"Markdown to PDF failed: {error_msg}") + return { + "error": error_msg, + "conversion_time": round(time.time() - start_time, 2), + } \ No newline at end of file diff --git a/uv.lock b/uv.lock index 3f15096..93c1a63 100644 --- a/uv.lock +++ b/uv.lock @@ -1032,7 +1032,7 @@ wheels = [ [[package]] name = "mcp-pdf" -version = "2.1.7" +version = "2.2.0" source = { editable = "." } dependencies = [ { name = "fastmcp" }, @@ -1052,6 +1052,7 @@ dependencies = [ [package.optional-dependencies] all = [ { name = "camelot-py", extra = ["cv"] }, + { name = "pypandoc" }, { name = "reportlab" }, { name = "tabula-py" }, ] @@ -1069,6 +1070,9 @@ dev = [ forms = [ { name = "reportlab" }, ] +markdown = [ + { name = "pypandoc" }, +] tables = [ { name = "camelot-py", extra = ["cv"] }, { name = "tabula-py" }, @@ -1102,6 +1106,8 @@ requires-dist = [ { name = "pip-audit", marker = "extra == 'dev'", specifier = ">=2.0.0" }, { name = "pydantic", specifier = ">=2.0.0" }, { name = "pymupdf", specifier = ">=1.23.0" }, + { name = "pypandoc", marker = "extra == 'all'", specifier = ">=1.13" }, + { name = "pypandoc", marker = "extra == 'markdown'", specifier = ">=1.13" }, { name = "pypdf", specifier = ">=6.0.0" }, { name = "pytesseract", specifier = ">=0.3.10" }, { name = "pytest", marker = "extra == 'dev'", specifier = ">=7.0.0" }, @@ -1115,7 +1121,7 @@ requires-dist = [ { name = "tabula-py", marker = "extra == 'tables'", specifier = ">=2.8.0" }, { name = "twine", marker = "extra == 'dev'", specifier = ">=4.0.0" }, ] -provides-extras = ["forms", "tables", "all", "dev"] +provides-extras = ["forms", "tables", "markdown", "all", "dev"] [package.metadata.requires-dev] dev = [ @@ -1938,6 +1944,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4a/26/8c72973b8833a72785cedc3981eb59b8ac7075942718bbb7b69b352cdde4/pymupdf-1.26.3-cp39-abi3-win_amd64.whl", hash = "sha256:b4cd5124d05737944636cf45fc37ce5824f10e707b0342efe109c7b6bd37a9cc", size = 18735124, upload-time = "2025-07-02T21:31:10.992Z" }, ] +[[package]] +name = "pypandoc" +version = "1.17" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ea/d6/410615fc433e5d1eacc00db2044ae2a9c82302df0d35366fe2bd15de024d/pypandoc-1.17.tar.gz", hash = "sha256:51179abfd6e582a25ed03477541b48836b5bba5a4c3b282a547630793934d799", size = 69071, upload-time = "2026-03-14T22:39:07.21Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0c/86/e2ffa604eacfbec3f430b1d850e7e04c4101eca1a5828f9ae54bf51dfba4/pypandoc-1.17-py3-none-any.whl", hash = "sha256:01fdbffa61edb9f8e82e8faad6954efcb7b6f8f0634aead4d89e322a00225a67", size = 23554, upload-time = "2026-03-14T22:38:46.007Z" }, +] + [[package]] name = "pyparsing" version = "3.2.3"