From b2d9073f04979f1fa45a6ed5eded46da4d7ab1ec Mon Sep 17 00:00:00 2001
From: Ryan Malloy <ryan@supported.systems>
Date: Tue, 5 May 2026 16:21:09 -0600
Subject: [PATCH] =?UTF-8?q?Add=20markdown=5Fto=5Fpdf=20tool=20=E2=80=94=20?=
 =?UTF-8?q?convert=20.md=20to=20PDF=20via=20pandoc?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

New tool in ImageProcessingMixin (sibling of pdf_to_markdown). Accepts
either a markdown file path or inline markdown text, writes a PDF to a
caller-specified output path.

Engine selection auto-detects what's available on PATH, preferring quality:
xelatex > pdflatex > tectonic > weasyprint > wkhtmltopdf. Caller can force
a specific engine or pass raw pandoc args for advanced cases.

pypandoc is gated behind a new [markdown] optional extra so the base
install stays lean. The tool surfaces clear errors if pypandoc, pandoc,
or all PDF engines are missing.

Bumps to v2.2.0 (new feature, minor bump).
---
 CLAUDE.md                                     |   2 +-
 pyproject.toml                                |   9 +-
 .../mixins_official/image_processing.py       | 204 +++++++++++++++++-
 uv.lock                                       |  19 +-
 4 files changed, 229 insertions(+), 5 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index db00a51..d7c4a3c 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -91,7 +91,7 @@ uv publish
 2. **Table Extraction**: `extract_tables` - Auto-fallback through Camelot → pdfplumber → Tabula
 3. **OCR Processing**: `ocr_pdf` - Tesseract with preprocessing options
 4. **Document Analysis**: `is_scanned_pdf`, `get_document_structure`, `extract_metadata`
-5. **Format Conversion**: `pdf_to_markdown` - Writes markdown + extracted raster images and vector graphics (SVG) to disk by default, returns path + preview. Images use relative `./images/` paths, vectors use `./vectors/` paths. Set `inline=True` for full markdown in response. Set `include_vectors=False` to skip vector extraction. Use `output_filename` to override the default .md filename. When `include_vectors=True`, returns `vector_diagnostics` showing which pages had drawings below the complexity threshold. Set `vector_fallback_raster=True` to render those sub-threshold pages as full-page raster images (PNG at 150 DPI) instead of skipping them.
+5. **Format Conversion**: `pdf_to_markdown` - Writes markdown + extracted raster images and vector graphics (SVG) to disk by default, returns path + preview. Images use relative `./images/` paths, vectors use `./vectors/` paths. Set `inline=True` for full markdown in response. Set `include_vectors=False` to skip vector extraction. Use `output_filename` to override the default .md filename. When `include_vectors=True`, returns `vector_diagnostics` showing which pages had drawings below the complexity threshold. Set `vector_fallback_raster=True` to render those sub-threshold pages as full-page raster images (PNG at 150 DPI) instead of skipping them. `markdown_to_pdf` - Reverse direction: converts a `.md` file or inline markdown text to PDF using pandoc. Auto-detects available PDF engines (xelatex, pdflatex, tectonic, weasyprint, wkhtmltopdf) and picks the best one on PATH. Pass `pdf_engine` to override or `extra_args` for raw pandoc options. Requires `pip install mcp-pdf[markdown]` and the pandoc binary.
 6. **Image Processing**: `extract_images` - Extract images with custom output paths and clean summary output
 7. **Link Extraction**: `extract_links` - Extract all hyperlinks with page filtering and type categorization
 8. **PDF Forms**: `extract_form_data`, `create_form_pdf`, `fill_form_pdf`, `add_form_fields` - Complete form lifecycle management
diff --git a/pyproject.toml b/pyproject.toml
index 637801c..c9768c8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "mcp-pdf"
-version = "2.1.7"
+version = "2.2.0"
 description = "Secure FastMCP server for comprehensive PDF processing - text extraction, OCR, table extraction, forms, annotations, and more"
 authors = [{name = "Ryan Malloy", email = "ryan@malloys.us"}]
 readme = "README.md"
@@ -68,11 +68,18 @@ tables = [
     "tabula-py>=2.8.0",
 ]
 
+# Markdown → PDF conversion (requires pandoc binary + a PDF engine such as
+# xelatex, pdflatex, tectonic, weasyprint, or wkhtmltopdf)
+markdown = [
+    "pypandoc>=1.13",
+]
+
 # All optional features
 all = [
     "reportlab>=4.0.0",
     "camelot-py[cv]>=0.11.0",
     "tabula-py>=2.8.0",
+    "pypandoc>=1.13",
 ]
 
 # Development dependencies
diff --git a/src/mcp_pdf/mixins_official/image_processing.py b/src/mcp_pdf/mixins_official/image_processing.py
index 363f936..16b8e86 100644
--- a/src/mcp_pdf/mixins_official/image_processing.py
+++ b/src/mcp_pdf/mixins_official/image_processing.py
@@ -990,4 +990,206 @@ class ImageProcessingMixin(MCPMixin):
         # Match floating point numbers in SVG
         simplified = re.sub(r'-?\d+\.\d{3,}', reduce_precision, svg_content)
 
-        return simplified
\ No newline at end of file
+        return simplified
+
+    @mcp_tool(
+        name="markdown_to_pdf",
+        description=(
+            "Convert a Markdown file (or inline text) to PDF using pandoc. "
+            "Auto-detects available PDF engines (xelatex, pdflatex, tectonic, "
+            "weasyprint, wkhtmltopdf) and falls back through them in that order. "
+            "Pass pdf_engine to override, or extra_args for custom pandoc options "
+            "(e.g. ['-V', 'geometry:margin=1in']). Requires pandoc binary on host "
+            "and at least one PDF engine. Install with: pip install mcp-pdf[markdown]"
+        )
+    )
+    async def markdown_to_pdf(
+        self,
+        output_path: str,
+        markdown_path: Optional[str] = None,
+        markdown_text: Optional[str] = None,
+        pdf_engine: Optional[str] = None,
+        toc: bool = False,
+        title: Optional[str] = None,
+        author: Optional[str] = None,
+        date: Optional[str] = None,
+        base_path: Optional[str] = None,
+        extra_args: Optional[List[str]] = None,
+    ) -> Dict[str, Any]:
+        """
+        Convert Markdown to PDF using pandoc as the parser and one of several
+        available PDF rendering engines as the backend.
+
+        Provide either `markdown_path` (a .md file) or `markdown_text` (an
+        inline string), but not both. The output PDF is written to `output_path`.
+
+        Engine selection:
+            If `pdf_engine` is None, the first engine found on PATH is used,
+            preferring quality: xelatex > pdflatex > tectonic > weasyprint > wkhtmltopdf.
+            If a specific engine is requested but not on PATH, an error is returned
+            listing what *is* available.
+
+        Args:
+            output_path: Where to write the resulting PDF (required).
+            markdown_path: Path to a .md file. Mutually exclusive with markdown_text.
+            markdown_text: Inline markdown content. Mutually exclusive with markdown_path.
+            pdf_engine: Force a specific engine ("xelatex", "pdflatex", "tectonic",
+                "weasyprint", "wkhtmltopdf"). Default: auto-detect.
+            toc: Generate a table of contents from headings.
+            title: Document title (overrides any YAML frontmatter title).
+            author: Document author (overrides any YAML frontmatter author).
+            date: Document date string (overrides any YAML frontmatter date).
+            base_path: Resource resolution base for relative image references.
+                Defaults to the markdown file's directory when markdown_path is used.
+            extra_args: Additional raw pandoc CLI arguments (advanced).
+
+        Returns:
+            Dict with output_path, file_size, engine_used, conversion_time, and
+            (when applicable) detected_engines listing what's available on the host.
+        """
+        import shutil
+
+        start_time = time.time()
+
+        ENGINE_PREFERENCE = ["xelatex", "pdflatex", "tectonic", "weasyprint", "wkhtmltopdf"]
+
+        try:
+            # Optional dep check — pypandoc is gated behind the [markdown] extra
+            try:
+                import pypandoc
+            except ImportError:
+                return {
+                    "error": (
+                        "pypandoc is not installed. Install with: "
+                        "pip install mcp-pdf[markdown]   (also requires the pandoc "
+                        "binary and a PDF engine on PATH)"
+                    ),
+                    "conversion_time": round(time.time() - start_time, 2),
+                }
+
+            # Verify pandoc binary is reachable — pypandoc raises OSError if missing
+            try:
+                pypandoc.get_pandoc_version()
+            except OSError:
+                return {
+                    "error": (
+                        "pandoc binary not found on PATH. Install pandoc: "
+                        "https://pandoc.org/installing.html"
+                    ),
+                    "conversion_time": round(time.time() - start_time, 2),
+                }
+
+            # Validate input — exactly one of markdown_path or markdown_text
+            if bool(markdown_path) == bool(markdown_text):
+                return {
+                    "error": "Provide exactly one of markdown_path or markdown_text",
+                    "conversion_time": round(time.time() - start_time, 2),
+                }
+
+            # Validate output path
+            output = validate_output_path(output_path)
+            output.parent.mkdir(parents=True, exist_ok=True)
+
+            # Detect available PDF engines on PATH
+            available_engines = [e for e in ENGINE_PREFERENCE if shutil.which(e)]
+
+            # Pick engine: explicit override or first available
+            if pdf_engine:
+                if not shutil.which(pdf_engine):
+                    return {
+                        "error": (
+                            f"Requested PDF engine '{pdf_engine}' not found on PATH. "
+                            f"Available engines: {available_engines or 'none'}"
+                        ),
+                        "detected_engines": available_engines,
+                        "conversion_time": round(time.time() - start_time, 2),
+                    }
+                engine = pdf_engine
+            else:
+                if not available_engines:
+                    return {
+                        "error": (
+                            "No PDF engine found on PATH. Install one of: "
+                            + ", ".join(ENGINE_PREFERENCE)
+                        ),
+                        "conversion_time": round(time.time() - start_time, 2),
+                    }
+                engine = available_engines[0]
+
+            # Build pandoc arguments
+            args: List[str] = [f"--pdf-engine={engine}"]
+            if toc:
+                args.append("--toc")
+            if title:
+                args.extend(["-M", f"title={title}"])
+            if author:
+                args.extend(["-M", f"author={author}"])
+            if date:
+                args.extend(["-M", f"date={date}"])
+
+            # Resource path for relative image refs — defaults to source dir
+            if base_path:
+                resource_dir = Path(base_path).resolve()
+            elif markdown_path:
+                resource_dir = Path(markdown_path).resolve().parent
+            else:
+                resource_dir = None
+
+            if resource_dir:
+                args.extend(["--resource-path", str(resource_dir)])
+
+            if extra_args:
+                args.extend(extra_args)
+
+            # Convert — file path or inline text
+            if markdown_path:
+                source_path = Path(markdown_path).resolve()
+                if not source_path.is_file():
+                    return {
+                        "error": f"Markdown file not found: {markdown_path}",
+                        "conversion_time": round(time.time() - start_time, 2),
+                    }
+                pypandoc.convert_file(
+                    str(source_path),
+                    to="pdf",
+                    outputfile=str(output),
+                    extra_args=args,
+                )
+            else:
+                pypandoc.convert_text(
+                    markdown_text,
+                    to="pdf",
+                    format="md",
+                    outputfile=str(output),
+                    extra_args=args,
+                )
+
+            file_size = output.stat().st_size
+
+            return {
+                "output_path": str(output),
+                "file_size": file_size,
+                "file_size_kb": round(file_size / 1024, 2),
+                "engine_used": engine,
+                "detected_engines": available_engines,
+                "toc": toc,
+                "conversion_time": round(time.time() - start_time, 2),
+            }
+
+        except RuntimeError as e:
+            # pypandoc raises RuntimeError for pandoc subprocess failures —
+            # the message often contains the engine's stderr, which is the most
+            # useful signal a user can get for typesetting errors
+            error_msg = sanitize_error_message(str(e))
+            logger.error(f"Markdown to PDF conversion failed: {error_msg}")
+            return {
+                "error": f"Pandoc conversion failed: {error_msg}",
+                "conversion_time": round(time.time() - start_time, 2),
+            }
+        except Exception as e:
+            error_msg = sanitize_error_message(str(e))
+            logger.error(f"Markdown to PDF failed: {error_msg}")
+            return {
+                "error": error_msg,
+                "conversion_time": round(time.time() - start_time, 2),
+            }
\ No newline at end of file
diff --git a/uv.lock b/uv.lock
index 3f15096..93c1a63 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1032,7 +1032,7 @@ wheels = [
 
 [[package]]
 name = "mcp-pdf"
-version = "2.1.7"
+version = "2.2.0"
 source = { editable = "." }
 dependencies = [
     { name = "fastmcp" },
@@ -1052,6 +1052,7 @@ dependencies = [
 [package.optional-dependencies]
 all = [
     { name = "camelot-py", extra = ["cv"] },
+    { name = "pypandoc" },
     { name = "reportlab" },
     { name = "tabula-py" },
 ]
@@ -1069,6 +1070,9 @@ dev = [
 forms = [
     { name = "reportlab" },
 ]
+markdown = [
+    { name = "pypandoc" },
+]
 tables = [
     { name = "camelot-py", extra = ["cv"] },
     { name = "tabula-py" },
@@ -1102,6 +1106,8 @@ requires-dist = [
     { name = "pip-audit", marker = "extra == 'dev'", specifier = ">=2.0.0" },
     { name = "pydantic", specifier = ">=2.0.0" },
     { name = "pymupdf", specifier = ">=1.23.0" },
+    { name = "pypandoc", marker = "extra == 'all'", specifier = ">=1.13" },
+    { name = "pypandoc", marker = "extra == 'markdown'", specifier = ">=1.13" },
     { name = "pypdf", specifier = ">=6.0.0" },
     { name = "pytesseract", specifier = ">=0.3.10" },
     { name = "pytest", marker = "extra == 'dev'", specifier = ">=7.0.0" },
@@ -1115,7 +1121,7 @@ requires-dist = [
     { name = "tabula-py", marker = "extra == 'tables'", specifier = ">=2.8.0" },
     { name = "twine", marker = "extra == 'dev'", specifier = ">=4.0.0" },
 ]
-provides-extras = ["forms", "tables", "all", "dev"]
+provides-extras = ["forms", "tables", "markdown", "all", "dev"]
 
 [package.metadata.requires-dev]
 dev = [
@@ -1938,6 +1944,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/4a/26/8c72973b8833a72785cedc3981eb59b8ac7075942718bbb7b69b352cdde4/pymupdf-1.26.3-cp39-abi3-win_amd64.whl", hash = "sha256:b4cd5124d05737944636cf45fc37ce5824f10e707b0342efe109c7b6bd37a9cc", size = 18735124, upload-time = "2025-07-02T21:31:10.992Z" },
 ]
 
+[[package]]
+name = "pypandoc"
+version = "1.17"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ea/d6/410615fc433e5d1eacc00db2044ae2a9c82302df0d35366fe2bd15de024d/pypandoc-1.17.tar.gz", hash = "sha256:51179abfd6e582a25ed03477541b48836b5bba5a4c3b282a547630793934d799", size = 69071, upload-time = "2026-03-14T22:39:07.21Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0c/86/e2ffa604eacfbec3f430b1d850e7e04c4101eca1a5828f9ae54bf51dfba4/pypandoc-1.17-py3-none-any.whl", hash = "sha256:01fdbffa61edb9f8e82e8faad6954efcb7b6f8f0634aead4d89e322a00225a67", size = 23554, upload-time = "2026-03-14T22:38:46.007Z" },
+]
+
 [[package]]
 name = "pyparsing"
 version = "3.2.3"