From 214f72b034bc4e9ea8e3a50fd9864ee0be33f30e Mon Sep 17 00:00:00 2001
From: havelentzas <harrison.velentzas@gmail.com>
Date: Thu, 4 Jun 2026 21:59:01 +0200
Subject: [PATCH] Added information and error message for Python 3.14 users
 facing marker-pdf issues.

---
 README.md                 | 4 +++-
 pyproject.toml            | 1 +
 src/presubmit/pipeline.py | 9 ++++++++-
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 0f31ce3..13e1170 100644
--- a/README.md
+++ b/README.md
@@ -34,7 +34,7 @@ The 7 stages that genuinely need page rasters — the math chain (`01e`, `01e2`,
 
 ## Requirements
 
-- Python 3.10+
+- Python 3.10+ (Python 3.14 users: PDF input is broken; see note below)
 - An Anthropic API key ([console.anthropic.com](https://console.anthropic.com/))
 - `marker-pdf` (installed automatically via `pip install -e .`; pulls in PyTorch and a few GB of ML models on first use — see "PDF handling and cost" above)
 - `qpdf` on `PATH` for PDF preprocessing (optional; python fallback exists)
@@ -53,6 +53,8 @@ pip install -e .
 
 Initial install takes 5–10 minutes because of `marker-pdf` and its PyTorch dependency. The first PDF conversion downloads model weights (~3–5 GB) the first time.
 
+> **Python 3.14: PDF input is broken; `.md` / `.tex` input works fine.** `marker-pdf` depends on `surya-ocr` 0.12.x, which was built against `transformers` 4.43.x. On Python 3.14, `tokenizers` 0.20.x (required to downgrade transformers) cannot be compiled from source because `pyo3` 0.22.5 only supports up to Python 3.13. The `transformers` version that installs in a Python 3.14 venv (4.57+) has breaking changes for surya's composition config classes, causing `KeyError: 'encoder'` and subsequent errors during PDF conversion. Workaround: convert your PDF to markdown first (e.g. with `pymupdf4llm`) and pass the `.md` file — the pipeline accepts `.md` / `.markdown` / `.txt` / `.tex` directly and skips marker entirely. To use PDF input, run on Python 3.12 or 3.13.
+
 ## API key setup
 
 `presubmit` calls the Anthropic API directly via the official Python SDK. It does **not** authenticate via the `claude` CLI's OAuth subscription or via your claude.ai login — those are different auth surfaces. You need a personal API key on your Anthropic account.
diff --git a/pyproject.toml b/pyproject.toml
index ebdc17d..e7dee8a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,6 +20,7 @@ classifiers = [
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
     "Topic :: Scientific/Engineering",
     "Topic :: Text Processing",
 ]
diff --git a/src/presubmit/pipeline.py b/src/presubmit/pipeline.py
index 42bfdb4..7a40251 100644
--- a/src/presubmit/pipeline.py
+++ b/src/presubmit/pipeline.py
@@ -188,10 +188,17 @@ def _marker_convert(pdf_path: Path, md_path: Path) -> None:
             ".md / .markdown / .tex file as input to bypass marker."
         ) from e
     except Exception as e:
+        py314 = sys.version_info >= (3, 14)
+        hint = (
+            " PDF input is broken on Python 3.14 — pass a .md file instead, "
+            "or downgrade to Python 3.12/3.13. See README."
+        ) if py314 else (
+            " Pass a pre-converted .md file to bypass marker."
+        )
         raise PipelineError(
             f"marker-pdf failed converting {pdf_path}. "
             f"Latest error: {e}. "
-            f"Earlier (<=0.x API) error: {last_err}."
+            f"Earlier (<=0.x API) error: {last_err}.{hint}"
         ) from e