From 214f72b034bc4e9ea8e3a50fd9864ee0be33f30e Mon Sep 17 00:00:00 2001 From: havelentzas Date: Thu, 4 Jun 2026 21:59:01 +0200 Subject: [PATCH] Added information and error message for Python 3.14 users facing marker-pdf issues. --- README.md | 4 +++- pyproject.toml | 1 + src/presubmit/pipeline.py | 9 ++++++++- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 0f31ce3..13e1170 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ The 7 stages that genuinely need page rasters — the math chain (`01e`, `01e2`, ## Requirements -- Python 3.10+ +- Python 3.10+ (Python 3.14 users: PDF input is broken; see note below) - An Anthropic API key ([console.anthropic.com](https://console.anthropic.com/)) - `marker-pdf` (installed automatically via `pip install -e .`; pulls in PyTorch and a few GB of ML models on first use — see "PDF handling and cost" above) - `qpdf` on `PATH` for PDF preprocessing (optional; python fallback exists) @@ -53,6 +53,8 @@ pip install -e . Initial install takes 5–10 minutes because of `marker-pdf` and its PyTorch dependency. The first PDF conversion downloads model weights (~3–5 GB) the first time. +> **Python 3.14: PDF input is broken; `.md` / `.tex` input works fine.** `marker-pdf` depends on `surya-ocr` 0.12.x, which was built against `transformers` 4.43.x. On Python 3.14, `tokenizers` 0.20.x (required to downgrade transformers) cannot be compiled from source because `pyo3` 0.22.5 only supports up to Python 3.13. The `transformers` version that installs in a Python 3.14 venv (4.57+) has breaking changes for surya's composition config classes, causing `KeyError: 'encoder'` and subsequent errors during PDF conversion. Workaround: convert your PDF to markdown first (e.g. with `pymupdf4llm`) and pass the `.md` file — the pipeline accepts `.md` / `.markdown` / `.txt` / `.tex` directly and skips marker entirely. To use PDF input, run on Python 3.12 or 3.13. + ## API key setup `presubmit` calls the Anthropic API directly via the official Python SDK. It does **not** authenticate via the `claude` CLI's OAuth subscription or via your claude.ai login — those are different auth surfaces. You need a personal API key on your Anthropic account. diff --git a/pyproject.toml b/pyproject.toml index ebdc17d..e7dee8a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,7 @@ classifiers = [ "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", "Topic :: Scientific/Engineering", "Topic :: Text Processing", ] diff --git a/src/presubmit/pipeline.py b/src/presubmit/pipeline.py index 42bfdb4..7a40251 100644 --- a/src/presubmit/pipeline.py +++ b/src/presubmit/pipeline.py @@ -188,10 +188,17 @@ def _marker_convert(pdf_path: Path, md_path: Path) -> None: ".md / .markdown / .tex file as input to bypass marker." ) from e except Exception as e: + py314 = sys.version_info >= (3, 14) + hint = ( + " PDF input is broken on Python 3.14 — pass a .md file instead, " + "or downgrade to Python 3.12/3.13. See README." + ) if py314 else ( + " Pass a pre-converted .md file to bypass marker." + ) raise PipelineError( f"marker-pdf failed converting {pdf_path}. " f"Latest error: {e}. " - f"Earlier (<=0.x API) error: {last_err}." + f"Earlier (<=0.x API) error: {last_err}.{hint}" ) from e