From 91cc60956e227cda2910ace0845af3b45ba72943 Mon Sep 17 00:00:00 2001 From: Pratyush Sharma <56130065+pratyush618@users.noreply.github.com> Date: Fri, 24 Apr 2026 16:04:29 +0530 Subject: [PATCH 1/9] chore: add rust-toolchain and justfile for consistent dev tooling rust-toolchain.toml pins every contributor and CI invocation to the same stable toolchain with rustfmt, clippy, and the wasm32-unknown-unknown target. Previously CI used dtolnay/rust-toolchain@stable while contributors installed their own; minor version drift between them could produce clippy lint discrepancies at merge time. justfile captures the common build / test / lint commands documented in CLAUDE.md as executable recipes. `just` (no args) prints the full list, and the common flows (build, test, check, fmt, clean-all) are one step each so local iteration matches the pre-commit chain. --- justfile | 83 +++++++++++++++++++++++++++++++++++++++++++++ rust-toolchain.toml | 4 +++ 2 files changed, 87 insertions(+) create mode 100644 justfile create mode 100644 rust-toolchain.toml diff --git a/justfile b/justfile new file mode 100644 index 0000000..78158be --- /dev/null +++ b/justfile @@ -0,0 +1,83 @@ +# paperjam — common developer commands +# +# Run `just` (no args) for the list. Requires `just` (https://just.systems). + +default: + @just --list + +# --- Build --------------------------------------------------------------- + +# Build + install the Python extension into the current venv (release). +build: + uv run maturin develop --release + +# Build + install the Python extension in debug mode (fast rebuild). +build-dev: + uv run maturin develop + +# Build the whole Rust workspace. +build-rust: + cargo build --workspace + +# Compile-check the wasm target (no linking). +build-wasm: + cargo check -p paperjam-wasm --target wasm32-unknown-unknown + +# Build the Docusaurus docs site. +build-docs: + cd docs-site && npm ci && npm run build + +# Render crate API docs into target/doc. +rustdoc: + cargo doc --workspace --no-deps + +# --- Test ---------------------------------------------------------------- + +# Run the full Rust test suite. +test-rust: + cargo test --workspace + +# Run the Python test suite (requires `just build` first). +test-py: + uv run pytest tests/python/ -v + +# Run both Rust and Python test suites. +test: test-rust test-py + +# Run the table-accuracy harness (slower, requires fixtures). +test-accuracy: + uv run pytest tests/python/ -m accuracy -v + +# --- Lint / format ------------------------------------------------------- + +# Apply all autoformatters and run every pre-commit hook. +check: + pre-commit run --all-files + +# Rust-only checks (fmt + clippy). +check-rust: + cargo fmt --all --check + cargo clippy --workspace --all-targets -- -D warnings + +# Python-only checks (ruff + mypy). +check-py: + uv run ruff check py_src/ tests/ + uv run ruff format --check py_src/ tests/ + uv run mypy py_src/ tests/ examples/ --ignore-missing-imports + +# Apply autoformatters (doesn't fail on remaining lint issues). +fmt: + cargo fmt --all + uv run ruff format py_src/ tests/ + uv run ruff check --fix py_src/ tests/ + +# --- Clean --------------------------------------------------------------- + +# Remove Rust build artifacts. +clean: + cargo clean + +# Remove Rust build artifacts and Python caches. +clean-all: clean + rm -rf .pytest_cache .mypy_cache .ruff_cache + find . -type d -name __pycache__ -exec rm -rf {} + diff --git a/rust-toolchain.toml b/rust-toolchain.toml new file mode 100644 index 0000000..c38c8a8 --- /dev/null +++ b/rust-toolchain.toml @@ -0,0 +1,4 @@ +[toolchain] +channel = "stable" +components = ["rustfmt", "clippy"] +targets = ["wasm32-unknown-unknown"] From 6eb020bb7e9d8f02eeabe7d2634c99a446e0d363 Mon Sep 17 00:00:00 2001 From: Pratyush Sharma <56130065+pratyush618@users.noreply.github.com> Date: Fri, 24 Apr 2026 16:04:42 +0530 Subject: [PATCH 2/9] chore(async): stop force-enabling signatures/validation on core paperjam-async currently only reaches into paperjam_core::render, yet its manifest force-enabled the signatures and validation features on paperjam-core for every consumer. Downstream crates that need those features (paperjam-py does, explicitly) keep working unchanged; lightweight async consumers no longer drag in the x509-parser / cms / rsa / p256 / sha1 / pkcs8 / spki / ureq / rustls / roxmltree tree. --- crates/paperjam-async/Cargo.toml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/crates/paperjam-async/Cargo.toml b/crates/paperjam-async/Cargo.toml index bd05547..1deaf83 100644 --- a/crates/paperjam-async/Cargo.toml +++ b/crates/paperjam-async/Cargo.toml @@ -6,7 +6,12 @@ license.workspace = true description = "Async wrappers for paperjam operations via tokio::spawn_blocking" [dependencies] -paperjam-core = { path = "../paperjam-core", features = ["render", "signatures", "validation"] } +# paperjam-async currently only uses `paperjam_core::render`, so we enable +# that feature here but leave `signatures`/`validation` to be opted in by +# the downstream crate that actually needs them (paperjam-py does this +# explicitly). Keeps the async surface lightweight for callers that only +# want basic document operations. +paperjam-core = { path = "../paperjam-core", features = ["render"] } paperjam-model = { path = "../paperjam-model" } paperjam-convert = { path = "../paperjam-convert", optional = true } tokio = { workspace = true } From 8646fe5a7c48b4cf4b66c86e5752226566d458ae Mon Sep 17 00:00:00 2001 From: Pratyush Sharma <56130065+pratyush618@users.noreply.github.com> Date: Fri, 24 Apr 2026 16:04:56 +0530 Subject: [PATCH 3/9] docs: crate-level rustdoc across the workspace Every library crate now has a `//!` summary describing its scope, its entry points, and how it fits into the broader paperjam ecosystem. Uniform style: plain prose, no intra-doc links in crate-level summaries (simpler to maintain, no rustdoc link warnings to manage). Also fixes two pre-existing rustdoc warnings uncovered along the way: an `[OPTIONAL]` literal in signature/tsa.rs that rustdoc was parsing as an intra-doc link, and a bare URL in model/annotations.rs flagged for auto-linking. The PyO3 `PyDocument` and `PyPage` classes get class-level docs that clarify they are the native layer beneath the pure-Python `paperjam.Document` / `paperjam.Page` wrappers. After this commit `cargo doc --workspace --no-deps` produces zero warnings. --- crates/paperjam-async/src/lib.rs | 8 ++++++++ crates/paperjam-convert/src/lib.rs | 8 ++++++++ crates/paperjam-core/src/lib.rs | 20 ++++++++++++++++++++ crates/paperjam-core/src/signature/tsa.rs | 2 +- crates/paperjam-docx/src/lib.rs | 8 ++++++++ crates/paperjam-epub/src/lib.rs | 9 +++++++++ crates/paperjam-html/src/lib.rs | 7 +++++++ crates/paperjam-mcp/src/lib.rs | 10 ++++++++++ crates/paperjam-model/src/annotations.rs | 2 +- crates/paperjam-model/src/lib.rs | 10 ++++++++++ crates/paperjam-pipeline/src/lib.rs | 10 ++++++++++ crates/paperjam-pptx/src/lib.rs | 6 ++++++ crates/paperjam-py/src/document.rs | 8 +++++++- crates/paperjam-py/src/lib.rs | 8 ++++++++ crates/paperjam-py/src/page.rs | 6 ++++++ crates/paperjam-wasm/src/lib.rs | 8 ++++++++ crates/paperjam-xlsx/src/lib.rs | 7 +++++++ 17 files changed, 134 insertions(+), 3 deletions(-) diff --git a/crates/paperjam-async/src/lib.rs b/crates/paperjam-async/src/lib.rs index 9688833..30deb05 100644 --- a/crates/paperjam-async/src/lib.rs +++ b/crates/paperjam-async/src/lib.rs @@ -1,3 +1,11 @@ +//! Tokio-native async wrappers around paperjam's blocking operations. +//! +//! Each heavy operation (`open`, `save`, `render`, `to_markdown`, +//! `merge`, `redact_text`, ...) is re-exposed as an `async fn` that runs +//! the blocking work on `tokio::spawn_blocking`. This is what powers the +//! `paperjam.aopen` / `paperjam.arender_*` / `paperjam.amerge` helpers on +//! the Python side. + pub mod document; pub mod generic; pub mod page; diff --git a/crates/paperjam-convert/src/lib.rs b/crates/paperjam-convert/src/lib.rs index f8e6395..f97efea 100644 --- a/crates/paperjam-convert/src/lib.rs +++ b/crates/paperjam-convert/src/lib.rs @@ -1,3 +1,11 @@ +//! Cross-format document conversion. +//! +//! Orchestrates conversion between every pair of formats supported by the +//! paperjam workspace (PDF, DOCX, XLSX, PPTX, HTML, EPUB, Markdown). Each +//! format crate is an optional dependency so consumers only pay for the +//! formats they want; features named after the source and target crates +//! gate those conversions in and out. + pub mod convert; pub mod detect; pub mod error; diff --git a/crates/paperjam-core/src/lib.rs b/crates/paperjam-core/src/lib.rs index 9836e0d..8e1d503 100644 --- a/crates/paperjam-core/src/lib.rs +++ b/crates/paperjam-core/src/lib.rs @@ -1,3 +1,23 @@ +//! Pure-Rust PDF engine: parsing, text and table extraction, page +//! manipulation, form fields, digital signatures, encryption, rendering, +//! and PDF/A / PDF/UA validation. +//! +//! `paperjam-core` is the PDF-specific implementation behind the +//! `paperjam` library. Non-PDF formats live in sibling crates +//! (`paperjam-docx`, `paperjam-xlsx`, `paperjam-pptx`, `paperjam-html`, +//! `paperjam-epub`); cross-format operations go through `paperjam-convert`. +//! +//! Heavy optional pieces are feature-gated: +//! +//! | Feature | Enables | +//! |--------------|----------------------------------------------------------| +//! | `render` | page-to-image rasterisation via pdfium | +//! | `signatures` | sign / verify / inspect digital signatures | +//! | `ltv` | long-term validation (TSA, OCSP, CRL embedding) | +//! | `validation` | PDF/A and PDF/UA conformance checks | +//! | `parallel` | rayon-based parallel processing (default on) | +//! | `mmap` | memory-mapped file access for large documents | + pub mod annotations; pub mod bookmarks; #[cfg(feature = "validation")] diff --git a/crates/paperjam-core/src/signature/tsa.rs b/crates/paperjam-core/src/signature/tsa.rs index 7c89031..7ccccfb 100644 --- a/crates/paperjam-core/src/signature/tsa.rs +++ b/crates/paperjam-core/src/signature/tsa.rs @@ -54,7 +54,7 @@ pub fn build_timestamp_request(signature_value: &[u8]) -> Result> { /// Parse an RFC 3161 timestamp response and extract the TimeStampToken. /// -/// The response is: SEQUENCE { status PKIStatusInfo, timeStampToken [OPTIONAL] } +/// The response is: `SEQUENCE { status PKIStatusInfo, timeStampToken [OPTIONAL] }` /// We check the status integer and return the token bytes. pub fn parse_timestamp_response(resp_bytes: &[u8]) -> Result> { // Minimal DER parsing: skip the outer SEQUENCE, read PKIStatusInfo, extract token diff --git a/crates/paperjam-docx/src/lib.rs b/crates/paperjam-docx/src/lib.rs index 72bfb0e..fd0a358 100644 --- a/crates/paperjam-docx/src/lib.rs +++ b/crates/paperjam-docx/src/lib.rs @@ -1,3 +1,11 @@ +//! DOCX (Office Open XML word-processing) support for the paperjam +//! ecosystem. +//! +//! Reads and writes `.docx` files and exposes text, tables, and metadata +//! through the `DocumentTrait` implementation on `DocxDocument`. Body +//! parsing is delegated to `docx-rs`; an internal size-capped ZIP reader +//! handles the metadata parts the upstream API does not expose. + mod document; mod error; mod image; diff --git a/crates/paperjam-epub/src/lib.rs b/crates/paperjam-epub/src/lib.rs index d85af44..e408216 100644 --- a/crates/paperjam-epub/src/lib.rs +++ b/crates/paperjam-epub/src/lib.rs @@ -1,3 +1,12 @@ +//! EPUB document support for the paperjam ecosystem. +//! +//! Parses EPUB archives (container.xml → OPF → spine) and exposes each +//! chapter as an `HtmlDocument`, delegating per-chapter rendering to +//! `paperjam-html`. Implements `DocumentTrait` so EPUB files participate +//! in the shared model (chapter → page). +//! +//! Archive reads are size-capped internally to mitigate zip-bomb attacks. + mod document; mod error; mod image; diff --git a/crates/paperjam-html/src/lib.rs b/crates/paperjam-html/src/lib.rs index 723b07d..922b117 100644 --- a/crates/paperjam-html/src/lib.rs +++ b/crates/paperjam-html/src/lib.rs @@ -1,3 +1,10 @@ +//! HTML document support for the paperjam ecosystem. +//! +//! Parses HTML bytes via `scraper`, extracts text and tables, and +//! implements `DocumentTrait` so HTML documents share the same API +//! surface as the office formats. Also used by `paperjam-epub` for +//! chapter content (EPUB spine entries are XHTML). + mod document; mod error; pub mod image; diff --git a/crates/paperjam-mcp/src/lib.rs b/crates/paperjam-mcp/src/lib.rs index c65a1c7..92895bb 100644 --- a/crates/paperjam-mcp/src/lib.rs +++ b/crates/paperjam-mcp/src/lib.rs @@ -1,3 +1,13 @@ +//! Model Context Protocol server for paperjam. +//! +//! Exposes document operations (open, extract, convert, manipulate, +//! render, sign, validate, run pipelines, …) as MCP tools that a local +//! assistant (Claude Code, Cursor, Claude Desktop) can invoke. All +//! path resolution goes through a sandbox rooted at the server's +//! configured working directory; absolute paths escaping the sandbox +//! are rejected unless the operator opts out with +//! `--allow-absolute-paths`. + pub mod error; pub mod session; diff --git a/crates/paperjam-model/src/annotations.rs b/crates/paperjam-model/src/annotations.rs index 159eee2..acd5764 100644 --- a/crates/paperjam-model/src/annotations.rs +++ b/crates/paperjam-model/src/annotations.rs @@ -1,7 +1,7 @@ /// Where a link annotation points to. #[derive(Debug, Clone)] pub enum LinkDestination { - /// External URI (e.g. "https://example.com"). + /// External URI (e.g. `https://example.com`). Uri(String), /// Go to a specific page within the document. GoTo { page: u32 }, diff --git a/crates/paperjam-model/src/lib.rs b/crates/paperjam-model/src/lib.rs index 8613b0a..29039a8 100644 --- a/crates/paperjam-model/src/lib.rs +++ b/crates/paperjam-model/src/lib.rs @@ -1,3 +1,13 @@ +//! Format-agnostic types and traits shared across the paperjam workspace. +//! +//! Holds the stable data model — bookmarks, metadata, tables, text layout, +//! annotations, structure blocks — plus the `DocumentTrait` that every +//! format crate (`paperjam-docx`, `paperjam-xlsx`, ...) implements. +//! +//! This crate intentionally has no format-specific dependencies, so +//! downstream crates can depend on it without pulling in parsers they +//! do not use. + pub mod document; pub mod format; diff --git a/crates/paperjam-pipeline/src/lib.rs b/crates/paperjam-pipeline/src/lib.rs index 90071a4..6c9bac3 100644 --- a/crates/paperjam-pipeline/src/lib.rs +++ b/crates/paperjam-pipeline/src/lib.rs @@ -1,3 +1,13 @@ +//! Declarative multi-step document workflows defined in YAML or JSON. +//! +//! A pipeline is a sequence of steps — open, extract, convert, redact, +//! merge, save — applied to one or more input files. The engine runs +//! steps serially or in parallel and returns a per-file summary +//! (success / failure / skipped). +//! +//! Used by the `pj pipeline` CLI subcommand and the `run_pipeline` MCP +//! tool. + pub mod builder; pub mod context; pub mod definition; diff --git a/crates/paperjam-pptx/src/lib.rs b/crates/paperjam-pptx/src/lib.rs index 7fe2f6b..07e8e7a 100644 --- a/crates/paperjam-pptx/src/lib.rs +++ b/crates/paperjam-pptx/src/lib.rs @@ -1,3 +1,9 @@ +//! PPTX (Office Open XML presentation) support for the paperjam ecosystem. +//! +//! Parses `.pptx` archives slide-by-slide, extracts text blocks and +//! tables from slide XML, and implements `DocumentTrait` so presentations +//! participate in the shared model (slide → page). + pub mod document; pub mod error; pub mod markdown; diff --git a/crates/paperjam-py/src/document.rs b/crates/paperjam-py/src/document.rs index 493f500..c03ff6a 100644 --- a/crates/paperjam-py/src/document.rs +++ b/crates/paperjam-py/src/document.rs @@ -6,7 +6,13 @@ use std::sync::Arc; use crate::errors::to_py_err; use crate::page::PyPage; -/// Internal Rust document, exposed to Python as _paperjam.RustDocument. +/// Native PDF document handle, exposed to Python as `_paperjam.RustDocument`. +/// +/// Users normally access this via the higher-level `paperjam.Document` +/// wrapper in `py_src/paperjam/_document.py`, which adds ergonomic +/// defaults and composed operations on top of the raw bindings. +/// Holds an `Arc` so the same underlying PDF can be shared +/// across threads and Python tasks without copying. #[pyclass(name = "RustDocument")] pub struct PyDocument { pub(crate) inner: Arc, diff --git a/crates/paperjam-py/src/lib.rs b/crates/paperjam-py/src/lib.rs index 6fa5475..047dc5e 100644 --- a/crates/paperjam-py/src/lib.rs +++ b/crates/paperjam-py/src/lib.rs @@ -1,3 +1,11 @@ +//! PyO3 bindings that expose paperjam's Rust engine to Python as the +//! `_paperjam` native extension module. +//! +//! The Python package (`py_src/paperjam/`) wraps these raw bindings with +//! a more idiomatic API. Every PyO3-exposed symbol registered here must +//! also appear in `py_src/paperjam/_paperjam.pyi` so static type checkers +//! can see the extension's surface. + use pyo3::prelude::*; #[cfg(feature = "formats")] diff --git a/crates/paperjam-py/src/page.rs b/crates/paperjam-py/src/page.rs index aebe5a2..dbac9c4 100644 --- a/crates/paperjam-py/src/page.rs +++ b/crates/paperjam-py/src/page.rs @@ -5,6 +5,12 @@ use std::sync::Arc; use crate::errors::to_py_err; +/// Native PDF page handle, exposed to Python as `_paperjam.RustPage`. +/// +/// Obtained via `RustDocument.page(n)`. Users normally access this +/// through the higher-level `paperjam.Page` wrapper in +/// `py_src/paperjam/_page.py`, which adds ergonomic defaults and +/// composed operations on top of the raw bindings. #[pyclass(name = "RustPage")] pub struct PyPage { pub(crate) inner: Arc, diff --git a/crates/paperjam-wasm/src/lib.rs b/crates/paperjam-wasm/src/lib.rs index 43c9d27..936b9f1 100644 --- a/crates/paperjam-wasm/src/lib.rs +++ b/crates/paperjam-wasm/src/lib.rs @@ -1,3 +1,11 @@ +//! WebAssembly bindings for paperjam, exposed via `wasm-bindgen`. +//! +//! Builds with `wasm-pack build --target web`. The generated JS + WASM +//! pair powers the interactive playground on the docs site. +//! Functionality is a subset of the native engine — rendering and +//! signatures are omitted on wasm, and compression is pure-Rust to avoid +//! `libz-sys` on `wasm32-unknown-unknown`. + use std::sync::Arc; use paperjam_core::document::Document; diff --git a/crates/paperjam-xlsx/src/lib.rs b/crates/paperjam-xlsx/src/lib.rs index 21d192a..8e925f2 100644 --- a/crates/paperjam-xlsx/src/lib.rs +++ b/crates/paperjam-xlsx/src/lib.rs @@ -1,3 +1,10 @@ +//! XLSX (Office Open XML spreadsheet) support for the paperjam ecosystem. +//! +//! Reads `.xlsx` workbooks via `calamine` and writes them via +//! `rust_xlsxwriter`. Each sheet's rows are exposed as stringified cells, +//! and the crate implements `DocumentTrait` so workbooks participate in +//! the shared model (sheet → page, cell → text). + pub mod document; pub mod error; pub mod markdown; From 22fd55215f3473756dc26d302610e3e5af717dbe Mon Sep 17 00:00:00 2001 From: Pratyush Sharma <56130065+pratyush618@users.noreply.github.com> Date: Fri, 24 Apr 2026 16:05:05 +0530 Subject: [PATCH 4/9] chore(ci): run docs workflow on PRs and install wasm-opt The docs workflow previously fired only on pushes to main, so docs regressions (broken wasm builds, Docusaurus compile errors, bad links) were invisible until after merge. Now PRs with matching paths run the full build (without deploying) so problems surface in the PR check run. Also installs binaryen, whose wasm-opt binary wasm-pack invokes automatically when present on PATH. Release-mode WASM bundles shrink by 20-30% with no code changes. Concurrency group is keyed on ref so PR builds and deploy builds don't cancel each other; the deploy job is skipped on pull_request events to preserve production pages behaviour. --- .github/workflows/docs.yml | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 342d3a0..c832a33 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -7,6 +7,12 @@ on: - "docs-site/**" - "crates/paperjam-wasm/**" - ".github/workflows/docs.yml" + pull_request: + branches: [main] + paths: + - "docs-site/**" + - "crates/paperjam-wasm/**" + - ".github/workflows/docs.yml" workflow_dispatch: permissions: @@ -15,7 +21,7 @@ permissions: id-token: write concurrency: - group: pages + group: pages-${{ github.ref }} cancel-in-progress: false jobs: @@ -35,6 +41,11 @@ jobs: - name: Install wasm-pack run: curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh + - name: Install binaryen (wasm-opt) + # wasm-pack invokes wasm-opt automatically if present on PATH; + # binaryen ships wasm-opt. Drops release-mode wasm size 20-30%. + run: sudo apt-get update && sudo apt-get install -y binaryen + - name: Build WASM run: wasm-pack build crates/paperjam-wasm --target web --release --out-dir ../../docs-site/static/wasm @@ -52,12 +63,14 @@ jobs: run: cd docs-site && npm run build - name: Upload Pages artifact + if: github.event_name == 'push' || github.event_name == 'workflow_dispatch' uses: actions/upload-pages-artifact@v5 with: path: docs-site/build deploy: needs: build + if: github.event_name == 'push' || github.event_name == 'workflow_dispatch' runs-on: ubuntu-latest environment: name: github-pages From 7d9fafd080ee22453f4f60c62fac85132238e42f Mon Sep 17 00:00:00 2001 From: Pratyush Sharma <56130065+pratyush618@users.noreply.github.com> Date: Fri, 24 Apr 2026 16:05:12 +0530 Subject: [PATCH 5/9] docs(changelog): record [Unreleased] entries since 0.2.0 Document the audit-driven work that has landed on main but hasn't been cut into a release yet: the ZIP-entry and MCP sandbox security hardening (#69), the panic-surface cleanup in the PDF engine (#70), the form-bindings stub sync and metadata / docs refresh (#68), plus the tooling, docs, and paperjam-async feature adjustments from this polish branch. --- CHANGELOG.md | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6ec1502..0ce3492 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,64 @@ and this project adheres to [Semantic Versioning](https://semver.org/). ## [Unreleased] +### Security + +- Bound ZIP entry reads in EPUB, PPTX, and DOCX parsers. A crafted archive + declaring a tiny compressed size could previously expand to multi-GB on + decompression; entries are now rejected when the declared or observed + decompressed size exceeds a per-entry cap. +- Cap `Vec::with_capacity` preallocations in XLSX sheet parsing and PPTX + slide parsing at reasonable ceilings so attacker-controlled counts can + no longer trigger large allocations up front. +- `paperjam-mcp`: resolved paths are now sandboxed to the configured + working directory by default. Absolute paths and `..` traversal that + escape the working dir are rejected with a structured error. Operators + can opt out with `--allow-absolute-paths` (or + `ServerConfig::allow_absolute_paths`). + +### Fixed + +- Replace panic-prone `f64::partial_cmp(..).unwrap()` in table detection + (`table/{grid,lattice,stream}.rs`) with `total_cmp`, so malformed PDFs + producing NaN coordinates no longer crash the parser. +- Replace `get_object_mut().unwrap()` / `as_dict_mut().unwrap()` / + `from_utf8().unwrap()` across the stamp, watermark, bookmarks, and + PDF/UA validation modules with structured `PdfError` returns. Malformed + PDFs now surface typed errors instead of panicking the process. +- Stub drift: add `modify_form_field`, `add_form_field`, and the + `fill_form.generate_appearances` parameter to `_paperjam.pyi` so mypy + sees the full PyO3 surface. + +### Added + +- Crate-level `//!` rustdoc summaries on every workspace crate. +- `rust-toolchain.toml` pins the contributor toolchain to stable with + `rustfmt`, `clippy`, and the `wasm32-unknown-unknown` target. +- `justfile` with shortcuts for common build / test / lint tasks. +- `[profile.release]` with thin LTO, `codegen-units = 1`, and symbol + strip. Adds a `release-with-debug` profile for profiling. + +### Changed + +- `paperjam-async` no longer force-enables `signatures` and `validation` + on `paperjam-core`. Consumers that need them (e.g. `paperjam-py`) + continue to enable them explicitly; lightweight async users no longer + drag in the full signing / validation stack. +- Docs site CI now builds on pull requests (without deploying) so docs + regressions are caught pre-merge. Binaryen's `wasm-opt` is installed + so release WASM bundles are size-optimized. + +### Docs + +- README: CLI examples now use the correct `pj` binary name and accurate + flags; removed the nonexistent `extract tables --format csv` flag. +- `docs-site/docs/getting-started/installation.md`: replace leftover + Sphinx build instructions with the Docusaurus workflow, fix the + clone org, expand the feature-flag table. +- `pyproject.toml`: fill in multi-format description, `readme`, + `project.urls`, and extra classifiers/keywords so the PyPI page is + populated. Drop the stale Sphinx `[docs]` extra. + ## [0.2.0] — 2026-04-04 ### Added From 9af7fc5055e65c8991ee66f6c556aa6be2a39c3a Mon Sep 17 00:00:00 2001 From: Pratyush Sharma <56130065+pratyush618@users.noreply.github.com> Date: Fri, 24 Apr 2026 16:16:11 +0530 Subject: [PATCH 6/9] fix(ci): install pinned binaryen release instead of apt binaryen MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ubuntu's apt-shipped binaryen is ~v108, which predates the default enablement of bulk-memory and sign-extension instructions in rustc output. The result is wasm-pack invoking /usr/bin/wasm-opt on a valid modern wasm module and wasm-opt rejecting it with "[wasm-validator error] Bulk memory operation (bulk memory is disabled)" — observed on the PR #71 run. Download and install a pinned binaryen release tarball from the upstream GitHub releases page. version_119 is known-good against the current rustc and supports all default features. Future bumps change one env var. --- .github/workflows/docs.yml | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index c832a33..23abc57 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -42,9 +42,21 @@ jobs: run: curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh - name: Install binaryen (wasm-opt) - # wasm-pack invokes wasm-opt automatically if present on PATH; - # binaryen ships wasm-opt. Drops release-mode wasm size 20-30%. - run: sudo apt-get update && sudo apt-get install -y binaryen + # wasm-pack invokes wasm-opt automatically when it is on PATH. + # Ubuntu's apt package is too old to validate modern rustc output + # (rustc 1.95+ emits bulk-memory and sign-extension instructions + # by default, unsupported by apt-binaryen ~v108), so we install a + # pinned upstream release from the binaryen GitHub releases page. + env: + BINARYEN_VERSION: version_119 + run: | + set -euo pipefail + tarball="binaryen-${BINARYEN_VERSION}-x86_64-linux.tar.gz" + url="https://github.com/WebAssembly/binaryen/releases/download/${BINARYEN_VERSION}/${tarball}" + curl -fsSL "$url" -o "/tmp/${tarball}" + tar -xzf "/tmp/${tarball}" -C /tmp + sudo install -m 0755 "/tmp/binaryen-${BINARYEN_VERSION}/bin/wasm-opt" /usr/local/bin/wasm-opt + wasm-opt --version - name: Build WASM run: wasm-pack build crates/paperjam-wasm --target web --release --out-dir ../../docs-site/static/wasm From db29661aebe0c8374e2b7d7d9b05c1e88f02a6b6 Mon Sep 17 00:00:00 2001 From: Pratyush Sharma <56130065+pratyush618@users.noreply.github.com> Date: Fri, 24 Apr 2026 16:18:11 +0530 Subject: [PATCH 7/9] chore(ci): verify binaryen tarball checksum and cache across runs Harden the binaryen install step that landed in the previous commit: - SHA256-pin the downloaded tarball (value verified against a local download of version_119). Guards against upstream tampering or an accidental silent swap. - Split the version-check into a dedicated Verify step so the log shows the installed wasm-opt version unambiguously. - Wrap the install in actions/cache keyed on the pinned version so subsequent runs skip the download. Saves ~3-5s per run. --- .github/workflows/docs.yml | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 23abc57..8509006 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -41,22 +41,37 @@ jobs: - name: Install wasm-pack run: curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh + # wasm-pack invokes wasm-opt automatically when it is on PATH. + # Ubuntu's apt-packaged binaryen (~v108) is too old to validate + # modern rustc output — rustc 1.95+ emits bulk-memory and + # sign-extension instructions by default, which that wasm-opt + # rejects. We install a pinned upstream release, verify its + # SHA256, and cache it across runs. + - name: Cache binaryen + id: cache-binaryen + uses: actions/cache@v4 + env: + BINARYEN_VERSION: version_119 + with: + path: /usr/local/bin/wasm-opt + key: binaryen-${{ env.BINARYEN_VERSION }}-x86_64-linux + - name: Install binaryen (wasm-opt) - # wasm-pack invokes wasm-opt automatically when it is on PATH. - # Ubuntu's apt package is too old to validate modern rustc output - # (rustc 1.95+ emits bulk-memory and sign-extension instructions - # by default, unsupported by apt-binaryen ~v108), so we install a - # pinned upstream release from the binaryen GitHub releases page. + if: steps.cache-binaryen.outputs.cache-hit != 'true' env: BINARYEN_VERSION: version_119 + BINARYEN_SHA256: 716bcf9f5f36a6f466239fbb09a925eeaf54c46411ccefac979ec649e7c06d2d run: | set -euo pipefail tarball="binaryen-${BINARYEN_VERSION}-x86_64-linux.tar.gz" url="https://github.com/WebAssembly/binaryen/releases/download/${BINARYEN_VERSION}/${tarball}" curl -fsSL "$url" -o "/tmp/${tarball}" + echo "${BINARYEN_SHA256} /tmp/${tarball}" | sha256sum --check --strict tar -xzf "/tmp/${tarball}" -C /tmp sudo install -m 0755 "/tmp/binaryen-${BINARYEN_VERSION}/bin/wasm-opt" /usr/local/bin/wasm-opt - wasm-opt --version + + - name: Verify wasm-opt + run: wasm-opt --version - name: Build WASM run: wasm-pack build crates/paperjam-wasm --target web --release --out-dir ../../docs-site/static/wasm From 8298aeb955bdba33f1960a3c322abb4f22dbe2c3 Mon Sep 17 00:00:00 2001 From: Pratyush Sharma <56130065+pratyush618@users.noreply.github.com> Date: Fri, 24 Apr 2026 16:21:06 +0530 Subject: [PATCH 8/9] fix(wasm): tell wasm-pack to enable bulk-memory and sign-ext in wasm-opt rustc 1.82+ emits bulk-memory and sign-extension instructions in its default wasm output. wasm-pack's baseline wasm-opt invocation ("-O") does not pass --enable-bulk-memory / --enable-sign-ext, so even a modern binaryen rejects the module with "Bulk memory operations require bulk memory [--enable-bulk-memory]" during validation. Configure the flags in paperjam-wasm's Cargo.toml metadata block so wasm-pack invokes wasm-opt with the right feature set. This is what was blocking CI #71 even after installing a modern binaryen. --- crates/paperjam-wasm/Cargo.toml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/crates/paperjam-wasm/Cargo.toml b/crates/paperjam-wasm/Cargo.toml index be89264..38a8635 100644 --- a/crates/paperjam-wasm/Cargo.toml +++ b/crates/paperjam-wasm/Cargo.toml @@ -34,3 +34,11 @@ pptx = ["dep:paperjam-pptx", "paperjam-convert/pptx"] html = ["dep:paperjam-html", "paperjam-convert/html"] epub = ["dep:paperjam-epub", "paperjam-convert/epub"] all-formats = ["pdf", "docx", "xlsx", "pptx", "html", "epub"] + +# wasm-pack reads this block to decide how to invoke wasm-opt on the +# release bundle. rustc 1.82+ emits bulk-memory and sign-extension ops +# by default, and wasm-pack's default wasm-opt invocation ("-O") leaves +# those features disabled — we enable them here so the optimiser can +# validate the input. Everything else mirrors wasm-pack's -O level 2. +[package.metadata.wasm-pack.profile.release] +wasm-opt = ["-O", "--enable-bulk-memory", "--enable-sign-ext"] From fa7dde96382e58af65852073f2012a0361c14371 Mon Sep 17 00:00:00 2001 From: Pratyush Sharma <56130065+pratyush618@users.noreply.github.com> Date: Fri, 24 Apr 2026 16:25:23 +0530 Subject: [PATCH 9/9] fix(wasm): extend wasm-opt feature set to the full rustc default list Rust 1.87 / LLVM 20 enabled bulk-memory and nontrapping-fptoint in the default wasm32-unknown-unknown feature set, alongside the previously-defaulted multivalue, mutable-globals, reference-types, and sign-ext. wasm-pack's baseline "-O" invocation of wasm-opt does not pass any of them, so the optimiser rejects a perfectly valid rustc-emitted module. The previous commit only enabled bulk-memory and sign-ext, which exposed a follow-on validator error on `i32.trunc_sat_f64_s` (nontrapping-fptoint). Rather than re-play whack-a-mole for each feature, pass the full list that matches the rustc default set documented in the wasm32-unknown-unknown platform-support page. Ref: https://doc.rust-lang.org/rustc/platform-support/wasm32-unknown-unknown.html --- crates/paperjam-wasm/Cargo.toml | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/crates/paperjam-wasm/Cargo.toml b/crates/paperjam-wasm/Cargo.toml index 38a8635..d9b643e 100644 --- a/crates/paperjam-wasm/Cargo.toml +++ b/crates/paperjam-wasm/Cargo.toml @@ -36,9 +36,17 @@ epub = ["dep:paperjam-epub", "paperjam-convert/epub"] all-formats = ["pdf", "docx", "xlsx", "pptx", "html", "epub"] # wasm-pack reads this block to decide how to invoke wasm-opt on the -# release bundle. rustc 1.82+ emits bulk-memory and sign-extension ops -# by default, and wasm-pack's default wasm-opt invocation ("-O") leaves -# those features disabled — we enable them here so the optimiser can -# validate the input. Everything else mirrors wasm-pack's -O level 2. +# release bundle. Modern rustc (1.82+) emits several wasm 2.0 features +# in its default output — we enable the same set here so wasm-opt can +# validate and optimise the module. The list mirrors the feature set +# rustc enables by default on wasm32-unknown-unknown. [package.metadata.wasm-pack.profile.release] -wasm-opt = ["-O", "--enable-bulk-memory", "--enable-sign-ext"] +wasm-opt = [ + "-O", + "--enable-bulk-memory", + "--enable-sign-ext", + "--enable-nontrapping-float-to-int", + "--enable-mutable-globals", + "--enable-reference-types", + "--enable-multivalue", +]