diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 342d3a0..8509006 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -7,6 +7,12 @@ on: - "docs-site/**" - "crates/paperjam-wasm/**" - ".github/workflows/docs.yml" + pull_request: + branches: [main] + paths: + - "docs-site/**" + - "crates/paperjam-wasm/**" + - ".github/workflows/docs.yml" workflow_dispatch: permissions: @@ -15,7 +21,7 @@ permissions: id-token: write concurrency: - group: pages + group: pages-${{ github.ref }} cancel-in-progress: false jobs: @@ -35,6 +41,38 @@ jobs: - name: Install wasm-pack run: curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh + # wasm-pack invokes wasm-opt automatically when it is on PATH. + # Ubuntu's apt-packaged binaryen (~v108) is too old to validate + # modern rustc output — rustc 1.95+ emits bulk-memory and + # sign-extension instructions by default, which that wasm-opt + # rejects. We install a pinned upstream release, verify its + # SHA256, and cache it across runs. + - name: Cache binaryen + id: cache-binaryen + uses: actions/cache@v4 + env: + BINARYEN_VERSION: version_119 + with: + path: /usr/local/bin/wasm-opt + key: binaryen-${{ env.BINARYEN_VERSION }}-x86_64-linux + + - name: Install binaryen (wasm-opt) + if: steps.cache-binaryen.outputs.cache-hit != 'true' + env: + BINARYEN_VERSION: version_119 + BINARYEN_SHA256: 716bcf9f5f36a6f466239fbb09a925eeaf54c46411ccefac979ec649e7c06d2d + run: | + set -euo pipefail + tarball="binaryen-${BINARYEN_VERSION}-x86_64-linux.tar.gz" + url="https://github.com/WebAssembly/binaryen/releases/download/${BINARYEN_VERSION}/${tarball}" + curl -fsSL "$url" -o "/tmp/${tarball}" + echo "${BINARYEN_SHA256} /tmp/${tarball}" | sha256sum --check --strict + tar -xzf "/tmp/${tarball}" -C /tmp + sudo install -m 0755 "/tmp/binaryen-${BINARYEN_VERSION}/bin/wasm-opt" /usr/local/bin/wasm-opt + + - name: Verify wasm-opt + run: wasm-opt --version + - name: Build WASM run: wasm-pack build crates/paperjam-wasm --target web --release --out-dir ../../docs-site/static/wasm @@ -52,12 +90,14 @@ jobs: run: cd docs-site && npm run build - name: Upload Pages artifact + if: github.event_name == 'push' || github.event_name == 'workflow_dispatch' uses: actions/upload-pages-artifact@v5 with: path: docs-site/build deploy: needs: build + if: github.event_name == 'push' || github.event_name == 'workflow_dispatch' runs-on: ubuntu-latest environment: name: github-pages diff --git a/CHANGELOG.md b/CHANGELOG.md index 6ec1502..0ce3492 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,64 @@ and this project adheres to [Semantic Versioning](https://semver.org/). ## [Unreleased] +### Security + +- Bound ZIP entry reads in EPUB, PPTX, and DOCX parsers. A crafted archive + declaring a tiny compressed size could previously expand to multi-GB on + decompression; entries are now rejected when the declared or observed + decompressed size exceeds a per-entry cap. +- Cap `Vec::with_capacity` preallocations in XLSX sheet parsing and PPTX + slide parsing at reasonable ceilings so attacker-controlled counts can + no longer trigger large allocations up front. +- `paperjam-mcp`: resolved paths are now sandboxed to the configured + working directory by default. Absolute paths and `..` traversal that + escape the working dir are rejected with a structured error. Operators + can opt out with `--allow-absolute-paths` (or + `ServerConfig::allow_absolute_paths`). + +### Fixed + +- Replace panic-prone `f64::partial_cmp(..).unwrap()` in table detection + (`table/{grid,lattice,stream}.rs`) with `total_cmp`, so malformed PDFs + producing NaN coordinates no longer crash the parser. +- Replace `get_object_mut().unwrap()` / `as_dict_mut().unwrap()` / + `from_utf8().unwrap()` across the stamp, watermark, bookmarks, and + PDF/UA validation modules with structured `PdfError` returns. Malformed + PDFs now surface typed errors instead of panicking the process. +- Stub drift: add `modify_form_field`, `add_form_field`, and the + `fill_form.generate_appearances` parameter to `_paperjam.pyi` so mypy + sees the full PyO3 surface. + +### Added + +- Crate-level `//!` rustdoc summaries on every workspace crate. +- `rust-toolchain.toml` pins the contributor toolchain to stable with + `rustfmt`, `clippy`, and the `wasm32-unknown-unknown` target. +- `justfile` with shortcuts for common build / test / lint tasks. +- `[profile.release]` with thin LTO, `codegen-units = 1`, and symbol + strip. Adds a `release-with-debug` profile for profiling. + +### Changed + +- `paperjam-async` no longer force-enables `signatures` and `validation` + on `paperjam-core`. Consumers that need them (e.g. `paperjam-py`) + continue to enable them explicitly; lightweight async users no longer + drag in the full signing / validation stack. +- Docs site CI now builds on pull requests (without deploying) so docs + regressions are caught pre-merge. Binaryen's `wasm-opt` is installed + so release WASM bundles are size-optimized. + +### Docs + +- README: CLI examples now use the correct `pj` binary name and accurate + flags; removed the nonexistent `extract tables --format csv` flag. +- `docs-site/docs/getting-started/installation.md`: replace leftover + Sphinx build instructions with the Docusaurus workflow, fix the + clone org, expand the feature-flag table. +- `pyproject.toml`: fill in multi-format description, `readme`, + `project.urls`, and extra classifiers/keywords so the PyPI page is + populated. Drop the stale Sphinx `[docs]` extra. + ## [0.2.0] — 2026-04-04 ### Added diff --git a/crates/paperjam-async/Cargo.toml b/crates/paperjam-async/Cargo.toml index bd05547..1deaf83 100644 --- a/crates/paperjam-async/Cargo.toml +++ b/crates/paperjam-async/Cargo.toml @@ -6,7 +6,12 @@ license.workspace = true description = "Async wrappers for paperjam operations via tokio::spawn_blocking" [dependencies] -paperjam-core = { path = "../paperjam-core", features = ["render", "signatures", "validation"] } +# paperjam-async currently only uses `paperjam_core::render`, so we enable +# that feature here but leave `signatures`/`validation` to be opted in by +# the downstream crate that actually needs them (paperjam-py does this +# explicitly). Keeps the async surface lightweight for callers that only +# want basic document operations. +paperjam-core = { path = "../paperjam-core", features = ["render"] } paperjam-model = { path = "../paperjam-model" } paperjam-convert = { path = "../paperjam-convert", optional = true } tokio = { workspace = true } diff --git a/crates/paperjam-async/src/lib.rs b/crates/paperjam-async/src/lib.rs index 9688833..30deb05 100644 --- a/crates/paperjam-async/src/lib.rs +++ b/crates/paperjam-async/src/lib.rs @@ -1,3 +1,11 @@ +//! Tokio-native async wrappers around paperjam's blocking operations. +//! +//! Each heavy operation (`open`, `save`, `render`, `to_markdown`, +//! `merge`, `redact_text`, ...) is re-exposed as an `async fn` that runs +//! the blocking work on `tokio::spawn_blocking`. This is what powers the +//! `paperjam.aopen` / `paperjam.arender_*` / `paperjam.amerge` helpers on +//! the Python side. + pub mod document; pub mod generic; pub mod page; diff --git a/crates/paperjam-convert/src/lib.rs b/crates/paperjam-convert/src/lib.rs index f8e6395..f97efea 100644 --- a/crates/paperjam-convert/src/lib.rs +++ b/crates/paperjam-convert/src/lib.rs @@ -1,3 +1,11 @@ +//! Cross-format document conversion. +//! +//! Orchestrates conversion between every pair of formats supported by the +//! paperjam workspace (PDF, DOCX, XLSX, PPTX, HTML, EPUB, Markdown). Each +//! format crate is an optional dependency so consumers only pay for the +//! formats they want; features named after the source and target crates +//! gate those conversions in and out. + pub mod convert; pub mod detect; pub mod error; diff --git a/crates/paperjam-core/src/lib.rs b/crates/paperjam-core/src/lib.rs index 9836e0d..8e1d503 100644 --- a/crates/paperjam-core/src/lib.rs +++ b/crates/paperjam-core/src/lib.rs @@ -1,3 +1,23 @@ +//! Pure-Rust PDF engine: parsing, text and table extraction, page +//! manipulation, form fields, digital signatures, encryption, rendering, +//! and PDF/A / PDF/UA validation. +//! +//! `paperjam-core` is the PDF-specific implementation behind the +//! `paperjam` library. Non-PDF formats live in sibling crates +//! (`paperjam-docx`, `paperjam-xlsx`, `paperjam-pptx`, `paperjam-html`, +//! `paperjam-epub`); cross-format operations go through `paperjam-convert`. +//! +//! Heavy optional pieces are feature-gated: +//! +//! | Feature | Enables | +//! |--------------|----------------------------------------------------------| +//! | `render` | page-to-image rasterisation via pdfium | +//! | `signatures` | sign / verify / inspect digital signatures | +//! | `ltv` | long-term validation (TSA, OCSP, CRL embedding) | +//! | `validation` | PDF/A and PDF/UA conformance checks | +//! | `parallel` | rayon-based parallel processing (default on) | +//! | `mmap` | memory-mapped file access for large documents | + pub mod annotations; pub mod bookmarks; #[cfg(feature = "validation")] diff --git a/crates/paperjam-core/src/signature/tsa.rs b/crates/paperjam-core/src/signature/tsa.rs index 7c89031..7ccccfb 100644 --- a/crates/paperjam-core/src/signature/tsa.rs +++ b/crates/paperjam-core/src/signature/tsa.rs @@ -54,7 +54,7 @@ pub fn build_timestamp_request(signature_value: &[u8]) -> Result> { /// Parse an RFC 3161 timestamp response and extract the TimeStampToken. /// -/// The response is: SEQUENCE { status PKIStatusInfo, timeStampToken [OPTIONAL] } +/// The response is: `SEQUENCE { status PKIStatusInfo, timeStampToken [OPTIONAL] }` /// We check the status integer and return the token bytes. pub fn parse_timestamp_response(resp_bytes: &[u8]) -> Result> { // Minimal DER parsing: skip the outer SEQUENCE, read PKIStatusInfo, extract token diff --git a/crates/paperjam-docx/src/lib.rs b/crates/paperjam-docx/src/lib.rs index 72bfb0e..fd0a358 100644 --- a/crates/paperjam-docx/src/lib.rs +++ b/crates/paperjam-docx/src/lib.rs @@ -1,3 +1,11 @@ +//! DOCX (Office Open XML word-processing) support for the paperjam +//! ecosystem. +//! +//! Reads and writes `.docx` files and exposes text, tables, and metadata +//! through the `DocumentTrait` implementation on `DocxDocument`. Body +//! parsing is delegated to `docx-rs`; an internal size-capped ZIP reader +//! handles the metadata parts the upstream API does not expose. + mod document; mod error; mod image; diff --git a/crates/paperjam-epub/src/lib.rs b/crates/paperjam-epub/src/lib.rs index d85af44..e408216 100644 --- a/crates/paperjam-epub/src/lib.rs +++ b/crates/paperjam-epub/src/lib.rs @@ -1,3 +1,12 @@ +//! EPUB document support for the paperjam ecosystem. +//! +//! Parses EPUB archives (container.xml → OPF → spine) and exposes each +//! chapter as an `HtmlDocument`, delegating per-chapter rendering to +//! `paperjam-html`. Implements `DocumentTrait` so EPUB files participate +//! in the shared model (chapter → page). +//! +//! Archive reads are size-capped internally to mitigate zip-bomb attacks. + mod document; mod error; mod image; diff --git a/crates/paperjam-html/src/lib.rs b/crates/paperjam-html/src/lib.rs index 723b07d..922b117 100644 --- a/crates/paperjam-html/src/lib.rs +++ b/crates/paperjam-html/src/lib.rs @@ -1,3 +1,10 @@ +//! HTML document support for the paperjam ecosystem. +//! +//! Parses HTML bytes via `scraper`, extracts text and tables, and +//! implements `DocumentTrait` so HTML documents share the same API +//! surface as the office formats. Also used by `paperjam-epub` for +//! chapter content (EPUB spine entries are XHTML). + mod document; mod error; pub mod image; diff --git a/crates/paperjam-mcp/src/lib.rs b/crates/paperjam-mcp/src/lib.rs index c65a1c7..92895bb 100644 --- a/crates/paperjam-mcp/src/lib.rs +++ b/crates/paperjam-mcp/src/lib.rs @@ -1,3 +1,13 @@ +//! Model Context Protocol server for paperjam. +//! +//! Exposes document operations (open, extract, convert, manipulate, +//! render, sign, validate, run pipelines, …) as MCP tools that a local +//! assistant (Claude Code, Cursor, Claude Desktop) can invoke. All +//! path resolution goes through a sandbox rooted at the server's +//! configured working directory; absolute paths escaping the sandbox +//! are rejected unless the operator opts out with +//! `--allow-absolute-paths`. + pub mod error; pub mod session; diff --git a/crates/paperjam-model/src/annotations.rs b/crates/paperjam-model/src/annotations.rs index 159eee2..acd5764 100644 --- a/crates/paperjam-model/src/annotations.rs +++ b/crates/paperjam-model/src/annotations.rs @@ -1,7 +1,7 @@ /// Where a link annotation points to. #[derive(Debug, Clone)] pub enum LinkDestination { - /// External URI (e.g. "https://example.com"). + /// External URI (e.g. `https://example.com`). Uri(String), /// Go to a specific page within the document. GoTo { page: u32 }, diff --git a/crates/paperjam-model/src/lib.rs b/crates/paperjam-model/src/lib.rs index 8613b0a..29039a8 100644 --- a/crates/paperjam-model/src/lib.rs +++ b/crates/paperjam-model/src/lib.rs @@ -1,3 +1,13 @@ +//! Format-agnostic types and traits shared across the paperjam workspace. +//! +//! Holds the stable data model — bookmarks, metadata, tables, text layout, +//! annotations, structure blocks — plus the `DocumentTrait` that every +//! format crate (`paperjam-docx`, `paperjam-xlsx`, ...) implements. +//! +//! This crate intentionally has no format-specific dependencies, so +//! downstream crates can depend on it without pulling in parsers they +//! do not use. + pub mod document; pub mod format; diff --git a/crates/paperjam-pipeline/src/lib.rs b/crates/paperjam-pipeline/src/lib.rs index 90071a4..6c9bac3 100644 --- a/crates/paperjam-pipeline/src/lib.rs +++ b/crates/paperjam-pipeline/src/lib.rs @@ -1,3 +1,13 @@ +//! Declarative multi-step document workflows defined in YAML or JSON. +//! +//! A pipeline is a sequence of steps — open, extract, convert, redact, +//! merge, save — applied to one or more input files. The engine runs +//! steps serially or in parallel and returns a per-file summary +//! (success / failure / skipped). +//! +//! Used by the `pj pipeline` CLI subcommand and the `run_pipeline` MCP +//! tool. + pub mod builder; pub mod context; pub mod definition; diff --git a/crates/paperjam-pptx/src/lib.rs b/crates/paperjam-pptx/src/lib.rs index 7fe2f6b..07e8e7a 100644 --- a/crates/paperjam-pptx/src/lib.rs +++ b/crates/paperjam-pptx/src/lib.rs @@ -1,3 +1,9 @@ +//! PPTX (Office Open XML presentation) support for the paperjam ecosystem. +//! +//! Parses `.pptx` archives slide-by-slide, extracts text blocks and +//! tables from slide XML, and implements `DocumentTrait` so presentations +//! participate in the shared model (slide → page). + pub mod document; pub mod error; pub mod markdown; diff --git a/crates/paperjam-py/src/document.rs b/crates/paperjam-py/src/document.rs index 493f500..c03ff6a 100644 --- a/crates/paperjam-py/src/document.rs +++ b/crates/paperjam-py/src/document.rs @@ -6,7 +6,13 @@ use std::sync::Arc; use crate::errors::to_py_err; use crate::page::PyPage; -/// Internal Rust document, exposed to Python as _paperjam.RustDocument. +/// Native PDF document handle, exposed to Python as `_paperjam.RustDocument`. +/// +/// Users normally access this via the higher-level `paperjam.Document` +/// wrapper in `py_src/paperjam/_document.py`, which adds ergonomic +/// defaults and composed operations on top of the raw bindings. +/// Holds an `Arc` so the same underlying PDF can be shared +/// across threads and Python tasks without copying. #[pyclass(name = "RustDocument")] pub struct PyDocument { pub(crate) inner: Arc, diff --git a/crates/paperjam-py/src/lib.rs b/crates/paperjam-py/src/lib.rs index 6fa5475..047dc5e 100644 --- a/crates/paperjam-py/src/lib.rs +++ b/crates/paperjam-py/src/lib.rs @@ -1,3 +1,11 @@ +//! PyO3 bindings that expose paperjam's Rust engine to Python as the +//! `_paperjam` native extension module. +//! +//! The Python package (`py_src/paperjam/`) wraps these raw bindings with +//! a more idiomatic API. Every PyO3-exposed symbol registered here must +//! also appear in `py_src/paperjam/_paperjam.pyi` so static type checkers +//! can see the extension's surface. + use pyo3::prelude::*; #[cfg(feature = "formats")] diff --git a/crates/paperjam-py/src/page.rs b/crates/paperjam-py/src/page.rs index aebe5a2..dbac9c4 100644 --- a/crates/paperjam-py/src/page.rs +++ b/crates/paperjam-py/src/page.rs @@ -5,6 +5,12 @@ use std::sync::Arc; use crate::errors::to_py_err; +/// Native PDF page handle, exposed to Python as `_paperjam.RustPage`. +/// +/// Obtained via `RustDocument.page(n)`. Users normally access this +/// through the higher-level `paperjam.Page` wrapper in +/// `py_src/paperjam/_page.py`, which adds ergonomic defaults and +/// composed operations on top of the raw bindings. #[pyclass(name = "RustPage")] pub struct PyPage { pub(crate) inner: Arc, diff --git a/crates/paperjam-wasm/Cargo.toml b/crates/paperjam-wasm/Cargo.toml index be89264..d9b643e 100644 --- a/crates/paperjam-wasm/Cargo.toml +++ b/crates/paperjam-wasm/Cargo.toml @@ -34,3 +34,19 @@ pptx = ["dep:paperjam-pptx", "paperjam-convert/pptx"] html = ["dep:paperjam-html", "paperjam-convert/html"] epub = ["dep:paperjam-epub", "paperjam-convert/epub"] all-formats = ["pdf", "docx", "xlsx", "pptx", "html", "epub"] + +# wasm-pack reads this block to decide how to invoke wasm-opt on the +# release bundle. Modern rustc (1.82+) emits several wasm 2.0 features +# in its default output — we enable the same set here so wasm-opt can +# validate and optimise the module. The list mirrors the feature set +# rustc enables by default on wasm32-unknown-unknown. +[package.metadata.wasm-pack.profile.release] +wasm-opt = [ + "-O", + "--enable-bulk-memory", + "--enable-sign-ext", + "--enable-nontrapping-float-to-int", + "--enable-mutable-globals", + "--enable-reference-types", + "--enable-multivalue", +] diff --git a/crates/paperjam-wasm/src/lib.rs b/crates/paperjam-wasm/src/lib.rs index 43c9d27..936b9f1 100644 --- a/crates/paperjam-wasm/src/lib.rs +++ b/crates/paperjam-wasm/src/lib.rs @@ -1,3 +1,11 @@ +//! WebAssembly bindings for paperjam, exposed via `wasm-bindgen`. +//! +//! Builds with `wasm-pack build --target web`. The generated JS + WASM +//! pair powers the interactive playground on the docs site. +//! Functionality is a subset of the native engine — rendering and +//! signatures are omitted on wasm, and compression is pure-Rust to avoid +//! `libz-sys` on `wasm32-unknown-unknown`. + use std::sync::Arc; use paperjam_core::document::Document; diff --git a/crates/paperjam-xlsx/src/lib.rs b/crates/paperjam-xlsx/src/lib.rs index 21d192a..8e925f2 100644 --- a/crates/paperjam-xlsx/src/lib.rs +++ b/crates/paperjam-xlsx/src/lib.rs @@ -1,3 +1,10 @@ +//! XLSX (Office Open XML spreadsheet) support for the paperjam ecosystem. +//! +//! Reads `.xlsx` workbooks via `calamine` and writes them via +//! `rust_xlsxwriter`. Each sheet's rows are exposed as stringified cells, +//! and the crate implements `DocumentTrait` so workbooks participate in +//! the shared model (sheet → page, cell → text). + pub mod document; pub mod error; pub mod markdown; diff --git a/justfile b/justfile new file mode 100644 index 0000000..78158be --- /dev/null +++ b/justfile @@ -0,0 +1,83 @@ +# paperjam — common developer commands +# +# Run `just` (no args) for the list. Requires `just` (https://just.systems). + +default: + @just --list + +# --- Build --------------------------------------------------------------- + +# Build + install the Python extension into the current venv (release). +build: + uv run maturin develop --release + +# Build + install the Python extension in debug mode (fast rebuild). +build-dev: + uv run maturin develop + +# Build the whole Rust workspace. +build-rust: + cargo build --workspace + +# Compile-check the wasm target (no linking). +build-wasm: + cargo check -p paperjam-wasm --target wasm32-unknown-unknown + +# Build the Docusaurus docs site. +build-docs: + cd docs-site && npm ci && npm run build + +# Render crate API docs into target/doc. +rustdoc: + cargo doc --workspace --no-deps + +# --- Test ---------------------------------------------------------------- + +# Run the full Rust test suite. +test-rust: + cargo test --workspace + +# Run the Python test suite (requires `just build` first). +test-py: + uv run pytest tests/python/ -v + +# Run both Rust and Python test suites. +test: test-rust test-py + +# Run the table-accuracy harness (slower, requires fixtures). +test-accuracy: + uv run pytest tests/python/ -m accuracy -v + +# --- Lint / format ------------------------------------------------------- + +# Apply all autoformatters and run every pre-commit hook. +check: + pre-commit run --all-files + +# Rust-only checks (fmt + clippy). +check-rust: + cargo fmt --all --check + cargo clippy --workspace --all-targets -- -D warnings + +# Python-only checks (ruff + mypy). +check-py: + uv run ruff check py_src/ tests/ + uv run ruff format --check py_src/ tests/ + uv run mypy py_src/ tests/ examples/ --ignore-missing-imports + +# Apply autoformatters (doesn't fail on remaining lint issues). +fmt: + cargo fmt --all + uv run ruff format py_src/ tests/ + uv run ruff check --fix py_src/ tests/ + +# --- Clean --------------------------------------------------------------- + +# Remove Rust build artifacts. +clean: + cargo clean + +# Remove Rust build artifacts and Python caches. +clean-all: clean + rm -rf .pytest_cache .mypy_cache .ruff_cache + find . -type d -name __pycache__ -exec rm -rf {} + diff --git a/rust-toolchain.toml b/rust-toolchain.toml new file mode 100644 index 0000000..c38c8a8 --- /dev/null +++ b/rust-toolchain.toml @@ -0,0 +1,4 @@ +[toolchain] +channel = "stable" +components = ["rustfmt", "clippy"] +targets = ["wasm32-unknown-unknown"]