diff --git a/crates/paperjam-epub/Cargo.toml b/crates/paperjam-epub/Cargo.toml index 9c69809..a3738f4 100644 --- a/crates/paperjam-epub/Cargo.toml +++ b/crates/paperjam-epub/Cargo.toml @@ -7,7 +7,7 @@ license.workspace = true description = "EPUB e-book processing for the paperjam ecosystem" [dependencies] -paperjam-model = { path = "../paperjam-model" } +paperjam-model = { path = "../paperjam-model", features = ["zip_safety"] } paperjam-html = { path = "../paperjam-html" } zip = { version = "2", default-features = false, features = ["deflate"] } quick-xml = "0.37" diff --git a/crates/paperjam-epub/src/error.rs b/crates/paperjam-epub/src/error.rs index 6cbb9c5..d26bd67 100644 --- a/crates/paperjam-epub/src/error.rs +++ b/crates/paperjam-epub/src/error.rs @@ -1,25 +1,30 @@ #[derive(Debug, thiserror::Error)] pub enum EpubError { + /// The outer archive header is invalid or could not be opened. #[error("invalid ZIP archive: {0}")] Zip(#[from] zip::result::ZipError), + /// Non-archive I/O failure (e.g. cloning raw bytes). #[error("I/O error: {0}")] Io(#[from] std::io::Error), + /// XML parse failure. #[error("XML parse error: {0}")] Xml(String), + /// Chapter content failed to parse as HTML. #[error("HTML parse error: {0}")] Html(#[from] paperjam_html::HtmlError), - #[error("missing entry in EPUB: {0}")] - MissingEntry(String), - + /// The EPUB layout (container.xml, OPF, spine) was malformed. #[error("invalid EPUB structure: {0}")] InvalidStructure(String), - #[error("EPUB entry `{name}` is too large ({size} bytes, limit {limit})")] - EntryTooLarge { name: String, size: u64, limit: u64 }, + /// A bounded archive read hit one of the configured safety limits + /// (per-entry size, total-byte budget, entry count, or compression + /// ratio), or could not locate a named entry. + #[error(transparent)] + Archive(#[from] paperjam_model::zip_safety::ZipSafetyError), } impl From for EpubError { diff --git a/crates/paperjam-epub/src/lib.rs b/crates/paperjam-epub/src/lib.rs index e408216..90184e7 100644 --- a/crates/paperjam-epub/src/lib.rs +++ b/crates/paperjam-epub/src/lib.rs @@ -13,7 +13,6 @@ mod image; mod markdown; mod metadata; pub mod parser; -mod safe_read; mod structure; mod table; mod text; diff --git a/crates/paperjam-epub/src/parser.rs b/crates/paperjam-epub/src/parser.rs index 932e08b..1f035b6 100644 --- a/crates/paperjam-epub/src/parser.rs +++ b/crates/paperjam-epub/src/parser.rs @@ -5,16 +5,17 @@ use quick_xml::Reader; use crate::document::{ChapterData, EpubDocument, OpfMetadata, TocEntry}; use crate::error::{EpubError, Result}; -use crate::safe_read::{read_entry_bytes, read_entry_string}; use crate::toc; +use paperjam_model::zip_safety::{ArchiveLimits, SafeArchive}; /// Parse an EPUB document from raw bytes. pub fn parse_epub(bytes: &[u8]) -> Result { let cursor = std::io::Cursor::new(bytes); let mut archive = zip::ZipArchive::new(cursor)?; + let mut safe = SafeArchive::new(&mut archive, ArchiveLimits::DEFAULT); // 1. Find the OPF path from container.xml. - let container_xml = read_entry_string(&mut archive, "META-INF/container.xml")?; + let container_xml = safe.read_entry_string("META-INF/container.xml")?; let opf_path = parse_container_xml(&container_xml)?; let opf_base_dir = opf_path .rsplit_once('/') @@ -22,18 +23,18 @@ pub fn parse_epub(bytes: &[u8]) -> Result { .unwrap_or_default(); // 2. Parse OPF: metadata, manifest, spine. - let opf_xml = read_entry_string(&mut archive, &opf_path)?; + let opf_xml = safe.read_entry_string(&opf_path)?; let (opf_metadata, manifest, spine) = parse_opf(&opf_xml)?; // 3. Parse TOC. - let toc_entries = parse_toc_from_manifest(&mut archive, &manifest, &opf_base_dir); + let toc_entries = parse_toc_from_manifest(&mut safe, &manifest, &opf_base_dir); // 4. Read chapters in spine order. let mut chapters = Vec::new(); for (idx, spine_idref) in spine.iter().enumerate() { if let Some(href) = manifest.get(spine_idref) { let full_path = resolve_path(&opf_base_dir, href); - match read_entry_bytes(&mut archive, &full_path) { + match safe.read_entry_bytes(&full_path) { Ok(html_bytes) => { let html_doc = paperjam_html::HtmlDocument::from_bytes(&html_bytes)?; let title = find_toc_title(&toc_entries, href); @@ -52,7 +53,7 @@ pub fn parse_epub(bytes: &[u8]) -> Result { } // 5. Collect archive images. - let archive_images = collect_images(&mut archive, &manifest, &opf_base_dir); + let archive_images = collect_images(&mut safe, &manifest, &opf_base_dir); Ok(EpubDocument { chapters, @@ -252,7 +253,7 @@ fn find_toc_title(entries: &[TocEntry], href: &str) -> Option { /// Parse TOC from the manifest, trying NCX first then nav.xhtml. fn parse_toc_from_manifest( - archive: &mut zip::ZipArchive>, + safe: &mut SafeArchive<'_, std::io::Cursor<&[u8]>>, manifest: &HashMap, opf_base_dir: &str, ) -> Vec { @@ -260,7 +261,7 @@ fn parse_toc_from_manifest( for (id, href) in manifest { if id == "ncx" || href.ends_with(".ncx") { let full_path = resolve_path(opf_base_dir, href); - if let Ok(xml) = read_entry_string(archive, &full_path) { + if let Ok(xml) = safe.read_entry_string(&full_path) { let entries = toc::parse_ncx(&xml); if !entries.is_empty() { return entries; @@ -273,7 +274,7 @@ fn parse_toc_from_manifest( for href in manifest.values() { if href.contains("nav") && (href.ends_with(".xhtml") || href.ends_with(".html")) { let full_path = resolve_path(opf_base_dir, href); - if let Ok(html_bytes) = read_entry_bytes(archive, &full_path) { + if let Ok(html_bytes) = safe.read_entry_bytes(&full_path) { let entries = toc::parse_nav_xhtml(&html_bytes); if !entries.is_empty() { return entries; @@ -287,7 +288,7 @@ fn parse_toc_from_manifest( /// Collect image files from the archive based on manifest media types. fn collect_images( - archive: &mut zip::ZipArchive>, + safe: &mut SafeArchive<'_, std::io::Cursor<&[u8]>>, manifest: &HashMap, opf_base_dir: &str, ) -> Vec<(String, Vec)> { @@ -298,7 +299,7 @@ fn collect_images( let lower = href.to_ascii_lowercase(); if image_extensions.iter().any(|ext| lower.ends_with(ext)) { let full_path = resolve_path(opf_base_dir, href); - if let Ok(data) = read_entry_bytes(archive, &full_path) { + if let Ok(data) = safe.read_entry_bytes(&full_path) { images.push((href.clone(), data)); } } diff --git a/crates/paperjam-epub/src/safe_read.rs b/crates/paperjam-epub/src/safe_read.rs deleted file mode 100644 index 167800d..0000000 --- a/crates/paperjam-epub/src/safe_read.rs +++ /dev/null @@ -1,115 +0,0 @@ -//! Bounded readers for ZIP entries to mitigate decompression-bomb attacks. -//! -//! EPUB files are ZIP archives; malicious inputs can declare tiny compressed -//! sizes that expand to gigabytes. We cap the decompressed length on every -//! entry we pull out of the archive. - -use std::io::Read; - -use crate::error::{EpubError, Result}; - -/// Per-entry decompressed byte limit. Normal EPUB entries (XHTML chapters, -/// cover images, fonts) are comfortably under this. A document that exceeds -/// it is either pathological or malicious. -pub const MAX_ENTRY_BYTES: u64 = 100 * 1024 * 1024; - -pub fn read_entry_string( - archive: &mut zip::ZipArchive>, - name: &str, -) -> Result { - let mut entry = archive - .by_name(name) - .map_err(|_| EpubError::MissingEntry(name.to_string()))?; - - let declared = entry.size(); - if declared > MAX_ENTRY_BYTES { - return Err(EpubError::EntryTooLarge { - name: name.to_string(), - size: declared, - limit: MAX_ENTRY_BYTES, - }); - } - - let mut buf = String::new(); - let read = (&mut entry) - .take(MAX_ENTRY_BYTES + 1) - .read_to_string(&mut buf)?; - if read as u64 > MAX_ENTRY_BYTES { - return Err(EpubError::EntryTooLarge { - name: name.to_string(), - size: read as u64, - limit: MAX_ENTRY_BYTES, - }); - } - Ok(buf) -} - -pub fn read_entry_bytes( - archive: &mut zip::ZipArchive>, - name: &str, -) -> Result> { - let mut entry = archive - .by_name(name) - .map_err(|_| EpubError::MissingEntry(name.to_string()))?; - - let declared = entry.size(); - if declared > MAX_ENTRY_BYTES { - return Err(EpubError::EntryTooLarge { - name: name.to_string(), - size: declared, - limit: MAX_ENTRY_BYTES, - }); - } - - let mut buf = Vec::new(); - let read = (&mut entry) - .take(MAX_ENTRY_BYTES + 1) - .read_to_end(&mut buf)?; - if read as u64 > MAX_ENTRY_BYTES { - return Err(EpubError::EntryTooLarge { - name: name.to_string(), - size: read as u64, - limit: MAX_ENTRY_BYTES, - }); - } - Ok(buf) -} - -#[cfg(test)] -mod tests { - use super::*; - - use std::io::{Cursor, Write}; - - fn build_archive_with_entry(name: &str, contents: &[u8]) -> Vec { - let mut buf = Vec::new(); - { - let mut w = zip::ZipWriter::new(Cursor::new(&mut buf)); - w.start_file::<_, ()>(name, zip::write::SimpleFileOptions::default()) - .unwrap(); - w.write_all(contents).unwrap(); - w.finish().unwrap(); - } - buf - } - - #[test] - fn small_entry_reads_normally() { - let bytes = build_archive_with_entry("hello.txt", b"hello world"); - let bytes_slice: &[u8] = &bytes; - let mut archive = zip::ZipArchive::new(Cursor::new(bytes_slice)).unwrap(); - let s = read_entry_string(&mut archive, "hello.txt").unwrap(); - assert_eq!(s, "hello world"); - } - - #[test] - fn oversized_entry_is_rejected_by_declared_size() { - // Build an archive whose single entry exceeds the per-entry cap. - let blob = vec![b'a'; (MAX_ENTRY_BYTES as usize) + 1]; - let bytes = build_archive_with_entry("big.bin", &blob); - let bytes_slice: &[u8] = &bytes; - let mut archive = zip::ZipArchive::new(Cursor::new(bytes_slice)).unwrap(); - let err = read_entry_bytes(&mut archive, "big.bin").unwrap_err(); - assert!(matches!(err, EpubError::EntryTooLarge { .. })); - } -} diff --git a/crates/paperjam-model/Cargo.toml b/crates/paperjam-model/Cargo.toml index e40c9c3..e9a9aa1 100644 --- a/crates/paperjam-model/Cargo.toml +++ b/crates/paperjam-model/Cargo.toml @@ -7,3 +7,15 @@ license.workspace = true description = "Pure data types and traits for the paperjam document processing ecosystem" [dependencies] +thiserror = { workspace = true, optional = true } +zip = { version = "2", default-features = false, features = ["deflate"], optional = true } + +[features] +# Shared hardened ZIP reader used by the OOXML/EPUB format crates. +# Off by default so the PDF engine and other non-zip consumers stay +# dependency-free. +zip_safety = ["dep:zip", "dep:thiserror"] + +[dev-dependencies] +thiserror = { workspace = true } +zip = { version = "2", default-features = false, features = ["deflate"] } diff --git a/crates/paperjam-model/src/lib.rs b/crates/paperjam-model/src/lib.rs index 29039a8..e45af85 100644 --- a/crates/paperjam-model/src/lib.rs +++ b/crates/paperjam-model/src/lib.rs @@ -11,6 +11,9 @@ pub mod document; pub mod format; +#[cfg(feature = "zip_safety")] +pub mod zip_safety; + pub mod annotations; pub mod bookmarks; pub mod conversion; diff --git a/crates/paperjam-model/src/zip_safety.rs b/crates/paperjam-model/src/zip_safety.rs new file mode 100644 index 0000000..1c7bddd --- /dev/null +++ b/crates/paperjam-model/src/zip_safety.rs @@ -0,0 +1,369 @@ +//! Shared hardened ZIP reader for the OOXML- and EPUB-family format +//! crates. +//! +//! ZIP-based document formats (DOCX, PPTX, XLSX, EPUB) are easy +//! decompression-bomb vectors: a malicious archive can declare a tiny +//! compressed size that expands to gigabytes, pack millions of trivially +//! empty entries, or pair a small compressed entry with a huge +//! decompressed size (compression-ratio attack). +//! +//! [`SafeArchive`] wraps a [`zip::ZipArchive`] and enforces four +//! independent caps across the lifetime of a single archive scan: +//! +//! 1. **Per-entry decompressed size** — no single entry can exceed +//! [`ArchiveLimits::max_entry_bytes`]. +//! 2. **Total decompressed bytes** — the running sum of bytes read +//! across all entries cannot exceed [`ArchiveLimits::max_total_bytes`]. +//! 3. **Entry count** — at most [`ArchiveLimits::max_entries`] entries +//! will be decompressed. +//! 4. **Compression ratio** — declared decompressed size divided by +//! compressed size cannot exceed [`ArchiveLimits::max_ratio`]. +//! +//! Each check rejects with a structured [`ZipSafetyError`] so malicious +//! archives produce typed errors instead of an OOM. +//! +//! This module is compiled only when the `zip_safety` feature is +//! enabled, keeping `paperjam-model`'s default dependency surface empty. + +use std::io::{Read, Seek}; + +/// Limits applied to a single archive scan. Tune individual fields as +/// needed; [`ArchiveLimits::DEFAULT`] is sized for typical office-family +/// documents and EPUBs. +#[derive(Debug, Clone, Copy)] +pub struct ArchiveLimits { + /// Largest decompressed size permitted for any single entry, in + /// bytes. + pub max_entry_bytes: u64, + /// Running total decompressed-byte budget across the whole scan. + pub max_total_bytes: u64, + /// Maximum number of entries [`SafeArchive`] will decompress. + pub max_entries: usize, + /// Maximum `entry.size() / entry.compressed_size()` ratio. Entries + /// with `compressed_size() == 0` are exempt from this check and are + /// governed by `max_entry_bytes` only. + pub max_ratio: u64, +} + +impl ArchiveLimits { + /// Default limits tuned for ordinary office and EPUB documents. + /// Real-world files fit comfortably; anything exceeding these is + /// almost certainly pathological or malicious. + pub const DEFAULT: Self = Self { + max_entry_bytes: 100 * 1024 * 1024, // 100 MB + max_total_bytes: 500 * 1024 * 1024, // 500 MB + max_entries: 10_000, + max_ratio: 100, + }; +} + +impl Default for ArchiveLimits { + fn default() -> Self { + Self::DEFAULT + } +} + +/// Errors surfaced by [`SafeArchive`]. +/// +/// Format crates typically wrap this with `#[from]` inside their own +/// error type. +#[derive(Debug, thiserror::Error)] +pub enum ZipSafetyError { + /// Propagated from the underlying [`zip`] crate. + #[error("ZIP error: {0}")] + Zip(#[from] zip::result::ZipError), + + /// Propagated from an I/O read on a ZIP entry. + #[error("I/O error reading ZIP entry: {0}")] + Io(#[from] std::io::Error), + + /// A named entry was not present in the archive. + #[error("ZIP entry `{0}` not found")] + MissingEntry(String), + + /// An entry was read but its bytes were not valid UTF-8. + #[error("ZIP entry `{name}` is not valid UTF-8: {source}")] + InvalidUtf8 { + name: String, + #[source] + source: std::string::FromUtf8Error, + }, + + /// A single entry's decompressed size exceeded the per-entry cap. + #[error("ZIP entry `{name}` is too large ({size} bytes, limit {limit})")] + EntryTooLarge { name: String, size: u64, limit: u64 }, + + /// The aggregate decompressed-byte budget was exhausted. + #[error( + "ZIP archive exceeded total decompressed-byte budget \ + (attempted {attempted} bytes, limit {limit})" + )] + TotalExceeded { attempted: u64, limit: u64 }, + + /// The archive contained more entries than the scan will decompress. + #[error("ZIP archive has too many entries (limit {limit})")] + TooManyEntries { limit: usize }, + + /// A single entry's declared compression ratio exceeds the cap. + #[error( + "ZIP entry `{name}` has compression ratio {ratio}x, \ + which exceeds the {limit}x safety cap" + )] + CompressionRatioExceeded { + name: String, + ratio: u64, + limit: u64, + }, +} + +/// A [`zip::ZipArchive`] wrapper that enforces [`ArchiveLimits`] across +/// every read. +/// +/// `SafeArchive` borrows the underlying archive mutably and accumulates +/// running totals for the lifetime of the scan. A fresh scan needs a +/// fresh `SafeArchive`; reusing one across archives would leak state +/// between them. +pub struct SafeArchive<'a, R: Read + Seek> { + archive: &'a mut zip::ZipArchive, + limits: ArchiveLimits, + bytes_consumed: u64, + entries_consumed: usize, +} + +impl<'a, R: Read + Seek> SafeArchive<'a, R> { + /// Wrap an archive with the given limits. + pub fn new(archive: &'a mut zip::ZipArchive, limits: ArchiveLimits) -> Self { + Self { + archive, + limits, + bytes_consumed: 0, + entries_consumed: 0, + } + } + + /// Read a named entry and decode it as UTF-8 text. + pub fn read_entry_string(&mut self, name: &str) -> Result { + let bytes = self.read_entry_bytes(name)?; + String::from_utf8(bytes).map_err(|source| ZipSafetyError::InvalidUtf8 { + name: name.to_string(), + source, + }) + } + + /// Read a named entry as raw bytes. + pub fn read_entry_bytes(&mut self, name: &str) -> Result, ZipSafetyError> { + self.check_and_bump_entry_count()?; + + let mut entry = self + .archive + .by_name(name) + .map_err(|_| ZipSafetyError::MissingEntry(name.to_string()))?; + + let declared = entry.size(); + let compressed = entry.compressed_size(); + check_entry_declared(name, declared, compressed, &self.limits)?; + + let remaining_budget = self + .limits + .max_total_bytes + .saturating_sub(self.bytes_consumed); + let per_read_cap = self.limits.max_entry_bytes.min(remaining_budget); + + // `+ 1` so we can distinguish "exactly at cap" from "lied about + // size and actually exceeded"; without the extra byte the + // attacker can silently truncate to the cap. + let mut buf = Vec::with_capacity(declared.min(per_read_cap) as usize); + let read = (&mut entry).take(per_read_cap + 1).read_to_end(&mut buf)? as u64; + + if read > per_read_cap { + return Err(if read > self.limits.max_entry_bytes { + ZipSafetyError::EntryTooLarge { + name: name.to_string(), + size: read, + limit: self.limits.max_entry_bytes, + } + } else { + ZipSafetyError::TotalExceeded { + attempted: self.bytes_consumed.saturating_add(read), + limit: self.limits.max_total_bytes, + } + }); + } + + self.bytes_consumed = self.bytes_consumed.saturating_add(read); + Ok(buf) + } + + /// Total decompressed bytes read so far. + pub fn bytes_consumed(&self) -> u64 { + self.bytes_consumed + } + + /// Number of entries decompressed so far. + pub fn entries_consumed(&self) -> usize { + self.entries_consumed + } + + fn check_and_bump_entry_count(&mut self) -> Result<(), ZipSafetyError> { + if self.entries_consumed >= self.limits.max_entries { + return Err(ZipSafetyError::TooManyEntries { + limit: self.limits.max_entries, + }); + } + self.entries_consumed += 1; + Ok(()) + } +} + +fn check_entry_declared( + name: &str, + declared: u64, + compressed: u64, + limits: &ArchiveLimits, +) -> Result<(), ZipSafetyError> { + if declared > limits.max_entry_bytes { + return Err(ZipSafetyError::EntryTooLarge { + name: name.to_string(), + size: declared, + limit: limits.max_entry_bytes, + }); + } + if let Some(ratio) = declared.checked_div(compressed) { + if ratio > limits.max_ratio { + return Err(ZipSafetyError::CompressionRatioExceeded { + name: name.to_string(), + ratio, + limit: limits.max_ratio, + }); + } + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::{Cursor, Write}; + + fn build_archive(entries: &[(&str, &[u8])]) -> Vec { + let mut buf = Vec::new(); + { + let mut w = zip::ZipWriter::new(Cursor::new(&mut buf)); + for (name, bytes) in entries { + w.start_file::<_, ()>(*name, zip::write::SimpleFileOptions::default()) + .unwrap(); + w.write_all(bytes).unwrap(); + } + w.finish().unwrap(); + } + buf + } + + fn open(bytes: &[u8]) -> zip::ZipArchive> { + zip::ZipArchive::new(Cursor::new(bytes)).unwrap() + } + + #[test] + fn reads_small_entry_successfully() { + let bytes = build_archive(&[("hello.txt", b"hello world")]); + let mut zip = open(&bytes); + let mut safe = SafeArchive::new(&mut zip, ArchiveLimits::DEFAULT); + assert_eq!(safe.read_entry_string("hello.txt").unwrap(), "hello world"); + assert_eq!(safe.entries_consumed(), 1); + assert_eq!(safe.bytes_consumed(), 11); + } + + #[test] + fn missing_entry_returns_structured_error() { + let bytes = build_archive(&[("a", b"x")]); + let mut zip = open(&bytes); + let mut safe = SafeArchive::new(&mut zip, ArchiveLimits::DEFAULT); + let err = safe.read_entry_bytes("not_there").unwrap_err(); + assert!(matches!(err, ZipSafetyError::MissingEntry(name) if name == "not_there")); + } + + #[test] + fn rejects_entry_larger_than_per_entry_cap() { + let limits = ArchiveLimits { + max_entry_bytes: 16, + ..ArchiveLimits::DEFAULT + }; + let bytes = build_archive(&[("big.bin", &[b'a'; 32])]); + let mut zip = open(&bytes); + let mut safe = SafeArchive::new(&mut zip, limits); + assert!(matches!( + safe.read_entry_bytes("big.bin"), + Err(ZipSafetyError::EntryTooLarge { .. }) + )); + } + + #[test] + fn rejects_when_running_total_exceeds_budget() { + let limits = ArchiveLimits { + max_total_bytes: 20, + ..ArchiveLimits::DEFAULT + }; + let bytes = build_archive(&[("a", &[b'x'; 16]), ("b", &[b'y'; 16])]); + let mut zip = open(&bytes); + let mut safe = SafeArchive::new(&mut zip, limits); + safe.read_entry_bytes("a").unwrap(); + assert!(matches!( + safe.read_entry_bytes("b"), + Err(ZipSafetyError::TotalExceeded { .. }) + )); + } + + #[test] + fn rejects_when_entry_count_exceeded() { + let limits = ArchiveLimits { + max_entries: 1, + ..ArchiveLimits::DEFAULT + }; + let bytes = build_archive(&[("a", b"x"), ("b", b"y")]); + let mut zip = open(&bytes); + let mut safe = SafeArchive::new(&mut zip, limits); + safe.read_entry_bytes("a").unwrap(); + assert!(matches!( + safe.read_entry_bytes("b"), + Err(ZipSafetyError::TooManyEntries { .. }) + )); + } + + #[test] + fn rejects_compression_ratio_bomb() { + // Deflated 1 MB of a single byte compresses ~1000x; with the + // ratio cap at 10x that must be rejected. + let payload = vec![b'a'; 1_000_000]; + let bytes = { + let mut buf = Vec::new(); + { + let mut w = zip::ZipWriter::new(Cursor::new(&mut buf)); + let options = zip::write::SimpleFileOptions::default() + .compression_method(zip::CompressionMethod::Deflated); + w.start_file::<_, ()>("bomb.bin", options).unwrap(); + w.write_all(&payload).unwrap(); + w.finish().unwrap(); + } + buf + }; + let limits = ArchiveLimits { + max_ratio: 10, + ..ArchiveLimits::DEFAULT + }; + let mut zip = open(&bytes); + let mut safe = SafeArchive::new(&mut zip, limits); + assert!(matches!( + safe.read_entry_bytes("bomb.bin"), + Err(ZipSafetyError::CompressionRatioExceeded { .. }) + )); + } + + #[test] + fn non_utf8_bytes_return_structured_error() { + let bytes = build_archive(&[("bad", &[0xff, 0xfe, 0xfd])]); + let mut zip = open(&bytes); + let mut safe = SafeArchive::new(&mut zip, ArchiveLimits::DEFAULT); + let err = safe.read_entry_string("bad").unwrap_err(); + assert!(matches!(err, ZipSafetyError::InvalidUtf8 { .. })); + } +} diff --git a/crates/paperjam-pptx/Cargo.toml b/crates/paperjam-pptx/Cargo.toml index 111513f..bdec08f 100644 --- a/crates/paperjam-pptx/Cargo.toml +++ b/crates/paperjam-pptx/Cargo.toml @@ -7,7 +7,7 @@ license.workspace = true description = "PPTX presentation processing for the paperjam ecosystem" [dependencies] -paperjam-model = { path = "../paperjam-model" } +paperjam-model = { path = "../paperjam-model", features = ["zip_safety"] } zip = { version = "2", default-features = false, features = ["deflate"] } quick-xml = "0.37" thiserror = { workspace = true } diff --git a/crates/paperjam-pptx/src/error.rs b/crates/paperjam-pptx/src/error.rs index 4217ad9..1725b42 100644 --- a/crates/paperjam-pptx/src/error.rs +++ b/crates/paperjam-pptx/src/error.rs @@ -1,11 +1,11 @@ /// Errors that can occur during PPTX processing. #[derive(Debug, thiserror::Error)] pub enum PptxError { - /// The file is not a valid ZIP archive. + /// The outer archive header is invalid or could not be opened. #[error("invalid ZIP archive: {0}")] Zip(#[from] zip::result::ZipError), - /// I/O error while reading the archive. + /// Non-archive I/O failure. #[error("I/O error: {0}")] Io(#[from] std::io::Error), @@ -13,17 +13,15 @@ pub enum PptxError { #[error("XML parse error: {0}")] Xml(String), - /// A required entry is missing from the PPTX archive. - #[error("missing entry in PPTX: {0}")] - MissingEntry(String), - - /// The PPTX structure is invalid or unsupported. + /// The PPTX layout (slide XMLs, rels, metadata) was malformed. #[error("invalid PPTX structure: {0}")] InvalidStructure(String), - /// A ZIP entry exceeded the per-entry decompressed byte limit. - #[error("PPTX entry `{name}` is too large ({size} bytes, limit {limit})")] - EntryTooLarge { name: String, size: u64, limit: u64 }, + /// A bounded archive read hit one of the configured safety limits + /// (per-entry size, total-byte budget, entry count, or compression + /// ratio), or could not locate a named entry. + #[error(transparent)] + Archive(#[from] paperjam_model::zip_safety::ZipSafetyError), } impl From for PptxError { diff --git a/crates/paperjam-pptx/src/lib.rs b/crates/paperjam-pptx/src/lib.rs index 07e8e7a..a64cc39 100644 --- a/crates/paperjam-pptx/src/lib.rs +++ b/crates/paperjam-pptx/src/lib.rs @@ -9,7 +9,6 @@ pub mod error; pub mod markdown; pub mod metadata; pub mod parser; -mod safe_read; pub mod structure; pub mod table; pub mod text; diff --git a/crates/paperjam-pptx/src/metadata.rs b/crates/paperjam-pptx/src/metadata.rs index 4be3080..44b9bea 100644 --- a/crates/paperjam-pptx/src/metadata.rs +++ b/crates/paperjam-pptx/src/metadata.rs @@ -1,9 +1,8 @@ use crate::error::{PptxError, Result}; -use crate::safe_read::read_entry_string; +use paperjam_model::zip_safety::{SafeArchive, ZipSafetyError}; use quick_xml::events::Event; use quick_xml::Reader; -use std::io::Read; -use zip::ZipArchive; +use std::io::{Read, Seek}; /// Internal representation of PPTX metadata extracted from `docProps/core.xml`. #[derive(Debug, Clone, Default)] @@ -36,13 +35,14 @@ impl PptxMetadata { } /// Parse metadata from `docProps/core.xml` inside the PPTX ZIP archive. -pub fn parse_metadata( - archive: &mut ZipArchive, -) -> Result { - let xml = match read_entry_string(archive, "docProps/core.xml") { +pub fn parse_metadata(safe: &mut SafeArchive<'_, R>) -> Result { + // A DOCX/PPTX produced without core.xml is legal (the file is just + // missing metadata); only the `MissingEntry` flavour of the safety + // error counts as a graceful fallback here. + let xml = match safe.read_entry_string("docProps/core.xml") { Ok(buf) => buf, - Err(PptxError::MissingEntry(_)) => return Ok(PptxMetadata::default()), - Err(e) => return Err(e), + Err(ZipSafetyError::MissingEntry(_)) => return Ok(PptxMetadata::default()), + Err(e) => return Err(e.into()), }; let mut reader = Reader::from_str(&xml); diff --git a/crates/paperjam-pptx/src/parser.rs b/crates/paperjam-pptx/src/parser.rs index acd7f01..b029dd6 100644 --- a/crates/paperjam-pptx/src/parser.rs +++ b/crates/paperjam-pptx/src/parser.rs @@ -1,8 +1,8 @@ use crate::document::{PptxDocument, SlideData, TextBlock}; use crate::error::{PptxError, Result}; use crate::metadata; -use crate::safe_read::read_entry_string; use paperjam_model::table::{Cell, Row, Table, TableStrategy}; +use paperjam_model::zip_safety::{ArchiveLimits, SafeArchive}; use quick_xml::events::Event; use quick_xml::Reader; use std::io::Read; @@ -13,16 +13,20 @@ pub fn parse_pptx(bytes: &[u8]) -> Result { let cursor = std::io::Cursor::new(bytes); let mut archive = ZipArchive::new(cursor)?; - let meta = metadata::parse_metadata(&mut archive)?; + // `slide_names` is sourced from the archive listing, not a + // `SafeArchive` read, so it does not count against the byte/entry + // budget. let slide_names = find_slide_files(&archive); - // Cap the initial allocation — slide_names comes from the archive - // listing and is attacker-controlled. + let mut safe = SafeArchive::new(&mut archive, ArchiveLimits::DEFAULT); + let meta = metadata::parse_metadata(&mut safe)?; + + // Cap the initial allocation — slide_names is attacker-controlled. let mut slides = Vec::with_capacity(slide_names.len().min(4096)); for (idx, name) in slide_names.iter().enumerate() { - let slide_xml = read_entry_string(&mut archive, name)?; + let slide_xml = safe.read_entry_string(name)?; let notes_path = format!("ppt/notesSlides/notesSlide{}.xml", idx + 1); - let notes_xml = read_entry_string(&mut archive, ¬es_path).ok(); + let notes_xml = safe.read_entry_string(¬es_path).ok(); let slide = parse_slide(&slide_xml, idx + 1, notes_xml.as_deref())?; slides.push(slide); } diff --git a/crates/paperjam-pptx/src/safe_read.rs b/crates/paperjam-pptx/src/safe_read.rs deleted file mode 100644 index 2a2a3b0..0000000 --- a/crates/paperjam-pptx/src/safe_read.rs +++ /dev/null @@ -1,40 +0,0 @@ -//! Bounded readers for ZIP entries to mitigate decompression-bomb attacks. - -use std::io::{Read, Seek}; - -use crate::error::{PptxError, Result}; - -/// Per-entry decompressed byte limit. Typical slide XML is tens of KB; even -/// large decks rarely exceed a few MB per entry. -pub const MAX_ENTRY_BYTES: u64 = 100 * 1024 * 1024; - -pub fn read_entry_string( - archive: &mut zip::ZipArchive, - name: &str, -) -> Result { - let mut entry = archive - .by_name(name) - .map_err(|_| PptxError::MissingEntry(name.to_string()))?; - - let declared = entry.size(); - if declared > MAX_ENTRY_BYTES { - return Err(PptxError::EntryTooLarge { - name: name.to_string(), - size: declared, - limit: MAX_ENTRY_BYTES, - }); - } - - let mut buf = String::new(); - let read = (&mut entry) - .take(MAX_ENTRY_BYTES + 1) - .read_to_string(&mut buf)?; - if read as u64 > MAX_ENTRY_BYTES { - return Err(PptxError::EntryTooLarge { - name: name.to_string(), - size: read as u64, - limit: MAX_ENTRY_BYTES, - }); - } - Ok(buf) -}