Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion crates/paperjam-epub/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ license.workspace = true
description = "EPUB e-book processing for the paperjam ecosystem"

[dependencies]
paperjam-model = { path = "../paperjam-model" }
paperjam-model = { path = "../paperjam-model", features = ["zip_safety"] }
paperjam-html = { path = "../paperjam-html" }
zip = { version = "2", default-features = false, features = ["deflate"] }
quick-xml = "0.37"
Expand Down
15 changes: 10 additions & 5 deletions crates/paperjam-epub/src/error.rs
Original file line number Diff line number Diff line change
@@ -1,25 +1,30 @@
#[derive(Debug, thiserror::Error)]
pub enum EpubError {
/// The outer archive header is invalid or could not be opened.
#[error("invalid ZIP archive: {0}")]
Zip(#[from] zip::result::ZipError),

/// Non-archive I/O failure (e.g. cloning raw bytes).
#[error("I/O error: {0}")]
Io(#[from] std::io::Error),

/// XML parse failure.
#[error("XML parse error: {0}")]
Xml(String),

/// Chapter content failed to parse as HTML.
#[error("HTML parse error: {0}")]
Html(#[from] paperjam_html::HtmlError),

#[error("missing entry in EPUB: {0}")]
MissingEntry(String),

/// The EPUB layout (container.xml, OPF, spine) was malformed.
#[error("invalid EPUB structure: {0}")]
InvalidStructure(String),

#[error("EPUB entry `{name}` is too large ({size} bytes, limit {limit})")]
EntryTooLarge { name: String, size: u64, limit: u64 },
/// A bounded archive read hit one of the configured safety limits
/// (per-entry size, total-byte budget, entry count, or compression
/// ratio), or could not locate a named entry.
#[error(transparent)]
Archive(#[from] paperjam_model::zip_safety::ZipSafetyError),
}

impl From<quick_xml::Error> for EpubError {
Expand Down
1 change: 0 additions & 1 deletion crates/paperjam-epub/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ mod image;
mod markdown;
mod metadata;
pub mod parser;
mod safe_read;
mod structure;
mod table;
mod text;
Expand Down
23 changes: 12 additions & 11 deletions crates/paperjam-epub/src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,35 +5,36 @@ use quick_xml::Reader;

use crate::document::{ChapterData, EpubDocument, OpfMetadata, TocEntry};
use crate::error::{EpubError, Result};
use crate::safe_read::{read_entry_bytes, read_entry_string};
use crate::toc;
use paperjam_model::zip_safety::{ArchiveLimits, SafeArchive};

/// Parse an EPUB document from raw bytes.
pub fn parse_epub(bytes: &[u8]) -> Result<EpubDocument> {
let cursor = std::io::Cursor::new(bytes);
let mut archive = zip::ZipArchive::new(cursor)?;
let mut safe = SafeArchive::new(&mut archive, ArchiveLimits::DEFAULT);

// 1. Find the OPF path from container.xml.
let container_xml = read_entry_string(&mut archive, "META-INF/container.xml")?;
let container_xml = safe.read_entry_string("META-INF/container.xml")?;
let opf_path = parse_container_xml(&container_xml)?;
let opf_base_dir = opf_path
.rsplit_once('/')
.map(|(d, _)| d.to_string())
.unwrap_or_default();

// 2. Parse OPF: metadata, manifest, spine.
let opf_xml = read_entry_string(&mut archive, &opf_path)?;
let opf_xml = safe.read_entry_string(&opf_path)?;
let (opf_metadata, manifest, spine) = parse_opf(&opf_xml)?;

// 3. Parse TOC.
let toc_entries = parse_toc_from_manifest(&mut archive, &manifest, &opf_base_dir);
let toc_entries = parse_toc_from_manifest(&mut safe, &manifest, &opf_base_dir);

// 4. Read chapters in spine order.
let mut chapters = Vec::new();
for (idx, spine_idref) in spine.iter().enumerate() {
if let Some(href) = manifest.get(spine_idref) {
let full_path = resolve_path(&opf_base_dir, href);
match read_entry_bytes(&mut archive, &full_path) {
match safe.read_entry_bytes(&full_path) {
Ok(html_bytes) => {
let html_doc = paperjam_html::HtmlDocument::from_bytes(&html_bytes)?;
let title = find_toc_title(&toc_entries, href);
Expand All @@ -52,7 +53,7 @@ pub fn parse_epub(bytes: &[u8]) -> Result<EpubDocument> {
}

// 5. Collect archive images.
let archive_images = collect_images(&mut archive, &manifest, &opf_base_dir);
let archive_images = collect_images(&mut safe, &manifest, &opf_base_dir);

Ok(EpubDocument {
chapters,
Expand Down Expand Up @@ -252,15 +253,15 @@ fn find_toc_title(entries: &[TocEntry], href: &str) -> Option<String> {

/// Parse TOC from the manifest, trying NCX first then nav.xhtml.
fn parse_toc_from_manifest(
archive: &mut zip::ZipArchive<std::io::Cursor<&[u8]>>,
safe: &mut SafeArchive<'_, std::io::Cursor<&[u8]>>,
manifest: &HashMap<String, String>,
opf_base_dir: &str,
) -> Vec<TocEntry> {
// Look for NCX file (usually id="ncx" or ends with .ncx).
for (id, href) in manifest {
if id == "ncx" || href.ends_with(".ncx") {
let full_path = resolve_path(opf_base_dir, href);
if let Ok(xml) = read_entry_string(archive, &full_path) {
if let Ok(xml) = safe.read_entry_string(&full_path) {
let entries = toc::parse_ncx(&xml);
if !entries.is_empty() {
return entries;
Expand All @@ -273,7 +274,7 @@ fn parse_toc_from_manifest(
for href in manifest.values() {
if href.contains("nav") && (href.ends_with(".xhtml") || href.ends_with(".html")) {
let full_path = resolve_path(opf_base_dir, href);
if let Ok(html_bytes) = read_entry_bytes(archive, &full_path) {
if let Ok(html_bytes) = safe.read_entry_bytes(&full_path) {
let entries = toc::parse_nav_xhtml(&html_bytes);
if !entries.is_empty() {
return entries;
Expand All @@ -287,7 +288,7 @@ fn parse_toc_from_manifest(

/// Collect image files from the archive based on manifest media types.
fn collect_images(
archive: &mut zip::ZipArchive<std::io::Cursor<&[u8]>>,
safe: &mut SafeArchive<'_, std::io::Cursor<&[u8]>>,
manifest: &HashMap<String, String>,
opf_base_dir: &str,
) -> Vec<(String, Vec<u8>)> {
Expand All @@ -298,7 +299,7 @@ fn collect_images(
let lower = href.to_ascii_lowercase();
if image_extensions.iter().any(|ext| lower.ends_with(ext)) {
let full_path = resolve_path(opf_base_dir, href);
if let Ok(data) = read_entry_bytes(archive, &full_path) {
if let Ok(data) = safe.read_entry_bytes(&full_path) {
images.push((href.clone(), data));
}
}
Expand Down
115 changes: 0 additions & 115 deletions crates/paperjam-epub/src/safe_read.rs

This file was deleted.

12 changes: 12 additions & 0 deletions crates/paperjam-model/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,15 @@ license.workspace = true
description = "Pure data types and traits for the paperjam document processing ecosystem"

[dependencies]
thiserror = { workspace = true, optional = true }
zip = { version = "2", default-features = false, features = ["deflate"], optional = true }

[features]
# Shared hardened ZIP reader used by the OOXML/EPUB format crates.
# Off by default so the PDF engine and other non-zip consumers stay
# dependency-free.
zip_safety = ["dep:zip", "dep:thiserror"]

[dev-dependencies]
thiserror = { workspace = true }
zip = { version = "2", default-features = false, features = ["deflate"] }
3 changes: 3 additions & 0 deletions crates/paperjam-model/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
pub mod document;
pub mod format;

#[cfg(feature = "zip_safety")]
pub mod zip_safety;

pub mod annotations;
pub mod bookmarks;
pub mod conversion;
Expand Down
Loading
Loading