Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ The library uses a pluggable parser architecture in `spacypdfreader/parsers/`:

Each parser implements a `parser(pdf_path: str, page_number: int, **kwargs)` function that returns text for a single page.

A parser may optionally expose a `batch_parser(pdf_path: str, pages: Iterable[int], **kwargs)` function (attached as `parser.batch_parser`) that returns text for many pages in a single pass. pdfminer provides one to avoid re-parsing the whole PDF once per page (which is O(n^2) work). When a `batch_parser` is available, `pdf_reader()` prefers it over per-page calls.

### spaCy Custom Extensions

The library registers several custom attributes on spaCy tokens and docs:
Expand All @@ -100,8 +102,10 @@ These extensions are registered in `spacypdfreader/spacypdfreader.py` at module

1. PDF path and spaCy Language object provided to `pdf_reader()`
2. PDF page count determined using pdfminer's `PDFParser`
3. Pages extracted in parallel (if `n_processes` specified) or sequentially
4. Each page text converted to a spaCy `Doc` via `nlp.pipe()`
3. Text extracted from the requested pages:
- If the parser exposes a `batch_parser` (e.g. pdfminer), the PDF is parsed once in a single pass.
- Otherwise pages are extracted per-page, in parallel via a `ThreadPool` when `n_processes` is set (this only helps parsers that release the GIL, such as the pytesseract OCR parser), or sequentially.
4. Each page text converted to a spaCy `Doc` via `nlp.pipe()` (single process)
5. Page numbers assigned to all tokens
6. Individual page `Doc` objects combined using `Doc.from_docs()`
7. Custom extensions set on the combined doc
Expand All @@ -111,7 +115,8 @@ These extensions are registered in `spacypdfreader/spacypdfreader.py` at module
- This library breaks spaCy convention: it does NOT use `nlp.add_pipe()` because text extraction must happen before spaCy processing
- Page numbers use 1-based indexing in the public API (but pdfminer uses 0-based internally)
- When using pdfminer parser, do NOT pass `page_numbers` kwarg - use `page_range` instead
- Multiprocessing uses `ThreadPool` not `ProcessPool` (see imports in spacypdfreader.py:4)
- Per-page parsing uses `ThreadPool` (not `ProcessPool`) when `n_processes` is set; this only speeds up parsers that release the GIL (e.g. pytesseract). The default pdfminer parser instead uses a single-pass `batch_parser` and ignores `n_processes` for extraction.
- `nlp.pipe()` is intentionally run single-process: spaCy's `n_process` has a large static cost that only pays off for thousands of texts, but a PDF yields just one text per page, so parallelizing it would slow down typical documents.

## Testing Notes

Expand Down
101 changes: 96 additions & 5 deletions src/spacypdfreader/parsers/pdfminer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,17 @@
from io import StringIO
from typing import Dict, Iterable, List

from pdfminer.converter import TextConverter
from pdfminer.high_level import extract_text
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage

_PAGE_NUMBERS_ERROR = (
"The `page_numbers` kwarg is not valid when using the pdfminer parser. "
"Please use `page_range` instead. For example: ",
"``",
)


def parser(pdf_path: str, page_number: int, **kwargs):
Expand Down Expand Up @@ -57,11 +70,7 @@ def parser(pdf_path: str, page_number: int, **kwargs):
# Check to see if the users has provided the `page_numbers` kwarg. This is not
# valid. So raise an error. See: https://github.com/SamEdwardes/spacypdfreader/issues/16
if "page_numbers" in kwargs:
raise ValueError(
"The `page_numbers` kwarg is not valid when using the pdfminer parser. "
"Please use `page_range` instead. For example: ",
"``",
)
raise ValueError(*_PAGE_NUMBERS_ERROR)

# pdfminer uses zero indexed page numbers. Therefore need to remove 1
# from the page count.
Expand All @@ -70,6 +79,88 @@ def parser(pdf_path: str, page_number: int, **kwargs):
return text


def batch_parser(pdf_path: str, pages: Iterable[int], **kwargs) -> List[str]:
"""Convert several PDF pages to text in a single pass with pdfminer.

Calling [`parser`][spacypdfreader.parsers.pdfminer.parser] once per page
re-opens and re-parses the entire PDF every time, which is roughly O(n^2)
work for an n page document. `batch_parser` parses the document a single
time and returns the text for each requested page. The output for a given
page is identical to calling `parser` for that page.

`spacypdfreader.pdf_reader` uses this automatically when the pdfminer
parser is selected (it is attached as `parser.batch_parser`).

Parameters:
pdf_path: Path to a PDF file.
pages: One indexed page numbers to convert (e.g. the first page of the
PDF is page 1, as opposed to page 0).
**kwargs: `**kwargs` will be passed to
[`pdfminer.high_level.extract_text`](https://pdfminersix.readthedocs.io/en/latest/reference/highlevel.html#extract-text)
(for example `password`, `caching`, `codec`, `maxpages` or
`laparams`).

Returns:
A list of strings, one per page, in the same order as `pages`.
"""
if "page_numbers" in kwargs:
raise ValueError(*_PAGE_NUMBERS_ERROR)

# pdfminer uses zero indexed page numbers.
zero_indexed = [page - 1 for page in pages]
text_by_page = _extract_text_per_page(pdf_path, zero_indexed, **kwargs)
return [text_by_page[page] for page in zero_indexed]


def _extract_text_per_page(
pdf_path: str,
page_numbers: Iterable[int],
password: str = "",
maxpages: int = 0,
caching: bool = True,
codec: str = "utf-8",
laparams: LAParams = None,
) -> Dict[int, str]:
"""Extract text from the requested (zero indexed) pages in a single pass.

This mirrors `pdfminer.high_level.extract_text` but opens and parses the
document only once, capturing the text for each requested page separately.
"""
if laparams is None:
laparams = LAParams()

# `PDFPage.get_pages` yields the requested pages in document order, so pair
# the yielded pages with the sorted, de-duplicated page numbers.
requested = sorted(set(page_numbers))
text_by_page: Dict[int, str] = {}

with open(pdf_path, "rb") as in_file:
rsrcmgr = PDFResourceManager(caching=caching)
pdf_pages = PDFPage.get_pages(
in_file,
requested,
maxpages=maxpages,
password=password,
caching=caching,
)
for page_number, page in zip(requested, pdf_pages):
with StringIO() as output:
device = TextConverter(
rsrcmgr, output, codec=codec, laparams=laparams
)
interpreter = PDFPageInterpreter(rsrcmgr, device)
interpreter.process_page(page)
text_by_page[page_number] = output.getvalue()
device.close()

return text_by_page


# Expose the single-pass extractor on the per-page parser so that
# `spacypdfreader.pdf_reader` can discover and prefer it.
parser.batch_parser = batch_parser


class PdfminerParser:
"""This class has bee included for backwards compatibility. Do not use."""

Expand Down
42 changes: 27 additions & 15 deletions src/spacypdfreader/spacypdfreader.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import warnings
from functools import partial
from multiprocessing.pool import ThreadPool as Pool
from multiprocessing.pool import ThreadPool
from typing import Any, Callable, Iterable, Optional

import spacy
Expand Down Expand Up @@ -176,25 +176,37 @@ def pdf_reader(
console.print(f"PDF contains {num_pages} pages.")
console.print(f"Extracting text from {start_page} to {end_page}...")

# Handle multiprocessing
if n_processes:
with Pool(n_processes) as p:
# Extract the text from each page.
page_numbers = list(range(start_page, end_page + 1))
batch_parser = getattr(pdf_parser, "batch_parser", None)

if batch_parser is not None:
# Single-pass parsers (e.g. pdfminer) parse the whole PDF once instead
# of re-opening and re-parsing it for every page. This is faster than
# parallelising the per-page calls, so multiprocessing is not needed
# for extraction here.
texts = batch_parser(pdf_path, page_numbers, **kwargs)
elif n_processes:
# Per-page parsers (e.g. pytesseract OCR) that release the GIL during
# subprocess / native calls benefit from a thread pool. Note that pure
# python, CPU-bound parsers will see little benefit due to the GIL.
with ThreadPool(n_processes) as pool:
partial_worker = partial(pdf_parser, pdf_path, **kwargs)
args = list(range(start_page, end_page + 1))
texts = p.map(partial_worker, args)

# Handle non-multiprocessing
texts = pool.map(partial_worker, page_numbers)
else:
texts = []
for page_num in range(start_page, end_page + 1):
text = pdf_parser(pdf_path=pdf_path, page_number=page_num, **kwargs)
texts.append(text)

# Convert text to spaCy Doc objects.
texts = [
pdf_parser(pdf_path=pdf_path, page_number=page_num, **kwargs)
for page_num in page_numbers
]

# Convert text to spaCy Doc objects. `nlp.pipe` is kept single process on
# purpose: spaCy's multiprocessing has a large static cost and only pays
# off for thousands of texts, whereas a PDF yields just one text per page,
# so `n_process > 1` would slow down typical documents.
if verbose:
console.print("Converting text to [blue bold]spaCy[/] Doc...")

docs = [doc for doc in nlp.pipe(texts)]
docs = list(nlp.pipe(texts))
for idx, doc in enumerate(docs):
page_num = idx + start_page
for token in doc:
Expand Down
Loading