diff --git a/CLAUDE.md b/CLAUDE.md index 0f4e673..f8323d8 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -83,6 +83,8 @@ The library uses a pluggable parser architecture in `spacypdfreader/parsers/`: Each parser implements a `parser(pdf_path: str, page_number: int, **kwargs)` function that returns text for a single page. +A parser may optionally expose a `batch_parser(pdf_path: str, pages: Iterable[int], **kwargs)` function (attached as `parser.batch_parser`) that returns text for many pages in a single pass. pdfminer provides one to avoid re-parsing the whole PDF once per page (which is O(n^2) work). When a `batch_parser` is available, `pdf_reader()` prefers it over per-page calls. + ### spaCy Custom Extensions The library registers several custom attributes on spaCy tokens and docs: @@ -100,8 +102,10 @@ These extensions are registered in `spacypdfreader/spacypdfreader.py` at module 1. PDF path and spaCy Language object provided to `pdf_reader()` 2. PDF page count determined using pdfminer's `PDFParser` -3. Pages extracted in parallel (if `n_processes` specified) or sequentially -4. Each page text converted to a spaCy `Doc` via `nlp.pipe()` +3. Text extracted from the requested pages: + - If the parser exposes a `batch_parser` (e.g. pdfminer), the PDF is parsed once in a single pass. + - Otherwise pages are extracted per-page, in parallel via a `ThreadPool` when `n_processes` is set (this only helps parsers that release the GIL, such as the pytesseract OCR parser), or sequentially. +4. Each page text converted to a spaCy `Doc` via `nlp.pipe()` (single process) 5. Page numbers assigned to all tokens 6. Individual page `Doc` objects combined using `Doc.from_docs()` 7. Custom extensions set on the combined doc @@ -111,7 +115,8 @@ These extensions are registered in `spacypdfreader/spacypdfreader.py` at module - This library breaks spaCy convention: it does NOT use `nlp.add_pipe()` because text extraction must happen before spaCy processing - Page numbers use 1-based indexing in the public API (but pdfminer uses 0-based internally) - When using pdfminer parser, do NOT pass `page_numbers` kwarg - use `page_range` instead -- Multiprocessing uses `ThreadPool` not `ProcessPool` (see imports in spacypdfreader.py:4) +- Per-page parsing uses `ThreadPool` (not `ProcessPool`) when `n_processes` is set; this only speeds up parsers that release the GIL (e.g. pytesseract). The default pdfminer parser instead uses a single-pass `batch_parser` and ignores `n_processes` for extraction. +- `nlp.pipe()` is intentionally run single-process: spaCy's `n_process` has a large static cost that only pays off for thousands of texts, but a PDF yields just one text per page, so parallelizing it would slow down typical documents. ## Testing Notes diff --git a/src/spacypdfreader/parsers/pdfminer.py b/src/spacypdfreader/parsers/pdfminer.py index f06bff7..b72c207 100644 --- a/src/spacypdfreader/parsers/pdfminer.py +++ b/src/spacypdfreader/parsers/pdfminer.py @@ -1,4 +1,17 @@ +from io import StringIO +from typing import Dict, Iterable, List + +from pdfminer.converter import TextConverter from pdfminer.high_level import extract_text +from pdfminer.layout import LAParams +from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager +from pdfminer.pdfpage import PDFPage + +_PAGE_NUMBERS_ERROR = ( + "The `page_numbers` kwarg is not valid when using the pdfminer parser. " + "Please use `page_range` instead. For example: ", + "``", +) def parser(pdf_path: str, page_number: int, **kwargs): @@ -57,11 +70,7 @@ def parser(pdf_path: str, page_number: int, **kwargs): # Check to see if the users has provided the `page_numbers` kwarg. This is not # valid. So raise an error. See: https://github.com/SamEdwardes/spacypdfreader/issues/16 if "page_numbers" in kwargs: - raise ValueError( - "The `page_numbers` kwarg is not valid when using the pdfminer parser. " - "Please use `page_range` instead. For example: ", - "``", - ) + raise ValueError(*_PAGE_NUMBERS_ERROR) # pdfminer uses zero indexed page numbers. Therefore need to remove 1 # from the page count. @@ -70,6 +79,88 @@ def parser(pdf_path: str, page_number: int, **kwargs): return text +def batch_parser(pdf_path: str, pages: Iterable[int], **kwargs) -> List[str]: + """Convert several PDF pages to text in a single pass with pdfminer. + + Calling [`parser`][spacypdfreader.parsers.pdfminer.parser] once per page + re-opens and re-parses the entire PDF every time, which is roughly O(n^2) + work for an n page document. `batch_parser` parses the document a single + time and returns the text for each requested page. The output for a given + page is identical to calling `parser` for that page. + + `spacypdfreader.pdf_reader` uses this automatically when the pdfminer + parser is selected (it is attached as `parser.batch_parser`). + + Parameters: + pdf_path: Path to a PDF file. + pages: One indexed page numbers to convert (e.g. the first page of the + PDF is page 1, as opposed to page 0). + **kwargs: `**kwargs` will be passed to + [`pdfminer.high_level.extract_text`](https://pdfminersix.readthedocs.io/en/latest/reference/highlevel.html#extract-text) + (for example `password`, `caching`, `codec`, `maxpages` or + `laparams`). + + Returns: + A list of strings, one per page, in the same order as `pages`. + """ + if "page_numbers" in kwargs: + raise ValueError(*_PAGE_NUMBERS_ERROR) + + # pdfminer uses zero indexed page numbers. + zero_indexed = [page - 1 for page in pages] + text_by_page = _extract_text_per_page(pdf_path, zero_indexed, **kwargs) + return [text_by_page[page] for page in zero_indexed] + + +def _extract_text_per_page( + pdf_path: str, + page_numbers: Iterable[int], + password: str = "", + maxpages: int = 0, + caching: bool = True, + codec: str = "utf-8", + laparams: LAParams = None, +) -> Dict[int, str]: + """Extract text from the requested (zero indexed) pages in a single pass. + + This mirrors `pdfminer.high_level.extract_text` but opens and parses the + document only once, capturing the text for each requested page separately. + """ + if laparams is None: + laparams = LAParams() + + # `PDFPage.get_pages` yields the requested pages in document order, so pair + # the yielded pages with the sorted, de-duplicated page numbers. + requested = sorted(set(page_numbers)) + text_by_page: Dict[int, str] = {} + + with open(pdf_path, "rb") as in_file: + rsrcmgr = PDFResourceManager(caching=caching) + pdf_pages = PDFPage.get_pages( + in_file, + requested, + maxpages=maxpages, + password=password, + caching=caching, + ) + for page_number, page in zip(requested, pdf_pages): + with StringIO() as output: + device = TextConverter( + rsrcmgr, output, codec=codec, laparams=laparams + ) + interpreter = PDFPageInterpreter(rsrcmgr, device) + interpreter.process_page(page) + text_by_page[page_number] = output.getvalue() + device.close() + + return text_by_page + + +# Expose the single-pass extractor on the per-page parser so that +# `spacypdfreader.pdf_reader` can discover and prefer it. +parser.batch_parser = batch_parser + + class PdfminerParser: """This class has bee included for backwards compatibility. Do not use.""" diff --git a/src/spacypdfreader/spacypdfreader.py b/src/spacypdfreader/spacypdfreader.py index 5216a7a..0a68822 100644 --- a/src/spacypdfreader/spacypdfreader.py +++ b/src/spacypdfreader/spacypdfreader.py @@ -1,7 +1,7 @@ import os import warnings from functools import partial -from multiprocessing.pool import ThreadPool as Pool +from multiprocessing.pool import ThreadPool from typing import Any, Callable, Iterable, Optional import spacy @@ -176,25 +176,37 @@ def pdf_reader( console.print(f"PDF contains {num_pages} pages.") console.print(f"Extracting text from {start_page} to {end_page}...") - # Handle multiprocessing - if n_processes: - with Pool(n_processes) as p: + # Extract the text from each page. + page_numbers = list(range(start_page, end_page + 1)) + batch_parser = getattr(pdf_parser, "batch_parser", None) + + if batch_parser is not None: + # Single-pass parsers (e.g. pdfminer) parse the whole PDF once instead + # of re-opening and re-parsing it for every page. This is faster than + # parallelising the per-page calls, so multiprocessing is not needed + # for extraction here. + texts = batch_parser(pdf_path, page_numbers, **kwargs) + elif n_processes: + # Per-page parsers (e.g. pytesseract OCR) that release the GIL during + # subprocess / native calls benefit from a thread pool. Note that pure + # python, CPU-bound parsers will see little benefit due to the GIL. + with ThreadPool(n_processes) as pool: partial_worker = partial(pdf_parser, pdf_path, **kwargs) - args = list(range(start_page, end_page + 1)) - texts = p.map(partial_worker, args) - - # Handle non-multiprocessing + texts = pool.map(partial_worker, page_numbers) else: - texts = [] - for page_num in range(start_page, end_page + 1): - text = pdf_parser(pdf_path=pdf_path, page_number=page_num, **kwargs) - texts.append(text) - - # Convert text to spaCy Doc objects. + texts = [ + pdf_parser(pdf_path=pdf_path, page_number=page_num, **kwargs) + for page_num in page_numbers + ] + + # Convert text to spaCy Doc objects. `nlp.pipe` is kept single process on + # purpose: spaCy's multiprocessing has a large static cost and only pays + # off for thousands of texts, whereas a PDF yields just one text per page, + # so `n_process > 1` would slow down typical documents. if verbose: console.print("Converting text to [blue bold]spaCy[/] Doc...") - docs = [doc for doc in nlp.pipe(texts)] + docs = list(nlp.pipe(texts)) for idx, doc in enumerate(docs): page_num = idx + start_page for token in doc: