SamEdwardes · SamEdwardes · Jun 18, 2026
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -83,6 +83,8 @@ The library uses a pluggable parser architecture in `spacypdfreader/parsers/`:
 
 Each parser implements a `parser(pdf_path: str, page_number: int, **kwargs)` function that returns text for a single page.
 
+A parser may optionally expose a `batch_parser(pdf_path: str, pages: Iterable[int], **kwargs)` function (attached as `parser.batch_parser`) that returns text for many pages in a single pass. pdfminer provides one to avoid re-parsing the whole PDF once per page (which is O(n^2) work). When a `batch_parser` is available, `pdf_reader()` prefers it over per-page calls.
+
 ### spaCy Custom Extensions
 
 The library registers several custom attributes on spaCy tokens and docs:
@@ -100,8 +102,10 @@ These extensions are registered in `spacypdfreader/spacypdfreader.py` at module
 
 1. PDF path and spaCy Language object provided to `pdf_reader()`
 2. PDF page count determined using pdfminer's `PDFParser`
-3. Pages extracted in parallel (if `n_processes` specified) or sequentially
-4. Each page text converted to a spaCy `Doc` via `nlp.pipe()`
+3. Text extracted from the requested pages:
+   - If the parser exposes a `batch_parser` (e.g. pdfminer), the PDF is parsed once in a single pass.
+   - Otherwise pages are extracted per-page, in parallel via a `ThreadPool` when `n_processes` is set (this only helps parsers that release the GIL, such as the pytesseract OCR parser), or sequentially.
+4. Each page text converted to a spaCy `Doc` via `nlp.pipe()` (single process)
 5. Page numbers assigned to all tokens
 6. Individual page `Doc` objects combined using `Doc.from_docs()`
 7. Custom extensions set on the combined doc
@@ -111,7 +115,8 @@ These extensions are registered in `spacypdfreader/spacypdfreader.py` at module
 - This library breaks spaCy convention: it does NOT use `nlp.add_pipe()` because text extraction must happen before spaCy processing
 - Page numbers use 1-based indexing in the public API (but pdfminer uses 0-based internally)
 - When using pdfminer parser, do NOT pass `page_numbers` kwarg - use `page_range` instead
-- Multiprocessing uses `ThreadPool` not `ProcessPool` (see imports in spacypdfreader.py:4)
+- Per-page parsing uses `ThreadPool` (not `ProcessPool`) when `n_processes` is set; this only speeds up parsers that release the GIL (e.g. pytesseract). The default pdfminer parser instead uses a single-pass `batch_parser` and ignores `n_processes` for extraction.
+- `nlp.pipe()` is intentionally run single-process: spaCy's `n_process` has a large static cost that only pays off for thousands of texts, but a PDF yields just one text per page, so parallelizing it would slow down typical documents.
 
 ## Testing Notes
 

diff --git a/src/spacypdfreader/parsers/pdfminer.py b/src/spacypdfreader/parsers/pdfminer.py
@@ -1,4 +1,17 @@
+from io import StringIO
+from typing import Dict, Iterable, List
+
+from pdfminer.converter import TextConverter
 from pdfminer.high_level import extract_text
+from pdfminer.layout import LAParams
+from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
+from pdfminer.pdfpage import PDFPage
+
+_PAGE_NUMBERS_ERROR = (
+    "The `page_numbers` kwarg is not valid when using the pdfminer parser. "
+    "Please use `page_range` instead. For example: ",
+    "``",
+)
 
 
 def parser(pdf_path: str, page_number: int, **kwargs):
@@ -57,11 +70,7 @@ def parser(pdf_path: str, page_number: int, **kwargs):
     # Check to see if the users has provided the `page_numbers` kwarg. This is not
     # valid. So raise an error. See: https://github.com/SamEdwardes/spacypdfreader/issues/16
     if "page_numbers" in kwargs:
-        raise ValueError(
-            "The `page_numbers` kwarg is not valid when using the pdfminer parser. "
-            "Please use `page_range` instead. For example: ",
-            "``",
-        )
+        raise ValueError(*_PAGE_NUMBERS_ERROR)
 
     # pdfminer uses zero indexed page numbers. Therefore need to remove 1
     # from the page count.
@@ -70,6 +79,88 @@ def parser(pdf_path: str, page_number: int, **kwargs):
     return text
 
 
+def batch_parser(pdf_path: str, pages: Iterable[int], **kwargs) -> List[str]:
+    """Convert several PDF pages to text in a single pass with pdfminer.
+
+    Calling [`parser`][spacypdfreader.parsers.pdfminer.parser] once per page
+    re-opens and re-parses the entire PDF every time, which is roughly O(n^2)
+    work for an n page document. `batch_parser` parses the document a single
+    time and returns the text for each requested page. The output for a given
+    page is identical to calling `parser` for that page.
+
+    `spacypdfreader.pdf_reader` uses this automatically when the pdfminer
+    parser is selected (it is attached as `parser.batch_parser`).
+
+    Parameters:
+        pdf_path: Path to a PDF file.
+        pages: One indexed page numbers to convert (e.g. the first page of the
+            PDF is page 1, as opposed to page 0).
+        **kwargs: `**kwargs` will be passed to
+            [`pdfminer.high_level.extract_text`](https://pdfminersix.readthedocs.io/en/latest/reference/highlevel.html#extract-text)
+            (for example `password`, `caching`, `codec`, `maxpages` or
+            `laparams`).
+
+    Returns:
+        A list of strings, one per page, in the same order as `pages`.
+    """
+    if "page_numbers" in kwargs:
+        raise ValueError(*_PAGE_NUMBERS_ERROR)
+
+    # pdfminer uses zero indexed page numbers.
+    zero_indexed = [page - 1 for page in pages]
+    text_by_page = _extract_text_per_page(pdf_path, zero_indexed, **kwargs)
+    return [text_by_page[page] for page in zero_indexed]
+
+
+def _extract_text_per_page(
+    pdf_path: str,
+    page_numbers: Iterable[int],
+    password: str = "",
+    maxpages: int = 0,
+    caching: bool = True,
+    codec: str = "utf-8",
+    laparams: LAParams = None,
+) -> Dict[int, str]:
+    """Extract text from the requested (zero indexed) pages in a single pass.
+
+    This mirrors `pdfminer.high_level.extract_text` but opens and parses the
+    document only once, capturing the text for each requested page separately.
+    """
+    if laparams is None:
+        laparams = LAParams()
+
+    # `PDFPage.get_pages` yields the requested pages in document order, so pair
+    # the yielded pages with the sorted, de-duplicated page numbers.
+    requested = sorted(set(page_numbers))
+    text_by_page: Dict[int, str] = {}
+
+    with open(pdf_path, "rb") as in_file:
+        rsrcmgr = PDFResourceManager(caching=caching)
+        pdf_pages = PDFPage.get_pages(
+            in_file,
+            requested,
+            maxpages=maxpages,
+            password=password,
+            caching=caching,
+        )
+        for page_number, page in zip(requested, pdf_pages):
+            with StringIO() as output:
+                device = TextConverter(
+                    rsrcmgr, output, codec=codec, laparams=laparams
+                )
+                interpreter = PDFPageInterpreter(rsrcmgr, device)
+                interpreter.process_page(page)
+                text_by_page[page_number] = output.getvalue()
+                device.close()
+
+    return text_by_page
+
+
+# Expose the single-pass extractor on the per-page parser so that
+# `spacypdfreader.pdf_reader` can discover and prefer it.
+parser.batch_parser = batch_parser
+
+
 class PdfminerParser:
     """This class has bee included for backwards compatibility. Do not use."""
 

diff --git a/src/spacypdfreader/spacypdfreader.py b/src/spacypdfreader/spacypdfreader.py
@@ -1,7 +1,7 @@
 import os
 import warnings
 from functools import partial
-from multiprocessing.pool import ThreadPool as Pool
+from multiprocessing.pool import ThreadPool
 from typing import Any, Callable, Iterable, Optional
 
 import spacy
@@ -176,25 +176,37 @@ def pdf_reader(
         console.print(f"PDF contains {num_pages} pages.")
         console.print(f"Extracting text from {start_page} to {end_page}...")
 
-    # Handle multiprocessing
-    if n_processes:
-        with Pool(n_processes) as p:
+    # Extract the text from each page.
+    page_numbers = list(range(start_page, end_page + 1))
+    batch_parser = getattr(pdf_parser, "batch_parser", None)
+
+    if batch_parser is not None:
+        # Single-pass parsers (e.g. pdfminer) parse the whole PDF once instead
+        # of re-opening and re-parsing it for every page. This is faster than
+        # parallelising the per-page calls, so multiprocessing is not needed
+        # for extraction here.
+        texts = batch_parser(pdf_path, page_numbers, **kwargs)
+    elif n_processes:
+        # Per-page parsers (e.g. pytesseract OCR) that release the GIL during
+        # subprocess / native calls benefit from a thread pool. Note that pure
+        # python, CPU-bound parsers will see little benefit due to the GIL.
+        with ThreadPool(n_processes) as pool:
             partial_worker = partial(pdf_parser, pdf_path, **kwargs)
-            args = list(range(start_page, end_page + 1))
-            texts = p.map(partial_worker, args)
-
-    # Handle non-multiprocessing
+            texts = pool.map(partial_worker, page_numbers)
     else:
-        texts = []
-        for page_num in range(start_page, end_page + 1):
-            text = pdf_parser(pdf_path=pdf_path, page_number=page_num, **kwargs)
-            texts.append(text)
-
-    # Convert text to spaCy Doc objects.
+        texts = [
+            pdf_parser(pdf_path=pdf_path, page_number=page_num, **kwargs)
+            for page_num in page_numbers
+        ]
+
+    # Convert text to spaCy Doc objects. `nlp.pipe` is kept single process on
+    # purpose: spaCy's multiprocessing has a large static cost and only pays
+    # off for thousands of texts, whereas a PDF yields just one text per page,
+    # so `n_process > 1` would slow down typical documents.
     if verbose:
         console.print("Converting text to [blue bold]spaCy[/] Doc...")
 
-    docs = [doc for doc in nlp.pipe(texts)]
+    docs = list(nlp.pipe(texts))
     for idx, doc in enumerate(docs):
         page_num = idx + start_page
         for token in doc: