diff --git a/.github/workflows/dts-validator.yml b/.github/workflows/dts-validator.yml index dfde4a1..b41d42c 100644 --- a/.github/workflows/dts-validator.yml +++ b/.github/workflows/dts-validator.yml @@ -2,8 +2,6 @@ name: DTS-Validator on: push: - branches: - - main pull_request: jobs: @@ -39,10 +37,10 @@ jobs: run: sleep 5 # Adjust as needed - name: Install and run tests - run: | + run: | cd dts-validator python -m venv env source env/bin/activate pip install poetry poetry install - pytest --entry-endpoint=http://localhost:5000 \ No newline at end of file + pytest --entry-endpoint=http://localhost:5000 diff --git a/dapytains/processor.py b/dapytains/processor.py index c6ee05d..0eaf509 100644 --- a/dapytains/processor.py +++ b/dapytains/processor.py @@ -7,7 +7,7 @@ try: if saxon_version == "PE": import saxoncpe as saxonlib - elif saxon_version == "PE": + elif saxon_version == "EE": import saxoncee as saxonlib else: import saxonche as saxonlib diff --git a/dapytains/tei/citeStructure.py b/dapytains/tei/citeStructure.py index 7e87756..1c31f1f 100644 --- a/dapytains/tei/citeStructure.py +++ b/dapytains/tei/citeStructure.py @@ -2,9 +2,10 @@ from typing import Dict, List, Optional from dataclasses import dataclass, field from collections import namedtuple, defaultdict -from functools import cmp_to_key from dapytains.processor import get_xpath_proc, saxonlib +_pos_re = re.compile(r'\[(\d+)\]') + @dataclass class CiteData: @@ -281,15 +282,6 @@ def find_refs_from_branches( units = [] xpath_prefix = "./" if unit else "" - # Custom comparison function to compare nodes by document order - def compare_nodes_by_doc_order(node1, node2): - # Check if node1 precedes node2 in document order - precedes = xpath_proc.evaluate_single(f'{node1.xpath} << {node2.xpath}').string_value - if precedes == "true": - return -1 # node1 comes before node2 - - return 1 - unsorted = [] for s in structure: unsorted.extend( @@ -303,7 +295,19 @@ def compare_nodes_by_doc_order(node1, node2): _simple_node(ref, self.generate_xpath(ref), struct) for ref, struct in unsorted ] - unsorted = sorted(unsorted, key=cmp_to_key(compare_nodes_by_doc_order)) + # Generate a positional path key for each node once (O(n) JVM calls) and sort + # natively, rather than calling back into Saxon for every pairwise comparison + # (which would cost O(n log n) JVM round-trips). + def _doc_order_key(node): + # Count ALL preceding element siblings (not just same-name) so that + # mixed-name siblings at the same level sort in document order. + path_str = str(xpath_proc.evaluate_single( + f"string-join(for $n in ({node.xpath}/ancestor-or-self::*) " + f"return concat('/', name($n), '[', 1 + count($n/preceding-sibling::*), ']'), '')" + )) + return tuple(int(x) for x in _pos_re.findall(path_str)) + + unsorted = sorted(unsorted, key=_doc_order_key) units = [] for elem in unsorted: diff --git a/dapytains/tei/document.py b/dapytains/tei/document.py index 68934ec..2d24e74 100644 --- a/dapytains/tei/document.py +++ b/dapytains/tei/document.py @@ -37,21 +37,19 @@ def xpath_walk(xpath: List[str]) -> Tuple[str, List[str], List[str]]: return current_filled, queue, [xpath[0]] if len(xpath) > 1 else [] -def is_traversing_xpath(parent: saxonlib.PyXdmNode, xpath: str, processor: saxonlib.PySaxonProcessor) -> bool: +def is_traversing_xpath(xpath_proc: saxonlib.PyXPathProcessor, xpath: str) -> bool: """ Check if an XPath is traversing more than one level - :param parent: + :param xpath_proc: XPath processor with context already set to the parent node :param xpath: :return: """ - xpath_proc = get_xpath_proc(parent, processor=processor) if xpath.startswith(".//"): # If the XPath starts with .//, we try to see if we have a direct child that matches drct_xpath = xpath.replace(".//", "./", 1) if xpath_proc.effective_boolean_value(f"head({xpath}) is head({drct_xpath})"): return False - else: - return True + return True return False @@ -70,7 +68,7 @@ def xpath_walk_step(parent: saxonlib.PyXdmNode, xpath: str, processor: saxonlib. xpath_proc = get_xpath_proc(parent, processor=processor) # We check first for loops, because that changes the xpath if xpath.startswith(".//"): - if is_traversing_xpath(parent, xpath, processor=processor): + if is_traversing_xpath(xpath_proc, xpath): return xpath_proc.evaluate_single(f"./*[{xpath}]"), True else: return xpath_proc.evaluate_single(xpath), False @@ -174,11 +172,12 @@ def copy_node( _add_space_tail(element, node, processor=processor) return element elif parent is not None: - if not parent.getchildren(): + existing = parent.getchildren() + if not existing: if not isinstance(parent, (StringElement, ObjectifiedElement)): parent.text = unescape((parent.text or "") + element) else: - parent.getchildren()[-1].tail = unescape(element) + existing[-1].tail = unescape(element) return parent if node is None: @@ -373,7 +372,7 @@ def reconstruct_doc( # Given that both XPath returns the same node, we still need to check if end is looping # We optimize by avoiding this check when start and end are the same - if start_xpath != end_xpath and is_traversing_xpath(root, current_end, processor=processor): + if start_xpath != end_xpath and is_traversing_xpath(xpath_proc, current_end): queue_end = end_xpath # If we have a child XPath, then continue the job @@ -407,7 +406,7 @@ def reconstruct_doc( # Given that both XPath returns the same node, we still need to check if end is looping # We optimize by avoiding this check when start and end are the same - if start_xpath != end_xpath and is_traversing_xpath(root, current_end, processor=processor): + if start_xpath != end_xpath and is_traversing_xpath(xpath_proc, current_end): queue_end = end_xpath reconstruct_doc( @@ -434,7 +433,7 @@ def reconstruct_doc( # Given that both XPath returns the same node, we still need to check if end is looping # We optimize by avoiding this check when start and end are the same - if start_xpath != end_xpath and is_traversing_xpath(root, current_end, processor=processor): + if start_xpath != end_xpath and is_traversing_xpath(xpath_proc, current_end): queue_end = end_xpath new_tree = reconstruct_doc( @@ -486,8 +485,7 @@ def reconstruct_doc( sib_current_end = clean_xpath_for_following(current_end, end_is_traversing) # We look for siblings between start and end matches - xpath = get_xpath_proc(root, processor=processor) - for sibling in xpath_eval(xpath, f"./node()[preceding-sibling::{sib_current_start} and following-sibling::{sib_current_end}]"): + for sibling in xpath_eval(xpath_proc, f"./node()[preceding-sibling::{sib_current_start} and following-sibling::{sib_current_end}]"): copy_node(sibling, include_children=True, parent=new_tree, processor=processor) # Here we reached the end, logically. @@ -622,7 +620,7 @@ def get_passage(self, ref_or_start: Optional[str], end: Optional[str] = None, tr return root def get_reffs(self, tree: Optional[str] = None): - tree = self.citeStructure[tree or self.default_tree] + tree: CiteStructureParser = self.citeStructure[tree or self.default_tree] return tree.find_refs(root=self.xml, structure=tree.structure) def get_next(self, tree, unit) -> Optional[CitableUnit]: diff --git a/tests/test_parallel_app.py b/tests/test_parallel_app.py new file mode 100644 index 0000000..c77df4a --- /dev/null +++ b/tests/test_parallel_app.py @@ -0,0 +1,241 @@ +""" +Parallel-worker tests for the Flask app. + +Strategy: spawn 2 Flask server subprocesses on separate ports (simulating 2 +gunicorn prefork workers — each has its own Python process and JVM instance), +then hammer both with concurrent HTTP requests via requests + ThreadPoolExecutor. + +NOTE: Saxon's C/JVM bindings are NOT thread-safe; concurrent Saxon calls from +the *same* process will crash. The safe parallelism model is one OS process per +concurrent client, which is exactly what prefork servers (gunicorn, uwsgi) use. +These tests exercise that model directly. +""" +import concurrent.futures +import os +import socket +import sys +import tempfile +import time +import urllib.parse + +import pytest +import requests + +_basedir = os.path.abspath(os.path.dirname(__file__)) +_project_root = os.path.dirname(_basedir) +_catalog = os.path.join(_basedir, "catalog", "example-collection.xml") + +# Resources present in the test catalog +_RESOURCE_TEXT = "https://foo.bar/text" # base_tei.xml (Luke/Mark) +_RESOURCE_MULTI = "https://example.org/resource1" # multiple_tree.xml + +# ───────────────────────────────────────────────────────────────────────────── +# Helpers +# ───────────────────────────────────────────────────────────────────────────── + +def _free_port() -> int: + with socket.socket() as s: + s.bind(("127.0.0.1", 0)) + return s.getsockname()[1] + + +_SERVER_SCRIPT = """\ +import sys, os +sys.path.insert(0, sys.argv[1]) # project root +port = int(sys.argv[2]) +db_path = sys.argv[3] +catalog = sys.argv[4] + +from flask import Flask +from dapytains.app.app import create_app +from dapytains.app.ingest import store_catalog +from dapytains.metadata.xml_parser import parse + +app = Flask(__name__) +app, db = create_app(app) +app.config["SQLALCHEMY_DATABASE_URI"] = f"sqlite:///{db_path}" +app.config["SQLALCHEMY_TRACK_MODIFICATIONS"] = False +db.init_app(app) + +with app.app_context(): + db.create_all() + catalog_data, _ = parse(catalog) + store_catalog(catalog_data) + +# threaded=False: one request at a time per process — safe with Saxon +app.run(host="127.0.0.1", port=port, threaded=False, use_reloader=False) +""" + + +def _start_server(port: int, db_path: str): + """Spawn a Flask server subprocess and return the Popen handle.""" + import subprocess + return subprocess.Popen( + [sys.executable, "-c", _SERVER_SCRIPT, + _project_root, str(port), db_path, _catalog], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + + +def _wait_ready(port: int, retries: int = 30, delay: float = 0.3) -> None: + """Poll until the server accepts connections or raise TimeoutError.""" + for _ in range(retries): + try: + requests.get(f"http://127.0.0.1:{port}/", timeout=1) + return + except requests.exceptions.ConnectionError: + time.sleep(delay) + raise TimeoutError(f"Server on port {port} did not start in time") + + +# ───────────────────────────────────────────────────────────────────────────── +# Fixtures +# ───────────────────────────────────────────────────────────────────────────── + +@pytest.fixture(scope="module") +def two_workers(): + """Start 2 Flask worker processes and yield their base URLs.""" + import subprocess + + tmpdir = tempfile.mkdtemp() + port1, port2 = _free_port(), _free_port() + db1 = os.path.join(tmpdir, "worker1.db") + db2 = os.path.join(tmpdir, "worker2.db") + + proc1 = _start_server(port1, db1) + proc2 = _start_server(port2, db2) + + try: + _wait_ready(port1) + _wait_ready(port2) + yield [f"http://127.0.0.1:{port1}", f"http://127.0.0.1:{port2}"] + finally: + proc1.terminate() + proc2.terminate() + proc1.wait(timeout=10) + proc2.wait(timeout=10) + + +# ───────────────────────────────────────────────────────────────────────────── +# Reference responses (computed against a single worker during setup) +# ───────────────────────────────────────────────────────────────────────────── + +def _doc_url(base: str, resource: str, **kwargs) -> str: + params = {"resource": resource, **kwargs} + return f"{base}/document/?{urllib.parse.urlencode(params)}" + + +def _nav_url(base: str, resource: str, **kwargs) -> str: + params = {"resource": resource, **kwargs} + return f"{base}/navigation/?{urllib.parse.urlencode(params)}" + + +# ───────────────────────────────────────────────────────────────────────────── +# Tests +# ───────────────────────────────────────────────────────────────────────────── + +class TestTwoWorkersParallel: + + def test_workers_are_up(self, two_workers): + """Both workers answer the entry-point.""" + for base in two_workers: + r = requests.get(f"{base}/") + assert r.status_code == 200 + assert "collection" in r.json() + + def test_sequential_passages_per_worker(self, two_workers): + """Each worker handles N sequential document requests correctly.""" + refs = ["Luke 1:1", "Luke 1:2", "Mark 1:1"] + for base in two_workers: + responses = [ + requests.get(_doc_url(base, _RESOURCE_TEXT, ref=ref)) + for ref in refs + ] + for r in responses: + assert r.status_code == 200 + assert r.content # non-empty XML + + def test_parallel_requests_to_both_workers(self, two_workers): + """Send 20 document requests distributed across both workers in parallel. + + Requests use threads only for HTTP I/O; Saxon runs inside separate server + processes so there is no intra-process concurrency on the JVM. + """ + resource = _RESOURCE_TEXT + ref = "Luke 1:1" + urls = [_doc_url(base, resource, ref=ref) for base in two_workers] * 10 + + with concurrent.futures.ThreadPoolExecutor(max_workers=8) as pool: + results = list(pool.map(requests.get, urls)) + + assert all(r.status_code == 200 for r in results), [ + r.text for r in results if r.status_code != 200 + ] + # All responses must be identical XML + bodies = [r.text for r in results] + assert all(b == bodies[0] for b in bodies), "inconsistent XML across workers" + + def test_parallel_range_requests(self, two_workers): + """Range passages delivered in parallel from both workers are consistent.""" + urls = [ + _doc_url(base, _RESOURCE_TEXT, start="Luke 1:1", end="Luke 1#1") + for base in two_workers + ] * 8 + + with concurrent.futures.ThreadPoolExecutor(max_workers=8) as pool: + results = list(pool.map(requests.get, urls)) + + assert all(r.status_code == 200 for r in results) + bodies = [r.text for r in results] + assert all(b == bodies[0] for b in bodies), "range XML must be identical across workers" + + def test_parallel_navigation_requests(self, two_workers): + """Navigation endpoint delivers consistent results in parallel from both workers.""" + urls = [_nav_url(base, _RESOURCE_TEXT, down=1) for base in two_workers] * 8 + + with concurrent.futures.ThreadPoolExecutor(max_workers=8) as pool: + results = list(pool.map(requests.get, urls)) + + assert all(r.status_code == 200 for r in results) + # member identifiers must be identical across workers + member_sets = [ + {m["identifier"] for m in r.json().get("member", [])} + for r in results + ] + assert all(m == member_sets[0] for m in member_sets) + + def test_parallel_mixed_requests(self, two_workers): + """Mix of single-passage, range, and navigation requests in parallel.""" + base1, base2 = two_workers + urls = ( + [_doc_url(base1, _RESOURCE_TEXT, ref="Luke 1:1")] * 4 + + [_doc_url(base2, _RESOURCE_TEXT, ref="Luke 1:2")] * 4 + + [_doc_url(base1, _RESOURCE_TEXT, start="Luke 1:1", end="Luke 1#1")] * 4 + + [_nav_url(base2, _RESOURCE_TEXT, down=1)] * 4 + ) + + with concurrent.futures.ThreadPoolExecutor(max_workers=8) as pool: + results = list(pool.map(requests.get, urls)) + + assert all(r.status_code == 200 for r in results), [ + (r.url, r.status_code, r.text[:120]) for r in results if r.status_code != 200 + ] + + def test_repeated_parallel_bursts(self, two_workers): + """Run several bursts of parallel requests; state must stay consistent between bursts.""" + resource = _RESOURCE_TEXT + ref = "Luke 1:1" + urls = [_doc_url(base, resource, ref=ref) for base in two_workers] * 5 + + reference_body = None + for burst in range(3): + with concurrent.futures.ThreadPoolExecutor(max_workers=6) as pool: + results = list(pool.map(requests.get, urls)) + assert all(r.status_code == 200 for r in results), f"burst {burst} had failures" + burst_body = results[0].text + if reference_body is None: + reference_body = burst_body + assert all(r.text == reference_body for r in results), ( + f"burst {burst}: response drifted from the reference" + ) diff --git a/tests/test_successive_calls.py b/tests/test_successive_calls.py new file mode 100644 index 0000000..ec9c844 --- /dev/null +++ b/tests/test_successive_calls.py @@ -0,0 +1,317 @@ +""" +Tests for processor-reuse stability: verifies that sharing or reusing Saxon +XPath/XQuery processors across successive calls and across multiple Document +instances does not corrupt context or results. +""" +import os.path + +from lxml.etree import tostring + +from dapytains.processor import get_processor, get_xpath_proc +from dapytains.tei.citeStructure import CiteStructureParser +from dapytains.tei.document import Document + +local_dir = os.path.join(os.path.dirname(__file__), "tei") +_project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + + + + +# ───────────────────────────────────────────────────────────── +# Successive calls on the SAME Document instance +# ───────────────────────────────────────────────────────────── + +def test_successive_single_passages(): + """get_passage called repeatedly on one Document must give stable results.""" + doc = Document(f"{local_dir}/base_tei.xml") + expected_1_1 = tostring(doc.get_passage("Luke 1:1"), encoding=str) + expected_1_2 = tostring(doc.get_passage("Luke 1:2"), encoding=str) + + # Call again — shared XPath processor state must not bleed between calls + assert tostring(doc.get_passage("Luke 1:1"), encoding=str) == expected_1_1 + assert tostring(doc.get_passage("Luke 1:2"), encoding=str) == expected_1_2 + assert tostring(doc.get_passage("Luke 1:1"), encoding=str) == expected_1_1 + + +def test_successive_range_then_single(): + """A range call followed by a single-ref call must not corrupt context. + + reconstruct_doc calls xpath_proc.set_context(result_end) mid-function; + this test confirms that state does not leak into the next call. + """ + doc = Document(f"{local_dir}/base_tei.xml") + expected_single = tostring(doc.get_passage("Luke 1:1"), encoding=str) + expected_range = tostring(doc.get_passage("Luke 1:1", "Luke 1#1"), encoding=str) + + for _ in range(3): + assert tostring(doc.get_passage("Luke 1:1", "Luke 1#1"), encoding=str) == expected_range + assert tostring(doc.get_passage("Luke 1:1"), encoding=str) == expected_single + + +def test_successive_reffs_and_passage(): + """get_reffs and get_passage interleaved on the same Document must be stable.""" + doc = Document(f"{local_dir}/base_tei.xml") + reffs_1 = [r.ref for r in doc.get_reffs()] + passage = tostring(doc.get_passage("Luke 1:1"), encoding=str) + reffs_2 = [r.ref for r in doc.get_reffs()] + + assert reffs_1 == reffs_2, ( + "get_reffs must return the same refs before and after get_passage" + ) + assert tostring(doc.get_passage("Luke 1:1"), encoding=str) == passage + + +def test_successive_calls_with_traversing_xpath(): + """Documents using .// XPaths exercise is_traversing_xpath; calls must be stable.""" + doc = Document(f"{local_dir}/tei_with_two_traversing_with_n.xml") + expected_wide = tostring(doc.get_passage("Luke 1:1", "Luke 1#3"), encoding=str) + expected_narrow = tostring(doc.get_passage("Luke 1:1", "Luke 1#1"), encoding=str) + + for _ in range(3): + assert tostring(doc.get_passage("Luke 1:1", "Luke 1#3"), encoding=str) == expected_wide + assert tostring(doc.get_passage("Luke 1:1", "Luke 1#1"), encoding=str) == expected_narrow + + +def test_successive_calls_lb_milestone(): + """Milestone (lb) documents: successive range and single calls stay consistent.""" + doc = Document(f"{local_dir}/lb_same_ab.xml") + single = tostring(doc.get_passage("2"), encoding=str) + rng = tostring(doc.get_passage("2", "4"), encoding=str) + last = tostring(doc.get_passage("5"), encoding=str) + + for _ in range(3): + assert tostring(doc.get_passage("2"), encoding=str) == single + assert tostring(doc.get_passage("2", "4"), encoding=str) == rng + assert tostring(doc.get_passage("5"), encoding=str) == last + + +def test_successive_calls_multiple_trees(): + """Switching between trees on the same Document must always return correct results.""" + doc = Document(f"{local_dir}/multiple_tree.xml") + default_passage = tostring(doc.get_passage("I"), encoding=str) + alpha_passage = tostring(doc.get_passage("div-002", tree="alpha"), encoding=str) + + for _ in range(3): + assert tostring(doc.get_passage("I"), encoding=str) == default_passage + assert tostring(doc.get_passage("div-002", tree="alpha"), encoding=str) == alpha_passage + assert tostring(doc.get_passage("I"), encoding=str) == default_passage + + +# ───────────────────────────────────────────────────────────── +# Multiple Document instances in memory simultaneously +# ───────────────────────────────────────────────────────────── + +def test_two_docs_in_memory_independent(): + """Two Document objects with separate processors must not share state.""" + doc_a = Document(f"{local_dir}/base_tei.xml") + doc_b = Document(f"{local_dir}/simple_doc.xml") + + result_a = tostring(doc_a.get_passage("Luke 1:1"), encoding=str) + result_b = tostring(doc_b.get_passage("1"), encoding=str) + + # Re-query after the other document has been used + assert tostring(doc_a.get_passage("Luke 1:1"), encoding=str) == result_a + assert tostring(doc_b.get_passage("1"), encoding=str) == result_b + + +def test_two_docs_same_file_independent(): + """Two Document objects from the same file must produce identical, independent results.""" + doc_a = Document(f"{local_dir}/base_tei.xml") + doc_b = Document(f"{local_dir}/base_tei.xml") + + single_a = tostring(doc_a.get_passage("Luke 1:1"), encoding=str) + single_b = tostring(doc_b.get_passage("Luke 1:1"), encoding=str) + assert single_a == single_b + + range_a = tostring(doc_a.get_passage("Luke 1:1", "Luke 1#1"), encoding=str) + range_b = tostring(doc_b.get_passage("Luke 1:1", "Luke 1#1"), encoding=str) + assert range_a == range_b + + # Using doc_a must not affect doc_b + tostring(doc_a.get_passage("Luke 1:2"), encoding=str) + assert tostring(doc_b.get_passage("Luke 1:1"), encoding=str) == single_b + assert tostring(doc_b.get_passage("Luke 1:1", "Luke 1#1"), encoding=str) == range_b + + +def test_interleaved_calls_two_different_docs(): + """Interleave passage calls between two structurally different documents.""" + doc_a = Document(f"{local_dir}/base_tei.xml") + doc_b = Document(f"{local_dir}/tei_with_two_traversing_with_n.xml") + + expected_a = tostring(doc_a.get_passage("Luke 1:1"), encoding=str) + expected_b = tostring(doc_b.get_passage("Luke 1:1", "Luke 1#3"), encoding=str) + + for _ in range(3): + assert tostring(doc_a.get_passage("Luke 1:1"), encoding=str) == expected_a + assert tostring(doc_b.get_passage("Luke 1:1", "Luke 1#3"), encoding=str) == expected_b + assert tostring(doc_a.get_passage("Luke 1:1"), encoding=str) == expected_a + + +def test_three_docs_in_memory(): + """Three Document objects alive simultaneously all return correct results.""" + doc_a = Document(f"{local_dir}/base_tei.xml") + doc_b = Document(f"{local_dir}/simple_doc.xml") + doc_c = Document(f"{local_dir}/lb_same_ab.xml") + + expected_a = tostring(doc_a.get_passage("Luke 1:1"), encoding=str) + expected_b = tostring(doc_b.get_passage("2", "3"), encoding=str) + expected_c = tostring(doc_c.get_passage("2", "4"), encoding=str) + + # Query in rotated order + assert tostring(doc_c.get_passage("2", "4"), encoding=str) == expected_c + assert tostring(doc_a.get_passage("Luke 1:1"), encoding=str) == expected_a + assert tostring(doc_b.get_passage("2", "3"), encoding=str) == expected_b + assert tostring(doc_b.get_passage("2", "3"), encoding=str) == expected_b + assert tostring(doc_c.get_passage("2", "4"), encoding=str) == expected_c + assert tostring(doc_a.get_passage("Luke 1:1"), encoding=str) == expected_a + + +# ───────────────────────────────────────────────────────────── +# Multiple XML nodes via a SHARED PySaxonProcessor +# ───────────────────────────────────────────────────────────── + +def test_shared_processor_two_docs(): + """Two Documents sharing a PySaxonProcessor must stay independent.""" + processor = get_processor() + doc_a = Document(f"{local_dir}/base_tei.xml", processor=processor) + doc_b = Document(f"{local_dir}/simple_doc.xml", processor=processor) + + result_a = tostring(doc_a.get_passage("Luke 1:1"), encoding=str) + result_b = tostring(doc_b.get_passage("1"), encoding=str) + + for _ in range(3): + assert tostring(doc_a.get_passage("Luke 1:1"), encoding=str) == result_a + assert tostring(doc_b.get_passage("1"), encoding=str) == result_b + + +def test_shared_processor_interleaved_reffs(): + """get_reffs on two docs sharing a processor must return independent, stable results.""" + processor = get_processor() + doc_a = Document(f"{local_dir}/base_tei.xml", processor=processor) + doc_b = Document(f"{local_dir}/simple_doc.xml", processor=processor) + + reffs_a1 = [r.ref for r in doc_a.get_reffs()] + reffs_b1 = [r.ref for r in doc_b.get_reffs()] + + # Query in the opposite order + reffs_b2 = [r.ref for r in doc_b.get_reffs()] + reffs_a2 = [r.ref for r in doc_a.get_reffs()] + + assert reffs_a1 == reffs_a2, "doc_a refs must be stable" + assert reffs_b1 == reffs_b2, "doc_b refs must be stable" + assert set(reffs_a1) != set(reffs_b1), "different documents must have different top-level refs" + + +def test_shared_processor_passage_and_reffs_interleaved(): + """Mix get_passage and get_reffs across two docs on a shared processor.""" + processor = get_processor() + doc_a = Document(f"{local_dir}/base_tei.xml", processor=processor) + doc_b = Document(f"{local_dir}/simple_doc.xml", processor=processor) + + passage_a = tostring(doc_a.get_passage("Luke 1:1"), encoding=str) + reffs_b = [r.ref for r in doc_b.get_reffs()] + passage_b = tostring(doc_b.get_passage("2"), encoding=str) + reffs_a = [r.ref for r in doc_a.get_reffs()] + + assert tostring(doc_a.get_passage("Luke 1:1"), encoding=str) == passage_a + assert [r.ref for r in doc_b.get_reffs()] == reffs_b + assert tostring(doc_b.get_passage("2"), encoding=str) == passage_b + assert [r.ref for r in doc_a.get_reffs()] == reffs_a + + +# ───────────────────────────────────────────────────────────── +# CiteStructureParser successive calls and multiple parsers +# ───────────────────────────────────────────────────────────── + +_MIXED_CHILDREN_XML = """ + + + + + + + + + +
T1
T2
T3
+
A
B
C
+
""" + + +def test_citestructure_successive_find_refs(): + """find_refs called multiple times on the same parser must produce stable results.""" + processor = get_processor() + TEI = processor.parse_xml(xml_text=_MIXED_CHILDREN_XML) + xp = get_xpath_proc(elem=TEI, processor=processor) + parser = CiteStructureParser( + xp.evaluate_single("/TEI/teiHeader/encodingDesc/refsDecl[1]"), + processor=processor + ) + + for _ in range(3): + roots = parser.find_refs(root=TEI, structure=parser.structure) + refs = [r.ref for r in roots] + assert refs == ["Luke", "Mark"], "top-level refs must be stable" + + luke_children = [c.ref for c in roots[0].children[0].children] + assert luke_children == ["Luke 1:1", "Luke 1:2", "Luke 1#1"], ( + "mixed-type children must stay in document order" + ) + + mark_children = [c.ref for c in roots[1].children[0].children] + assert mark_children == ["Mark 1:1", "Mark 1#1", "Mark 1:2"], ( + "mixed-type children in Mark must follow document order" + ) + + +def test_citestructure_two_parsers_shared_processor(): + """Two CiteStructureParsers on different XML nodes sharing a processor must not interfere.""" + processor = get_processor() + + tei_a = processor.parse_xml(xml_text=""" + + + +
A
B
""") + + tei_b = processor.parse_xml(xml_text=""" + + + +
X
Y
""") + + xp_a = get_xpath_proc(elem=tei_a, processor=processor) + xp_b = get_xpath_proc(elem=tei_b, processor=processor) + parser_a = CiteStructureParser( + xp_a.evaluate_single("/TEI/teiHeader/encodingDesc/refsDecl[1]"), processor=processor + ) + parser_b = CiteStructureParser( + xp_b.evaluate_single("/TEI/teiHeader/encodingDesc/refsDecl[1]"), processor=processor + ) + + refs_a = [r.ref for r in parser_a.find_refs(root=tei_a, structure=parser_a.structure)] + refs_b = [r.ref for r in parser_b.find_refs(root=tei_b, structure=parser_b.structure)] + + assert refs_a == ["1", "2"] + assert refs_b == ["alpha", "beta"] + + # Swap query order — must still give correct results + assert [r.ref for r in parser_b.find_refs(root=tei_b, structure=parser_b.structure)] == refs_b + assert [r.ref for r in parser_a.find_refs(root=tei_a, structure=parser_a.structure)] == refs_a + + +def test_citestructure_generate_xpath_stable(): + """generate_xpath must return consistent results across repeated calls.""" + processor = get_processor() + TEI = processor.parse_xml(xml_text=_MIXED_CHILDREN_XML) + xp = get_xpath_proc(elem=TEI, processor=processor) + parser = CiteStructureParser( + xp.evaluate_single("/TEI/teiHeader/encodingDesc/refsDecl[1]"), + processor=processor + ) + + for _ in range(5): + assert parser.generate_xpath("Luke 1:2") == "//body/div[@n='Luke']/div[position()=1]/div[position()=2]" + assert parser.generate_xpath("Mark 1#1") == "//body/div[@n='Mark']/div[position()=1]/l[position()=1]" + assert parser.generate_xpath("Luke") == "//body/div[@n='Luke']" +