From 397633f120aadfa054d6a1d0e98b86e0cfc2533b Mon Sep 17 00:00:00 2001 From: Seyed Yahya Shirazi Date: Tue, 9 Jun 2026 21:17:01 -0700 Subject: [PATCH 1/2] fix(citations): drop impossible pre-publication citation buckets OpenAlex citing works sometimes carry bad publication years, producing citation counts in years before the paper existed (e.g. the BIDS 2016 paper showed a 2013 bucket). Floor each canonical paper's histogram at its earliest version publication year. - resolve_work returns the work id AND publication_year (ResolvedWork). - sync_citing_papers takes the minimum publication year across the version group (the preprint, when present) and drops any count in earlier years. Tests: pre-publication bucket dropped; floor uses the earliest version year. --- src/knowledge/openalex_citations.py | 29 +++++-- src/knowledge/papers_sync.py | 16 +++- .../test_knowledge/test_openalex_citations.py | 25 ++++-- tests/test_knowledge/test_papers_sync.py | 82 +++++++++++++++++++ 4 files changed, 138 insertions(+), 14 deletions(-) diff --git a/src/knowledge/openalex_citations.py b/src/knowledge/openalex_citations.py index 3bdfeb8..9b5d9f8 100644 --- a/src/knowledge/openalex_citations.py +++ b/src/knowledge/openalex_citations.py @@ -38,6 +38,14 @@ class CitingPaper: url: str +@dataclass +class ResolvedWork: + """An OpenAlex work id paired with its publication year.""" + + work_id: str + publication_year: int | None + + def _strip_id(value: str | None) -> str: """Reduce an OpenAlex IRI (https://openalex.org/W123) to its bare id.""" if not value: @@ -91,18 +99,29 @@ def _params(self, **extra: object) -> dict[str, object]: params["api_key"] = self._api_key return params - def resolve_work_id(self, doi: str) -> str | None: - """Resolve a DOI to its OpenAlex work id (e.g. ``W2128495200``).""" + def resolve_work(self, doi: str) -> ResolvedWork | None: + """Resolve a DOI to its OpenAlex work id and publication year. + + The year lets callers floor a citation histogram at the paper's own + publication, dropping impossible pre-publication citation buckets. + """ resp = self._client.get( f"{OPENALEX_BASE}/works/doi:{doi}", - params=self._params(select="id"), + params=self._params(select="id,publication_year"), ) if resp.status_code == 404: logger.warning("OpenAlex has no work for DOI %s", doi) return None resp.raise_for_status() - work_id = _strip_id(resp.json().get("id")) - return work_id or None + data = resp.json() + work_id = _strip_id(data.get("id")) + if not work_id: + return None + year = data.get("publication_year") + return ResolvedWork( + work_id=work_id, + publication_year=year if isinstance(year, int) else None, + ) @staticmethod def _cites_filter(work_ids: str | Sequence[str]) -> str: diff --git a/src/knowledge/papers_sync.py b/src/knowledge/papers_sync.py index c0d2f69..3452ded 100644 --- a/src/knowledge/papers_sync.py +++ b/src/knowledge/papers_sync.py @@ -450,15 +450,25 @@ def sync_citing_papers( for doi in dois: try: # Resolve the primary DOI plus any version aliases to a group of - # OpenAlex work ids; citations across the group are merged. + # OpenAlex works; citations across the group are merged. group_dois = [doi, *aliases.get(doi, [])] - work_ids = [wid for d in group_dois if (wid := client.resolve_work_id(d))] - if not work_ids: + resolved = [w for d in group_dois if (w := client.resolve_work(d))] + if not resolved: logger.warning("Skipping citations: cannot resolve DOI %s", doi) continue + work_ids = [w.work_id for w in resolved] + + # The earliest publication year across the version group (the + # preprint, if any) is the floor: a paper cannot be cited before + # it exists, so drop impossible pre-publication buckets that come + # from citing works with bad dates. + pub_years = [w.publication_year for w in resolved if w.publication_year] + floor_year = min(pub_years) if pub_years else None # 1. Complete per-year counts (source of truth for the chart). counts = client.counts_by_year(work_ids) + if floor_year is not None: + counts = {y: c for y, c in counts.items() if y >= floor_year} if not counts: # A canonical paper with zero citations is implausible; an # empty histogram almost always means a transient OpenAlex diff --git a/tests/test_knowledge/test_openalex_citations.py b/tests/test_knowledge/test_openalex_citations.py index dd2f921..8ea7640 100644 --- a/tests/test_knowledge/test_openalex_citations.py +++ b/tests/test_knowledge/test_openalex_citations.py @@ -48,21 +48,34 @@ def test_strip_doi(self): assert _strip_doi(None) is None -class TestResolveWorkId: - def test_resolves_doi_to_work_id(self): +class TestResolveWork: + def test_resolves_doi_to_work_and_year(self): def handler(request: httpx.Request) -> httpx.Response: assert "/works/doi:10.1/x" in str(request.url) - return httpx.Response(200, json={"id": "https://openalex.org/W999"}) + return httpx.Response( + 200, json={"id": "https://openalex.org/W999", "publication_year": 2019} + ) + + with _client(handler) as c: + resolved = c.resolve_work("10.1/x") + assert resolved.work_id == "W999" + assert resolved.publication_year == 2019 + + def test_missing_year_is_none(self): + def handler(_request: httpx.Request) -> httpx.Response: + return httpx.Response(200, json={"id": "https://openalex.org/W1"}) with _client(handler) as c: - assert c.resolve_work_id("10.1/x") == "W999" + resolved = c.resolve_work("10.1/x") + assert resolved.work_id == "W1" + assert resolved.publication_year is None def test_unresolved_doi_returns_none(self): def handler(_request: httpx.Request) -> httpx.Response: return httpx.Response(404, json={"error": "not found"}) with _client(handler) as c: - assert c.resolve_work_id("10.1/missing") is None + assert c.resolve_work("10.1/missing") is None def test_includes_mailto_param(self): seen = {} @@ -72,7 +85,7 @@ def handler(request: httpx.Request) -> httpx.Response: return httpx.Response(200, json={"id": "https://openalex.org/W1"}) with _client(handler) as c: - c.resolve_work_id("10.1/x") + c.resolve_work("10.1/x") assert seen["mailto"] == "t@example.org" diff --git a/tests/test_knowledge/test_papers_sync.py b/tests/test_knowledge/test_papers_sync.py index ad6ce47..a34b542 100644 --- a/tests/test_knowledge/test_papers_sync.py +++ b/tests/test_knowledge/test_papers_sync.py @@ -413,6 +413,88 @@ def factory(**_kwargs): assert stored == 0 assert stats.total == 0 + def test_drops_prepublication_citations(self, tmp_path: Path, monkeypatch) -> None: + # The work was published in 2016; a citing bucket dated 2013 is + # impossible (bad OpenAlex date) and must be dropped from the histogram. + def handler(request: httpx.Request) -> httpx.Response: + if "/works/doi:" in str(request.url): + return httpx.Response( + 200, json={"id": "https://openalex.org/W1", "publication_year": 2016} + ) + if request.url.params.get("group_by"): + return httpx.Response( + 200, + json={ + "group_by": [ + {"key": "2013", "count": 2}, # before publication + {"key": "2016", "count": 5}, + {"key": "2020", "count": 9}, + ] + }, + ) + return httpx.Response(200, json={"meta": {"next_cursor": None}, "results": []}) + + def factory(**_kwargs): + return OpenAlexCitationClient( + client=httpx.Client(transport=httpx.MockTransport(handler)) + ) + + monkeypatch.setattr(ps, "OpenAlexCitationClient", factory) + + db_path = tmp_path / "knowledge" / "test.db" + with patch("src.knowledge.db.get_db_path", return_value=db_path): + init_db("test") + sync_citing_papers(["10.1/canon"], project="test") + stats = get_citation_stats("test") + + assert stats.by_paper == {"10.1/canon": {"2016": 5, "2020": 9}} + assert "2013" not in stats.per_year + + def test_floor_is_earliest_version_year(self, tmp_path: Path, monkeypatch) -> None: + # Primary published 2025, preprint 2024 -> floor is 2024 (the preprint). + def handler(request: httpx.Request) -> httpx.Response: + url = str(request.url) + if "/works/doi:10.1234/published" in url: + return httpx.Response( + 200, json={"id": "https://openalex.org/W1", "publication_year": 2025} + ) + if "/works/doi:10.1101/preprint" in url: + return httpx.Response( + 200, json={"id": "https://openalex.org/W2", "publication_year": 2024} + ) + if request.url.params.get("group_by"): + return httpx.Response( + 200, + json={ + "group_by": [ + {"key": "2023", "count": 3}, # before the preprint + {"key": "2024", "count": 7}, + {"key": "2025", "count": 11}, + ] + }, + ) + return httpx.Response(200, json={"meta": {"next_cursor": None}, "results": []}) + + def factory(**_kwargs): + return OpenAlexCitationClient( + client=httpx.Client(transport=httpx.MockTransport(handler)) + ) + + monkeypatch.setattr(ps, "OpenAlexCitationClient", factory) + + db_path = tmp_path / "knowledge" / "test.db" + with patch("src.knowledge.db.get_db_path", return_value=db_path): + init_db("test") + sync_citing_papers( + ["10.1234/published"], + project="test", + aliases={"10.1234/published": ["10.1101/preprint"]}, + ) + stats = get_citation_stats("test") + + # 2023 (before the 2024 preprint) dropped; 2024+ kept. + assert stats.by_paper == {"10.1234/published": {"2024": 7, "2025": 11}} + def test_version_aliases_merge_into_primary(self, tmp_path: Path, monkeypatch) -> None: # Primary + preprint resolve to W1/W2; counts are queried as a group and # attributed to the primary DOI. From a28297b6415d1bb025b2e3e051b68a7a3210b83b Mon Sep 17 00:00:00 2001 From: Seyed Yahya Shirazi Date: Tue, 9 Jun 2026 21:19:21 -0700 Subject: [PATCH 2/2] fix(citations): address review on pre-publication floor - Use 'is not None' for publication_year filter (don't drop a year-0 edge). - Test that an over-high floor (bogus future year) leaves existing counts intact via the empty-counts guard rather than wiping them. --- src/knowledge/papers_sync.py | 2 +- tests/test_knowledge/test_papers_sync.py | 30 ++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/src/knowledge/papers_sync.py b/src/knowledge/papers_sync.py index 3452ded..b7181b5 100644 --- a/src/knowledge/papers_sync.py +++ b/src/knowledge/papers_sync.py @@ -462,7 +462,7 @@ def sync_citing_papers( # preprint, if any) is the floor: a paper cannot be cited before # it exists, so drop impossible pre-publication buckets that come # from citing works with bad dates. - pub_years = [w.publication_year for w in resolved if w.publication_year] + pub_years = [w.publication_year for w in resolved if w.publication_year is not None] floor_year = min(pub_years) if pub_years else None # 1. Complete per-year counts (source of truth for the chart). diff --git a/tests/test_knowledge/test_papers_sync.py b/tests/test_knowledge/test_papers_sync.py index a34b542..881707e 100644 --- a/tests/test_knowledge/test_papers_sync.py +++ b/tests/test_knowledge/test_papers_sync.py @@ -450,6 +450,36 @@ def factory(**_kwargs): assert stats.by_paper == {"10.1/canon": {"2016": 5, "2020": 9}} assert "2013" not in stats.per_year + def test_over_aggressive_floor_preserves_existing(self, tmp_path: Path, monkeypatch) -> None: + # If OpenAlex reports a bogus future year for the canonical work, every + # current bucket is floored out; the empty-counts guard must then keep + # the existing histogram rather than wiping it. + def handler(request: httpx.Request) -> httpx.Response: + if "/works/doi:" in str(request.url): + return httpx.Response( + 200, json={"id": "https://openalex.org/W1", "publication_year": 2099} + ) + if request.url.params.get("group_by"): + return httpx.Response(200, json={"group_by": [{"key": "2024", "count": 10}]}) + return httpx.Response(200, json={"meta": {"next_cursor": None}, "results": []}) + + def factory(**_kwargs): + return OpenAlexCitationClient( + client=httpx.Client(transport=httpx.MockTransport(handler)) + ) + + monkeypatch.setattr(ps, "OpenAlexCitationClient", factory) + + db_path = tmp_path / "knowledge" / "test.db" + with patch("src.knowledge.db.get_db_path", return_value=db_path): + init_db("test") + replace_citation_counts("10.1/canon", {2024: 50}, project="test") + sync_citing_papers(["10.1/canon"], project="test") + stats = get_citation_stats("test") + + # Existing data preserved, not wiped to empty by the over-high floor. + assert stats.by_paper == {"10.1/canon": {"2024": 50}} + def test_floor_is_earliest_version_year(self, tmp_path: Path, monkeypatch) -> None: # Primary published 2025, preprint 2024 -> floor is 2024 (the preprint). def handler(request: httpx.Request) -> httpx.Response: