diff --git a/src/api/scheduler.py b/src/api/scheduler.py index 555f278..a14e96c 100644 --- a/src/api/scheduler.py +++ b/src/api/scheduler.py @@ -137,6 +137,7 @@ def _run_papers_sync_for_community(community_id: str) -> bool: project=community_id, openalex_api_key=settings.openalex_api_key, openalex_email=settings.openalex_email, + aliases=citations.aliases, ) total += citing_count diff --git a/src/assistants/bids/config.yaml b/src/assistants/bids/config.yaml index 924a660..fa9f670 100644 --- a/src/assistants/bids/config.yaml +++ b/src/assistants/bids/config.yaml @@ -590,6 +590,10 @@ citations: "10.1038/s41597-024-03559-8": "Motion-BIDS (Jeung 2024)" "10.1038/s41597-025-05543-2": "MRS-BIDS (Bouchard 2025)" "10.1371/journal.pcbi.1005209": "BIDS Apps (Gorgolewski 2017)" + # Merge preprint + published versions so split OpenAlex citations accumulate + aliases: + "10.1371/journal.pcbi.1005209": # BIDS Apps published (PLoS Comp Biol) + - "10.1101/079145" # BIDS Apps bioRxiv preprint (2016) # Expose the citation dashboard as a public, read-only JSON feed # (GET /bids/citations). FAQ feed stays off: BIDS has no FAQ pipeline configured. diff --git a/src/assistants/eeglab/config.yaml b/src/assistants/eeglab/config.yaml index c57b51e..31c0d8b 100644 --- a/src/assistants/eeglab/config.yaml +++ b/src/assistants/eeglab/config.yaml @@ -433,6 +433,10 @@ citations: "10.1016/j.neuroimage.2019.05.026": "ICLabel (Pion-Tonachini 2019)" "10.3389/fninf.2015.00016": "PREP (Bigdely-Shamlo 2015)" "10.1162/IMAG.a.136": "LSL (Kothe 2025)" + # Merge preprint + published versions so split OpenAlex citations accumulate + aliases: + "10.1162/IMAG.a.136": # LSL published (Imaging Neuroscience) + - "10.1101/2024.02.13.580071" # LSL bioRxiv preprint (2024) # Expose generated FAQ entries and citation stats as public, read-only JSON feeds # (GET /eeglab/faq and GET /eeglab/citations). Off by default platform-wide. diff --git a/src/cli/sync.py b/src/cli/sync.py index f57bbec..a893eed 100644 --- a/src/cli/sync.py +++ b/src/cli/sync.py @@ -99,6 +99,14 @@ def _get_community_paper_dois(community_id: str) -> list[str]: return [] +def _get_community_paper_aliases(community_id: str) -> dict[str, list[str]]: + """Get the primary-DOI -> version-DOIs alias map from the registry.""" + info = registry.get(community_id) + if info and info.community_config and info.community_config.citations: + return info.community_config.citations.aliases + return {} + + def _get_all_community_ids() -> list[str]: """Get all registered community IDs.""" return [info.id for info in registry.list_all()] @@ -372,7 +380,11 @@ def sync_papers( if dois: console.print(f"\n[dim]Syncing citations for {len(dois)} DOI(s)...[/dim]") with console.status("[green]Syncing citations...[/green]"): - citing_count = sync_citing_papers(dois, project=community) + citing_count = sync_citing_papers( + dois, + project=community, + aliases=_get_community_paper_aliases(community), + ) results_by_source["citing"] = citing_count total += citing_count console.print(f"[dim]Recent citing papers stored: {citing_count}[/dim]") @@ -585,7 +597,11 @@ def sync_all( # sync_citing_papers' own default cap, not the per-query --limit. if dois: with console.status("[green]Syncing citing papers...[/green]"): - citing_count = sync_citing_papers(dois, project=comm_id) + citing_count = sync_citing_papers( + dois, + project=comm_id, + aliases=_get_community_paper_aliases(comm_id), + ) paper_total += citing_count console.print(f"[green]Papers: {paper_total} items[/green]") diff --git a/src/core/config/community.py b/src/core/config/community.py index e9db7e6..057cdda 100644 --- a/src/core/config/community.py +++ b/src/core/config/community.py @@ -274,6 +274,45 @@ def validate_paper_labels(cls, v: dict[str, str]) -> dict[str, str]: normalized[clean_doi] = label return normalized + aliases: dict[str, list[str]] = Field(default_factory=dict) + """Version DOIs to merge into a canonical paper's citation count. + + Maps a primary DOI (from ``dois``) to other DOIs for the *same paper* + (typically a preprint and the published version). OpenAlex splits citations + across version records, so the citation sync queries them together and + deduplicates, attributing the merged per-year counts to the primary DOI. + Example: '10.1162/IMAG.a.136' -> ['10.1101/2024.02.13.580071']. Keys and + values are normalized like ``dois``.""" + + @field_validator("aliases") + @classmethod + def validate_aliases(cls, v: dict[str, list[str]]) -> dict[str, list[str]]: + """Normalize and validate primary + alias DOIs (same rules as ``dois``).""" + doi_pattern = re.compile(r"^10\.\d{4,}/[^\s]+$") + + def _clean(doi: str) -> str: + cleaned = re.sub(r"^(https?://)?(dx\.)?doi\.org/", "", doi.strip()) + if cleaned and not doi_pattern.match(cleaned): + raise ValueError(f"Invalid DOI in aliases (expected '10.xxxx/yyyy'): {doi}") + return cleaned + + normalized: dict[str, list[str]] = {} + for primary, versions in v.items(): + clean_primary = _clean(primary) + if not clean_primary: + continue + clean_versions: list[str] = [] + for d in versions: + clean = _clean(d) + if not clean: + # An empty version entry (e.g. `- ""`) is an authoring slip + # that would silently drop a version from the merge. + raise ValueError(f"Empty alias version DOI for primary '{primary}'") + if clean not in clean_versions: + clean_versions.append(clean) + normalized[clean_primary] = clean_versions + return normalized + @field_validator("queries") @classmethod def validate_queries(cls, v: list[str]) -> list[str]: @@ -304,6 +343,18 @@ def validate_dois(cls, v: list[str]) -> list[str]: # Deduplicate return list(dict.fromkeys(normalized)) + @model_validator(mode="after") + def validate_alias_primaries_in_dois(self) -> "CitationConfig": + """Every alias primary DOI must be a tracked DOI, else the merge is a no-op. + + Runs after field validators, so both ``dois`` and ``aliases`` keys are + already normalized and directly comparable. + """ + unknown = set(self.aliases) - set(self.dois) + if unknown: + raise ValueError(f"aliases primary DOIs not present in dois: {sorted(unknown)}") + return self + class DiscourseCategoryConfig(BaseModel): """A Discourse category to sync.""" diff --git a/src/knowledge/openalex_citations.py b/src/knowledge/openalex_citations.py index db66b84..3bdfeb8 100644 --- a/src/knowledge/openalex_citations.py +++ b/src/knowledge/openalex_citations.py @@ -15,6 +15,7 @@ """ import logging +from collections.abc import Sequence from dataclasses import dataclass import httpx @@ -103,15 +104,28 @@ def resolve_work_id(self, doi: str) -> str | None: work_id = _strip_id(resp.json().get("id")) return work_id or None - def counts_by_year(self, work_id: str) -> dict[int, int]: - """Return the complete per-year count of works citing ``work_id``. + @staticmethod + def _cites_filter(work_ids: str | Sequence[str]) -> str: + """Build a ``cites:`` filter, OR-joining multiple work ids with ``|``. - Uses OpenAlex ``group_by`` so the counts are exact and uncapped, - independent of how many citing papers are stored. + OpenAlex deduplicates across an OR group, so passing every version of a + paper (preprint + published) yields the merged, non-double-counted set. + """ + ids = [work_ids] if isinstance(work_ids, str) else [w for w in work_ids if w] + if not ids: + raise ValueError("work_ids must contain at least one OpenAlex work id") + return "cites:" + "|".join(ids) + + def counts_by_year(self, work_ids: str | Sequence[str]) -> dict[int, int]: + """Return the complete per-year count of works citing ``work_ids``. + + Accepts one work id or several (a version group); multiple ids are + OR-joined and deduplicated by OpenAlex. Uses ``group_by`` so the counts + are exact and uncapped, independent of how many papers are stored. """ resp = self._client.get( f"{OPENALEX_BASE}/works", - params=self._params(filter=f"cites:{work_id}", group_by="publication_year"), + params=self._params(filter=self._cites_filter(work_ids), group_by="publication_year"), ) resp.raise_for_status() counts: dict[int, int] = {} @@ -123,12 +137,16 @@ def counts_by_year(self, work_id: str) -> dict[int, int]: counts[year] = int(group.get("count", 0)) return counts - def recent_citing_papers(self, work_id: str, limit: int = 2000) -> list[CitingPaper]: - """Collect up to ``limit`` most-recent works citing ``work_id``. + def recent_citing_papers( + self, work_ids: str | Sequence[str], limit: int = 2000 + ) -> list[CitingPaper]: + """Collect up to ``limit`` most-recent works citing ``work_ids``. - Cursor-paginates ``sort=publication_date:desc`` so the stored sample is - the newest citations rather than an arbitrary first page. + Accepts one work id or a version group (OR-joined, deduplicated by + OpenAlex). Cursor-paginates ``sort=publication_date:desc`` so the stored + sample is the newest citations rather than an arbitrary first page. """ + cites_filter = self._cites_filter(work_ids) papers: list[CitingPaper] = [] cursor: str | None = "*" # Bound the page count: a highly-cited work may have title-less records @@ -141,7 +159,7 @@ def recent_citing_papers(self, work_id: str, limit: int = 2000) -> list[CitingPa resp = self._client.get( f"{OPENALEX_BASE}/works", params=self._params( - filter=f"cites:{work_id}", + filter=cites_filter, sort="publication_date:desc", select="id,doi,title,publication_date", cursor=cursor, @@ -173,7 +191,7 @@ def recent_citing_papers(self, work_id: str, limit: int = 2000) -> list[CitingPa if pages >= max_pages and cursor: logger.warning( "recent_citing_papers hit page cap for %s (%d pages, %d stored)", - work_id, + cites_filter, pages, len(papers), ) diff --git a/src/knowledge/papers_sync.py b/src/knowledge/papers_sync.py index cba7f03..c0d2f69 100644 --- a/src/knowledge/papers_sync.py +++ b/src/knowledge/papers_sync.py @@ -405,6 +405,7 @@ def sync_citing_papers( project: str = "hed", openalex_api_key: str | None = None, openalex_email: str | None = None, + aliases: dict[str, list[str]] | None = None, ) -> int: """Sync citation data for the given canonical DOIs from OpenAlex. @@ -418,14 +419,21 @@ def sync_citing_papers( 2. The latest ``max_results`` citing papers (publication date descending), upserted into the ``papers`` table for the search corpus. + When a DOI has version ``aliases`` (e.g. a preprint plus the published + version), every version is resolved and queried together: OpenAlex splits + citations across version records, so OR-joining and deduplicating them + recovers the true count, attributed to the primary DOI. + Args: - dois: Canonical DOIs to track citations for (bare ``10.xxxx/yyyy``). - Unresolved DOIs are skipped with a warning. + dois: Canonical (primary) DOIs to track citations for. Unresolvable + DOIs are skipped with a warning. max_results: Maximum number of recent citing papers stored per DOI. Does not limit the per-year counts, which are always complete. project: Project/community ID for database isolation. openalex_api_key: Optional OpenAlex API key for premium throughput. openalex_email: Optional email for the OpenAlex polite pool. + aliases: Optional map of primary DOI -> additional version DOIs whose + citations merge into the primary. Returns: Total citing papers stored across all DOIs (counts are uncapped). @@ -435,34 +443,38 @@ def sync_citing_papers( email = openalex_email or _OPENALEX_EMAIL or "" api_key = openalex_api_key or _OPENALEX_API_KEY or "" + aliases = aliases or {} total_stored = 0 with OpenAlexCitationClient(email=email, api_key=api_key) as client: for doi in dois: try: - work_id = client.resolve_work_id(doi) - if not work_id: + # Resolve the primary DOI plus any version aliases to a group of + # OpenAlex work ids; citations across the group are merged. + group_dois = [doi, *aliases.get(doi, [])] + work_ids = [wid for d in group_dois if (wid := client.resolve_work_id(d))] + if not work_ids: logger.warning("Skipping citations: cannot resolve DOI %s", doi) continue # 1. Complete per-year counts (source of truth for the chart). - counts = client.counts_by_year(work_id) + counts = client.counts_by_year(work_ids) if not counts: # A canonical paper with zero citations is implausible; an # empty histogram almost always means a transient OpenAlex # gap. Do not wipe existing counts on a likely-bad read. logger.warning( - "Empty citation histogram for %s (work %s); keeping existing " + "Empty citation histogram for %s (works %s); keeping existing " "counts and skipping this DOI", doi, - work_id, + work_ids, ) continue replace_citation_counts(doi, counts, project) total_citations = sum(counts.values()) # 2. Latest citing papers for the search corpus. - papers = client.recent_citing_papers(work_id, limit=max_results) + papers = client.recent_citing_papers(work_ids, limit=max_results) stored = _store_citing_papers(papers, project, cites_doi=doi) update_sync_metadata("citations", f"citing_{doi}", total_citations, project) diff --git a/tests/test_core/test_config/test_community.py b/tests/test_core/test_config/test_community.py index de11eb1..de942d5 100644 --- a/tests/test_core/test_config/test_community.py +++ b/tests/test_core/test_config/test_community.py @@ -219,12 +219,39 @@ def test_paper_labels_dedup_last_wins(self) -> None: """Two keys that normalize to the same DOI collapse to one (last wins).""" config = CitationConfig( paper_labels={ - "https://doi.org/10.1234/x": "Label A", + "https://doi.org/10.1234/x": "Label B", "10.1234/x": "Label B", } ) assert config.paper_labels == {"10.1234/x": "Label B"} + def test_aliases_default_empty(self) -> None: + assert CitationConfig().aliases == {} + + def test_aliases_normalizes_primary_and_versions(self) -> None: + config = CitationConfig( + dois=["10.1234/primary"], + aliases={ + "https://doi.org/10.1234/primary": [ + "https://doi.org/10.1101/preprint", + "10.1101/preprint", # duplicate after normalization + ] + }, + ) + assert config.aliases == {"10.1234/primary": ["10.1101/preprint"]} + + def test_aliases_rejects_invalid_doi(self) -> None: + with pytest.raises(ValidationError, match="Invalid DOI in aliases"): + CitationConfig(dois=["10.1234/primary"], aliases={"10.1234/primary": ["not-a-doi"]}) + + def test_aliases_rejects_empty_version(self) -> None: + with pytest.raises(ValidationError, match="Empty alias version DOI"): + CitationConfig(dois=["10.1234/primary"], aliases={"10.1234/primary": [""]}) + + def test_aliases_primary_must_be_in_dois(self) -> None: + with pytest.raises(ValidationError, match="not present in dois"): + CitationConfig(dois=["10.1234/a"], aliases={"10.1234/b": ["10.1101/x"]}) + def test_deduplicates_queries(self) -> None: """Should deduplicate queries.""" config = CitationConfig(queries=["query 1", "query 1", "query 2"]) diff --git a/tests/test_knowledge/test_openalex_citations.py b/tests/test_knowledge/test_openalex_citations.py index 7e270bb..dd2f921 100644 --- a/tests/test_knowledge/test_openalex_citations.py +++ b/tests/test_knowledge/test_openalex_citations.py @@ -16,6 +16,21 @@ ) +class TestCitesFilter: + def test_single_work_id(self): + assert OpenAlexCitationClient._cites_filter("W1") == "cites:W1" + + def test_multiple_work_ids_or_joined(self): + assert OpenAlexCitationClient._cites_filter(["W1", "W2", "W3"]) == "cites:W1|W2|W3" + + def test_filters_empty_ids(self): + assert OpenAlexCitationClient._cites_filter(["W1", "", "W2"]) == "cites:W1|W2" + + def test_empty_raises(self): + with pytest.raises(ValueError, match="at least one"): + OpenAlexCitationClient._cites_filter([]) + + def _client(handler) -> OpenAlexCitationClient: transport = httpx.MockTransport(handler) return OpenAlexCitationClient(email="t@example.org", client=httpx.Client(transport=transport)) @@ -82,6 +97,18 @@ def handler(request: httpx.Request) -> httpx.Response: counts = c.counts_by_year("W1") assert counts == {2024: 10, 2023: 5, 2022: 2} + def test_version_group_uses_or_joined_filter(self): + seen = {} + + def handler(request: httpx.Request) -> httpx.Response: + seen["filter"] = request.url.params.get("filter") + return httpx.Response(200, json={"group_by": [{"key": "2024", "count": 5}]}) + + with _client(handler) as c: + counts = c.counts_by_year(["W1", "W2"]) + assert seen["filter"] == "cites:W1|W2" + assert counts == {2024: 5} + def test_skips_non_year_buckets(self): def handler(_request: httpx.Request) -> httpx.Response: return httpx.Response( diff --git a/tests/test_knowledge/test_papers_sync.py b/tests/test_knowledge/test_papers_sync.py index 9c4ba93..ad6ce47 100644 --- a/tests/test_knowledge/test_papers_sync.py +++ b/tests/test_knowledge/test_papers_sync.py @@ -413,6 +413,44 @@ def factory(**_kwargs): assert stored == 0 assert stats.total == 0 + def test_version_aliases_merge_into_primary(self, tmp_path: Path, monkeypatch) -> None: + # Primary + preprint resolve to W1/W2; counts are queried as a group and + # attributed to the primary DOI. + seen = {} + + def handler(request: httpx.Request) -> httpx.Response: + url = str(request.url) + if "/works/doi:10.1/primary" in url: + return httpx.Response(200, json={"id": "https://openalex.org/W1"}) + if "/works/doi:10.1/preprint" in url: + return httpx.Response(200, json={"id": "https://openalex.org/W2"}) + if request.url.params.get("group_by"): + seen["filter"] = request.url.params.get("filter") + return httpx.Response(200, json={"group_by": [{"key": "2024", "count": 12}]}) + return httpx.Response(200, json={"meta": {"next_cursor": None}, "results": []}) + + def factory(**_kwargs): + return OpenAlexCitationClient( + client=httpx.Client(transport=httpx.MockTransport(handler)) + ) + + monkeypatch.setattr(ps, "OpenAlexCitationClient", factory) + + db_path = tmp_path / "knowledge" / "test.db" + with patch("src.knowledge.db.get_db_path", return_value=db_path): + init_db("test") + sync_citing_papers( + ["10.1/primary"], + project="test", + aliases={"10.1/primary": ["10.1/preprint"]}, + ) + stats = get_citation_stats("test") + + # Both work ids were OR-joined into one cites filter... + assert seen["filter"] == "cites:W1|W2" + # ...and the merged count is attributed to the primary DOI. + assert stats.by_paper == {"10.1/primary": {"2024": 12}} + def test_empty_counts_does_not_wipe_existing(self, tmp_path: Path, monkeypatch) -> None: # An empty histogram (likely a transient API gap) must not erase the # previously stored counts for that canonical DOI.