Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/api/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ def _run_papers_sync_for_community(community_id: str) -> bool:
project=community_id,
openalex_api_key=settings.openalex_api_key,
openalex_email=settings.openalex_email,
aliases=citations.aliases,
)
total += citing_count

Expand Down
4 changes: 4 additions & 0 deletions src/assistants/bids/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -590,6 +590,10 @@ citations:
"10.1038/s41597-024-03559-8": "Motion-BIDS (Jeung 2024)"
"10.1038/s41597-025-05543-2": "MRS-BIDS (Bouchard 2025)"
"10.1371/journal.pcbi.1005209": "BIDS Apps (Gorgolewski 2017)"
# Merge preprint + published versions so split OpenAlex citations accumulate
aliases:
"10.1371/journal.pcbi.1005209": # BIDS Apps published (PLoS Comp Biol)
- "10.1101/079145" # BIDS Apps bioRxiv preprint (2016)

# Expose the citation dashboard as a public, read-only JSON feed
# (GET /bids/citations). FAQ feed stays off: BIDS has no FAQ pipeline configured.
Expand Down
4 changes: 4 additions & 0 deletions src/assistants/eeglab/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,10 @@ citations:
"10.1016/j.neuroimage.2019.05.026": "ICLabel (Pion-Tonachini 2019)"
"10.3389/fninf.2015.00016": "PREP (Bigdely-Shamlo 2015)"
"10.1162/IMAG.a.136": "LSL (Kothe 2025)"
# Merge preprint + published versions so split OpenAlex citations accumulate
aliases:
"10.1162/IMAG.a.136": # LSL published (Imaging Neuroscience)
- "10.1101/2024.02.13.580071" # LSL bioRxiv preprint (2024)

# Expose generated FAQ entries and citation stats as public, read-only JSON feeds
# (GET /eeglab/faq and GET /eeglab/citations). Off by default platform-wide.
Expand Down
20 changes: 18 additions & 2 deletions src/cli/sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,14 @@ def _get_community_paper_dois(community_id: str) -> list[str]:
return []


def _get_community_paper_aliases(community_id: str) -> dict[str, list[str]]:
"""Get the primary-DOI -> version-DOIs alias map from the registry."""
info = registry.get(community_id)
if info and info.community_config and info.community_config.citations:
return info.community_config.citations.aliases
return {}


def _get_all_community_ids() -> list[str]:
"""Get all registered community IDs."""
return [info.id for info in registry.list_all()]
Expand Down Expand Up @@ -372,7 +380,11 @@ def sync_papers(
if dois:
console.print(f"\n[dim]Syncing citations for {len(dois)} DOI(s)...[/dim]")
with console.status("[green]Syncing citations...[/green]"):
citing_count = sync_citing_papers(dois, project=community)
citing_count = sync_citing_papers(
dois,
project=community,
aliases=_get_community_paper_aliases(community),
)
results_by_source["citing"] = citing_count
total += citing_count
console.print(f"[dim]Recent citing papers stored: {citing_count}[/dim]")
Expand Down Expand Up @@ -585,7 +597,11 @@ def sync_all(
# sync_citing_papers' own default cap, not the per-query --limit.
if dois:
with console.status("[green]Syncing citing papers...[/green]"):
citing_count = sync_citing_papers(dois, project=comm_id)
citing_count = sync_citing_papers(
dois,
project=comm_id,
aliases=_get_community_paper_aliases(comm_id),
)
paper_total += citing_count

console.print(f"[green]Papers: {paper_total} items[/green]")
Expand Down
51 changes: 51 additions & 0 deletions src/core/config/community.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,45 @@ def validate_paper_labels(cls, v: dict[str, str]) -> dict[str, str]:
normalized[clean_doi] = label
return normalized

aliases: dict[str, list[str]] = Field(default_factory=dict)
"""Version DOIs to merge into a canonical paper's citation count.

Maps a primary DOI (from ``dois``) to other DOIs for the *same paper*
(typically a preprint and the published version). OpenAlex splits citations
across version records, so the citation sync queries them together and
deduplicates, attributing the merged per-year counts to the primary DOI.
Example: '10.1162/IMAG.a.136' -> ['10.1101/2024.02.13.580071']. Keys and
values are normalized like ``dois``."""

@field_validator("aliases")
@classmethod
def validate_aliases(cls, v: dict[str, list[str]]) -> dict[str, list[str]]:
"""Normalize and validate primary + alias DOIs (same rules as ``dois``)."""
doi_pattern = re.compile(r"^10\.\d{4,}/[^\s]+$")

def _clean(doi: str) -> str:
cleaned = re.sub(r"^(https?://)?(dx\.)?doi\.org/", "", doi.strip())
if cleaned and not doi_pattern.match(cleaned):
raise ValueError(f"Invalid DOI in aliases (expected '10.xxxx/yyyy'): {doi}")
return cleaned

normalized: dict[str, list[str]] = {}
for primary, versions in v.items():
clean_primary = _clean(primary)
if not clean_primary:
continue
clean_versions: list[str] = []
for d in versions:
clean = _clean(d)
if not clean:
# An empty version entry (e.g. `- ""`) is an authoring slip
# that would silently drop a version from the merge.
raise ValueError(f"Empty alias version DOI for primary '{primary}'")
if clean not in clean_versions:
clean_versions.append(clean)
normalized[clean_primary] = clean_versions
return normalized

@field_validator("queries")
@classmethod
def validate_queries(cls, v: list[str]) -> list[str]:
Expand Down Expand Up @@ -304,6 +343,18 @@ def validate_dois(cls, v: list[str]) -> list[str]:
# Deduplicate
return list(dict.fromkeys(normalized))

@model_validator(mode="after")
def validate_alias_primaries_in_dois(self) -> "CitationConfig":
"""Every alias primary DOI must be a tracked DOI, else the merge is a no-op.

Runs after field validators, so both ``dois`` and ``aliases`` keys are
already normalized and directly comparable.
"""
unknown = set(self.aliases) - set(self.dois)
if unknown:
raise ValueError(f"aliases primary DOIs not present in dois: {sorted(unknown)}")
return self


class DiscourseCategoryConfig(BaseModel):
"""A Discourse category to sync."""
Expand Down
40 changes: 29 additions & 11 deletions src/knowledge/openalex_citations.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
"""

import logging
from collections.abc import Sequence
from dataclasses import dataclass

import httpx
Expand Down Expand Up @@ -103,15 +104,28 @@ def resolve_work_id(self, doi: str) -> str | None:
work_id = _strip_id(resp.json().get("id"))
return work_id or None

def counts_by_year(self, work_id: str) -> dict[int, int]:
"""Return the complete per-year count of works citing ``work_id``.
@staticmethod
def _cites_filter(work_ids: str | Sequence[str]) -> str:
"""Build a ``cites:`` filter, OR-joining multiple work ids with ``|``.

Uses OpenAlex ``group_by`` so the counts are exact and uncapped,
independent of how many citing papers are stored.
OpenAlex deduplicates across an OR group, so passing every version of a
paper (preprint + published) yields the merged, non-double-counted set.
"""
ids = [work_ids] if isinstance(work_ids, str) else [w for w in work_ids if w]
if not ids:
raise ValueError("work_ids must contain at least one OpenAlex work id")
return "cites:" + "|".join(ids)

def counts_by_year(self, work_ids: str | Sequence[str]) -> dict[int, int]:
"""Return the complete per-year count of works citing ``work_ids``.

Accepts one work id or several (a version group); multiple ids are
OR-joined and deduplicated by OpenAlex. Uses ``group_by`` so the counts
are exact and uncapped, independent of how many papers are stored.
"""
resp = self._client.get(
f"{OPENALEX_BASE}/works",
params=self._params(filter=f"cites:{work_id}", group_by="publication_year"),
params=self._params(filter=self._cites_filter(work_ids), group_by="publication_year"),
)
resp.raise_for_status()
counts: dict[int, int] = {}
Expand All @@ -123,12 +137,16 @@ def counts_by_year(self, work_id: str) -> dict[int, int]:
counts[year] = int(group.get("count", 0))
return counts

def recent_citing_papers(self, work_id: str, limit: int = 2000) -> list[CitingPaper]:
"""Collect up to ``limit`` most-recent works citing ``work_id``.
def recent_citing_papers(
self, work_ids: str | Sequence[str], limit: int = 2000
) -> list[CitingPaper]:
"""Collect up to ``limit`` most-recent works citing ``work_ids``.

Cursor-paginates ``sort=publication_date:desc`` so the stored sample is
the newest citations rather than an arbitrary first page.
Accepts one work id or a version group (OR-joined, deduplicated by
OpenAlex). Cursor-paginates ``sort=publication_date:desc`` so the stored
sample is the newest citations rather than an arbitrary first page.
"""
cites_filter = self._cites_filter(work_ids)
papers: list[CitingPaper] = []
cursor: str | None = "*"
# Bound the page count: a highly-cited work may have title-less records
Expand All @@ -141,7 +159,7 @@ def recent_citing_papers(self, work_id: str, limit: int = 2000) -> list[CitingPa
resp = self._client.get(
f"{OPENALEX_BASE}/works",
params=self._params(
filter=f"cites:{work_id}",
filter=cites_filter,
sort="publication_date:desc",
select="id,doi,title,publication_date",
cursor=cursor,
Expand Down Expand Up @@ -173,7 +191,7 @@ def recent_citing_papers(self, work_id: str, limit: int = 2000) -> list[CitingPa
if pages >= max_pages and cursor:
logger.warning(
"recent_citing_papers hit page cap for %s (%d pages, %d stored)",
work_id,
cites_filter,
pages,
len(papers),
)
Expand Down
28 changes: 20 additions & 8 deletions src/knowledge/papers_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,7 @@ def sync_citing_papers(
project: str = "hed",
openalex_api_key: str | None = None,
openalex_email: str | None = None,
aliases: dict[str, list[str]] | None = None,
) -> int:
"""Sync citation data for the given canonical DOIs from OpenAlex.

Expand All @@ -418,14 +419,21 @@ def sync_citing_papers(
2. The latest ``max_results`` citing papers (publication date descending),
upserted into the ``papers`` table for the search corpus.

When a DOI has version ``aliases`` (e.g. a preprint plus the published
version), every version is resolved and queried together: OpenAlex splits
citations across version records, so OR-joining and deduplicating them
recovers the true count, attributed to the primary DOI.

Args:
dois: Canonical DOIs to track citations for (bare ``10.xxxx/yyyy``).
Unresolved DOIs are skipped with a warning.
dois: Canonical (primary) DOIs to track citations for. Unresolvable
DOIs are skipped with a warning.
max_results: Maximum number of recent citing papers stored per DOI.
Does not limit the per-year counts, which are always complete.
project: Project/community ID for database isolation.
openalex_api_key: Optional OpenAlex API key for premium throughput.
openalex_email: Optional email for the OpenAlex polite pool.
aliases: Optional map of primary DOI -> additional version DOIs whose
citations merge into the primary.

Returns:
Total citing papers stored across all DOIs (counts are uncapped).
Expand All @@ -435,34 +443,38 @@ def sync_citing_papers(

email = openalex_email or _OPENALEX_EMAIL or ""
api_key = openalex_api_key or _OPENALEX_API_KEY or ""
aliases = aliases or {}

total_stored = 0
with OpenAlexCitationClient(email=email, api_key=api_key) as client:
for doi in dois:
try:
work_id = client.resolve_work_id(doi)
if not work_id:
# Resolve the primary DOI plus any version aliases to a group of
# OpenAlex work ids; citations across the group are merged.
group_dois = [doi, *aliases.get(doi, [])]
work_ids = [wid for d in group_dois if (wid := client.resolve_work_id(d))]
if not work_ids:
logger.warning("Skipping citations: cannot resolve DOI %s", doi)
continue

# 1. Complete per-year counts (source of truth for the chart).
counts = client.counts_by_year(work_id)
counts = client.counts_by_year(work_ids)
if not counts:
# A canonical paper with zero citations is implausible; an
# empty histogram almost always means a transient OpenAlex
# gap. Do not wipe existing counts on a likely-bad read.
logger.warning(
"Empty citation histogram for %s (work %s); keeping existing "
"Empty citation histogram for %s (works %s); keeping existing "
"counts and skipping this DOI",
doi,
work_id,
work_ids,
)
continue
replace_citation_counts(doi, counts, project)
total_citations = sum(counts.values())

# 2. Latest citing papers for the search corpus.
papers = client.recent_citing_papers(work_id, limit=max_results)
papers = client.recent_citing_papers(work_ids, limit=max_results)
stored = _store_citing_papers(papers, project, cites_doi=doi)

update_sync_metadata("citations", f"citing_{doi}", total_citations, project)
Expand Down
29 changes: 28 additions & 1 deletion tests/test_core/test_config/test_community.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,12 +219,39 @@ def test_paper_labels_dedup_last_wins(self) -> None:
"""Two keys that normalize to the same DOI collapse to one (last wins)."""
config = CitationConfig(
paper_labels={
"https://doi.org/10.1234/x": "Label A",
"https://doi.org/10.1234/x": "Label B",
"10.1234/x": "Label B",
}
)
assert config.paper_labels == {"10.1234/x": "Label B"}

def test_aliases_default_empty(self) -> None:
assert CitationConfig().aliases == {}

def test_aliases_normalizes_primary_and_versions(self) -> None:
config = CitationConfig(
dois=["10.1234/primary"],
aliases={
"https://doi.org/10.1234/primary": [
"https://doi.org/10.1101/preprint",
"10.1101/preprint", # duplicate after normalization
]
},
)
assert config.aliases == {"10.1234/primary": ["10.1101/preprint"]}

def test_aliases_rejects_invalid_doi(self) -> None:
with pytest.raises(ValidationError, match="Invalid DOI in aliases"):
CitationConfig(dois=["10.1234/primary"], aliases={"10.1234/primary": ["not-a-doi"]})

def test_aliases_rejects_empty_version(self) -> None:
with pytest.raises(ValidationError, match="Empty alias version DOI"):
CitationConfig(dois=["10.1234/primary"], aliases={"10.1234/primary": [""]})

def test_aliases_primary_must_be_in_dois(self) -> None:
with pytest.raises(ValidationError, match="not present in dois"):
CitationConfig(dois=["10.1234/a"], aliases={"10.1234/b": ["10.1101/x"]})

def test_deduplicates_queries(self) -> None:
"""Should deduplicate queries."""
config = CitationConfig(queries=["query 1", "query 1", "query 2"])
Expand Down
27 changes: 27 additions & 0 deletions tests/test_knowledge/test_openalex_citations.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,21 @@
)


class TestCitesFilter:
def test_single_work_id(self):
assert OpenAlexCitationClient._cites_filter("W1") == "cites:W1"

def test_multiple_work_ids_or_joined(self):
assert OpenAlexCitationClient._cites_filter(["W1", "W2", "W3"]) == "cites:W1|W2|W3"

def test_filters_empty_ids(self):
assert OpenAlexCitationClient._cites_filter(["W1", "", "W2"]) == "cites:W1|W2"

def test_empty_raises(self):
with pytest.raises(ValueError, match="at least one"):
OpenAlexCitationClient._cites_filter([])


def _client(handler) -> OpenAlexCitationClient:
transport = httpx.MockTransport(handler)
return OpenAlexCitationClient(email="t@example.org", client=httpx.Client(transport=transport))
Expand Down Expand Up @@ -82,6 +97,18 @@ def handler(request: httpx.Request) -> httpx.Response:
counts = c.counts_by_year("W1")
assert counts == {2024: 10, 2023: 5, 2022: 2}

def test_version_group_uses_or_joined_filter(self):
seen = {}

def handler(request: httpx.Request) -> httpx.Response:
seen["filter"] = request.url.params.get("filter")
return httpx.Response(200, json={"group_by": [{"key": "2024", "count": 5}]})

with _client(handler) as c:
counts = c.counts_by_year(["W1", "W2"])
assert seen["filter"] == "cites:W1|W2"
assert counts == {2024: 5}

def test_skips_non_year_buckets(self):
def handler(_request: httpx.Request) -> httpx.Response:
return httpx.Response(
Expand Down
Loading
Loading