OpenScience-Collective · neuromechanist · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026
diff --git a/src/api/scheduler.py b/src/api/scheduler.py
@@ -137,6 +137,7 @@ def _run_papers_sync_for_community(community_id: str) -> bool:
                 project=community_id,
                 openalex_api_key=settings.openalex_api_key,
                 openalex_email=settings.openalex_email,
+                aliases=citations.aliases,
             )
             total += citing_count
 

diff --git a/src/assistants/bids/config.yaml b/src/assistants/bids/config.yaml
@@ -590,6 +590,10 @@ citations:
     "10.1038/s41597-024-03559-8": "Motion-BIDS (Jeung 2024)"
     "10.1038/s41597-025-05543-2": "MRS-BIDS (Bouchard 2025)"
     "10.1371/journal.pcbi.1005209": "BIDS Apps (Gorgolewski 2017)"
+  # Merge preprint + published versions so split OpenAlex citations accumulate
+  aliases:
+    "10.1371/journal.pcbi.1005209":       # BIDS Apps published (PLoS Comp Biol)
+      - "10.1101/079145"                  # BIDS Apps bioRxiv preprint (2016)
 
 # Expose the citation dashboard as a public, read-only JSON feed
 # (GET /bids/citations). FAQ feed stays off: BIDS has no FAQ pipeline configured.

diff --git a/src/assistants/eeglab/config.yaml b/src/assistants/eeglab/config.yaml
@@ -433,6 +433,10 @@ citations:
     "10.1016/j.neuroimage.2019.05.026": "ICLabel (Pion-Tonachini 2019)"
     "10.3389/fninf.2015.00016": "PREP (Bigdely-Shamlo 2015)"
     "10.1162/IMAG.a.136": "LSL (Kothe 2025)"
+  # Merge preprint + published versions so split OpenAlex citations accumulate
+  aliases:
+    "10.1162/IMAG.a.136":                 # LSL published (Imaging Neuroscience)
+      - "10.1101/2024.02.13.580071"       # LSL bioRxiv preprint (2024)
 
 # Expose generated FAQ entries and citation stats as public, read-only JSON feeds
 # (GET /eeglab/faq and GET /eeglab/citations). Off by default platform-wide.

diff --git a/src/cli/sync.py b/src/cli/sync.py
@@ -99,6 +99,14 @@ def _get_community_paper_dois(community_id: str) -> list[str]:
     return []
 
 
+def _get_community_paper_aliases(community_id: str) -> dict[str, list[str]]:
+    """Get the primary-DOI -> version-DOIs alias map from the registry."""
+    info = registry.get(community_id)
+    if info and info.community_config and info.community_config.citations:
+        return info.community_config.citations.aliases
+    return {}
+
+
 def _get_all_community_ids() -> list[str]:
     """Get all registered community IDs."""
     return [info.id for info in registry.list_all()]
@@ -372,7 +380,11 @@ def sync_papers(
         if dois:
             console.print(f"\n[dim]Syncing citations for {len(dois)} DOI(s)...[/dim]")
             with console.status("[green]Syncing citations...[/green]"):
-                citing_count = sync_citing_papers(dois, project=community)
+                citing_count = sync_citing_papers(
+                    dois,
+                    project=community,
+                    aliases=_get_community_paper_aliases(community),
+                )
             results_by_source["citing"] = citing_count
             total += citing_count
             console.print(f"[dim]Recent citing papers stored: {citing_count}[/dim]")
@@ -585,7 +597,11 @@ def sync_all(
             # sync_citing_papers' own default cap, not the per-query --limit.
             if dois:
                 with console.status("[green]Syncing citing papers...[/green]"):
-                    citing_count = sync_citing_papers(dois, project=comm_id)
+                    citing_count = sync_citing_papers(
+                        dois,
+                        project=comm_id,
+                        aliases=_get_community_paper_aliases(comm_id),
+                    )
                 paper_total += citing_count
 
             console.print(f"[green]Papers: {paper_total} items[/green]")

diff --git a/src/core/config/community.py b/src/core/config/community.py
@@ -274,6 +274,45 @@ def validate_paper_labels(cls, v: dict[str, str]) -> dict[str, str]:
             normalized[clean_doi] = label
         return normalized
 
+    aliases: dict[str, list[str]] = Field(default_factory=dict)
+    """Version DOIs to merge into a canonical paper's citation count.
+
+    Maps a primary DOI (from ``dois``) to other DOIs for the *same paper*
+    (typically a preprint and the published version). OpenAlex splits citations
+    across version records, so the citation sync queries them together and
+    deduplicates, attributing the merged per-year counts to the primary DOI.
+    Example: '10.1162/IMAG.a.136' -> ['10.1101/2024.02.13.580071']. Keys and
+    values are normalized like ``dois``."""
+
+    @field_validator("aliases")
+    @classmethod
+    def validate_aliases(cls, v: dict[str, list[str]]) -> dict[str, list[str]]:
+        """Normalize and validate primary + alias DOIs (same rules as ``dois``)."""
+        doi_pattern = re.compile(r"^10\.\d{4,}/[^\s]+$")
+
+        def _clean(doi: str) -> str:
+            cleaned = re.sub(r"^(https?://)?(dx\.)?doi\.org/", "", doi.strip())
+            if cleaned and not doi_pattern.match(cleaned):
+                raise ValueError(f"Invalid DOI in aliases (expected '10.xxxx/yyyy'): {doi}")
+            return cleaned
+
+        normalized: dict[str, list[str]] = {}
+        for primary, versions in v.items():
+            clean_primary = _clean(primary)
+            if not clean_primary:
+                continue
+            clean_versions: list[str] = []
+            for d in versions:
+                clean = _clean(d)
+                if not clean:
+                    # An empty version entry (e.g. `- ""`) is an authoring slip
+                    # that would silently drop a version from the merge.
+                    raise ValueError(f"Empty alias version DOI for primary '{primary}'")
+                if clean not in clean_versions:
+                    clean_versions.append(clean)
+            normalized[clean_primary] = clean_versions
+        return normalized
+
     @field_validator("queries")
     @classmethod
     def validate_queries(cls, v: list[str]) -> list[str]:
@@ -304,6 +343,18 @@ def validate_dois(cls, v: list[str]) -> list[str]:
         # Deduplicate
         return list(dict.fromkeys(normalized))
 
+    @model_validator(mode="after")
+    def validate_alias_primaries_in_dois(self) -> "CitationConfig":
+        """Every alias primary DOI must be a tracked DOI, else the merge is a no-op.
+
+        Runs after field validators, so both ``dois`` and ``aliases`` keys are
+        already normalized and directly comparable.
+        """
+        unknown = set(self.aliases) - set(self.dois)
+        if unknown:
+            raise ValueError(f"aliases primary DOIs not present in dois: {sorted(unknown)}")
+        return self
+
 
 class DiscourseCategoryConfig(BaseModel):
     """A Discourse category to sync."""

diff --git a/src/knowledge/openalex_citations.py b/src/knowledge/openalex_citations.py
@@ -15,6 +15,7 @@
 """
 
 import logging
+from collections.abc import Sequence
 from dataclasses import dataclass
 
 import httpx
@@ -103,15 +104,28 @@ def resolve_work_id(self, doi: str) -> str | None:
         work_id = _strip_id(resp.json().get("id"))
         return work_id or None
 
-    def counts_by_year(self, work_id: str) -> dict[int, int]:
-        """Return the complete per-year count of works citing ``work_id``.
+    @staticmethod
+    def _cites_filter(work_ids: str | Sequence[str]) -> str:
+        """Build a ``cites:`` filter, OR-joining multiple work ids with ``|``.
 
-        Uses OpenAlex ``group_by`` so the counts are exact and uncapped,
-        independent of how many citing papers are stored.
+        OpenAlex deduplicates across an OR group, so passing every version of a
+        paper (preprint + published) yields the merged, non-double-counted set.
+        """
+        ids = [work_ids] if isinstance(work_ids, str) else [w for w in work_ids if w]
+        if not ids:
+            raise ValueError("work_ids must contain at least one OpenAlex work id")
+        return "cites:" + "|".join(ids)
+
+    def counts_by_year(self, work_ids: str | Sequence[str]) -> dict[int, int]:
+        """Return the complete per-year count of works citing ``work_ids``.
+
+        Accepts one work id or several (a version group); multiple ids are
+        OR-joined and deduplicated by OpenAlex. Uses ``group_by`` so the counts
+        are exact and uncapped, independent of how many papers are stored.
         """
         resp = self._client.get(
             f"{OPENALEX_BASE}/works",
-            params=self._params(filter=f"cites:{work_id}", group_by="publication_year"),
+            params=self._params(filter=self._cites_filter(work_ids), group_by="publication_year"),
         )
         resp.raise_for_status()
         counts: dict[int, int] = {}
@@ -123,12 +137,16 @@ def counts_by_year(self, work_id: str) -> dict[int, int]:
             counts[year] = int(group.get("count", 0))
         return counts
 
-    def recent_citing_papers(self, work_id: str, limit: int = 2000) -> list[CitingPaper]:
-        """Collect up to ``limit`` most-recent works citing ``work_id``.
+    def recent_citing_papers(
+        self, work_ids: str | Sequence[str], limit: int = 2000
+    ) -> list[CitingPaper]:
+        """Collect up to ``limit`` most-recent works citing ``work_ids``.
 
-        Cursor-paginates ``sort=publication_date:desc`` so the stored sample is
-        the newest citations rather than an arbitrary first page.
+        Accepts one work id or a version group (OR-joined, deduplicated by
+        OpenAlex). Cursor-paginates ``sort=publication_date:desc`` so the stored
+        sample is the newest citations rather than an arbitrary first page.
         """
+        cites_filter = self._cites_filter(work_ids)
         papers: list[CitingPaper] = []
         cursor: str | None = "*"
         # Bound the page count: a highly-cited work may have title-less records
@@ -141,7 +159,7 @@ def recent_citing_papers(self, work_id: str, limit: int = 2000) -> list[CitingPa
             resp = self._client.get(
                 f"{OPENALEX_BASE}/works",
                 params=self._params(
-                    filter=f"cites:{work_id}",
+                    filter=cites_filter,
                     sort="publication_date:desc",
                     select="id,doi,title,publication_date",
                     cursor=cursor,
@@ -173,7 +191,7 @@ def recent_citing_papers(self, work_id: str, limit: int = 2000) -> list[CitingPa
         if pages >= max_pages and cursor:
             logger.warning(
                 "recent_citing_papers hit page cap for %s (%d pages, %d stored)",
-                work_id,
+                cites_filter,
                 pages,
                 len(papers),
             )

diff --git a/src/knowledge/papers_sync.py b/src/knowledge/papers_sync.py
@@ -405,6 +405,7 @@ def sync_citing_papers(
     project: str = "hed",
     openalex_api_key: str | None = None,
     openalex_email: str | None = None,
+    aliases: dict[str, list[str]] | None = None,
 ) -> int:
     """Sync citation data for the given canonical DOIs from OpenAlex.
 
@@ -418,14 +419,21 @@ def sync_citing_papers(
     2. The latest ``max_results`` citing papers (publication date descending),
        upserted into the ``papers`` table for the search corpus.
 
+    When a DOI has version ``aliases`` (e.g. a preprint plus the published
+    version), every version is resolved and queried together: OpenAlex splits
+    citations across version records, so OR-joining and deduplicating them
+    recovers the true count, attributed to the primary DOI.
+
     Args:
-        dois: Canonical DOIs to track citations for (bare ``10.xxxx/yyyy``).
-            Unresolved DOIs are skipped with a warning.
+        dois: Canonical (primary) DOIs to track citations for. Unresolvable
+            DOIs are skipped with a warning.
         max_results: Maximum number of recent citing papers stored per DOI.
             Does not limit the per-year counts, which are always complete.
         project: Project/community ID for database isolation.
         openalex_api_key: Optional OpenAlex API key for premium throughput.
         openalex_email: Optional email for the OpenAlex polite pool.
+        aliases: Optional map of primary DOI -> additional version DOIs whose
+            citations merge into the primary.
 
     Returns:
         Total citing papers stored across all DOIs (counts are uncapped).
@@ -435,34 +443,38 @@ def sync_citing_papers(
 
     email = openalex_email or _OPENALEX_EMAIL or ""
     api_key = openalex_api_key or _OPENALEX_API_KEY or ""
+    aliases = aliases or {}
 
     total_stored = 0
     with OpenAlexCitationClient(email=email, api_key=api_key) as client:
         for doi in dois:
             try:
-                work_id = client.resolve_work_id(doi)
-                if not work_id:
+                # Resolve the primary DOI plus any version aliases to a group of
+                # OpenAlex work ids; citations across the group are merged.
+                group_dois = [doi, *aliases.get(doi, [])]
+                work_ids = [wid for d in group_dois if (wid := client.resolve_work_id(d))]
+                if not work_ids:
                     logger.warning("Skipping citations: cannot resolve DOI %s", doi)
                     continue
 
                 # 1. Complete per-year counts (source of truth for the chart).
-                counts = client.counts_by_year(work_id)
+                counts = client.counts_by_year(work_ids)
                 if not counts:
                     # A canonical paper with zero citations is implausible; an
                     # empty histogram almost always means a transient OpenAlex
                     # gap. Do not wipe existing counts on a likely-bad read.
                     logger.warning(
-                        "Empty citation histogram for %s (work %s); keeping existing "
+                        "Empty citation histogram for %s (works %s); keeping existing "
                         "counts and skipping this DOI",
                         doi,
-                        work_id,
+                        work_ids,
                     )
                     continue
                 replace_citation_counts(doi, counts, project)
                 total_citations = sum(counts.values())
 
                 # 2. Latest citing papers for the search corpus.
-                papers = client.recent_citing_papers(work_id, limit=max_results)
+                papers = client.recent_citing_papers(work_ids, limit=max_results)
                 stored = _store_citing_papers(papers, project, cites_doi=doi)
 
                 update_sync_metadata("citations", f"citing_{doi}", total_citations, project)

diff --git a/tests/test_core/test_config/test_community.py b/tests/test_core/test_config/test_community.py
@@ -219,12 +219,39 @@ def test_paper_labels_dedup_last_wins(self) -> None:
         """Two keys that normalize to the same DOI collapse to one (last wins)."""
         config = CitationConfig(
             paper_labels={
-                "https://doi.org/10.1234/x": "Label A",
+                "https://doi.org/10.1234/x": "Label B",
                 "10.1234/x": "Label B",
             }
         )
         assert config.paper_labels == {"10.1234/x": "Label B"}
 
+    def test_aliases_default_empty(self) -> None:
+        assert CitationConfig().aliases == {}
+
+    def test_aliases_normalizes_primary_and_versions(self) -> None:
+        config = CitationConfig(
+            dois=["10.1234/primary"],
+            aliases={
+                "https://doi.org/10.1234/primary": [
+                    "https://doi.org/10.1101/preprint",
+                    "10.1101/preprint",  # duplicate after normalization
+                ]
+            },
+        )
+        assert config.aliases == {"10.1234/primary": ["10.1101/preprint"]}
+
+    def test_aliases_rejects_invalid_doi(self) -> None:
+        with pytest.raises(ValidationError, match="Invalid DOI in aliases"):
+            CitationConfig(dois=["10.1234/primary"], aliases={"10.1234/primary": ["not-a-doi"]})
+
+    def test_aliases_rejects_empty_version(self) -> None:
+        with pytest.raises(ValidationError, match="Empty alias version DOI"):
+            CitationConfig(dois=["10.1234/primary"], aliases={"10.1234/primary": [""]})
+
+    def test_aliases_primary_must_be_in_dois(self) -> None:
+        with pytest.raises(ValidationError, match="not present in dois"):
+            CitationConfig(dois=["10.1234/a"], aliases={"10.1234/b": ["10.1101/x"]})
+
     def test_deduplicates_queries(self) -> None:
         """Should deduplicate queries."""
         config = CitationConfig(queries=["query 1", "query 1", "query 2"])

diff --git a/tests/test_knowledge/test_openalex_citations.py b/tests/test_knowledge/test_openalex_citations.py
@@ -16,6 +16,21 @@
 )
 
 
+class TestCitesFilter:
+    def test_single_work_id(self):
+        assert OpenAlexCitationClient._cites_filter("W1") == "cites:W1"
+
+    def test_multiple_work_ids_or_joined(self):
+        assert OpenAlexCitationClient._cites_filter(["W1", "W2", "W3"]) == "cites:W1|W2|W3"
+
+    def test_filters_empty_ids(self):
+        assert OpenAlexCitationClient._cites_filter(["W1", "", "W2"]) == "cites:W1|W2"
+
+    def test_empty_raises(self):
+        with pytest.raises(ValueError, match="at least one"):
+            OpenAlexCitationClient._cites_filter([])
+
+
 def _client(handler) -> OpenAlexCitationClient:
     transport = httpx.MockTransport(handler)
     return OpenAlexCitationClient(email="t@example.org", client=httpx.Client(transport=transport))
@@ -82,6 +97,18 @@ def handler(request: httpx.Request) -> httpx.Response:
             counts = c.counts_by_year("W1")
         assert counts == {2024: 10, 2023: 5, 2022: 2}
 
+    def test_version_group_uses_or_joined_filter(self):
+        seen = {}
+
+        def handler(request: httpx.Request) -> httpx.Response:
+            seen["filter"] = request.url.params.get("filter")
+            return httpx.Response(200, json={"group_by": [{"key": "2024", "count": 5}]})
+
+        with _client(handler) as c:
+            counts = c.counts_by_year(["W1", "W2"])
+        assert seen["filter"] == "cites:W1|W2"
+        assert counts == {2024: 5}
+
     def test_skips_non_year_buckets(self):
         def handler(_request: httpx.Request) -> httpx.Response:
             return httpx.Response(