Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/wags_tails/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from wags_tails.moa import MoaData
from wags_tails.mondo import MondoData
from wags_tails.ncbi import NcbiGeneData, NcbiGenomeData
from wags_tails.ncbi_gene_summary import NcbiGeneSummaryData
from wags_tails.ncbi_lrg_refseqgene import NcbiLrgRefSeqGeneData
from wags_tails.ncbi_mane import NcbiManeRefSeqGenomicData, NcbiManeSummaryData
from wags_tails.ncit import NcitData
Expand Down Expand Up @@ -48,6 +49,7 @@
"MoaData",
"MondoData",
"NcbiGeneData",
"NcbiGeneSummaryData",
"NcbiGenomeData",
"NcbiLrgRefSeqGeneData",
"NcbiManeRefSeqGenomicData",
Expand Down
32 changes: 32 additions & 0 deletions src/wags_tails/ncbi_gene_summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""Get NCBI gene summary file

Updated daily at https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_summary.gz

See https://ftp.ncbi.nlm.nih.gov/gene/DATA/README for more info
"""

from datetime import UTC, datetime
from pathlib import Path

from wags_tails.base_source import DataSource
from wags_tails.utils.downloads import download_http, handle_gzip
from wags_tails.utils.versioning import DATE_VERSION_PATTERN


class NcbiGeneSummaryData(DataSource):
"""Provide access to NCBI gene_summary file"""

_src_name = "ncbi_gene_summary"
_filetype = "tsv"

def _get_latest_version(self) -> str:
return datetime.now(UTC).strftime(DATE_VERSION_PATTERN)

def _download_data(self, version: str, outfile: Path) -> None: # noqa: ARG002
"""Download data file to specified location.

:param version: version to acquire
:param outfile: location and filename for final data file
"""
url = "https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_summary.gz"
download_http(url, outfile, handler=handle_gzip, tqdm_params=self._tqdm_params)
Binary file added tests/fixtures/gene_summary.gz
Binary file not shown.
77 changes: 77 additions & 0 deletions tests/test_ncbi_gene_summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""Test NCBI gene summary data source."""

import json
from datetime import UTC, datetime
from pathlib import Path

import pytest
import requests_mock

from wags_tails import NcbiGeneSummaryData


@pytest.fixture
def ncbi_gs_data_dir(base_data_dir: Path):
"""Provide fixture for ncbi gene summary wags-tails directory"""
directory = base_data_dir / "ncbi_gene_summary"
directory.mkdir(exist_ok=True, parents=True)
return directory


@pytest.fixture
def ncbi_gs(ncbi_gs_data_dir: Path):
"""Provide fixture for fetcher instance"""
return NcbiGeneSummaryData(ncbi_gs_data_dir, silent=True)


@pytest.fixture(scope="module")
def info_response(fixture_dir):
"""Provide fixture for ncbi website release info response"""
with (fixture_dir / "gene_summary.gz").open() as f:
return json.load(f)


@pytest.fixture(scope="module")
def ncbi_gs_file(fixture_dir):
"""Provide fixture for HGNC data file"""
with (fixture_dir / "gene_summary.gz").open("rb") as f:
return f.read()


def test_get_latest(
ncbi_gs: NcbiGeneSummaryData,
ncbi_gs_data_dir: Path,
ncbi_gs_file: bytes,
monkeypatch,
):
with pytest.raises(
ValueError, match="Cannot set both `force_refresh` and `from_local`"
):
ncbi_gs.get_latest(from_local=True, force_refresh=True)

with pytest.raises(FileNotFoundError):
ncbi_gs.get_latest(from_local=True)

class MockDateTime(datetime):
@classmethod
def now(cls, tz=None): # noqa: ARG003
return datetime(2026, 1, 1, tzinfo=UTC)

monkeypatch.setattr("wags_tails.ncbi_gene_summary.datetime", MockDateTime)

with requests_mock.Mocker() as m:
m.get(
"https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_summary.gz",
content=ncbi_gs_file,
)
path, version = ncbi_gs.get_latest()
assert path == ncbi_gs_data_dir / "ncbi_gene_summary_20260101.tsv"
assert path.exists()
assert version == "20260101"
assert m.call_count == 1

path, version = ncbi_gs.get_latest()
assert path == ncbi_gs_data_dir / "ncbi_gene_summary_20260101.tsv"
assert path.exists()
assert version == "20260101"
assert m.call_count == 1
Loading