diff --git a/src/wags_tails/__init__.py b/src/wags_tails/__init__.py index d9d4b47..69923e1 100644 --- a/src/wags_tails/__init__.py +++ b/src/wags_tails/__init__.py @@ -18,6 +18,7 @@ from wags_tails.moa import MoaData from wags_tails.mondo import MondoData from wags_tails.ncbi import NcbiGeneData, NcbiGenomeData +from wags_tails.ncbi_gene_summary import NcbiGeneSummaryData from wags_tails.ncbi_lrg_refseqgene import NcbiLrgRefSeqGeneData from wags_tails.ncbi_mane import NcbiManeRefSeqGenomicData, NcbiManeSummaryData from wags_tails.ncit import NcitData @@ -48,6 +49,7 @@ "MoaData", "MondoData", "NcbiGeneData", + "NcbiGeneSummaryData", "NcbiGenomeData", "NcbiLrgRefSeqGeneData", "NcbiManeRefSeqGenomicData", diff --git a/src/wags_tails/ncbi_gene_summary.py b/src/wags_tails/ncbi_gene_summary.py new file mode 100644 index 0000000..166c3a6 --- /dev/null +++ b/src/wags_tails/ncbi_gene_summary.py @@ -0,0 +1,32 @@ +"""Get NCBI gene summary file + +Updated daily at https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_summary.gz + +See https://ftp.ncbi.nlm.nih.gov/gene/DATA/README for more info +""" + +from datetime import UTC, datetime +from pathlib import Path + +from wags_tails.base_source import DataSource +from wags_tails.utils.downloads import download_http, handle_gzip +from wags_tails.utils.versioning import DATE_VERSION_PATTERN + + +class NcbiGeneSummaryData(DataSource): + """Provide access to NCBI gene_summary file""" + + _src_name = "ncbi_gene_summary" + _filetype = "tsv" + + def _get_latest_version(self) -> str: + return datetime.now(UTC).strftime(DATE_VERSION_PATTERN) + + def _download_data(self, version: str, outfile: Path) -> None: # noqa: ARG002 + """Download data file to specified location. + + :param version: version to acquire + :param outfile: location and filename for final data file + """ + url = "https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_summary.gz" + download_http(url, outfile, handler=handle_gzip, tqdm_params=self._tqdm_params) diff --git a/tests/fixtures/gene_summary.gz b/tests/fixtures/gene_summary.gz new file mode 100644 index 0000000..da639f5 Binary files /dev/null and b/tests/fixtures/gene_summary.gz differ diff --git a/tests/test_ncbi_gene_summary.py b/tests/test_ncbi_gene_summary.py new file mode 100644 index 0000000..be14639 --- /dev/null +++ b/tests/test_ncbi_gene_summary.py @@ -0,0 +1,77 @@ +"""Test NCBI gene summary data source.""" + +import json +from datetime import UTC, datetime +from pathlib import Path + +import pytest +import requests_mock + +from wags_tails import NcbiGeneSummaryData + + +@pytest.fixture +def ncbi_gs_data_dir(base_data_dir: Path): + """Provide fixture for ncbi gene summary wags-tails directory""" + directory = base_data_dir / "ncbi_gene_summary" + directory.mkdir(exist_ok=True, parents=True) + return directory + + +@pytest.fixture +def ncbi_gs(ncbi_gs_data_dir: Path): + """Provide fixture for fetcher instance""" + return NcbiGeneSummaryData(ncbi_gs_data_dir, silent=True) + + +@pytest.fixture(scope="module") +def info_response(fixture_dir): + """Provide fixture for ncbi website release info response""" + with (fixture_dir / "gene_summary.gz").open() as f: + return json.load(f) + + +@pytest.fixture(scope="module") +def ncbi_gs_file(fixture_dir): + """Provide fixture for HGNC data file""" + with (fixture_dir / "gene_summary.gz").open("rb") as f: + return f.read() + + +def test_get_latest( + ncbi_gs: NcbiGeneSummaryData, + ncbi_gs_data_dir: Path, + ncbi_gs_file: bytes, + monkeypatch, +): + with pytest.raises( + ValueError, match="Cannot set both `force_refresh` and `from_local`" + ): + ncbi_gs.get_latest(from_local=True, force_refresh=True) + + with pytest.raises(FileNotFoundError): + ncbi_gs.get_latest(from_local=True) + + class MockDateTime(datetime): + @classmethod + def now(cls, tz=None): # noqa: ARG003 + return datetime(2026, 1, 1, tzinfo=UTC) + + monkeypatch.setattr("wags_tails.ncbi_gene_summary.datetime", MockDateTime) + + with requests_mock.Mocker() as m: + m.get( + "https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_summary.gz", + content=ncbi_gs_file, + ) + path, version = ncbi_gs.get_latest() + assert path == ncbi_gs_data_dir / "ncbi_gene_summary_20260101.tsv" + assert path.exists() + assert version == "20260101" + assert m.call_count == 1 + + path, version = ncbi_gs.get_latest() + assert path == ncbi_gs_data_dir / "ncbi_gene_summary_20260101.tsv" + assert path.exists() + assert version == "20260101" + assert m.call_count == 1