From 71a688a4a04295429c6c4ae6044f79fc81674b47 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Fri, 24 Apr 2026 11:56:48 -0400 Subject: [PATCH 1/3] feat: add ncbi gene summary --- src/wags_tails/__init__.py | 2 + src/wags_tails/base_source.py | 15 +++--- src/wags_tails/ncbi_gene_summary.py | 32 ++++++++++++ tests/fixtures/gene_summary.gz | Bin 0 -> 626 bytes tests/test_ncbi_gene_summary.py | 77 ++++++++++++++++++++++++++++ 5 files changed, 119 insertions(+), 7 deletions(-) create mode 100644 src/wags_tails/ncbi_gene_summary.py create mode 100644 tests/fixtures/gene_summary.gz create mode 100644 tests/test_ncbi_gene_summary.py diff --git a/src/wags_tails/__init__.py b/src/wags_tails/__init__.py index d9d4b47..69923e1 100644 --- a/src/wags_tails/__init__.py +++ b/src/wags_tails/__init__.py @@ -18,6 +18,7 @@ from wags_tails.moa import MoaData from wags_tails.mondo import MondoData from wags_tails.ncbi import NcbiGeneData, NcbiGenomeData +from wags_tails.ncbi_gene_summary import NcbiGeneSummaryData from wags_tails.ncbi_lrg_refseqgene import NcbiLrgRefSeqGeneData from wags_tails.ncbi_mane import NcbiManeRefSeqGenomicData, NcbiManeSummaryData from wags_tails.ncit import NcitData @@ -48,6 +49,7 @@ "MoaData", "MondoData", "NcbiGeneData", + "NcbiGeneSummaryData", "NcbiGenomeData", "NcbiLrgRefSeqGeneData", "NcbiManeRefSeqGenomicData", diff --git a/src/wags_tails/base_source.py b/src/wags_tails/base_source.py index f6f4ccc..1e5c48b 100644 --- a/src/wags_tails/base_source.py +++ b/src/wags_tails/base_source.py @@ -31,7 +31,7 @@ class DataSource(abc.ABC): # required attributes _src_name: str - _filetype: str + _filetype: str | None _versioned: bool = True def __init__(self, data_dir: Path | None = None, silent: bool = True) -> None: @@ -89,15 +89,16 @@ def get_latest( msg = "Cannot set both `force_refresh` and `from_local`" raise ValueError(msg) + filetype_suffix = f".{self._filetype}" if self._filetype else "" if from_local: file_glob = ( - f"{self._src_name}_*.{self._filetype}" + f"{self._src_name}_*{filetype_suffix}" if self._versioned - else f"{self._src_name}.{self._filetype}" + else f"{self._src_name}{filetype_suffix}" ) file_path = get_latest_local_file(self.data_dir, file_glob) version = ( - parse_file_version(file_path, f"{self._src_name}_(.+).{self._filetype}") + parse_file_version(file_path, f"{self._src_name}_(.+){filetype_suffix}") if self._versioned else "" ) @@ -105,16 +106,16 @@ def get_latest( latest_version = self._get_latest_version() latest_file = ( - f"{self._src_name}_{latest_version}.{self._filetype}" + f"{self._src_name}_{latest_version}{filetype_suffix}" if self._versioned - else f"{self._src_name}.{self._filetype}" + else f"{self._src_name}{filetype_suffix}" ) latest_file_path = self.data_dir / latest_file if (not force_refresh) and latest_file_path.exists(): _logger.debug( "Found existing file, %s, matching latest version %s.", latest_file_path.name, - latest_version if latest_version else "(unversioned)", + latest_version or "(unversioned)", ) return latest_file_path, latest_version self._download_data(latest_version, latest_file_path) diff --git a/src/wags_tails/ncbi_gene_summary.py b/src/wags_tails/ncbi_gene_summary.py new file mode 100644 index 0000000..166c3a6 --- /dev/null +++ b/src/wags_tails/ncbi_gene_summary.py @@ -0,0 +1,32 @@ +"""Get NCBI gene summary file + +Updated daily at https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_summary.gz + +See https://ftp.ncbi.nlm.nih.gov/gene/DATA/README for more info +""" + +from datetime import UTC, datetime +from pathlib import Path + +from wags_tails.base_source import DataSource +from wags_tails.utils.downloads import download_http, handle_gzip +from wags_tails.utils.versioning import DATE_VERSION_PATTERN + + +class NcbiGeneSummaryData(DataSource): + """Provide access to NCBI gene_summary file""" + + _src_name = "ncbi_gene_summary" + _filetype = "tsv" + + def _get_latest_version(self) -> str: + return datetime.now(UTC).strftime(DATE_VERSION_PATTERN) + + def _download_data(self, version: str, outfile: Path) -> None: # noqa: ARG002 + """Download data file to specified location. + + :param version: version to acquire + :param outfile: location and filename for final data file + """ + url = "https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_summary.gz" + download_http(url, outfile, handler=handle_gzip, tqdm_params=self._tqdm_params) diff --git a/tests/fixtures/gene_summary.gz b/tests/fixtures/gene_summary.gz new file mode 100644 index 0000000000000000000000000000000000000000..da639f5cc5bda68cc216e31e3aa8c2b06ab39410 GIT binary patch literal 626 zcmV-&0*(D2iwFo&z3OQI17~G!WnXi3ZEaz4c>wKGyOP^546L26Kxt=A9^1K$<1&+L z+@$cO%cP(vSi%eiG9cwU-`59}{JP{1VpWnT5ZGPpuGY)tWVv`;Z$51v*OMLYpYivk zB`bdS{cD72OZhniBM7nal3==86st$zGz0crSYHv>&{wC%q`s2);N1Z$eSzR(Dv}tE zRwyL$Y#a#Ioiq6Z{2pp#1bhvGG3Fz^)pjnCMd!3kz)NkIVjH16g4yrUraZLtsnMZ| zESZuVCH=`2^uw;*+l*}v1Ct)!8BM7OwV|FXN_T8gOhnjj!9)l~x$13eLeR3@H`ka7 z`KZ11CV7+;}g`qn0mk@T=L z0~{F49^j=R+vNHCPI~-*;jqW#gELVL15dgxc@If(=<`b023X4k-@8yt}NBh^?34rQZdy&N6wE1#u=`K_rK4v&z=D!Qm@RciZQ07goyK z*GzAyzBN)~8jPYq#do!`ZUaho8kCt!FeJIz3!OZ7Wznj`^N`lF3alnp?`C|rMpVxe zH&oK#N`VSbrhU5@HFv#%GSD6(sa`uhs~W(@rp`Z4r(75dMMG4x~T$I#zb=$B*Y$Iy?V|L;Tp M2Nz` Date: Fri, 24 Apr 2026 12:35:40 -0400 Subject: [PATCH 2/3] rm --- src/wags_tails/base_source.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/wags_tails/base_source.py b/src/wags_tails/base_source.py index 1e5c48b..db88b95 100644 --- a/src/wags_tails/base_source.py +++ b/src/wags_tails/base_source.py @@ -31,7 +31,7 @@ class DataSource(abc.ABC): # required attributes _src_name: str - _filetype: str | None + _filetype: str _versioned: bool = True def __init__(self, data_dir: Path | None = None, silent: bool = True) -> None: From e1ada81cce73d157f5dd5c9bb833184e933bfc0b Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Fri, 24 Apr 2026 12:36:09 -0400 Subject: [PATCH 3/3] revert --- src/wags_tails/base_source.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/wags_tails/base_source.py b/src/wags_tails/base_source.py index db88b95..f6f4ccc 100644 --- a/src/wags_tails/base_source.py +++ b/src/wags_tails/base_source.py @@ -89,16 +89,15 @@ def get_latest( msg = "Cannot set both `force_refresh` and `from_local`" raise ValueError(msg) - filetype_suffix = f".{self._filetype}" if self._filetype else "" if from_local: file_glob = ( - f"{self._src_name}_*{filetype_suffix}" + f"{self._src_name}_*.{self._filetype}" if self._versioned - else f"{self._src_name}{filetype_suffix}" + else f"{self._src_name}.{self._filetype}" ) file_path = get_latest_local_file(self.data_dir, file_glob) version = ( - parse_file_version(file_path, f"{self._src_name}_(.+){filetype_suffix}") + parse_file_version(file_path, f"{self._src_name}_(.+).{self._filetype}") if self._versioned else "" ) @@ -106,16 +105,16 @@ def get_latest( latest_version = self._get_latest_version() latest_file = ( - f"{self._src_name}_{latest_version}{filetype_suffix}" + f"{self._src_name}_{latest_version}.{self._filetype}" if self._versioned - else f"{self._src_name}{filetype_suffix}" + else f"{self._src_name}.{self._filetype}" ) latest_file_path = self.data_dir / latest_file if (not force_refresh) and latest_file_path.exists(): _logger.debug( "Found existing file, %s, matching latest version %s.", latest_file_path.name, - latest_version or "(unversioned)", + latest_version if latest_version else "(unversioned)", ) return latest_file_path, latest_version self._download_data(latest_version, latest_file_path)