From 7e2b6f942f174aa1942e189d45be07f16a20e616 Mon Sep 17 00:00:00 2001 From: AndrewVFranco <129307231+AndrewVFranco@users.noreply.github.com> Date: Sat, 4 Apr 2026 02:36:36 -0700 Subject: [PATCH 1/3] Add pubmed data retrieval function --- .idea/.gitignore | 10 ---------- requirements.txt | 6 +++++- src/core/config.py | 4 +++- src/retrieval/pubmed.py | 35 +++++++++++++++++++++++++++++++++++ 4 files changed, 43 insertions(+), 12 deletions(-) delete mode 100644 .idea/.gitignore create mode 100644 src/retrieval/pubmed.py diff --git a/.idea/.gitignore b/.idea/.gitignore deleted file mode 100644 index ab1f416..0000000 --- a/.idea/.gitignore +++ /dev/null @@ -1,10 +0,0 @@ -# Default ignored files -/shelf/ -/workspace.xml -# Ignored default folder with query files -/queries/ -# Datasource local storage ignored files -/dataSources/ -/dataSources.local.xml -# Editor-based HTTP Client requests -/httpRequests/ diff --git a/requirements.txt b/requirements.txt index f1fa448..109cd41 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,8 @@ python-json-logger>=2.0.0 ruff>=0.4.0 # Pytest -pytest>=9.0.0 \ No newline at end of file +pytest>=9.0.0 + +# Pubmed requests +requests>=2.33.0 +lxml>=6.0.2 \ No newline at end of file diff --git a/src/core/config.py b/src/core/config.py index 98fc734..1a1084f 100644 --- a/src/core/config.py +++ b/src/core/config.py @@ -27,4 +27,6 @@ class Settings(BaseSettings): DEBUG: bool = True class Config: - env_file = ".env" \ No newline at end of file + env_file = ".env" + +settings = Settings() \ No newline at end of file diff --git a/src/retrieval/pubmed.py b/src/retrieval/pubmed.py new file mode 100644 index 0000000..ed418c1 --- /dev/null +++ b/src/retrieval/pubmed.py @@ -0,0 +1,35 @@ +import requests +from src.core.config import settings + +def search_pubmed(query: str, max_results: int = 10) -> list[dict]: + base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" + + esearch_params = { + "db": "pubmed", + "term": query, + "retmax": max_results, + "retmode": "json", + "api_key": settings.NCBI_API_KEY + } + + try: + response = requests.get(f"{base_url}esearch.fcgi", params=esearch_params) + response.raise_for_status() + pmids = response.json()["esearchresult"]["idlist"] + + pmids = ",".join(pmids) + + efetch_params = { + "db": "pubmed", + "id": pmids, + "retmode": "XML", + "api_key": settings.NCBI_API_KEY + } + + raw_data = requests.get(f"{base_url}efetch.fcgi", params=efetch_params) + raw_data.raise_for_status() + + return raw_data.text + except Exception as e: + print(f"Error: {e}") + From 47a0e7a7c336fe74cf65fda25391d9b2f3235994 Mon Sep 17 00:00:00 2001 From: AndrewVFranco <129307231+AndrewVFranco@users.noreply.github.com> Date: Sat, 4 Apr 2026 02:49:02 -0700 Subject: [PATCH 2/3] Add xml parsing helper function --- .gitignore | 1 + src/retrieval/pubmed.py | 25 ++++++++++++++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 5316331..5a2e476 100644 --- a/.gitignore +++ b/.gitignore @@ -209,3 +209,4 @@ __marimo__/ # General *.DS_Store **.DS_Store +.idea/ diff --git a/src/retrieval/pubmed.py b/src/retrieval/pubmed.py index ed418c1..a79a6e2 100644 --- a/src/retrieval/pubmed.py +++ b/src/retrieval/pubmed.py @@ -1,8 +1,28 @@ import requests +from lxml import etree from src.core.config import settings +def parse_data(xml_text: str) -> list[dict]: + article_list = [] + root = etree.fromstring(xml_text.encode("utf-8")) + + articles = root.findall(".//PubmedArticle") + + # For each article, extract fields using findtext + for article in articles: + pmid = article.findtext(".//PMID") + title = article.findtext(".//ArticleTitle") + abstract = article.findtext(".//AbstractText") + article_data = {"pmid": pmid, "title": title, "abstract": abstract} + if article_data["abstract"] is not None: + article_list.append(article_data) + + return article_list + + def search_pubmed(query: str, max_results: int = 10) -> list[dict]: base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" + article_list = [] esearch_params = { "db": "pubmed", @@ -29,7 +49,10 @@ def search_pubmed(query: str, max_results: int = 10) -> list[dict]: raw_data = requests.get(f"{base_url}efetch.fcgi", params=efetch_params) raw_data.raise_for_status() - return raw_data.text + article_list = parse_data(raw_data.text) + + return article_list except Exception as e: print(f"Error: {e}") + From 5f25b347d7029c17b95da5a285a5b11ad80ea30f Mon Sep 17 00:00:00 2001 From: AndrewVFranco <129307231+AndrewVFranco@users.noreply.github.com> Date: Sat, 4 Apr 2026 02:54:34 -0700 Subject: [PATCH 3/3] Fix path to key in config --- src/core/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core/config.py b/src/core/config.py index 1a1084f..f03c7e6 100644 --- a/src/core/config.py +++ b/src/core/config.py @@ -1,5 +1,5 @@ from pydantic_settings import BaseSettings - +from pathlib import Path class Settings(BaseSettings): # NCBI @@ -27,6 +27,6 @@ class Settings(BaseSettings): DEBUG: bool = True class Config: - env_file = ".env" + env_file = Path(__file__).resolve().parent.parent.parent / ".env" settings = Settings() \ No newline at end of file