diff --git a/.gitignore b/.gitignore index 5316331..5a2e476 100644 --- a/.gitignore +++ b/.gitignore @@ -209,3 +209,4 @@ __marimo__/ # General *.DS_Store **.DS_Store +.idea/ diff --git a/src/core/config.py b/src/core/config.py index 1a1084f..f03c7e6 100644 --- a/src/core/config.py +++ b/src/core/config.py @@ -1,5 +1,5 @@ from pydantic_settings import BaseSettings - +from pathlib import Path class Settings(BaseSettings): # NCBI @@ -27,6 +27,6 @@ class Settings(BaseSettings): DEBUG: bool = True class Config: - env_file = ".env" + env_file = Path(__file__).resolve().parent.parent.parent / ".env" settings = Settings() \ No newline at end of file diff --git a/src/retrieval/pubmed.py b/src/retrieval/pubmed.py index ed418c1..a79a6e2 100644 --- a/src/retrieval/pubmed.py +++ b/src/retrieval/pubmed.py @@ -1,8 +1,28 @@ import requests +from lxml import etree from src.core.config import settings +def parse_data(xml_text: str) -> list[dict]: + article_list = [] + root = etree.fromstring(xml_text.encode("utf-8")) + + articles = root.findall(".//PubmedArticle") + + # For each article, extract fields using findtext + for article in articles: + pmid = article.findtext(".//PMID") + title = article.findtext(".//ArticleTitle") + abstract = article.findtext(".//AbstractText") + article_data = {"pmid": pmid, "title": title, "abstract": abstract} + if article_data["abstract"] is not None: + article_list.append(article_data) + + return article_list + + def search_pubmed(query: str, max_results: int = 10) -> list[dict]: base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" + article_list = [] esearch_params = { "db": "pubmed", @@ -29,7 +49,10 @@ def search_pubmed(query: str, max_results: int = 10) -> list[dict]: raw_data = requests.get(f"{base_url}efetch.fcgi", params=efetch_params) raw_data.raise_for_status() - return raw_data.text + article_list = parse_data(raw_data.text) + + return article_list except Exception as e: print(f"Error: {e}") +