From c1884a83b63d7caa402aeaaa7b0e7d208ba3adbf Mon Sep 17 00:00:00 2001 From: AndrewVFranco <129307231+AndrewVFranco@users.noreply.github.com> Date: Tue, 7 Apr 2026 22:21:16 -0700 Subject: [PATCH 1/3] Change database dependencies from ChromaDB to Pinecone --- .env.example | 4 ++-- requirements.txt | 17 ++++++++++++++--- src/core/config.py | 6 +++--- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/.env.example b/.env.example index ed87c53..86dd02e 100644 --- a/.env.example +++ b/.env.example @@ -5,8 +5,8 @@ OLLAMA_SERVER_PORT=11434 OLLAMA_MODEL=llama3.2 OLLAMA_MAX_TOKENS=1024 OLLAMA_TEMPERATURE=0.1 -CHROMA_DB_LOCATION=data/chroma_db -CHROMA_COLLECTION_NAME=pubmed_abstracts +PINECONE_API_KEY=your_key_here +PINECONE_INDEX_NAME=your_index_name MLFLOW_TRACKING_URI=http://localhost:5000 MLFLOW_ARTIFACT_LOCATION=logs/mlflow FASTAPI_HOST=0.0.0.0 diff --git a/requirements.txt b/requirements.txt index b09a671..902c846 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,6 +15,17 @@ pytest>=9.0.0 requests>=2.33.0 lxml>=6.0.2 -# ChromaDB -chromadb>=0.4.0 -sentence-transformers>=2.2.0 \ No newline at end of file +# Pinecone +pinecone>=3.0.0 + +# Agent framework +langchain>=0.2.0 +langchain-openai>=0.1.0 +langchain-community>=0.2.0 +langgraph>=0.1.0 + +# Redis caching +redis>=5.0.0 + +# OpenAI LLM +openai>=1.0.0 \ No newline at end of file diff --git a/src/core/config.py b/src/core/config.py index e7bef83..303a628 100644 --- a/src/core/config.py +++ b/src/core/config.py @@ -15,9 +15,9 @@ class Settings(BaseSettings): OLLAMA_MAX_TOKENS: int = 1024 OLLAMA_TEMPERATURE: float = 0.1 - # ChromaDB - CHROMA_DB_LOCATION: str = str(Path(__file__).resolve().parent.parent.parent / "data" / "chroma_db") - CHROMA_COLLECTION_NAME: str = "pubmed_abstracts" + # Pinecone + PINECONE_API_KEY = str + PINECONE_INDEX_NAME = str # MLflow MLFLOW_TRACKING_URI: str = "http://localhost:5000" From eddad7c6a07dba41fa36acd4629ddf9d0284fdb5 Mon Sep 17 00:00:00 2001 From: AndrewVFranco <129307231+AndrewVFranco@users.noreply.github.com> Date: Tue, 7 Apr 2026 22:24:05 -0700 Subject: [PATCH 2/3] Add BioBERT embedding model to requirements.txt --- requirements.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/requirements.txt b/requirements.txt index 902c846..85dd180 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,6 +18,9 @@ lxml>=6.0.2 # Pinecone pinecone>=3.0.0 +# BioBERT embedding model +sentence-transformers>=2.2.0 + # Agent framework langchain>=0.2.0 langchain-openai>=0.1.0 From 68b5802f4dfc677964acbfb7f8cd3611f85a910a Mon Sep 17 00:00:00 2001 From: AndrewVFranco <129307231+AndrewVFranco@users.noreply.github.com> Date: Tue, 7 Apr 2026 22:43:34 -0700 Subject: [PATCH 3/3] Update vector_store to use Pinecone --- src/core/config.py | 4 ++-- src/retrieval/vector_store.py | 45 ++++++++++++++++++++++------------- 2 files changed, 31 insertions(+), 18 deletions(-) diff --git a/src/core/config.py b/src/core/config.py index 303a628..5c210a8 100644 --- a/src/core/config.py +++ b/src/core/config.py @@ -16,8 +16,8 @@ class Settings(BaseSettings): OLLAMA_TEMPERATURE: float = 0.1 # Pinecone - PINECONE_API_KEY = str - PINECONE_INDEX_NAME = str + PINECONE_API_KEY: str + PINECONE_INDEX_NAME: str # MLflow MLFLOW_TRACKING_URI: str = "http://localhost:5000" diff --git a/src/retrieval/vector_store.py b/src/retrieval/vector_store.py index 03f8a8b..a2ab6f8 100644 --- a/src/retrieval/vector_store.py +++ b/src/retrieval/vector_store.py @@ -1,7 +1,9 @@ -import chromadb +from pinecone import Pinecone from sentence_transformers import SentenceTransformer from src.core.config import settings import os +from src.retrieval.pubmed import search_pubmed +import sys os.environ["HUGGING_FACE_HUB_TOKEN"] = settings.HF_TOKEN model = SentenceTransformer("pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb") @@ -16,30 +18,41 @@ def _embed_text(text: str) -> list[float]: def get_collection(): global _collection if _collection is None: - print(settings.CHROMA_DB_LOCATION) - client = chromadb.PersistentClient(path=settings.CHROMA_DB_LOCATION) - _collection = client.get_or_create_collection(name=settings.CHROMA_COLLECTION_NAME) + pc = Pinecone(api_key=settings.PINECONE_API_KEY) + _collection = pc.Index(settings.PINECONE_INDEX_NAME) return _collection def add_abstracts(abstracts: list[dict]): - ids = [] - embeddings = [] - documents = [] - metadatas = [] + data_list = [] for item in abstracts: - ids.append(item["pmid"]) - embeddings.append(_embed_text(item["abstract"])) - documents.append(item["abstract"]) - metadatas.append({"pmid": item["pmid"], "title": item["title"]}) - - get_collection().add(ids=ids, embeddings=embeddings, documents=documents, metadatas=metadatas) + data = {'id': item["pmid"], + 'values': _embed_text(item["abstract"]), + 'metadata': { + "title": item["title"], + "abstract": item["abstract"], + "pmid": item["pmid"] + } + } + data_list.append(data) + + get_collection().upsert(vectors=data_list) def query_abstracts(query: str, n_results: int = 5) -> list[dict]: embedding = _embed_text(query) results = get_collection().query( - query_embeddings=[embedding], - n_results=n_results + vector=embedding, + top_k=n_results, + include_metadata=True ) return results + +def main(): + search_results = search_pubmed("myocardial infarction", max_results=5) + add_abstracts(search_results) + results = query_abstracts("chest pain treatment") + print(results) + +if __name__ == "__main__": + sys.exit(main())