DaneshjouLab
diff --git a/‎.gitignore‎
Lines changed: 16 additions & 3 deletions b/‎.gitignore‎
Lines changed: 16 additions & 3 deletions
diff --git a/‎README.MD‎
Lines changed: 27 additions & 9 deletions b/‎README.MD‎
Lines changed: 27 additions & 9 deletions
diff --git a/‎assets/annotations_diagram.svg‎
Lines changed: 1 addition & 0 deletions b/‎assets/annotations_diagram.svg‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎data/README.md‎
Lines changed: 17 additions & 0 deletions b/‎data/README.md‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎…h_articles/saved_data/pmcid_mapping.json‎ ‎data/pmcid_mapping.json‎src/fetch_articles/saved_data/pmcid_mapping.json renamed to data/pmcid_mapping.json b/‎…h_articles/saved_data/pmcid_mapping.json‎ ‎data/pmcid_mapping.json‎src/fetch_articles/saved_data/pmcid_mapping.json renamed to data/pmcid_mapping.json
diff --git a/‎pixi.toml‎
Lines changed: 3 additions & 1 deletion b/‎pixi.toml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/benchmark/README.md‎
Lines changed: 4 additions & 0 deletions b/‎src/benchmark/README.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/benchmark/__init__.py‎ b/‎src/benchmark/__init__.py‎
diff --git a/‎src/dataset/README.md‎
Lines changed: 8 additions & 0 deletions b/‎src/dataset/README.md‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/dataset/__init__.py‎ b/‎src/dataset/__init__.py‎
@@ -19,6 +19,19 @@ __pycache__
 .env
 
 # data
-src/load_data/saved_data/
-src/fetch_articles/saved_data/downloaded_pmcids.json
-src/fetch_articles/saved_data/articles/
+data/articles/
+data/variantAnnotations/
+data/unique_pmcids.json
+data/pmid_list.json
+data/downloaded_pmcids.json
+
+*.zip
+*.tar.gz
+*.tar.bz2
+*.tar.xz
+*.tar.lzma
+*.tar.lz
+*.tar.lzo
+
+.DS_Store
+
@@ -6,19 +6,37 @@
 
 # AutoGKB
 
-
+Goals:
+1. Fetch annotated articles from variantAnnotations stored in PharmGKB API
+2. Create a general benchmark for an extraction system that can output a score for an extraction system
+Given: Article, Ground Truth Variants (Manually extracted and recorded in var_drug_ann.tsv:)
+Input: Extracted Variants
+Output: Score 
+3. System for extracting drug related variants annotations from an article. Associations in which the variant affects a drug dose, response, metabolism, etc.
+4. Continously fetch new pharmacogenomic articles
 
 ## Description
 
 This repository contains Python scripts for running and building a Pharmacogenomic Agentic system to annotate and label genetic variants based on their phenotypical associations from journal articles. 
 
 
 ## Progress Tracker
-| Task | Status |
-| --- | --- |
-| Download the zip of variants from pharmgkb | ✅  |
-| Get a PMID list from the variants tsv (column PMID) | ✅ |
-| Convert the PMID to PMCID | ✅ |
-| Update to use non-official pmid to pmcid | |
-| Fetch the content from the PMCID |  |
-| Create pairing of annotations to article | |
+| Category | Task | Status |
+| --- | --- | --- |
+| Initial Download | Download the zip of variants from pharmgkb | ✅  |
+|                  | Get a PMID list from the variants tsv (column PMID) | ✅ |
+|                  | Convert the PMID to PMCID | ✅ |
+|                  | Update to use non-official pmid to pmcid (aaron's method) | |
+|                  | Fetch the content from the PMCID | ✅ |
+| Benchmark        | Create pairings of annotations to articles | |
+|                  | Create a niave score of number of matches | |
+|                  | Create group wise score | |
+|                  | Look into advanced scoring based on distance from truth per term | |
+| Workflows        | Integrate Aaron's current approach | |
+|                  | Document on individual annotation meanings | |
+|                  | Delegate annotation groupings to team members | |
+| New Article Fetching | Replicate PharGKB current workflow | |
+
+## System Overview
+![Annotations Diagram](assets/annotations_diagram.svg)
+
@@ -0,0 +1,17 @@
+# Data
+
+This directory contains the primary data files used by the AutoGKB project.
+
+## Directory Structure
+
+- **articles/** - Contains XML files of articles from PubMed Central (PMC), identified by their PMCID (e.g., PMC1234567.xml). These articles are used for text mining and information extraction.
+
+- **variantAnnotations/** - Contains clinical variant annotations and related data:
+  - `var_drug_ann.tsv` - Variant-drug annotations. This is what is used in this repo.
+  - This can be downloaded using download_and_extract_variant_annotations from the load_variants module
+
+- **Support Files**:
+  - `pmcid_mapping.json` - Maps between PMIDs and PMCIDs
+  - `unique_pmcids.json` - List of unique PMCIDs in the dataset
+  - `pmid_list.json` - List of PMIDs in the dataset
+  - `downloaded_pmcids.json` - Tracking which PMCIDs have been downloaded
@@ -12,7 +12,9 @@ platforms = ["osx-arm64"]
 version = "0.1.0"
 
 [tasks]
-update-downloaded-pmcids = "python -c 'from src.fetch_articles.article_downloader import update_downloaded_pmcids; update_downloaded_pmcids()'"
+download-variants = "python -m src.load_variants.load_clinical_variants"
+update-download-map = "python -c 'from src.fetch_articles.article_downloader import update_downloaded_pmcids; update_downloaded_pmcids()'"
+download-articles = "python -m src.fetch_articles.article_downloader"
 
 [dependencies]
 seaborn = ">=0.13.2,<0.14"
 
@@ -0,0 +1,4 @@
+# Benchmark
+
+## Functions
+1. Calculate the niave difference between an extracted variant and the ground truth variant on Variant Annotation ID
@@ -0,0 +1,8 @@
+# Dataset
+
+## Goal
+Convert the loaded files into a dataset where the annotations and raw text are paired with each other
+
+## Subgoals
+1. Understand the formats of the annotations
+2. Choose a format for the dataset