Skip to content

Commit 55c443b

Browse files
feat: bulleted study params
1 parent 0872b54 commit 55c443b

10 files changed

Lines changed: 525 additions & 284 deletions

data/annotations/PMC11730665.json

Lines changed: 84 additions & 42 deletions
Large diffs are not rendered by default.

data/annotations/PMC4737107.json

Lines changed: 94 additions & 86 deletions
Large diffs are not rendered by default.

data/annotations/PMC5712579.json

Lines changed: 86 additions & 47 deletions
Large diffs are not rendered by default.

data/annotations/PMC5728534.json

Lines changed: 92 additions & 49 deletions
Large diffs are not rendered by default.

data/annotations/PMC5749368.json

Lines changed: 97 additions & 35 deletions
Large diffs are not rendered by default.

pixi.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ download-articles = "python -m src.fetch_articles.article_downloader"
1818
download-data = "gdown --fuzzy https://drive.google.com/file/d/1qtQWvi0x_k5_JofgrfsgkWzlIdb6isr9/view && unzip autogkb-data.zip && rm autogkb-data.zip"
1919
setup-repo = "pixi install && pixi run download-data"
2020
copy-markdown = "python -m src.copy_markdown"
21-
annotation-pipeline = "python -m src.components.annotation_pipeline"
21+
annotation-pipeline = "python -m src.annotation_pipeline"
2222
test-citations = "python -m src.citations.one_shot_citations"
2323

2424
[dependencies]

src/annotation_pipeline.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ def run(self, save_path: str = "data/annotations"):
6767
logger.info(
6868
f"Adding Citations to Study Parameters using OneShotCitations with model {self.citation_model}"
6969
)
70-
for field_name in self.study_parameters.model_fields:
70+
for field_name in self.study_parameters.__class__.model_fields:
7171
if (
7272
field_name != "additional_resource_links"
7373
): # Skip non-ParameterWithCitations field

src/citations/one_shot_citations.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
- P-value: {annotation.p_value}
3737
3838
From the following article text, find the top sentence from the article that contains the p-value for the pharmacogenomic relationship.
39-
If a table provides the exact p-value, return the table header (## Table X: ..., etc.) as your sentence.
39+
If a table provides the exact p-value, return the table header (## Table X: ..., etc.) as your sentence. But prefer to use a sentence from the article text if it also provides the p-value.
4040
Article text:
4141
"{article_text}"
4242

src/deprecated/variant_association_pipeline.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
get_association_types,
2020
AssociationType,
2121
)
22-
from src.drug_annotation import extract_drug_annotations
22+
from src.deprecated.drug_annotation import extract_drug_annotations
2323
from src.deprecated.phenotype_annotation_extraction import (
2424
extract_phenotype_annotations,
2525
)

src/study_parameters.py

Lines changed: 68 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,44 @@
11
from pydantic import BaseModel
2-
from typing import List, Optional
2+
from typing import List, Optional, Union
33
from src.inference import PMCIDGenerator
44
from loguru import logger
55
import os
66
import json
7+
import re
78

89

910
class ParameterWithCitations(BaseModel):
1011
"""Model for a parameter with its content and citations"""
1112

12-
content: str
13+
content: Union[str, List[str]] # Can store either string or list of strings
1314
citations: Optional[List[str]] = None
1415

1516

17+
def parse_bullets_to_list(text: str) -> List[str]:
18+
"""Parse bulleted text into a list of strings."""
19+
if not text or not text.strip():
20+
return []
21+
22+
# Split by common bullet patterns
23+
lines = text.strip().split('\n')
24+
bullets = []
25+
26+
for line in lines:
27+
line = line.strip()
28+
if not line:
29+
continue
30+
31+
# Remove common bullet markers (•, -, numbers) but preserve markdown asterisks
32+
cleaned_line = re.sub(r'^[\s]*[\•\-\d+\.\)\]\s]+[\s]*', '', line)
33+
# Also remove standalone asterisks that are bullet markers (not part of markdown)
34+
cleaned_line = re.sub(r'^[\s]*\*[\s]+', '', cleaned_line)
35+
36+
if cleaned_line:
37+
bullets.append(cleaned_line)
38+
39+
# If no bullets were found, return the original text as a single item
40+
return bullets if bullets else [text.strip()]
41+
1642
class StudyParameters(BaseModel):
1743
summary: ParameterWithCitations
1844
study_type: ParameterWithCitations
@@ -22,6 +48,7 @@ class StudyParameters(BaseModel):
2248
allele_frequency: ParameterWithCitations
2349
additional_resource_links: List[str]
2450

51+
bulleted_output_queue = "Format the response as a bulleted list. Keep each bullet point concise (1-2 sentences maximum). If the format of the response is term: value, then have the term bolded (**term**) and the value in plain text. Do not include any other text and use markdown formatting for your response."
2552

2653
class StudyParametersGenerator:
2754
"""
@@ -43,7 +70,8 @@ def __init__(self, pmcid: str, model: str = "gpt-4o"):
4370
def get_summary(self) -> str:
4471
"""Extract a short 2-3 sentence summary of the study."""
4572
prompt = "Provide a short 2-3 sentence summary of the study motivation, design, and results."
46-
return self.generator.generate(prompt)
73+
output_queues = "Format the response as a short paragraph without using any bullet points."
74+
return self.generator.generate(prompt + output_queues)
4775

4876
def get_study_type(self) -> str:
4977
"""Extract the study type with explanation."""
@@ -68,34 +96,37 @@ def get_study_type(self) -> str:
6896

6997
return self.generator.generate(prompt)
7098

71-
def get_participant_info(self) -> str:
99+
def get_participant_info(self) -> List[str]:
72100
"""Extract participant information with explanation."""
73101
prompt = """What are the details about the participants in this study? Include age, gender, ethnicity, pre-existing conditions and any other relevant characteristics. Also breakdown this information by study group if applicable."""
74-
output_queues = "Don't use bullets points, use plain text. Keep response length to one paragraph (4-5 sentences) maximum."
75-
76-
return self.generator.generate(prompt + output_queues)
102+
output_queues = bulleted_output_queue
103+
response = self.generator.generate(prompt + output_queues)
104+
return parse_bullets_to_list(response)
77105

78-
def get_study_design(self) -> str:
106+
def get_study_design(self) -> List[str]:
79107
"""Extract study design information with explanation."""
80108
prompt = """Describe the study design, including the study population, sample size, and any other relevant details about how the study was conducted."""
81-
output_queues = "Don't use bullets points, use plain text. Keep response length to one paragraph (4-5 sentences) maximum."
82-
return self.generator.generate(prompt + output_queues)
109+
output_queues = bulleted_output_queue
110+
response = self.generator.generate(prompt + output_queues)
111+
return parse_bullets_to_list(response)
83112

84-
def get_study_results(self) -> str:
113+
def get_study_results(self) -> List[str]:
85114
"""Extract study results with explanation."""
86115
prompt = """What are the main study results and findings? Pay key attention to report any ratio statistics (hazard ratio, odds ratio, etc.) and p-values."""
87-
output_queues = "Don't use bullets points, use plain text. Keep response length to one paragraph (4-5 sentences) maximum."
88-
return self.generator.generate(prompt + output_queues)
116+
output_queues = bulleted_output_queue
117+
response = self.generator.generate(prompt + output_queues)
118+
return parse_bullets_to_list(response)
89119

90-
def get_allele_frequency(self) -> str:
120+
def get_allele_frequency(self) -> List[str]:
91121
"""Extract allele frequency information with explanation."""
92122
prompt = """What information is provided about allele frequencies of variants in the study population? Include the allele frequency in the studied cohorts and experiments if relevant."""
93-
output_queues = "Don't use bullets points, use plain text. Keep response length to one paragraph (2-3 sentences) maximum."
94-
return self.generator.generate(prompt + output_queues)
123+
output_queues = bulleted_output_queue
124+
response = self.generator.generate(prompt + output_queues)
125+
return parse_bullets_to_list(response)
95126

96127
def get_additional_resource_links(self) -> List[str]:
97128
"""Extract additional resource links."""
98-
prompt = """What additional resources or links are provided in the study, such as study protocols or data? This should not include other papers or references, but solely information that pertains to the design/execution of this study. Return as a list of links/resources."""
129+
prompt = """What additional resources or links are provided in the study, such as study protocols or data? This should not include other papers or references, but solely information that pertains to the design/execution of this study. Return as a list of links/resources in markdown format."""
99130

100131
response = self.generator.generate(prompt)
101132
# Parse the response to extract links if it's a string
@@ -152,16 +183,32 @@ def test_study_parameters():
152183
print(f" {study_parameters.study_type.content}")
153184

154185
print(f"\n👥 PARTICIPANT INFO:")
155-
print(f" {study_parameters.participant_info.content}")
186+
if isinstance(study_parameters.participant_info.content, list):
187+
for i, item in enumerate(study_parameters.participant_info.content, 1):
188+
print(f" • {item}")
189+
else:
190+
print(f" {study_parameters.participant_info.content}")
156191

157192
print(f"\n🔬 STUDY DESIGN:")
158-
print(f" {study_parameters.study_design.content}")
193+
if isinstance(study_parameters.study_design.content, list):
194+
for i, item in enumerate(study_parameters.study_design.content, 1):
195+
print(f" • {item}")
196+
else:
197+
print(f" {study_parameters.study_design.content}")
159198

160199
print(f"\n📊 STUDY RESULTS:")
161-
print(f" {study_parameters.study_results.content}")
200+
if isinstance(study_parameters.study_results.content, list):
201+
for i, item in enumerate(study_parameters.study_results.content, 1):
202+
print(f" • {item}")
203+
else:
204+
print(f" {study_parameters.study_results.content}")
162205

163206
print(f"\n🧬 ALLELE FREQUENCY:")
164-
print(f" {study_parameters.allele_frequency.content}")
207+
if isinstance(study_parameters.allele_frequency.content, list):
208+
for i, item in enumerate(study_parameters.allele_frequency.content, 1):
209+
print(f" • {item}")
210+
else:
211+
print(f" {study_parameters.allele_frequency.content}")
165212

166213
print(f"\n🔗 ADDITIONAL RESOURCES:")
167214
if study_parameters.additional_resource_links:

0 commit comments

Comments
 (0)