From e6031e0f14c2da3a20c618d7ca04e788d4881dfb Mon Sep 17 00:00:00 2001 From: Joe C Date: Sun, 8 Oct 2017 22:07:09 -0400 Subject: [PATCH 1/5] JC added opening the file, calculating seqLength, calculating GC percentage, completed first histogram, notes added --- Exercise7answers.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100755 Exercise7answers.py diff --git a/Exercise7answers.py b/Exercise7answers.py new file mode 100755 index 0000000..c70ab25 --- /dev/null +++ b/Exercise7answers.py @@ -0,0 +1,38 @@ +# open fasta file +InFile=open("Lecture11.fasta","r") + +#create lists for storing information about sequences +sequenceID=[] +sequenceLength=[] +percentGC=[] + +#loop through each line of fasta file to process sequences +for Line in InFile: + # remove newline character from file line + Line=Line.strip() + # if a sequence record + if '>' in Line: + # add the sequence ID (except the ">" character) to the sequenceID list + sequenceID.append(Line[1:]) + # if a sequence line + else: + # get the number of characters in the sequence and convert to a float to avoid integer division + seqLen=float(len(Line)) + # count the number of G's and C's + nG=Line.count("G") + nC=Line.count("C") + + # append values to the lists + sequenceLength.append(seqLen) + percentGC.append((nG+nC)/seqLen*100) + +import pandas +# combine lists into dataframe +seqDF = pandas.DataFrame(list(zip(sequenceID,sequenceLength,percentGC)),columns=['sequenceID','sequenceLength','percentGC']) + +from plotnine import * +# histogram of sequence length +histogram1=ggplot(seqDF,aes(x="sequenceLength")) +histogram1+geom_histogram()+theme_classic() + + From 430cf4e3d37fe3b676eb7f5d96224463b0093811 Mon Sep 17 00:00:00 2001 From: Joe C Date: Sun, 8 Oct 2017 22:17:49 -0400 Subject: [PATCH 2/5] JC added histogram2, changed colors and optimized bin size --- Exercise7answers.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/Exercise7answers.py b/Exercise7answers.py index c70ab25..92bfab3 100755 --- a/Exercise7answers.py +++ b/Exercise7answers.py @@ -33,6 +33,12 @@ from plotnine import * # histogram of sequence length histogram1=ggplot(seqDF,aes(x="sequenceLength")) -histogram1+geom_histogram()+theme_classic() +histogram1+geom_histogram(binwidth=20,fill='blue',color='black')+theme_classic() + +# histogram of percentGC +histogram2=ggplot(seqDF,aes(x="percentGC")) +histogram2+geom_histogram()+theme_classic() +# changing colors and bins +histogram2+geom_histogram(binwidth=15,fill='yellow',color='black')+theme_classic() From 35a4c22bfd9e5e65cc578493e476642714a5ede0 Mon Sep 17 00:00:00 2001 From: Joe C Date: Thu, 12 Oct 2017 14:25:56 -0400 Subject: [PATCH 3/5] question 1 complete. no question 2. working on question 3 part 1 with part 2 finished.JC --- Exercise7answers.py | 50 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/Exercise7answers.py b/Exercise7answers.py index 92bfab3..fc3c110 100755 --- a/Exercise7answers.py +++ b/Exercise7answers.py @@ -1,3 +1,8 @@ +# Exercise 7 + + +#Question 1 + # open fasta file InFile=open("Lecture11.fasta","r") @@ -41,4 +46,49 @@ # changing colors and bins histogram2+geom_histogram(binwidth=15,fill='yellow',color='black')+theme_classic() +#Question 2 + + +#Question 3 + +# open data file +q3north=0 +q3south=0 +q3west=0 +q3east=0 + + +for i in range(0,len(q3),1): + if q3.region[i]=="north": + q3north=q3north+q3.observations[i] + elif q3.region[i]=="south": + q3south=q3south+q3.observations[i] + elif q3.region[i]=="east": + q3east=q3east+q3.observations[i] + elif q3.region[i]=="west": + q3west=q3west+q3.observations[i] + +q3north +q3south +q3west +q3east + +avgnorth=(sum(q3.region=="north")/len(q3north)) +sum(q3.region=="south") +sum(q3.region=="east") +sum(q3.region=="west") + + +import pandas +q3=pandas.read_csv("data.txt", sep=",", header=0) +average=q3.groupby('region')['observations'].mean() + +regions=q3.groupby('region')['region'] +seqDF=pandas.DataFrame(list(zip(average)),columns=['regions','observations'] set='\t') +df=average.to_frame(name=none) + +#scatter plot with jitter applied +from plotnine import * +q3sp=ggplot(q3,aes(x="region",y="observations")) +q3sp+geom_jitter()+coord_cartesian() From db9483d7f283d6443e561a204adff9e6044659bb Mon Sep 17 00:00:00 2001 From: Joe C Date: Thu, 12 Oct 2017 15:59:44 -0400 Subject: [PATCH 4/5] finished question 3. JC --- Exercise7answers.py | 43 ++++++++++++------------------------------- 1 file changed, 12 insertions(+), 31 deletions(-) diff --git a/Exercise7answers.py b/Exercise7answers.py index fc3c110..8fa59fa 100755 --- a/Exercise7answers.py +++ b/Exercise7answers.py @@ -51,44 +51,25 @@ #Question 3 -# open data file -q3north=0 -q3south=0 -q3west=0 -q3east=0 - - -for i in range(0,len(q3),1): - if q3.region[i]=="north": - q3north=q3north+q3.observations[i] - elif q3.region[i]=="south": - q3south=q3south+q3.observations[i] - elif q3.region[i]=="east": - q3east=q3east+q3.observations[i] - elif q3.region[i]=="west": - q3west=q3west+q3.observations[i] - -q3north -q3south -q3west -q3east - -avgnorth=(sum(q3.region=="north")/len(q3north)) -sum(q3.region=="south") -sum(q3.region=="east") -sum(q3.region=="west") - - +#open file and import pandas import pandas q3=pandas.read_csv("data.txt", sep=",", header=0) + +#group by the region and find the mean of each region average=q3.groupby('region')['observations'].mean() +#print to dataframe +df=average.to_frame() +#add the region rows - can find the order if you print the previous variable +df['region']=["east", "north", "south", "west"] -regions=q3.groupby('region')['region'] -seqDF=pandas.DataFrame(list(zip(average)),columns=['regions','observations'] set='\t') -df=average.to_frame(name=none) +#making a bar graph with the avg of the corresponding regions +from plotnine import * +q3bp=ggplot(df)+theme_classic()+xlab("region")+ylab("observations") +q3bp+geom_bar(aes(x="region",y="observations"),stat="summary",) #scatter plot with jitter applied from plotnine import * q3sp=ggplot(q3,aes(x="region",y="observations")) q3sp+geom_jitter()+coord_cartesian() + From f3ad056a999ed40db0b4aa00d672e104ec0d486b Mon Sep 17 00:00:00 2001 From: Patrick Doherty Date: Thu, 12 Oct 2017 23:00:53 -0400 Subject: [PATCH 5/5] Scatter plot with trend line --- Ex7_02.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100755 Ex7_02.py diff --git a/Ex7_02.py b/Ex7_02.py new file mode 100755 index 0000000..4a711c1 --- /dev/null +++ b/Ex7_02.py @@ -0,0 +1,14 @@ +import pandas as pd +import numpy as np +data=pd.read_csv("dataset.csv") +import matplotlib.pyplot as plt +plt.title('Percent of Planted Corn in USA that is GMO') +plt.ylabel('Percent') +plt.xlabel('Year') +plt.xlim([2000,2018]) +plt.scatter(data.Year, data.Percent) +x=data.Year +y=data.Percent +z = np.polyfit(x,y, 1) +p = np.poly1d(z) +plt.plot(x,p(x),"r--")