From 9b779de780ea1f904691723bb0ca9144b22538b6 Mon Sep 17 00:00:00 2001 From: Katherine Date: Fri, 6 Oct 2017 10:54:14 -0400 Subject: [PATCH 01/16] created python script file --- exercise7 | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100755 exercise7 diff --git a/exercise7 b/exercise7 new file mode 100755 index 0000000..e69de29 From 43a71cf0e9a89477d489a602d081f2e06dc60140 Mon Sep 17 00:00:00 2001 From: Michelle Corley Date: Fri, 6 Oct 2017 11:19:15 -0400 Subject: [PATCH 02/16] initial commit of part 1 script, no plots yet --- part1script.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 part1script.py diff --git a/part1script.py b/part1script.py new file mode 100644 index 0000000..3ac5694 --- /dev/null +++ b/part1script.py @@ -0,0 +1,31 @@ +import pandas + +#Question 1 +InFile=open("Lecture11.fasta","r") + +#create lists for storing information about sequences +sequenceID=[] +sequenceLength=[] +percentGC=[] +meltingTemp=[] + +#loop through each line in fasta file to process sequences +for Line in InFile: + Line=Line.strip() + if '>' in Line: + sequenceID.append(Line[1:]) + else: + seqLen=float(len(Line)) + nG=Line.count("G") + nC=Line.count("C") + + #append values to lists + sequenceLength.append(seqLen) + percentGC.append((nG+nC)/seqLen*100) + print(percentGC) + +#combine lists into dataframe +seqDF = pandas.DataFrame(list(zip(sequenceID,sequenceLength,percentGC)),columns=['sequenceID','sequenceLength','percentGC']) + +#close file +InFile.close() From ceeed73691b2543642867e7d8ac411c9b9dbf90f Mon Sep 17 00:00:00 2001 From: Katherine Date: Mon, 9 Oct 2017 12:08:53 -0400 Subject: [PATCH 03/16] Added to Q1, loaded Q3 dataset and made blank lists --- exercise7 | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/exercise7 b/exercise7 index e69de29..ca0f9fb 100755 --- a/exercise7 +++ b/exercise7 @@ -0,0 +1,59 @@ +####Exercise7#### +#Question 1 +#load dataset +import pandas +InFile=open("Lecture11.fasta","r") +#create lists for storing information about sequences +sequenceID=[] +sequenceLength=[] +percentGC=[] +meltingTemp=[] + +for Line in InFile: +# remove newline character from file line + Line=Line.strip() + if '>' in Line: + sequenceID.append(Line[1:]) + else: + seqLen=float(len(Line)) + # count the number of G's and C's + nG=Line.count("G") + nC=Line.count("C") +# if the sequence is 14 or fewer bases calculate melting temperature + if seqLen<=14: + Tm=2*(nG+nC)+2*seqLen + else: + Tm=-9999 +#append values to list +sequenceLength.append(seqLen) +percentGC.append((nG+nC)/seqLen*100) +meltingTemp.append(Tm) + +#Histogram of sequence lengths +import plotnine +from plotnine import * +p=(ggplot(data=None) + + aes(x="sequenceID",y="seqLen") + + geom_histogram(binwidth=2)) + +#Question 2 + + +#Question 3 +#load the dataset +import pandas +Data = pandas.read_csv("data.txt", sep='\t', lineterminator='\r') +#create lists for storing information about sequences +North=[] +East=[] +South=[] +West=[] +for Pop in Data: + if Data.region[Pop]=="north": + North.append(line[1:]) + elif Data.region[Pop]=="east": + East.append(line[1:]) + elif Data.region[Pop]=="south": + South.append(line[1:]) + else: + West.append(line[1:]) \ No newline at end of file From 493ca778e896a6b0811502072b2c6c18e449a751 Mon Sep 17 00:00:00 2001 From: Katherine Date: Mon, 9 Oct 2017 16:32:20 -0400 Subject: [PATCH 04/16] Completed Questions 1 and 3 --- exercise7 | 49 +++++++++++++++++++++++-------------------------- 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/exercise7 b/exercise7 index ca0f9fb..9500604 100755 --- a/exercise7 +++ b/exercise7 @@ -3,6 +3,7 @@ #load dataset import pandas InFile=open("Lecture11.fasta","r") +InFile=close() #create lists for storing information about sequences sequenceID=[] sequenceLength=[] @@ -12,29 +13,32 @@ meltingTemp=[] for Line in InFile: # remove newline character from file line Line=Line.strip() + print (Line) if '>' in Line: sequenceID.append(Line[1:]) else: - seqLen=float(len(Line)) + Seqlength = float(len(Line)) + print (Seqlength) + sequenceLength.append(Seqlength) # count the number of G's and C's nG=Line.count("G") + print (nG) nC=Line.count("C") -# if the sequence is 14 or fewer bases calculate melting temperature - if seqLen<=14: - Tm=2*(nG+nC)+2*seqLen - else: - Tm=-9999 + print (nC) #append values to list -sequenceLength.append(seqLen) -percentGC.append((nG+nC)/seqLen*100) -meltingTemp.append(Tm) + gcTotal = (nG+nC)/Seqlength*100 + percentGC.append(gcTotal) + +#dataframe of resulting info +seqDF = pandas.DataFrame(list(zip(sequenceID,sequenceLength,percentGC)),columns=['sequenceID','sequenceLength','percentGC']) +InFile.close() #Histogram of sequence lengths import plotnine from plotnine import * -p=(ggplot(data=None) + - aes(x="sequenceID",y="seqLen") + - geom_histogram(binwidth=2)) +p=(ggplot(data=seqDF) + + aes(x="sequenceLength") + + geom_histogram(binwidth=5)) #Question 2 @@ -42,18 +46,11 @@ p=(ggplot(data=None) + #Question 3 #load the dataset import pandas -Data = pandas.read_csv("data.txt", sep='\t', lineterminator='\r') +import numpy +Data = pandas.read_csv("data.txt", sep=',') +print (Data) #create lists for storing information about sequences -North=[] -East=[] -South=[] -West=[] -for Pop in Data: - if Data.region[Pop]=="north": - North.append(line[1:]) - elif Data.region[Pop]=="east": - East.append(line[1:]) - elif Data.region[Pop]=="south": - South.append(line[1:]) - else: - West.append(line[1:]) \ No newline at end of file +import plotnine +from plotnine import * +d=ggplot(Data)+theme_classic()+xlab("region")+ylab("Average") +d+geom_bar(aes(x="factor(region)",y="observations"),stat="summary",fun_y=numpy.mean) From 7017b00e59961b83740195cb3e8be516006e592c Mon Sep 17 00:00:00 2001 From: Katherine Date: Mon, 9 Oct 2017 18:28:18 -0400 Subject: [PATCH 05/16] Cleaned up code and added comments for Q1 and Q3 --- exercise7 | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/exercise7 b/exercise7 index 9500604..090024d 100755 --- a/exercise7 +++ b/exercise7 @@ -9,14 +9,16 @@ sequenceID=[] sequenceLength=[] percentGC=[] meltingTemp=[] - +#for loop to sort out sequence lines and append their lengths for Line in InFile: # remove newline character from file line Line=Line.strip() print (Line) + # carrot lines separated from sequence lines if '>' in Line: sequenceID.append(Line[1:]) else: + # Create new seqlength dataframe and append lengths Seqlength = float(len(Line)) print (Seqlength) sequenceLength.append(Seqlength) @@ -25,12 +27,14 @@ for Line in InFile: print (nG) nC=Line.count("C") print (nC) -#append values to list + # append values to list gcTotal = (nG+nC)/Seqlength*100 percentGC.append(gcTotal) #dataframe of resulting info seqDF = pandas.DataFrame(list(zip(sequenceID,sequenceLength,percentGC)),columns=['sequenceID','sequenceLength','percentGC']) +#to make infile management easier +InFile=open("Lecture11.fasta","r") InFile.close() #Histogram of sequence lengths @@ -38,7 +42,7 @@ import plotnine from plotnine import * p=(ggplot(data=seqDF) + aes(x="sequenceLength") + - geom_histogram(binwidth=5)) + geom_histogram(binwidth=4)) #Question 2 @@ -49,7 +53,8 @@ import pandas import numpy Data = pandas.read_csv("data.txt", sep=',') print (Data) -#create lists for storing information about sequences + +#making bar graph with region as x and ave as y import plotnine from plotnine import * d=ggplot(Data)+theme_classic()+xlab("region")+ylab("Average") From d28b56599388b47dff5707081f7ffe84de3b641d Mon Sep 17 00:00:00 2001 From: Michelle Corley Date: Mon, 9 Oct 2017 22:13:29 -0400 Subject: [PATCH 06/16] histogram of sequence length --- part1script.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/part1script.py b/part1script.py index 3ac5694..ef5ac0f 100644 --- a/part1script.py +++ b/part1script.py @@ -1,4 +1,7 @@ +import numpy import pandas +from plotnine import * + #Question 1 InFile=open("Lecture11.fasta","r") @@ -11,9 +14,10 @@ #loop through each line in fasta file to process sequences for Line in InFile: - Line=Line.strip() + Line=Line.strip() #removes white space, tab, space, newline characters if '>' in Line: sequenceID.append(Line[1:]) + #print(Line[1:]) else: seqLen=float(len(Line)) nG=Line.count("G") @@ -22,10 +26,17 @@ #append values to lists sequenceLength.append(seqLen) percentGC.append((nG+nC)/seqLen*100) - print(percentGC) #combine lists into dataframe seqDF = pandas.DataFrame(list(zip(sequenceID,sequenceLength,percentGC)),columns=['sequenceID','sequenceLength','percentGC']) +#min(seqDF.sequenceLength) #close file InFile.close() + +#plots histogram of sequence length +b=ggplot(seqDF,aes(x="sequenceLength")) +b+geom_histogram(binwidth=5)+theme_classic() + + + From 4755988e1700553f8b557e8fa1337ce39a3c1f08 Mon Sep 17 00:00:00 2001 From: Michelle Corley Date: Tue, 10 Oct 2017 11:54:25 -0400 Subject: [PATCH 07/16] percentGC histogram added --- part1script.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/part1script.py b/part1script.py index ef5ac0f..eedaed5 100644 --- a/part1script.py +++ b/part1script.py @@ -38,5 +38,12 @@ b=ggplot(seqDF,aes(x="sequenceLength")) b+geom_histogram(binwidth=5)+theme_classic() +#plots histogram of sequence length +b=ggplot(seqDF,aes(x="percentGC")) +b+geom_histogram(binwidth=5)+theme_classic() + + + + From 48664baf35f5235960ed550639a49c179d823311 Mon Sep 17 00:00:00 2001 From: Michelle Corley Date: Tue, 10 Oct 2017 13:52:37 -0400 Subject: [PATCH 08/16] part 1 script updated --- part1script.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/part1script.py b/part1script.py index eedaed5..eceada6 100644 --- a/part1script.py +++ b/part1script.py @@ -38,7 +38,7 @@ b=ggplot(seqDF,aes(x="sequenceLength")) b+geom_histogram(binwidth=5)+theme_classic() -#plots histogram of sequence length +#plots histogram of percent GC b=ggplot(seqDF,aes(x="percentGC")) b+geom_histogram(binwidth=5)+theme_classic() From 0deb9a8942799885719766bbfe36d516110c7bb6 Mon Sep 17 00:00:00 2001 From: Michelle Corley Date: Tue, 10 Oct 2017 13:53:21 -0400 Subject: [PATCH 09/16] part 3 script with scatter plot --- part3script.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 part3script.py diff --git a/part3script.py b/part3script.py new file mode 100644 index 0000000..595e38c --- /dev/null +++ b/part3script.py @@ -0,0 +1,16 @@ +#Question 3 +#load the dataset +import pandas +import numpy +Data = pandas.read_csv("data.txt", sep=',') +#print (Data) + +#making bar graph with region as x and ave as y +import plotnine +from plotnine import * +d=ggplot(Data)+theme_classic()+xlab("region")+ylab("Average") +d+geom_bar(aes(x="factor(region)",y="observations"),stat="summary",fun_y=numpy.mean) + +#scatter plot of all observations +a=ggplot(Data,aes(x="region",y="observations")) +a+geom_jitter()+coord_cartesian() From 36b5e959f437f2891b625dcf6b54d12bd21dfa1a Mon Sep 17 00:00:00 2001 From: Katherine Date: Wed, 11 Oct 2017 19:51:16 -0400 Subject: [PATCH 10/16] Recommit because we were accidentally working in separate files. Making sure all code is in exercise7.py --- exercise7 | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/exercise7 b/exercise7 index 090024d..ac5f02e 100755 --- a/exercise7 +++ b/exercise7 @@ -44,6 +44,11 @@ p=(ggplot(data=seqDF) + aes(x="sequenceLength") + geom_histogram(binwidth=4)) +#Histogram of Percent GC +g=(ggplot(data=seqDF) + + aes(x="percentGC") + + geom_histogram(binwidth=5)) + #Question 2 @@ -59,3 +64,7 @@ import plotnine from plotnine import * d=ggplot(Data)+theme_classic()+xlab("region")+ylab("Average") d+geom_bar(aes(x="factor(region)",y="observations"),stat="summary",fun_y=numpy.mean) + +#scatter plot of everything observed +a=ggplot(Data,aes(x="region",y="observations")) +a+geom_jitter()+coord_cartesian() From b96c3cd1b09f6d78eea69194d43c8e9b5a1b9197 Mon Sep 17 00:00:00 2001 From: Michelle Corley Date: Thu, 12 Oct 2017 13:07:39 -0400 Subject: [PATCH 11/16] trouble with plotting --- part2script.py | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 part2script.py diff --git a/part2script.py b/part2script.py new file mode 100644 index 0000000..b647df3 --- /dev/null +++ b/part2script.py @@ -0,0 +1,11 @@ +import numpy +import pandas +import plotnine +from plotnine import * + +Part2=pandas.read_csv("part2datacopy.txt", sep="",", header=0) +#print(Part2) + +#plotting data in scatterplot with trendline +a=ggplot(Part2,aes(x="oil changes per year",y="cost of repairs($)"))+theme_classic()+geom_point() +a+xlab("oil changes per year")+ylab("cost of repairs($)")+stat_smooth(method="lm") From b5b3c8bde263497e6acda5d09e8f6f2afcc932b1 Mon Sep 17 00:00:00 2001 From: Michelle Corley Date: Thu, 12 Oct 2017 13:09:11 -0400 Subject: [PATCH 12/16] data for part2 --- part2datacopy.txt | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 part2datacopy.txt diff --git a/part2datacopy.txt b/part2datacopy.txt new file mode 100644 index 0000000..903b508 --- /dev/null +++ b/part2datacopy.txt @@ -0,0 +1,5 @@ +“oil changes per year”,”cost of repairs($)” “3”,300 “5”,300 “2”,500 “3”,400 “1”,700 “4”,400 “6”,100 “4”,250 “3”,450 +“2”,650 +“0”,600 +“10”,0 +“7”,150 \ No newline at end of file From 1dd0597e32eafe32ada673607a6b753ea7ff1ff4 Mon Sep 17 00:00:00 2001 From: Katherine Date: Thu, 12 Oct 2017 14:49:58 -0400 Subject: [PATCH 13/16] Removed quotation marks via command line --- part2datacopy.txt | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/part2datacopy.txt b/part2datacopy.txt index 903b508..82b3374 100644 --- a/part2datacopy.txt +++ b/part2datacopy.txt @@ -1,5 +1 @@ -“oil changes per year”,”cost of repairs($)” “3”,300 “5”,300 “2”,500 “3”,400 “1”,700 “4”,400 “6”,100 “4”,250 “3”,450 -“2”,650 -“0”,600 -“10”,0 -“7”,150 \ No newline at end of file +oil changes per year,cost of repairs($) 3,300 5,300 2,500 3,400 1,700 4,400 6,100 4,250 3,450 2,650 0,600 10,0 7,150 \ No newline at end of file From 0834cbf5cea7caf2258d0a00b141eecfd461aaab Mon Sep 17 00:00:00 2001 From: Katherine Date: Thu, 12 Oct 2017 14:50:49 -0400 Subject: [PATCH 14/16] Fixed importing file issue. It works now! --- part2script.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/part2script.py b/part2script.py index b647df3..6f3d765 100644 --- a/part2script.py +++ b/part2script.py @@ -3,7 +3,7 @@ import plotnine from plotnine import * -Part2=pandas.read_csv("part2datacopy.txt", sep="",", header=0) +Part2=pandas.read_csv("part2datacopy.txt", sep=",") #print(Part2) #plotting data in scatterplot with trendline From c51647a3f40c0063235765de91318ea4dd81b1df Mon Sep 17 00:00:00 2001 From: Katherine Date: Thu, 12 Oct 2017 14:51:18 -0400 Subject: [PATCH 15/16] Added Michelle's Question 2 script. This file contains all 3 Questions --- exercise7 | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/exercise7 b/exercise7 index ac5f02e..1d1c33c 100755 --- a/exercise7 +++ b/exercise7 @@ -50,7 +50,18 @@ g=(ggplot(data=seqDF) + geom_histogram(binwidth=5)) #Question 2 +import numpy +import pandas +import plotnine +from plotnine import * + +#read in file +Part2=pandas.read_csv("part2datacopy.txt", sep=",") +#print(Part2) +#plotting data in scatterplot with trendline +a=ggplot(Part2,aes(x="oil changes per year",y="cost of repairs($)"))+theme_classic()+geom_point() +a+xlab("oil changes per year")+ylab("cost of repairs($)")+stat_smooth(method="lm") #Question 3 #load the dataset From 6088aaa62b133a797fc36216d560d35d4dde8e10 Mon Sep 17 00:00:00 2001 From: Katherine Date: Thu, 12 Oct 2017 15:25:53 -0400 Subject: [PATCH 16/16] Added print commands to Q1 --- exercise7 | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/exercise7 b/exercise7 index 1d1c33c..f471461 100755 --- a/exercise7 +++ b/exercise7 @@ -3,7 +3,7 @@ #load dataset import pandas InFile=open("Lecture11.fasta","r") -InFile=close() +#InFile=close() #create lists for storing information about sequences sequenceID=[] sequenceLength=[] @@ -34,7 +34,7 @@ for Line in InFile: #dataframe of resulting info seqDF = pandas.DataFrame(list(zip(sequenceID,sequenceLength,percentGC)),columns=['sequenceID','sequenceLength','percentGC']) #to make infile management easier -InFile=open("Lecture11.fasta","r") +#InFile=open("Lecture11.fasta","r") InFile.close() #Histogram of sequence lengths @@ -43,11 +43,12 @@ from plotnine import * p=(ggplot(data=seqDF) + aes(x="sequenceLength") + geom_histogram(binwidth=4)) - +p #Histogram of Percent GC g=(ggplot(data=seqDF) + aes(x="percentGC") + geom_histogram(binwidth=5)) +g #Question 2 import numpy