From 84d6133185c6ad9ac40a36284f3ec16c313898de Mon Sep 17 00:00:00 2001 From: ayamasaki2011 Date: Fri, 6 Oct 2017 11:14:57 -0400 Subject: [PATCH 01/12] AEY: Script for calculating sequence lengths and %GC content from fasta file --- Exercise7.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100755 Exercise7.py diff --git a/Exercise7.py b/Exercise7.py new file mode 100755 index 0000000..4faeb6b --- /dev/null +++ b/Exercise7.py @@ -0,0 +1,20 @@ +#Part 1 +import pandas + +InFile=open("Lecture11.fasta","r") #Open fasta file as read-only + +sequenceLength=[] #Set up variables to accept/store sequence data as it is calculated +percentGC=[] + +for line in InFile: #Loop through each line in fasta file + if '>' in line: #Check line for >, if present, skip to next line + continue + else: + seqLen=float(len(line)) #Calculate length of sequence + nG=line.count("G") #Count individual G and C contents + nC=line.count("C") + percGC=float(((nG+nC)/seqLen)*100) #Calculate % GC + + sequenceLength.append(seqLen) #Append length of individual sequences to list + percentGC.append(percGC) #Append %GC of individual sequences to list + From 534e9ce7701ffe52f38c1f4777263c11e98edd3f Mon Sep 17 00:00:00 2001 From: ayamasaki2011 Date: Thu, 12 Oct 2017 11:45:12 -0400 Subject: [PATCH 02/12] AEY: Add plot of GC content to script --- Exercise7.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Exercise7.py b/Exercise7.py index 4faeb6b..54dcab3 100755 --- a/Exercise7.py +++ b/Exercise7.py @@ -1,5 +1,6 @@ #Part 1 import pandas +from plotnine import * InFile=open("Lecture11.fasta","r") #Open fasta file as read-only @@ -18,3 +19,9 @@ sequenceLength.append(seqLen) #Append length of individual sequences to list percentGC.append(percGC) #Append %GC of individual sequences to list +seqDF=pandas.DataFrame(list(zip(sequenceLength,percentGC)),columns=['sequenceLength','percentGC']) +a=ggplot(seqDF, aes(x="sequenceLength")) +a+geom_histogram()+theme_classic() + +b=ggplot(seqDF, aes(x="percentGC")) +b+geom_histogram()+theme_classic() From 90ecfc31c1b2d4b3c89a1803b9651c467f7056d5 Mon Sep 17 00:00:00 2001 From: ayamasaki2011 Date: Thu, 12 Oct 2017 11:46:45 -0400 Subject: [PATCH 03/12] AEY: Add line in script to close file --- Exercise7.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Exercise7.py b/Exercise7.py index 54dcab3..e7e0caa 100755 --- a/Exercise7.py +++ b/Exercise7.py @@ -25,3 +25,5 @@ b=ggplot(seqDF, aes(x="percentGC")) b+geom_histogram()+theme_classic() + +InFile.close() \ No newline at end of file From fbaf8a2b3d30b534dcfce33059eb415985d06969 Mon Sep 17 00:00:00 2001 From: ayamasaki2011 Date: Thu, 12 Oct 2017 11:48:29 -0400 Subject: [PATCH 04/12] AEY: Comment remaining lines of code --- Exercise7.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Exercise7.py b/Exercise7.py index e7e0caa..52b7988 100755 --- a/Exercise7.py +++ b/Exercise7.py @@ -19,11 +19,11 @@ sequenceLength.append(seqLen) #Append length of individual sequences to list percentGC.append(percGC) #Append %GC of individual sequences to list -seqDF=pandas.DataFrame(list(zip(sequenceLength,percentGC)),columns=['sequenceLength','percentGC']) -a=ggplot(seqDF, aes(x="sequenceLength")) -a+geom_histogram()+theme_classic() +seqDF=pandas.DataFrame(list(zip(sequenceLength,percentGC)),columns=['sequenceLength','percentGC']) #combine lists into dataframe for easier plotting +a=ggplot(seqDF, aes(x="sequenceLength")) #Create plot of sequence lengths +a+geom_histogram()+theme_classic() #Plot as histogram -b=ggplot(seqDF, aes(x="percentGC")) -b+geom_histogram()+theme_classic() +b=ggplot(seqDF, aes(x="percentGC")) #Create plot of %GC +b+geom_histogram()+theme_classic() #Plot as histogram -InFile.close() \ No newline at end of file +InFile.close() #Close file \ No newline at end of file From e507b14eeef5e619acdda219abf967f3ad127eb6 Mon Sep 17 00:00:00 2001 From: ayamasaki2011 Date: Thu, 12 Oct 2017 12:24:15 -0400 Subject: [PATCH 05/12] AEY: Start script for Part 3 - calculate mean of populations --- Exercise7.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/Exercise7.py b/Exercise7.py index 52b7988..dd9a1ab 100755 --- a/Exercise7.py +++ b/Exercise7.py @@ -26,4 +26,14 @@ b=ggplot(seqDF, aes(x="percentGC")) #Create plot of %GC b+geom_histogram()+theme_classic() #Plot as histogram -InFile.close() #Close file \ No newline at end of file +InFile.close() #Close file + +#Part 2 +import numpy + +data=pandas.read_csv("data.txt", header=0, sep=",") #Open file as data frame + +nMean=numpy.mean(data[data.region=="north"]) #Calculate mean for each direction/population +eMean=numpy.mean(data[data.region=="east"]) +wMean=numpy.mean(data[data.region=="west"]) +sMean=numpy.mean(data[data.region=="south"]) \ No newline at end of file From 68e6535399046cf930dee47b7f711434856cf561 Mon Sep 17 00:00:00 2001 From: ayamasaki2011 Date: Thu, 12 Oct 2017 14:39:58 -0400 Subject: [PATCH 06/12] AEY: Plot means onto bar graph --- Exercise7.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/Exercise7.py b/Exercise7.py index dd9a1ab..ba8d2b1 100755 --- a/Exercise7.py +++ b/Exercise7.py @@ -28,12 +28,29 @@ InFile.close() #Close file -#Part 2 +#Part 3 import numpy data=pandas.read_csv("data.txt", header=0, sep=",") #Open file as data frame -nMean=numpy.mean(data[data.region=="north"]) #Calculate mean for each direction/population -eMean=numpy.mean(data[data.region=="east"]) -wMean=numpy.mean(data[data.region=="west"]) -sMean=numpy.mean(data[data.region=="south"]) \ No newline at end of file +dataN=data[data.region=="north"] #Subset data frame & find mean for all populations +nMean=numpy.mean(dataN.observations) + +dataE=data[data.region=="east"] +eMean=numpy.mean(dataE.observations) + +dataW=data[data.region=="west"] +wMean=numpy.mean(dataW.observations) + +dataS=data[data.region=="south"] +sMean=numpy.mean(dataS.observations) + +means=pandas.DataFrame(columns=('region', 'mean')) #Combine means into new data frame +means.region='north','south','east','west' +means.iloc[0,1]=nMean +means.iloc[1,1]=sMean +means.iloc[2,1]=eMean +means.iloc[3,1]=wMean + +c=ggplot(means, aes(x="region",y="mean")) #Plot means on bar graph +c+geom_col() From 5c2294b78091ac4c8526743c80c9be35cd1cfee2 Mon Sep 17 00:00:00 2001 From: ayamasaki2011 Date: Thu, 12 Oct 2017 14:45:16 -0400 Subject: [PATCH 07/12] AEY: Plot data observations on scatter plot; answer question in comments --- Exercise7.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/Exercise7.py b/Exercise7.py index ba8d2b1..a8c4a28 100755 --- a/Exercise7.py +++ b/Exercise7.py @@ -53,4 +53,13 @@ means.iloc[3,1]=wMean c=ggplot(means, aes(x="region",y="mean")) #Plot means on bar graph -c+geom_col() +c+geom_col()+theme_classic() + +d=ggplot(data, aes(x="region", y="observations")) #Plot all observations on scatter plot +d+geom_jitter()+theme_classic() + +#Graphs tell different stories - only on the scatter plot does it become apparent that the observations +#in the south region are two discrete populations, rather than a continuous spread like the others. +#Additionally, the mean for the West region makes it look as though it has the smallest values, whereas +#the scatterplot shows that it has both the lowest and the highest values, over a very large spread. +#The mean barplot is really only an accurate respresentation for the North region. \ No newline at end of file From fbb7d48d3b169c90333139977854256bd26a1794 Mon Sep 17 00:00:00 2001 From: twhmitchell Date: Thu, 12 Oct 2017 20:38:16 -0400 Subject: [PATCH 08/12] Add files via upload Part 2, need to resolve plotting errors --- code | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 code diff --git a/code b/code new file mode 100644 index 0000000..a54bf3a --- /dev/null +++ b/code @@ -0,0 +1,10 @@ +import numpy +import pandas +from plotnine import * + +icecream=pandas.read_csv("icream_sales.txt",sep="\t",header=0) +icecream.shape +icecream.head(20) + +a=ggplot(icecream,aes(x="temp",y="sales"))+theme_classic()+geom_point() +a+xlab("Temperature (C)")+ylab("Sales In Dollars")+stat_smooth(method="lm") \ No newline at end of file From 829b99f7ae7ccea8d7e075661d5298c127b5ce1f Mon Sep 17 00:00:00 2001 From: twhmitchell Date: Thu, 12 Oct 2017 20:39:14 -0400 Subject: [PATCH 09/12] Add files via upload Dataset for Part 2 of the exercise --- icream_sales.txt | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 icream_sales.txt diff --git a/icream_sales.txt b/icream_sales.txt new file mode 100644 index 0000000..7faa37b --- /dev/null +++ b/icream_sales.txt @@ -0,0 +1,2 @@ +“temp” “sales” +14.2 215 16.4 325 11.9 185 15.2 332 18.5 406 22.1 522 19.4 412 25.1 614 23.4 544 18.1 421 22.6 445 17.2 408 \ No newline at end of file From b6a8115354e722734aa36815f3ece124c32f4923 Mon Sep 17 00:00:00 2001 From: ayamasaki2011 Date: Thu, 12 Oct 2017 21:18:36 -0400 Subject: [PATCH 10/12] AEY: Remove Part 3 from exercise code --- Exercise7Part1.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100755 Exercise7Part1.py diff --git a/Exercise7Part1.py b/Exercise7Part1.py new file mode 100755 index 0000000..52b7988 --- /dev/null +++ b/Exercise7Part1.py @@ -0,0 +1,29 @@ +#Part 1 +import pandas +from plotnine import * + +InFile=open("Lecture11.fasta","r") #Open fasta file as read-only + +sequenceLength=[] #Set up variables to accept/store sequence data as it is calculated +percentGC=[] + +for line in InFile: #Loop through each line in fasta file + if '>' in line: #Check line for >, if present, skip to next line + continue + else: + seqLen=float(len(line)) #Calculate length of sequence + nG=line.count("G") #Count individual G and C contents + nC=line.count("C") + percGC=float(((nG+nC)/seqLen)*100) #Calculate % GC + + sequenceLength.append(seqLen) #Append length of individual sequences to list + percentGC.append(percGC) #Append %GC of individual sequences to list + +seqDF=pandas.DataFrame(list(zip(sequenceLength,percentGC)),columns=['sequenceLength','percentGC']) #combine lists into dataframe for easier plotting +a=ggplot(seqDF, aes(x="sequenceLength")) #Create plot of sequence lengths +a+geom_histogram()+theme_classic() #Plot as histogram + +b=ggplot(seqDF, aes(x="percentGC")) #Create plot of %GC +b+geom_histogram()+theme_classic() #Plot as histogram + +InFile.close() #Close file \ No newline at end of file From 05b8aca55ef77e7626beb2d7e272c0dabd3703d1 Mon Sep 17 00:00:00 2001 From: Balaji Sampathkumar Date: Fri, 13 Oct 2017 01:34:57 -0400 Subject: [PATCH 11/12] EX 7 Question 3 - Balaji --- EX_7_Script_Q3 | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 EX_7_Script_Q3 diff --git a/EX_7_Script_Q3 b/EX_7_Script_Q3 new file mode 100644 index 0000000..ccb5e69 --- /dev/null +++ b/EX_7_Script_Q3 @@ -0,0 +1,30 @@ +import numpy +import os +import matplotlib.pyplot as plt +from plotnine import * +os.listdir('.') +os.chdir('/Users/sampathkumarbalaji/EX_7/Intro_Biocom_ND_319_Tutorial7') +import pandas + +#to parse and read +data_txt = pandas.read_csv("data.txt") +directions = ['north','south','east','west'] + +#create dataframe +A=numpy.zeros((4,2)) +mean_DF=pandas.DataFrame(A,columns=['region', 'mean_dir']) + +#assigning values to data frame elements with mean of 4 regions +for i in range(0,4) : + mean_DF.mean_dir[i] = numpy.mean(data_txt[data_txt.region==directions[i]].observations) + mean_DF.region[i] = directions[i] + +a=ggplot(mean_DF)+theme_classic()+xlab("region")+ylab("mean_dir") +a+geom_bar(aes(x="region",y="mean_dir"),stat="summary") + +b=ggplot(data, aes(x="region", y="observations")) #Plot all observations on scatter plot +b+geom_jitter()+theme_classic() + +#Bar Graph had mean plotted across the regions which almost had same values (~15). They almost looked the same. +#But on the scatter plot we north has points centered around 15 while east and west are equally spread and south +#has a bi-modal distribution. From 51f686937cb28f822d3b097ff5202201800e1a30 Mon Sep 17 00:00:00 2001 From: Balaji Sampathkumar Date: Fri, 13 Oct 2017 01:39:50 -0400 Subject: [PATCH 12/12] Combined all 3 scripts into 1 file - Balaji --- EX_7_Script_Final | 75 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 EX_7_Script_Final diff --git a/EX_7_Script_Final b/EX_7_Script_Final new file mode 100644 index 0000000..2ccf6cf --- /dev/null +++ b/EX_7_Script_Final @@ -0,0 +1,75 @@ +#Part 1 - Amanda +import pandas +from plotnine import * + +InFile=open("Lecture11.fasta","r") #Open fasta file as read-only + +sequenceLength=[] #Set up variables to accept/store sequence data as it is calculated +percentGC=[] + +for line in InFile: #Loop through each line in fasta file + if '>' in line: #Check line for >, if present, skip to next line + continue + else: + seqLen=float(len(line)) #Calculate length of sequence + nG=line.count("G") #Count individual G and C contents + nC=line.count("C") + percGC=float(((nG+nC)/seqLen)*100) #Calculate % GC + + sequenceLength.append(seqLen) #Append length of individual sequences to list + percentGC.append(percGC) #Append %GC of individual sequences to list + +seqDF=pandas.DataFrame(list(zip(sequenceLength,percentGC)),columns=['sequenceLength','percentGC']) #combine lists into dataframe for easier plotting +a=ggplot(seqDF, aes(x="sequenceLength")) #Create plot of sequence lengths +a+geom_histogram()+theme_classic() #Plot as histogram + +b=ggplot(seqDF, aes(x="percentGC")) #Create plot of %GC +b+geom_histogram()+theme_classic() #Plot as histogram + +InFile.close() #Close file +############################################################################# +#Part 2 - Thomas +import numpy +import pandas +from plotnine import * + +icecream=pandas.read_csv("icream_sales.txt",sep="\t",header=0) +icecream.shape +icecream.head(20) + +a=ggplot(icecream,aes(x="temp",y="sales"))+theme_classic()+geom_point() +a+xlab("Temperature (C)")+ylab("Sales In Dollars")+stat_smooth(method="lm") +############################################################################# +#Part 3 - Balaji +import numpy +import os +import matplotlib.pyplot as plt +from plotnine import * +os.listdir('.') +os.chdir('/Users/sampathkumarbalaji/EX_7/Intro_Biocom_ND_319_Tutorial7') +import pandas + +#to parse and read +data_txt = pandas.read_csv("data.txt") +directions = ['north','south','east','west'] + +#create dataframe +A=numpy.zeros((4,2)) +mean_DF=pandas.DataFrame(A,columns=['region', 'mean_dir']) + +#assigning values to data frame elements with mean of 4 regions +for i in range(0,4) : + mean_DF.mean_dir[i] = numpy.mean(data_txt[data_txt.region==directions[i]].observations) + mean_DF.region[i] = directions[i] + +a=ggplot(mean_DF)+theme_classic()+xlab("region")+ylab("mean_dir") +a+geom_bar(aes(x="region",y="mean_dir"),stat="summary") + +b=ggplot(data, aes(x="region", y="observations")) #Plot all observations on scatter plot +b+geom_jitter()+theme_classic() + +#Bar Graph had mean plotted across the regions which almost had same values (~15). They almost looked the same. +#But on the scatter plot we north has points centered around 15 while east and west are equally spread and south +#has a bi-modal distribution. + +