From 13c2f8e1165d493604b08c2bca130f77b5563fe8 Mon Sep 17 00:00:00 2001 From: Brittni Bertolet Date: Fri, 6 Oct 2017 10:56:16 -0400 Subject: [PATCH 1/6] initial commit --- AnalysisTutorial7.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 AnalysisTutorial7.py diff --git a/AnalysisTutorial7.py b/AnalysisTutorial7.py new file mode 100644 index 0000000..e000d5c --- /dev/null +++ b/AnalysisTutorial7.py @@ -0,0 +1,32 @@ +# Analysis for Tutorial 7 + + +# Set working directory +os.chdir('/Users/brittnibertolet/Desktop/bcTutorials/Intro_Biocomp_ND_318_Tutorial7/') + +# Load packages +import numpy +import pandas +from plotnine import * + + +############################################ +################ Question 1 ################ +############################################ + +# Read in fasta +seq=numpy.loadtxt("Lecture11.fasta") + +# Plan for storing info +# Create for loop to do these things + +# sequenceID - use if/else statemtn + +# sequenceLength - count length of line +### use float(len(line)) + +# percentGC - count Gs, count Cs, calc %GC (G + C/length) + +# melthingTemp - if/else statement +### if length <= 14, calc melting point +### else, melting point = -9999 \ No newline at end of file From f4010088f95cff38762e1622ba2432325371fa44 Mon Sep 17 00:00:00 2001 From: Brittni Bertolet Date: Fri, 6 Oct 2017 11:14:35 -0400 Subject: [PATCH 2/6] Added the code to process data of number 1 --- AnalysisTutorial7.py | 45 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 42 insertions(+), 3 deletions(-) diff --git a/AnalysisTutorial7.py b/AnalysisTutorial7.py index e000d5c..d56893e 100644 --- a/AnalysisTutorial7.py +++ b/AnalysisTutorial7.py @@ -15,18 +15,57 @@ ############################################ # Read in fasta -seq=numpy.loadtxt("Lecture11.fasta") +seq=open("Lecture11.fasta",'r') # Plan for storing info +sequenceID=[] +sequenceLength=[] +percentGC=[] +meltingTemp=[] + # Create for loop to do these things +for Line in seq: + # Remove newline character from file line + Line=Line.strip() + # Use if/else statement to figure out what line you're on + if '>' in Line: + # Append sequenceID to stored list + sequenceID.append(Line[1:]) + else: + # Count sequence length + seqLen=float(len(Line)) + # Count the number of G's and C's + nG=Line.count("G") + nC=Line.count("C") + + # Use if/else to figure out if the sequence is 14 or fewer bases + if seqLen<=14: + # Calculate melting temperature + Tm=2*(nG+nC)+2*seqLen + else: + # Return "-9999" + Tm=-9999 + + # Append values to the stored lists + sequenceLength.append(seqLen) + percentGC.append((nG+nC)/seqLen*100) + meltingTemp.append(Tm) + + -# sequenceID - use if/else statemtn +# sequenceID - use if/else statement # sequenceLength - count length of line ### use float(len(line)) # percentGC - count Gs, count Cs, calc %GC (G + C/length) +### use G_count=line.count("G") +### use C_count=line.count("C") # melthingTemp - if/else statement ### if length <= 14, calc melting point -### else, melting point = -9999 \ No newline at end of file +### else, melting point = -9999 + + + + From 46145c60db9828cf441605568987a7eb605be004 Mon Sep 17 00:00:00 2001 From: Brittni Bertolet Date: Tue, 10 Oct 2017 14:03:03 -0400 Subject: [PATCH 3/6] Added pseudocode and data to do number 2 --- AnalysisTutorial7.py | 37 +++++++++++++++++++++++++------------ Q2_lakeData.txt | 1 + 2 files changed, 26 insertions(+), 12 deletions(-) create mode 100644 Q2_lakeData.txt diff --git a/AnalysisTutorial7.py b/AnalysisTutorial7.py index d56893e..46558d6 100644 --- a/AnalysisTutorial7.py +++ b/AnalysisTutorial7.py @@ -1,4 +1,4 @@ -# Analysis for Tutorial 7 +#### Analysis for Tutorial 7 # Set working directory @@ -7,7 +7,7 @@ # Load packages import numpy import pandas -from plotnine import * +# from plotnine import * (not sure we need to do it this way) ############################################ @@ -50,22 +50,35 @@ sequenceLength.append(seqLen) percentGC.append((nG+nC)/seqLen*100) meltingTemp.append(Tm) - +# Combine lists into a dataframe +seqDF = pandas.DataFrame(list(zip(sequenceID,sequenceLength,percentGC,meltingTemp)),columns=['sequenceID','sequenceLength','percentGC','meltingTemp']) + +# Close file +seq.close() + +# Create histogram of sequence length +from plotnine import * +plot1=ggplot(seqDF,aes(x="sequenceLength")) +plot1+geom_histogram()+theme_classic() -# sequenceID - use if/else statement +# Create histogram of GC content +plot2=ggplot(seqDF,aes(x="percentGC")) +plot2+geom_histogram()+theme_classic() -# sequenceLength - count length of line -### use float(len(line)) +############################################ +################ Question 2 ################ +############################################ +# I put data in the GitHub titled "Q2_lakeData.txt". +# Read in Q2_lakeData.txt -# percentGC - count Gs, count Cs, calc %GC (G + C/length) -### use G_count=line.count("G") -### use C_count=line.count("C") +# Plot scatter plot of Prod vs chlA -# melthingTemp - if/else statement -### if length <= 14, calc melting point -### else, melting point = -9999 +# Add a trendline +############################################ +################ Question 3 ################ +############################################ diff --git a/Q2_lakeData.txt b/Q2_lakeData.txt new file mode 100644 index 0000000..9b8fab6 --- /dev/null +++ b/Q2_lakeData.txt @@ -0,0 +1 @@ +lakeID chlA pH DOC TP TN Prod BA 4.4 7 6.5 22.8 840 4493 BE 12.8 5.2 9.4 21.5 531 5329 BO 20.7 5.4 19.5 48.7 1389 9261 BR 40.2 8.3 6.6 86.9 1187 12649 CB 16.7 4.1 18.2 33 990 8137 CR 4 5.9 4.5 11.1 1111 2842 FO 6.3 5.5 11.1 52.6 696 3052 HB 13.8 5.3 23 30.7 1678 11418 MO 7.2 7.8 22.6 36.2 1450 8397 NG 32.3 4.7 23.4 17.6 2239 18701 PA 4.3 7.2 5.1 40.9 1024 9279 PE 3.4 7.7 6.4 12 1252 12256 TU 7 6.7 13.4 15.3 1122 8085 WL 8.1 6 7.4 22.1 803 13075 \ No newline at end of file From 91f08afcb6274fa4855de0bac99d598aee019afd Mon Sep 17 00:00:00 2001 From: Chloe Spurgat Date: Thu, 12 Oct 2017 17:03:22 -0400 Subject: [PATCH 4/6] added psuedocode for question 3 --- AnalysisTutorial7.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/AnalysisTutorial7.py b/AnalysisTutorial7.py index 46558d6..e22c035 100644 --- a/AnalysisTutorial7.py +++ b/AnalysisTutorial7.py @@ -2,7 +2,7 @@ # Set working directory -os.chdir('/Users/brittnibertolet/Desktop/bcTutorials/Intro_Biocomp_ND_318_Tutorial7/') +#os.chdir('/Users/brittnibertolet/Desktop/bcTutorials/Intro_Biocomp_ND_318_Tutorial7/') # Load packages import numpy @@ -81,4 +81,8 @@ ################ Question 3 ################ ############################################ +#read in data.txt +#bar plot of north, east, south, and west populations + +#scatter plot of observations From 5411531e7e1660de92dfb65aee4e1254cf3f83e8 Mon Sep 17 00:00:00 2001 From: Chloe Spurgat Date: Thu, 12 Oct 2017 17:20:41 -0400 Subject: [PATCH 5/6] code for question 2 --- AnalysisTutorial7.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/AnalysisTutorial7.py b/AnalysisTutorial7.py index e22c035..002bff3 100644 --- a/AnalysisTutorial7.py +++ b/AnalysisTutorial7.py @@ -69,12 +69,17 @@ ############################################ ################ Question 2 ################ ############################################ + # I put data in the GitHub titled "Q2_lakeData.txt". # Read in Q2_lakeData.txt -# Plot scatter plot of Prod vs chlA +lakeData=open("Q2_lakeData.txt", "r") + +# Plot scatter plot of Prod vs chlA with trendline + +a=ggplot(lakeData,aes(x="chlA",y="Prod"))+theme_classic()+geom_point() +a+xlab("Concentration of Chlorophyll A")+ylab("Methane Production")+stat_smooth(method="lm") -# Add a trendline ############################################ From edfa595a4db36919e410fbeda6f0fb7c88b36b58 Mon Sep 17 00:00:00 2001 From: Brittni Bertolet Date: Thu, 12 Oct 2017 19:17:46 -0400 Subject: [PATCH 6/6] changed one part of question 2 and finished question 3 --- AnalysisTutorial7.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/AnalysisTutorial7.py b/AnalysisTutorial7.py index 002bff3..d42b8ae 100644 --- a/AnalysisTutorial7.py +++ b/AnalysisTutorial7.py @@ -73,7 +73,7 @@ # I put data in the GitHub titled "Q2_lakeData.txt". # Read in Q2_lakeData.txt -lakeData=open("Q2_lakeData.txt", "r") +lakeData=pandas.read_csv("Q2_lakeData.txt", sep="\t") # Plot scatter plot of Prod vs chlA with trendline @@ -87,7 +87,20 @@ ############################################ #read in data.txt +data3=pandas.read_csv("data.txt",sep=",",header=0) #bar plot of north, east, south, and west populations +plot4=ggplot(data3)+xlab("Region")+ylab("Mean Observation") +plot4+geom_bar(aes(x="factor(region)",y="observations"),stat="summary",fun_y=numpy.mean)+theme_classic() + +# Plot a scatter plot of of all the observations +plot5=ggplot(data3, aes(x="region", y="observations"))+xlab("Region")+ylab("Mean Observation") +plot5+geom_point()+geom_jitter()+theme_classic() + +### The two plots tell very different stories. While each region does have a very similar +### mean, the data are distributed differently around the mean. Both East and West have very +### large standard deviations, while the North region does not. The South region is more like a +### bimodal distribution with both maximam on opposite sides of the mean. + #scatter plot of observations