mdoellma · brittnibertolet · Oct 6, 2017 · Oct 6, 2017 · Oct 10, 2017 · Oct 12, 2017
diff --git a/AnalysisTutorial7.py b/AnalysisTutorial7.py
@@ -0,0 +1,106 @@
+#### Analysis for Tutorial 7
+
+
+# Set working directory
+#os.chdir('/Users/brittnibertolet/Desktop/bcTutorials/Intro_Biocomp_ND_318_Tutorial7/')
+
+# Load packages
+import numpy 
+import pandas
+# from plotnine import * (not sure we need to do it this way)
+
+
+############################################
+################ Question 1 ################
+############################################
+
+# Read in fasta 
+seq=open("Lecture11.fasta",'r')
+
+# Plan for storing info
+sequenceID=[]
+sequenceLength=[]
+percentGC=[]
+meltingTemp=[]
+
+# Create for loop to do these things 
+for Line in seq:
+    # Remove newline character from file line
+    Line=Line.strip()
+    # Use if/else statement to figure out what line you're on
+    if '>' in Line:
+        # Append sequenceID to stored list
+        sequenceID.append(Line[1:])
+    else:
+        # Count sequence length
+        seqLen=float(len(Line))
+        # Count the number of G's and C's
+        nG=Line.count("G")
+        nC=Line.count("C")
+
+        # Use if/else to figure out if the sequence is 14 or fewer bases 
+        if seqLen<=14:
+            # Calculate melting temperature
+            Tm=2*(nG+nC)+2*seqLen
+        else:
+            # Return "-9999"
+            Tm=-9999
+
+        # Append values to the stored lists
+        sequenceLength.append(seqLen)
+        percentGC.append((nG+nC)/seqLen*100)
+        meltingTemp.append(Tm)
+
+# Combine lists into a dataframe
+seqDF = pandas.DataFrame(list(zip(sequenceID,sequenceLength,percentGC,meltingTemp)),columns=['sequenceID','sequenceLength','percentGC','meltingTemp'])
+
+# Close file
+seq.close()
+
+# Create histogram of sequence length
+from plotnine import *
+plot1=ggplot(seqDF,aes(x="sequenceLength"))
+plot1+geom_histogram()+theme_classic()
+
+# Create histogram of GC content
+plot2=ggplot(seqDF,aes(x="percentGC"))
+plot2+geom_histogram()+theme_classic()
+
+############################################
+################ Question 2 ################
+############################################
+
+# I put data in the GitHub titled "Q2_lakeData.txt". 
+# Read in Q2_lakeData.txt
+
+lakeData=pandas.read_csv("Q2_lakeData.txt", sep="\t")
+
+# Plot scatter plot of Prod vs chlA with trendline
+
+a=ggplot(lakeData,aes(x="chlA",y="Prod"))+theme_classic()+geom_point() 
+a+xlab("Concentration of Chlorophyll A")+ylab("Methane Production")+stat_smooth(method="lm")
+
+
+
+############################################
+################ Question 3 ################
+############################################
+
+#read in data.txt
+data3=pandas.read_csv("data.txt",sep=",",header=0)
+
+#bar plot of north, east, south, and west populations
+plot4=ggplot(data3)+xlab("Region")+ylab("Mean Observation")
+plot4+geom_bar(aes(x="factor(region)",y="observations"),stat="summary",fun_y=numpy.mean)+theme_classic()
+
+# Plot a scatter plot of of all the observations
+plot5=ggplot(data3, aes(x="region", y="observations"))+xlab("Region")+ylab("Mean Observation")
+plot5+geom_point()+geom_jitter()+theme_classic()
+
+### The two plots tell very different stories. While each region does have a very similar 
+### mean, the data are distributed differently around the mean. Both East and West have very 
+### large standard deviations, while the North region does not. The South region is more like a 
+### bimodal distribution with both maximam on opposite sides of the mean. 
+
+
+#scatter plot of observations
diff --git a/Q2_lakeData.txt b/Q2_lakeData.txt
@@ -0,0 +1 @@
+lakeID	chlA	pH	DOC	TP	TN	ProdBA	4.4	7	6.5	22.8	840	4493BE	12.8	5.2	9.4	21.5	531	5329BO	20.7	5.4	19.5	48.7	1389	9261BR	40.2	8.3	6.6	86.9	1187	12649CB	16.7	4.1	18.2	33	990	8137CR	4	5.9	4.5	11.1	1111	2842FO	6.3	5.5	11.1	52.6	696	3052HB	13.8	5.3	23	30.7	1678	11418MO	7.2	7.8	22.6	36.2	1450	8397NG	32.3	4.7	23.4	17.6	2239	18701PA	4.3	7.2	5.1	40.9	1024	9279PE	3.4	7.7	6.4	12	1252	12256TU	7	6.7	13.4	15.3	1122	8085WL	8.1	6	7.4	22.1	803	13075
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		lakeID chlA pH DOC TP TN ProdBA 4.4 7 6.5 22.8 840 4493BE 12.8 5.2 9.4 21.5 531 5329BO 20.7 5.4 19.5 48.7 1389 9261BR 40.2 8.3 6.6 86.9 1187 12649CB 16.7 4.1 18.2 33 990 8137CR 4 5.9 4.5 11.1 1111 2842FO 6.3 5.5 11.1 52.6 696 3052HB 13.8 5.3 23 30.7 1678 11418MO 7.2 7.8 22.6 36.2 1450 8397NG 32.3 4.7 23.4 17.6 2239 18701PA 4.3 7.2 5.1 40.9 1024 9279PE 3.4 7.7 6.4 12 1252 12256TU 7 6.7 13.4 15.3 1122 8085WL 8.1 6 7.4 22.1 803 13075
Expand Down