From 13c2f8e1165d493604b08c2bca130f77b5563fe8 Mon Sep 17 00:00:00 2001
From: Brittni Bertolet <bbertole@nd.edu>
Date: Fri, 6 Oct 2017 10:56:16 -0400
Subject: [PATCH 1/6] initial commit

---
 AnalysisTutorial7.py | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 AnalysisTutorial7.py

diff --git a/AnalysisTutorial7.py b/AnalysisTutorial7.py
new file mode 100644
index 0000000..e000d5c
--- /dev/null
+++ b/AnalysisTutorial7.py
@@ -0,0 +1,32 @@
+# Analysis for Tutorial 7
+
+
+# Set working directory
+os.chdir('/Users/brittnibertolet/Desktop/bcTutorials/Intro_Biocomp_ND_318_Tutorial7/')
+
+# Load packages
+import numpy 
+import pandas
+from plotnine import *
+
+
+############################################
+################ Question 1 ################
+############################################
+
+# Read in fasta 
+seq=numpy.loadtxt("Lecture11.fasta")
+
+# Plan for storing info
+# Create for loop to do these things 
+
+# sequenceID - use if/else statemtn
+
+# sequenceLength - count length of line 
+### use float(len(line))
+
+# percentGC - count Gs, count Cs, calc %GC (G + C/length)
+
+# melthingTemp - if/else statement
+### if length <= 14, calc melting point
+### else, melting point = -9999
\ No newline at end of file

From f4010088f95cff38762e1622ba2432325371fa44 Mon Sep 17 00:00:00 2001
From: Brittni Bertolet <bbertole@nd.edu>
Date: Fri, 6 Oct 2017 11:14:35 -0400
Subject: [PATCH 2/6] Added the code to process data of number 1

---
 AnalysisTutorial7.py | 45 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 42 insertions(+), 3 deletions(-)

diff --git a/AnalysisTutorial7.py b/AnalysisTutorial7.py
index e000d5c..d56893e 100644
--- a/AnalysisTutorial7.py
+++ b/AnalysisTutorial7.py
@@ -15,18 +15,57 @@
 ############################################
 
 # Read in fasta 
-seq=numpy.loadtxt("Lecture11.fasta")
+seq=open("Lecture11.fasta",'r')
 
 # Plan for storing info
+sequenceID=[]
+sequenceLength=[]
+percentGC=[]
+meltingTemp=[]
+
 # Create for loop to do these things 
+for Line in seq:
+    # Remove newline character from file line
+    Line=Line.strip()
+    # Use if/else statement to figure out what line you're on
+    if '>' in Line:
+        # Append sequenceID to stored list
+        sequenceID.append(Line[1:])
+    else:
+        # Count sequence length
+        seqLen=float(len(Line))
+        # Count the number of G's and C's
+        nG=Line.count("G")
+        nC=Line.count("C")
+        
+        # Use if/else to figure out if the sequence is 14 or fewer bases 
+        if seqLen<=14:
+            # Calculate melting temperature
+            Tm=2*(nG+nC)+2*seqLen
+        else:
+            # Return "-9999"
+            Tm=-9999
+        
+        # Append values to the stored lists
+        sequenceLength.append(seqLen)
+        percentGC.append((nG+nC)/seqLen*100)
+        meltingTemp.append(Tm)
+    
+
 
-# sequenceID - use if/else statemtn
+# sequenceID - use if/else statement
 
 # sequenceLength - count length of line 
 ### use float(len(line))
 
 # percentGC - count Gs, count Cs, calc %GC (G + C/length)
+### use G_count=line.count("G")
+### use C_count=line.count("C")
 
 # melthingTemp - if/else statement
 ### if length <= 14, calc melting point
-### else, melting point = -9999
\ No newline at end of file
+### else, melting point = -9999
+
+
+
+

From 46145c60db9828cf441605568987a7eb605be004 Mon Sep 17 00:00:00 2001
From: Brittni Bertolet <bbertole@nd.edu>
Date: Tue, 10 Oct 2017 14:03:03 -0400
Subject: [PATCH 3/6] Added pseudocode and data to do number 2

---
 AnalysisTutorial7.py | 37 +++++++++++++++++++++++++------------
 Q2_lakeData.txt      |  1 +
 2 files changed, 26 insertions(+), 12 deletions(-)
 create mode 100644 Q2_lakeData.txt

diff --git a/AnalysisTutorial7.py b/AnalysisTutorial7.py
index d56893e..46558d6 100644
--- a/AnalysisTutorial7.py
+++ b/AnalysisTutorial7.py
@@ -1,4 +1,4 @@
-# Analysis for Tutorial 7
+#### Analysis for Tutorial 7
 
 
 # Set working directory
@@ -7,7 +7,7 @@
 # Load packages
 import numpy 
 import pandas
-from plotnine import *
+# from plotnine import * (not sure we need to do it this way)
 
 
 ############################################
@@ -50,22 +50,35 @@
         sequenceLength.append(seqLen)
         percentGC.append((nG+nC)/seqLen*100)
         meltingTemp.append(Tm)
-    
 
+# Combine lists into a dataframe
+seqDF = pandas.DataFrame(list(zip(sequenceID,sequenceLength,percentGC,meltingTemp)),columns=['sequenceID','sequenceLength','percentGC','meltingTemp'])
+
+# Close file
+seq.close()
+
+# Create histogram of sequence length
+from plotnine import *
+plot1=ggplot(seqDF,aes(x="sequenceLength"))
+plot1+geom_histogram()+theme_classic()
 
-# sequenceID - use if/else statement
+# Create histogram of GC content
+plot2=ggplot(seqDF,aes(x="percentGC"))
+plot2+geom_histogram()+theme_classic()
 
-# sequenceLength - count length of line 
-### use float(len(line))
+############################################
+################ Question 2 ################
+############################################
+# I put data in the GitHub titled "Q2_lakeData.txt". 
+# Read in Q2_lakeData.txt
 
-# percentGC - count Gs, count Cs, calc %GC (G + C/length)
-### use G_count=line.count("G")
-### use C_count=line.count("C")
+# Plot scatter plot of Prod vs chlA 
 
-# melthingTemp - if/else statement
-### if length <= 14, calc melting point
-### else, melting point = -9999
+# Add a trendline
 
 
+############################################
+################ Question 3 ################
+############################################
 
 
diff --git a/Q2_lakeData.txt b/Q2_lakeData.txt
new file mode 100644
index 0000000..9b8fab6
--- /dev/null
+++ b/Q2_lakeData.txt
@@ -0,0 +1 @@
+lakeID	chlA	pH	DOC	TP	TN	ProdBA	4.4	7	6.5	22.8	840	4493BE	12.8	5.2	9.4	21.5	531	5329BO	20.7	5.4	19.5	48.7	1389	9261BR	40.2	8.3	6.6	86.9	1187	12649CB	16.7	4.1	18.2	33	990	8137CR	4	5.9	4.5	11.1	1111	2842FO	6.3	5.5	11.1	52.6	696	3052HB	13.8	5.3	23	30.7	1678	11418MO	7.2	7.8	22.6	36.2	1450	8397NG	32.3	4.7	23.4	17.6	2239	18701PA	4.3	7.2	5.1	40.9	1024	9279PE	3.4	7.7	6.4	12	1252	12256TU	7	6.7	13.4	15.3	1122	8085WL	8.1	6	7.4	22.1	803	13075
\ No newline at end of file

From 91f08afcb6274fa4855de0bac99d598aee019afd Mon Sep 17 00:00:00 2001
From: Chloe Spurgat <cspurgat@nd.edu>
Date: Thu, 12 Oct 2017 17:03:22 -0400
Subject: [PATCH 4/6] added psuedocode for question 3

---
 AnalysisTutorial7.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/AnalysisTutorial7.py b/AnalysisTutorial7.py
index 46558d6..e22c035 100644
--- a/AnalysisTutorial7.py
+++ b/AnalysisTutorial7.py
@@ -2,7 +2,7 @@
 
 
 # Set working directory
-os.chdir('/Users/brittnibertolet/Desktop/bcTutorials/Intro_Biocomp_ND_318_Tutorial7/')
+#os.chdir('/Users/brittnibertolet/Desktop/bcTutorials/Intro_Biocomp_ND_318_Tutorial7/')
 
 # Load packages
 import numpy 
@@ -81,4 +81,8 @@
 ################ Question 3 ################
 ############################################
 
+#read in data.txt
 
+#bar plot of north, east, south, and west populations
+
+#scatter plot of observations

From 5411531e7e1660de92dfb65aee4e1254cf3f83e8 Mon Sep 17 00:00:00 2001
From: Chloe Spurgat <cspurgat@nd.edu>
Date: Thu, 12 Oct 2017 17:20:41 -0400
Subject: [PATCH 5/6] code for question 2

---
 AnalysisTutorial7.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/AnalysisTutorial7.py b/AnalysisTutorial7.py
index e22c035..002bff3 100644
--- a/AnalysisTutorial7.py
+++ b/AnalysisTutorial7.py
@@ -69,12 +69,17 @@
 ############################################
 ################ Question 2 ################
 ############################################
+
 # I put data in the GitHub titled "Q2_lakeData.txt". 
 # Read in Q2_lakeData.txt
 
-# Plot scatter plot of Prod vs chlA 
+lakeData=open("Q2_lakeData.txt", "r")
+
+# Plot scatter plot of Prod vs chlA with trendline
+
+a=ggplot(lakeData,aes(x="chlA",y="Prod"))+theme_classic()+geom_point() 
+a+xlab("Concentration of Chlorophyll A")+ylab("Methane Production")+stat_smooth(method="lm")
 
-# Add a trendline
 
 
 ############################################

From edfa595a4db36919e410fbeda6f0fb7c88b36b58 Mon Sep 17 00:00:00 2001
From: Brittni Bertolet <bbertole@nd.edu>
Date: Thu, 12 Oct 2017 19:17:46 -0400
Subject: [PATCH 6/6] changed one part of question 2 and finished question 3

---
 AnalysisTutorial7.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/AnalysisTutorial7.py b/AnalysisTutorial7.py
index 002bff3..d42b8ae 100644
--- a/AnalysisTutorial7.py
+++ b/AnalysisTutorial7.py
@@ -73,7 +73,7 @@
 # I put data in the GitHub titled "Q2_lakeData.txt". 
 # Read in Q2_lakeData.txt
 
-lakeData=open("Q2_lakeData.txt", "r")
+lakeData=pandas.read_csv("Q2_lakeData.txt", sep="\t")
 
 # Plot scatter plot of Prod vs chlA with trendline
 
@@ -87,7 +87,20 @@
 ############################################
 
 #read in data.txt
+data3=pandas.read_csv("data.txt",sep=",",header=0)
 
 #bar plot of north, east, south, and west populations
+plot4=ggplot(data3)+xlab("Region")+ylab("Mean Observation")
+plot4+geom_bar(aes(x="factor(region)",y="observations"),stat="summary",fun_y=numpy.mean)+theme_classic()
+
+# Plot a scatter plot of of all the observations
+plot5=ggplot(data3, aes(x="region", y="observations"))+xlab("Region")+ylab("Mean Observation")
+plot5+geom_point()+geom_jitter()+theme_classic()
+
+### The two plots tell very different stories. While each region does have a very similar 
+### mean, the data are distributed differently around the mean. Both East and West have very 
+### large standard deviations, while the North region does not. The South region is more like a 
+### bimodal distribution with both maximam on opposite sides of the mean. 
+
 
 #scatter plot of observations