From 84d6133185c6ad9ac40a36284f3ec16c313898de Mon Sep 17 00:00:00 2001
From: ayamasaki2011 <ayamasak@nd.edu>
Date: Fri, 6 Oct 2017 11:14:57 -0400
Subject: [PATCH 01/12] AEY: Script for calculating sequence lengths and %GC
 content from fasta file

---
 Exercise7.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100755 Exercise7.py

diff --git a/Exercise7.py b/Exercise7.py
new file mode 100755
index 0000000..4faeb6b
--- /dev/null
+++ b/Exercise7.py
@@ -0,0 +1,20 @@
+#Part 1
+import pandas
+
+InFile=open("Lecture11.fasta","r") #Open fasta file as read-only
+
+sequenceLength=[] #Set up variables to accept/store sequence data as it is calculated
+percentGC=[]
+
+for line in InFile: #Loop through each line in fasta file
+    if '>' in line: #Check line for >, if present, skip to next line
+        continue
+    else:
+        seqLen=float(len(line)) #Calculate length of sequence
+        nG=line.count("G") #Count individual G and C contents
+        nC=line.count("C")
+        percGC=float(((nG+nC)/seqLen)*100) #Calculate % GC
+    
+        sequenceLength.append(seqLen) #Append length of individual sequences to list
+        percentGC.append(percGC) #Append %GC of individual sequences to list
+

From 534e9ce7701ffe52f38c1f4777263c11e98edd3f Mon Sep 17 00:00:00 2001
From: ayamasaki2011 <ayamasak@nd.edu>
Date: Thu, 12 Oct 2017 11:45:12 -0400
Subject: [PATCH 02/12] AEY: Add plot of GC content to script

---
 Exercise7.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/Exercise7.py b/Exercise7.py
index 4faeb6b..54dcab3 100755
--- a/Exercise7.py
+++ b/Exercise7.py
@@ -1,5 +1,6 @@
 #Part 1
 import pandas
+from plotnine import *
 
 InFile=open("Lecture11.fasta","r") #Open fasta file as read-only
 
@@ -18,3 +19,9 @@
         sequenceLength.append(seqLen) #Append length of individual sequences to list
         percentGC.append(percGC) #Append %GC of individual sequences to list
 
+seqDF=pandas.DataFrame(list(zip(sequenceLength,percentGC)),columns=['sequenceLength','percentGC'])
+a=ggplot(seqDF, aes(x="sequenceLength"))
+a+geom_histogram()+theme_classic()
+
+b=ggplot(seqDF, aes(x="percentGC"))
+b+geom_histogram()+theme_classic()

From 90ecfc31c1b2d4b3c89a1803b9651c467f7056d5 Mon Sep 17 00:00:00 2001
From: ayamasaki2011 <ayamasak@nd.edu>
Date: Thu, 12 Oct 2017 11:46:45 -0400
Subject: [PATCH 03/12] AEY: Add line in script to close file

---
 Exercise7.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Exercise7.py b/Exercise7.py
index 54dcab3..e7e0caa 100755
--- a/Exercise7.py
+++ b/Exercise7.py
@@ -25,3 +25,5 @@
 
 b=ggplot(seqDF, aes(x="percentGC"))
 b+geom_histogram()+theme_classic()
+
+InFile.close()
\ No newline at end of file

From fbaf8a2b3d30b534dcfce33059eb415985d06969 Mon Sep 17 00:00:00 2001
From: ayamasaki2011 <ayamasak@nd.edu>
Date: Thu, 12 Oct 2017 11:48:29 -0400
Subject: [PATCH 04/12] AEY: Comment remaining lines of code

---
 Exercise7.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/Exercise7.py b/Exercise7.py
index e7e0caa..52b7988 100755
--- a/Exercise7.py
+++ b/Exercise7.py
@@ -19,11 +19,11 @@
         sequenceLength.append(seqLen) #Append length of individual sequences to list
         percentGC.append(percGC) #Append %GC of individual sequences to list
 
-seqDF=pandas.DataFrame(list(zip(sequenceLength,percentGC)),columns=['sequenceLength','percentGC'])
-a=ggplot(seqDF, aes(x="sequenceLength"))
-a+geom_histogram()+theme_classic()
+seqDF=pandas.DataFrame(list(zip(sequenceLength,percentGC)),columns=['sequenceLength','percentGC']) #combine lists into dataframe for easier plotting
+a=ggplot(seqDF, aes(x="sequenceLength")) #Create plot of sequence lengths
+a+geom_histogram()+theme_classic() #Plot as histogram
 
-b=ggplot(seqDF, aes(x="percentGC"))
-b+geom_histogram()+theme_classic()
+b=ggplot(seqDF, aes(x="percentGC")) #Create plot of %GC
+b+geom_histogram()+theme_classic() #Plot as histogram
 
-InFile.close()
\ No newline at end of file
+InFile.close() #Close file
\ No newline at end of file

From e507b14eeef5e619acdda219abf967f3ad127eb6 Mon Sep 17 00:00:00 2001
From: ayamasaki2011 <ayamasak@nd.edu>
Date: Thu, 12 Oct 2017 12:24:15 -0400
Subject: [PATCH 05/12] AEY: Start script for Part 3 - calculate mean of
 populations

---
 Exercise7.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/Exercise7.py b/Exercise7.py
index 52b7988..dd9a1ab 100755
--- a/Exercise7.py
+++ b/Exercise7.py
@@ -26,4 +26,14 @@
 b=ggplot(seqDF, aes(x="percentGC")) #Create plot of %GC
 b+geom_histogram()+theme_classic() #Plot as histogram
 
-InFile.close() #Close file
\ No newline at end of file
+InFile.close() #Close file
+
+#Part 2
+import numpy
+
+data=pandas.read_csv("data.txt", header=0, sep=",") #Open file as data frame
+
+nMean=numpy.mean(data[data.region=="north"]) #Calculate mean for each direction/population
+eMean=numpy.mean(data[data.region=="east"])
+wMean=numpy.mean(data[data.region=="west"])
+sMean=numpy.mean(data[data.region=="south"])
\ No newline at end of file

From 68e6535399046cf930dee47b7f711434856cf561 Mon Sep 17 00:00:00 2001
From: ayamasaki2011 <ayamasak@nd.edu>
Date: Thu, 12 Oct 2017 14:39:58 -0400
Subject: [PATCH 06/12] AEY: Plot means onto bar graph

---
 Exercise7.py | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/Exercise7.py b/Exercise7.py
index dd9a1ab..ba8d2b1 100755
--- a/Exercise7.py
+++ b/Exercise7.py
@@ -28,12 +28,29 @@
 
 InFile.close() #Close file
 
-#Part 2
+#Part 3
 import numpy
 
 data=pandas.read_csv("data.txt", header=0, sep=",") #Open file as data frame
 
-nMean=numpy.mean(data[data.region=="north"]) #Calculate mean for each direction/population
-eMean=numpy.mean(data[data.region=="east"])
-wMean=numpy.mean(data[data.region=="west"])
-sMean=numpy.mean(data[data.region=="south"])
\ No newline at end of file
+dataN=data[data.region=="north"] #Subset data frame & find mean for all populations
+nMean=numpy.mean(dataN.observations)
+
+dataE=data[data.region=="east"]
+eMean=numpy.mean(dataE.observations)
+
+dataW=data[data.region=="west"]
+wMean=numpy.mean(dataW.observations)
+
+dataS=data[data.region=="south"]
+sMean=numpy.mean(dataS.observations)
+
+means=pandas.DataFrame(columns=('region', 'mean')) #Combine means into new data frame
+means.region='north','south','east','west'
+means.iloc[0,1]=nMean
+means.iloc[1,1]=sMean
+means.iloc[2,1]=eMean
+means.iloc[3,1]=wMean
+
+c=ggplot(means, aes(x="region",y="mean")) #Plot means on bar graph
+c+geom_col()

From 5c2294b78091ac4c8526743c80c9be35cd1cfee2 Mon Sep 17 00:00:00 2001
From: ayamasaki2011 <ayamasak@nd.edu>
Date: Thu, 12 Oct 2017 14:45:16 -0400
Subject: [PATCH 07/12] AEY: Plot data observations on scatter plot; answer
 question in comments

---
 Exercise7.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/Exercise7.py b/Exercise7.py
index ba8d2b1..a8c4a28 100755
--- a/Exercise7.py
+++ b/Exercise7.py
@@ -53,4 +53,13 @@
 means.iloc[3,1]=wMean
 
 c=ggplot(means, aes(x="region",y="mean")) #Plot means on bar graph
-c+geom_col()
+c+geom_col()+theme_classic()
+
+d=ggplot(data, aes(x="region", y="observations")) #Plot all observations on scatter plot
+d+geom_jitter()+theme_classic()
+
+#Graphs tell different stories - only on the scatter plot does it become apparent that the observations
+#in the south region are two discrete populations, rather than a continuous spread like the others.
+#Additionally, the mean for the West region makes it look as though it has the smallest values, whereas
+#the scatterplot shows that it has both the lowest and the highest values, over a very large spread.
+#The mean barplot is really only an accurate respresentation for the North region.
\ No newline at end of file

From fbb7d48d3b169c90333139977854256bd26a1794 Mon Sep 17 00:00:00 2001
From: twhmitchell <tmitch10@nd.edu>
Date: Thu, 12 Oct 2017 20:38:16 -0400
Subject: [PATCH 08/12] Add files via upload

Part 2, need to resolve plotting errors
---
 code | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 code

diff --git a/code b/code
new file mode 100644
index 0000000..a54bf3a
--- /dev/null
+++ b/code
@@ -0,0 +1,10 @@
+import numpy
+import pandas
+from plotnine import *
+
+icecream=pandas.read_csv("icream_sales.txt",sep="\t",header=0)
+icecream.shape
+icecream.head(20)
+
+a=ggplot(icecream,aes(x="temp",y="sales"))+theme_classic()+geom_point()
+a+xlab("Temperature (C)")+ylab("Sales In Dollars")+stat_smooth(method="lm")
\ No newline at end of file

From 829b99f7ae7ccea8d7e075661d5298c127b5ce1f Mon Sep 17 00:00:00 2001
From: twhmitchell <tmitch10@nd.edu>
Date: Thu, 12 Oct 2017 20:39:14 -0400
Subject: [PATCH 09/12] Add files via upload

Dataset for Part 2 of the exercise
---
 icream_sales.txt | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 icream_sales.txt

diff --git a/icream_sales.txt b/icream_sales.txt
new file mode 100644
index 0000000..7faa37b
--- /dev/null
+++ b/icream_sales.txt
@@ -0,0 +1,2 @@
+“temp” “sales”
+14.2  21516.4  32511.9  18515.2  33218.5  40622.1  52219.4  41225.1  61423.4  54418.1  42122.6  44517.2  408
\ No newline at end of file

From b6a8115354e722734aa36815f3ece124c32f4923 Mon Sep 17 00:00:00 2001
From: ayamasaki2011 <ayamasak@nd.edu>
Date: Thu, 12 Oct 2017 21:18:36 -0400
Subject: [PATCH 10/12] AEY: Remove Part 3 from exercise code

---
 Exercise7Part1.py | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100755 Exercise7Part1.py

diff --git a/Exercise7Part1.py b/Exercise7Part1.py
new file mode 100755
index 0000000..52b7988
--- /dev/null
+++ b/Exercise7Part1.py
@@ -0,0 +1,29 @@
+#Part 1
+import pandas
+from plotnine import *
+
+InFile=open("Lecture11.fasta","r") #Open fasta file as read-only
+
+sequenceLength=[] #Set up variables to accept/store sequence data as it is calculated
+percentGC=[]
+
+for line in InFile: #Loop through each line in fasta file
+    if '>' in line: #Check line for >, if present, skip to next line
+        continue
+    else:
+        seqLen=float(len(line)) #Calculate length of sequence
+        nG=line.count("G") #Count individual G and C contents
+        nC=line.count("C")
+        percGC=float(((nG+nC)/seqLen)*100) #Calculate % GC
+    
+        sequenceLength.append(seqLen) #Append length of individual sequences to list
+        percentGC.append(percGC) #Append %GC of individual sequences to list
+
+seqDF=pandas.DataFrame(list(zip(sequenceLength,percentGC)),columns=['sequenceLength','percentGC']) #combine lists into dataframe for easier plotting
+a=ggplot(seqDF, aes(x="sequenceLength")) #Create plot of sequence lengths
+a+geom_histogram()+theme_classic() #Plot as histogram
+
+b=ggplot(seqDF, aes(x="percentGC")) #Create plot of %GC
+b+geom_histogram()+theme_classic() #Plot as histogram
+
+InFile.close() #Close file
\ No newline at end of file

From 05b8aca55ef77e7626beb2d7e272c0dabd3703d1 Mon Sep 17 00:00:00 2001
From: Balaji Sampathkumar <bsampath@nd.edu>
Date: Fri, 13 Oct 2017 01:34:57 -0400
Subject: [PATCH 11/12] EX 7 Question 3 - Balaji

---
 EX_7_Script_Q3 | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 EX_7_Script_Q3

diff --git a/EX_7_Script_Q3 b/EX_7_Script_Q3
new file mode 100644
index 0000000..ccb5e69
--- /dev/null
+++ b/EX_7_Script_Q3
@@ -0,0 +1,30 @@
+import numpy
+import os
+import matplotlib.pyplot as plt
+from plotnine import *
+os.listdir('.')
+os.chdir('/Users/sampathkumarbalaji/EX_7/Intro_Biocom_ND_319_Tutorial7')
+import pandas
+
+#to parse and read
+data_txt = pandas.read_csv("data.txt")
+directions = ['north','south','east','west']
+
+#create dataframe
+A=numpy.zeros((4,2))
+mean_DF=pandas.DataFrame(A,columns=['region', 'mean_dir'])
+
+#assigning values to data frame elements with mean of 4 regions
+for i in range(0,4) :
+    mean_DF.mean_dir[i] = numpy.mean(data_txt[data_txt.region==directions[i]].observations)
+    mean_DF.region[i] = directions[i]
+
+a=ggplot(mean_DF)+theme_classic()+xlab("region")+ylab("mean_dir")
+a+geom_bar(aes(x="region",y="mean_dir"),stat="summary")
+
+b=ggplot(data, aes(x="region", y="observations")) #Plot all observations on scatter plot
+b+geom_jitter()+theme_classic()
+
+#Bar Graph had mean plotted across the regions which almost had same values (~15). They almost looked the same.
+#But on the scatter plot we north has points centered around 15 while east and west are equally spread and south
+#has a bi-modal distribution.

From 51f686937cb28f822d3b097ff5202201800e1a30 Mon Sep 17 00:00:00 2001
From: Balaji Sampathkumar <bsampath@nd.edu>
Date: Fri, 13 Oct 2017 01:39:50 -0400
Subject: [PATCH 12/12] Combined all 3 scripts into 1 file - Balaji

---
 EX_7_Script_Final | 75 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)
 create mode 100644 EX_7_Script_Final

diff --git a/EX_7_Script_Final b/EX_7_Script_Final
new file mode 100644
index 0000000..2ccf6cf
--- /dev/null
+++ b/EX_7_Script_Final
@@ -0,0 +1,75 @@
+#Part 1 - Amanda
+import pandas
+from plotnine import *
+
+InFile=open("Lecture11.fasta","r") #Open fasta file as read-only
+
+sequenceLength=[] #Set up variables to accept/store sequence data as it is calculated
+percentGC=[]
+
+for line in InFile: #Loop through each line in fasta file
+    if '>' in line: #Check line for >, if present, skip to next line
+        continue
+    else:
+        seqLen=float(len(line)) #Calculate length of sequence
+        nG=line.count("G") #Count individual G and C contents
+        nC=line.count("C")
+        percGC=float(((nG+nC)/seqLen)*100) #Calculate % GC
+    
+        sequenceLength.append(seqLen) #Append length of individual sequences to list
+        percentGC.append(percGC) #Append %GC of individual sequences to list
+
+seqDF=pandas.DataFrame(list(zip(sequenceLength,percentGC)),columns=['sequenceLength','percentGC']) #combine lists into dataframe for easier plotting
+a=ggplot(seqDF, aes(x="sequenceLength")) #Create plot of sequence lengths
+a+geom_histogram()+theme_classic() #Plot as histogram
+
+b=ggplot(seqDF, aes(x="percentGC")) #Create plot of %GC
+b+geom_histogram()+theme_classic() #Plot as histogram
+
+InFile.close() #Close file
+#############################################################################
+#Part 2 - Thomas
+import numpy
+import pandas
+from plotnine import *
+
+icecream=pandas.read_csv("icream_sales.txt",sep="\t",header=0)
+icecream.shape
+icecream.head(20)
+
+a=ggplot(icecream,aes(x="temp",y="sales"))+theme_classic()+geom_point()
+a+xlab("Temperature (C)")+ylab("Sales In Dollars")+stat_smooth(method="lm")
+#############################################################################
+#Part 3 - Balaji
+import numpy
+import os
+import matplotlib.pyplot as plt
+from plotnine import *
+os.listdir('.')
+os.chdir('/Users/sampathkumarbalaji/EX_7/Intro_Biocom_ND_319_Tutorial7')
+import pandas
+
+#to parse and read
+data_txt = pandas.read_csv("data.txt")
+directions = ['north','south','east','west']
+
+#create dataframe
+A=numpy.zeros((4,2))
+mean_DF=pandas.DataFrame(A,columns=['region', 'mean_dir'])
+
+#assigning values to data frame elements with mean of 4 regions
+for i in range(0,4) :
+    mean_DF.mean_dir[i] = numpy.mean(data_txt[data_txt.region==directions[i]].observations)
+    mean_DF.region[i] = directions[i]
+
+a=ggplot(mean_DF)+theme_classic()+xlab("region")+ylab("mean_dir")
+a+geom_bar(aes(x="region",y="mean_dir"),stat="summary")
+
+b=ggplot(data, aes(x="region", y="observations")) #Plot all observations on scatter plot
+b+geom_jitter()+theme_classic()
+
+#Bar Graph had mean plotted across the regions which almost had same values (~15). They almost looked the same.
+#But on the scatter plot we north has points centered around 15 while east and west are equally spread and south
+#has a bi-modal distribution.
+
+