From 9b779de780ea1f904691723bb0ca9144b22538b6 Mon Sep 17 00:00:00 2001
From: Katherine <bookaholic214@gmail.com>
Date: Fri, 6 Oct 2017 10:54:14 -0400
Subject: [PATCH 01/16] created python script file

---
 exercise7 | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100755 exercise7

diff --git a/exercise7 b/exercise7
new file mode 100755
index 0000000..e69de29

From 43a71cf0e9a89477d489a602d081f2e06dc60140 Mon Sep 17 00:00:00 2001
From: Michelle Corley <mcorley1@nd.edu>
Date: Fri, 6 Oct 2017 11:19:15 -0400
Subject: [PATCH 02/16] initial commit of part 1 script, no plots yet

---
 part1script.py | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 part1script.py

diff --git a/part1script.py b/part1script.py
new file mode 100644
index 0000000..3ac5694
--- /dev/null
+++ b/part1script.py
@@ -0,0 +1,31 @@
+import pandas
+
+#Question 1
+InFile=open("Lecture11.fasta","r")
+
+#create lists for storing information about sequences
+sequenceID=[]
+sequenceLength=[]
+percentGC=[]
+meltingTemp=[]
+
+#loop through each line in fasta file to process sequences
+for Line in InFile:
+    Line=Line.strip()
+    if '>' in Line:
+        sequenceID.append(Line[1:])
+    else:
+        seqLen=float(len(Line))
+        nG=Line.count("G")
+        nC=Line.count("C")
+        
+    #append values to lists
+        sequenceLength.append(seqLen)
+        percentGC.append((nG+nC)/seqLen*100)
+        print(percentGC)
+    
+#combine lists into dataframe 
+seqDF = pandas.DataFrame(list(zip(sequenceID,sequenceLength,percentGC)),columns=['sequenceID','sequenceLength','percentGC'])
+
+#close file
+InFile.close()

From ceeed73691b2543642867e7d8ac411c9b9dbf90f Mon Sep 17 00:00:00 2001
From: Katherine <bookaholic214@gmail.com>
Date: Mon, 9 Oct 2017 12:08:53 -0400
Subject: [PATCH 03/16] Added to Q1, loaded Q3 dataset and made blank lists

---
 exercise7 | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/exercise7 b/exercise7
index e69de29..ca0f9fb 100755
--- a/exercise7
+++ b/exercise7
@@ -0,0 +1,59 @@
+####Exercise7####
+#Question 1
+#load dataset
+import pandas
+InFile=open("Lecture11.fasta","r")
+#create lists for storing information about sequences
+sequenceID=[]
+sequenceLength=[]
+percentGC=[]
+meltingTemp=[]
+
+for Line in InFile:
+# remove newline character from file line
+    Line=Line.strip()
+    if '>' in Line:
+        sequenceID.append(Line[1:])
+    else:
+        seqLen=float(len(Line))
+        # count the number of G's and C's
+        nG=Line.count("G")
+        nC=Line.count("C")
+# if the sequence is 14 or fewer bases calculate melting temperature
+        if seqLen<=14:
+            Tm=2*(nG+nC)+2*seqLen
+        else:
+            Tm=-9999
+#append values to list
+sequenceLength.append(seqLen)
+percentGC.append((nG+nC)/seqLen*100)
+meltingTemp.append(Tm)
+
+#Histogram of sequence lengths
+import plotnine
+from plotnine import *
+p=(ggplot(data=None) +
+    aes(x="sequenceID",y="seqLen") +
+    geom_histogram(binwidth=2))
+
+#Question 2
+
+
+#Question 3
+#load the dataset
+import pandas
+Data = pandas.read_csv("data.txt", sep='\t', lineterminator='\r')
+#create lists for storing information about sequences
+North=[]
+East=[]
+South=[]
+West=[]
+for Pop in Data:
+    if Data.region[Pop]=="north":
+        North.append(line[1:])
+    elif Data.region[Pop]=="east":
+        East.append(line[1:])
+    elif Data.region[Pop]=="south":
+        South.append(line[1:])
+    else:
+        West.append(line[1:])
\ No newline at end of file

From 493ca778e896a6b0811502072b2c6c18e449a751 Mon Sep 17 00:00:00 2001
From: Katherine <bookaholic214@gmail.com>
Date: Mon, 9 Oct 2017 16:32:20 -0400
Subject: [PATCH 04/16] Completed Questions 1 and 3

---
 exercise7 | 49 +++++++++++++++++++++++--------------------------
 1 file changed, 23 insertions(+), 26 deletions(-)

diff --git a/exercise7 b/exercise7
index ca0f9fb..9500604 100755
--- a/exercise7
+++ b/exercise7
@@ -3,6 +3,7 @@
 #load dataset
 import pandas
 InFile=open("Lecture11.fasta","r")
+InFile=close()
 #create lists for storing information about sequences
 sequenceID=[]
 sequenceLength=[]
@@ -12,29 +13,32 @@ meltingTemp=[]
 for Line in InFile:
 # remove newline character from file line
     Line=Line.strip()
+    print (Line)
     if '>' in Line:
         sequenceID.append(Line[1:])
     else:
-        seqLen=float(len(Line))
+        Seqlength = float(len(Line))
+        print (Seqlength)
+        sequenceLength.append(Seqlength)
         # count the number of G's and C's
         nG=Line.count("G")
+        print (nG)
         nC=Line.count("C")
-# if the sequence is 14 or fewer bases calculate melting temperature
-        if seqLen<=14:
-            Tm=2*(nG+nC)+2*seqLen
-        else:
-            Tm=-9999
+        print (nC)
 #append values to list
-sequenceLength.append(seqLen)
-percentGC.append((nG+nC)/seqLen*100)
-meltingTemp.append(Tm)
+        gcTotal = (nG+nC)/Seqlength*100
+        percentGC.append(gcTotal)
+
+#dataframe of resulting info
+seqDF = pandas.DataFrame(list(zip(sequenceID,sequenceLength,percentGC)),columns=['sequenceID','sequenceLength','percentGC'])
+InFile.close()
 
 #Histogram of sequence lengths
 import plotnine
 from plotnine import *
-p=(ggplot(data=None) +
-    aes(x="sequenceID",y="seqLen") +
-    geom_histogram(binwidth=2))
+p=(ggplot(data=seqDF) +
+    aes(x="sequenceLength") +
+    geom_histogram(binwidth=5))
 
 #Question 2
 
@@ -42,18 +46,11 @@ p=(ggplot(data=None) +
 #Question 3
 #load the dataset
 import pandas
-Data = pandas.read_csv("data.txt", sep='\t', lineterminator='\r')
+import numpy
+Data = pandas.read_csv("data.txt", sep=',')
+print (Data)
 #create lists for storing information about sequences
-North=[]
-East=[]
-South=[]
-West=[]
-for Pop in Data:
-    if Data.region[Pop]=="north":
-        North.append(line[1:])
-    elif Data.region[Pop]=="east":
-        East.append(line[1:])
-    elif Data.region[Pop]=="south":
-        South.append(line[1:])
-    else:
-        West.append(line[1:])
\ No newline at end of file
+import plotnine
+from plotnine import *
+d=ggplot(Data)+theme_classic()+xlab("region")+ylab("Average")
+d+geom_bar(aes(x="factor(region)",y="observations"),stat="summary",fun_y=numpy.mean)

From 7017b00e59961b83740195cb3e8be516006e592c Mon Sep 17 00:00:00 2001
From: Katherine <bookaholic214@gmail.com>
Date: Mon, 9 Oct 2017 18:28:18 -0400
Subject: [PATCH 05/16] Cleaned up code and added comments for Q1 and Q3

---
 exercise7 | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/exercise7 b/exercise7
index 9500604..090024d 100755
--- a/exercise7
+++ b/exercise7
@@ -9,14 +9,16 @@ sequenceID=[]
 sequenceLength=[]
 percentGC=[]
 meltingTemp=[]
-
+#for loop to sort out sequence lines and append their lengths
 for Line in InFile:
 # remove newline character from file line
     Line=Line.strip()
     print (Line)
+    # carrot lines separated from sequence lines   
     if '>' in Line:
         sequenceID.append(Line[1:])
     else:
+        # Create new seqlength dataframe and append lengths
         Seqlength = float(len(Line))
         print (Seqlength)
         sequenceLength.append(Seqlength)
@@ -25,12 +27,14 @@ for Line in InFile:
         print (nG)
         nC=Line.count("C")
         print (nC)
-#append values to list
+        # append values to list
         gcTotal = (nG+nC)/Seqlength*100
         percentGC.append(gcTotal)
 
 #dataframe of resulting info
 seqDF = pandas.DataFrame(list(zip(sequenceID,sequenceLength,percentGC)),columns=['sequenceID','sequenceLength','percentGC'])
+#to make infile management easier
+InFile=open("Lecture11.fasta","r")
 InFile.close()
 
 #Histogram of sequence lengths
@@ -38,7 +42,7 @@ import plotnine
 from plotnine import *
 p=(ggplot(data=seqDF) +
     aes(x="sequenceLength") +
-    geom_histogram(binwidth=5))
+    geom_histogram(binwidth=4))
 
 #Question 2
 
@@ -49,7 +53,8 @@ import pandas
 import numpy
 Data = pandas.read_csv("data.txt", sep=',')
 print (Data)
-#create lists for storing information about sequences
+
+#making bar graph with region as x and ave as y
 import plotnine
 from plotnine import *
 d=ggplot(Data)+theme_classic()+xlab("region")+ylab("Average")

From d28b56599388b47dff5707081f7ffe84de3b641d Mon Sep 17 00:00:00 2001
From: Michelle Corley <mcorley1@nd.edu>
Date: Mon, 9 Oct 2017 22:13:29 -0400
Subject: [PATCH 06/16] histogram of sequence length

---
 part1script.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/part1script.py b/part1script.py
index 3ac5694..ef5ac0f 100644
--- a/part1script.py
+++ b/part1script.py
@@ -1,4 +1,7 @@
+import numpy
 import pandas
+from plotnine import *
+
 
 #Question 1
 InFile=open("Lecture11.fasta","r")
@@ -11,9 +14,10 @@
 
 #loop through each line in fasta file to process sequences
 for Line in InFile:
-    Line=Line.strip()
+    Line=Line.strip() #removes white space, tab, space, newline characters
     if '>' in Line:
         sequenceID.append(Line[1:])
+        #print(Line[1:])
     else:
         seqLen=float(len(Line))
         nG=Line.count("G")
@@ -22,10 +26,17 @@
     #append values to lists
         sequenceLength.append(seqLen)
         percentGC.append((nG+nC)/seqLen*100)
-        print(percentGC)
     
 #combine lists into dataframe 
 seqDF = pandas.DataFrame(list(zip(sequenceID,sequenceLength,percentGC)),columns=['sequenceID','sequenceLength','percentGC'])
+#min(seqDF.sequenceLength)
 
 #close file
 InFile.close()
+
+#plots histogram of sequence length
+b=ggplot(seqDF,aes(x="sequenceLength"))
+b+geom_histogram(binwidth=5)+theme_classic()
+
+
+

From 4755988e1700553f8b557e8fa1337ce39a3c1f08 Mon Sep 17 00:00:00 2001
From: Michelle Corley <mcorley1@nd.edu>
Date: Tue, 10 Oct 2017 11:54:25 -0400
Subject: [PATCH 07/16] percentGC histogram added

---
 part1script.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/part1script.py b/part1script.py
index ef5ac0f..eedaed5 100644
--- a/part1script.py
+++ b/part1script.py
@@ -38,5 +38,12 @@
 b=ggplot(seqDF,aes(x="sequenceLength"))
 b+geom_histogram(binwidth=5)+theme_classic()
 
+#plots histogram of sequence length
+b=ggplot(seqDF,aes(x="percentGC"))
+b+geom_histogram(binwidth=5)+theme_classic()
+
+
+
+
 
 

From 48664baf35f5235960ed550639a49c179d823311 Mon Sep 17 00:00:00 2001
From: Michelle Corley <mcorley1@nd.edu>
Date: Tue, 10 Oct 2017 13:52:37 -0400
Subject: [PATCH 08/16] part 1 script updated

---
 part1script.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/part1script.py b/part1script.py
index eedaed5..eceada6 100644
--- a/part1script.py
+++ b/part1script.py
@@ -38,7 +38,7 @@
 b=ggplot(seqDF,aes(x="sequenceLength"))
 b+geom_histogram(binwidth=5)+theme_classic()
 
-#plots histogram of sequence length
+#plots histogram of percent GC
 b=ggplot(seqDF,aes(x="percentGC"))
 b+geom_histogram(binwidth=5)+theme_classic()
 

From 0deb9a8942799885719766bbfe36d516110c7bb6 Mon Sep 17 00:00:00 2001
From: Michelle Corley <mcorley1@nd.edu>
Date: Tue, 10 Oct 2017 13:53:21 -0400
Subject: [PATCH 09/16] part 3 script with scatter plot

---
 part3script.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 part3script.py

diff --git a/part3script.py b/part3script.py
new file mode 100644
index 0000000..595e38c
--- /dev/null
+++ b/part3script.py
@@ -0,0 +1,16 @@
+#Question 3
+#load the dataset
+import pandas
+import numpy
+Data = pandas.read_csv("data.txt", sep=',')
+#print (Data)
+
+#making bar graph with region as x and ave as y
+import plotnine
+from plotnine import *
+d=ggplot(Data)+theme_classic()+xlab("region")+ylab("Average")
+d+geom_bar(aes(x="factor(region)",y="observations"),stat="summary",fun_y=numpy.mean)
+
+#scatter plot of all observations
+a=ggplot(Data,aes(x="region",y="observations"))
+a+geom_jitter()+coord_cartesian()

From 36b5e959f437f2891b625dcf6b54d12bd21dfa1a Mon Sep 17 00:00:00 2001
From: Katherine <bookaholic214@gmail.com>
Date: Wed, 11 Oct 2017 19:51:16 -0400
Subject: [PATCH 10/16] Recommit because we were accidentally working in
 separate files. Making sure all code is in exercise7.py

---
 exercise7 | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/exercise7 b/exercise7
index 090024d..ac5f02e 100755
--- a/exercise7
+++ b/exercise7
@@ -44,6 +44,11 @@ p=(ggplot(data=seqDF) +
     aes(x="sequenceLength") +
     geom_histogram(binwidth=4))
 
+#Histogram of Percent GC
+g=(ggplot(data=seqDF) +
+    aes(x="percentGC") +
+    geom_histogram(binwidth=5))
+
 #Question 2
 
 
@@ -59,3 +64,7 @@ import plotnine
 from plotnine import *
 d=ggplot(Data)+theme_classic()+xlab("region")+ylab("Average")
 d+geom_bar(aes(x="factor(region)",y="observations"),stat="summary",fun_y=numpy.mean)
+
+#scatter plot of everything observed
+a=ggplot(Data,aes(x="region",y="observations"))
+a+geom_jitter()+coord_cartesian()

From b96c3cd1b09f6d78eea69194d43c8e9b5a1b9197 Mon Sep 17 00:00:00 2001
From: Michelle Corley <mcorley1@nd.edu>
Date: Thu, 12 Oct 2017 13:07:39 -0400
Subject: [PATCH 11/16] trouble with plotting

---
 part2script.py | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 part2script.py

diff --git a/part2script.py b/part2script.py
new file mode 100644
index 0000000..b647df3
--- /dev/null
+++ b/part2script.py
@@ -0,0 +1,11 @@
+import numpy
+import pandas
+import plotnine
+from plotnine import *
+
+Part2=pandas.read_csv("part2datacopy.txt", sep="",", header=0)
+#print(Part2)
+
+#plotting data in scatterplot with trendline
+a=ggplot(Part2,aes(x="oil changes per year",y="cost of repairs($)"))+theme_classic()+geom_point()
+a+xlab("oil changes per year")+ylab("cost of repairs($)")+stat_smooth(method="lm")

From b5b3c8bde263497e6acda5d09e8f6f2afcc932b1 Mon Sep 17 00:00:00 2001
From: Michelle Corley <mcorley1@nd.edu>
Date: Thu, 12 Oct 2017 13:09:11 -0400
Subject: [PATCH 12/16] data for part2

---
 part2datacopy.txt | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 part2datacopy.txt

diff --git a/part2datacopy.txt b/part2datacopy.txt
new file mode 100644
index 0000000..903b508
--- /dev/null
+++ b/part2datacopy.txt
@@ -0,0 +1,5 @@
+“oil changes per year”,”cost of repairs($)”“3”,300“5”,300“2”,500“3”,400“1”,700“4”,400“6”,100“4”,250“3”,450
+“2”,650
+“0”,600
+“10”,0
+“7”,150
\ No newline at end of file

From 1dd0597e32eafe32ada673607a6b753ea7ff1ff4 Mon Sep 17 00:00:00 2001
From: Katherine <bookaholic214@gmail.com>
Date: Thu, 12 Oct 2017 14:49:58 -0400
Subject: [PATCH 13/16] Removed quotation marks via command line

---
 part2datacopy.txt | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/part2datacopy.txt b/part2datacopy.txt
index 903b508..82b3374 100644
--- a/part2datacopy.txt
+++ b/part2datacopy.txt
@@ -1,5 +1 @@
-“oil changes per year”,”cost of repairs($)”“3”,300“5”,300“2”,500“3”,400“1”,700“4”,400“6”,100“4”,250“3”,450
-“2”,650
-“0”,600
-“10”,0
-“7”,150
\ No newline at end of file
+oil changes per year,cost of repairs($)3,3005,3002,5003,4001,7004,4006,1004,2503,4502,6500,60010,07,150
\ No newline at end of file

From 0834cbf5cea7caf2258d0a00b141eecfd461aaab Mon Sep 17 00:00:00 2001
From: Katherine <bookaholic214@gmail.com>
Date: Thu, 12 Oct 2017 14:50:49 -0400
Subject: [PATCH 14/16] Fixed importing file issue. It works now!

---
 part2script.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/part2script.py b/part2script.py
index b647df3..6f3d765 100644
--- a/part2script.py
+++ b/part2script.py
@@ -3,7 +3,7 @@
 import plotnine
 from plotnine import *
 
-Part2=pandas.read_csv("part2datacopy.txt", sep="",", header=0)
+Part2=pandas.read_csv("part2datacopy.txt", sep=",")
 #print(Part2)
 
 #plotting data in scatterplot with trendline

From c51647a3f40c0063235765de91318ea4dd81b1df Mon Sep 17 00:00:00 2001
From: Katherine <bookaholic214@gmail.com>
Date: Thu, 12 Oct 2017 14:51:18 -0400
Subject: [PATCH 15/16] Added Michelle's Question 2 script. This file contains
 all 3 Questions

---
 exercise7 | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/exercise7 b/exercise7
index ac5f02e..1d1c33c 100755
--- a/exercise7
+++ b/exercise7
@@ -50,7 +50,18 @@ g=(ggplot(data=seqDF) +
     geom_histogram(binwidth=5))
 
 #Question 2
+import numpy
+import pandas
+import plotnine
+from plotnine import *
+
+#read in file
+Part2=pandas.read_csv("part2datacopy.txt", sep=",")
+#print(Part2)
 
+#plotting data in scatterplot with trendline
+a=ggplot(Part2,aes(x="oil changes per year",y="cost of repairs($)"))+theme_classic()+geom_point()
+a+xlab("oil changes per year")+ylab("cost of repairs($)")+stat_smooth(method="lm")
 
 #Question 3
 #load the dataset

From 6088aaa62b133a797fc36216d560d35d4dde8e10 Mon Sep 17 00:00:00 2001
From: Katherine <bookaholic214@gmail.com>
Date: Thu, 12 Oct 2017 15:25:53 -0400
Subject: [PATCH 16/16] Added print commands to Q1

---
 exercise7 | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/exercise7 b/exercise7
index 1d1c33c..f471461 100755
--- a/exercise7
+++ b/exercise7
@@ -3,7 +3,7 @@
 #load dataset
 import pandas
 InFile=open("Lecture11.fasta","r")
-InFile=close()
+#InFile=close()
 #create lists for storing information about sequences
 sequenceID=[]
 sequenceLength=[]
@@ -34,7 +34,7 @@ for Line in InFile:
 #dataframe of resulting info
 seqDF = pandas.DataFrame(list(zip(sequenceID,sequenceLength,percentGC)),columns=['sequenceID','sequenceLength','percentGC'])
 #to make infile management easier
-InFile=open("Lecture11.fasta","r")
+#InFile=open("Lecture11.fasta","r")
 InFile.close()
 
 #Histogram of sequence lengths
@@ -43,11 +43,12 @@ from plotnine import *
 p=(ggplot(data=seqDF) +
     aes(x="sequenceLength") +
     geom_histogram(binwidth=4))
-
+p
 #Histogram of Percent GC
 g=(ggplot(data=seqDF) +
     aes(x="percentGC") +
     geom_histogram(binwidth=5))
+g
 
 #Question 2
 import numpy