From 66c1b135dc2cf8ec4ff7386f12d19b8356bf57c4 Mon Sep 17 00:00:00 2001
From: Tim Burton <tburton@nd.edu>
Date: Thu, 12 Oct 2017 20:31:27 -0400
Subject: [PATCH 1/2] completed exercise 7.1 and 7.2, added cars.csv data for
 ex 7.2

---
 Ex7.py   | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 cars.csv | 51 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 114 insertions(+)
 create mode 100644 Ex7.py
 create mode 100644 cars.csv

diff --git a/Ex7.py b/Ex7.py
new file mode 100644
index 0000000..f7532de
--- /dev/null
+++ b/Ex7.py
@@ -0,0 +1,63 @@
+cd Desktop/Intro_Biocom_ND_319_Tutorial7/
+
+
+import pandas as pd
+
+InFile=open("Lecture11.fasta","r")
+
+#create lists for storing information about sequences
+sequenceID=[]
+sequenceLength=[]
+percentGC=[]
+meltingTemp=[]
+
+#loop through each line of fasta file to process sequences
+for Line in InFile:
+    # remove newline character from file line
+    Line=Line.strip()
+    # if a sequence record
+    if '>' in Line:
+        # add the sequence ID (except the ">" character) to the sequenceID list
+        sequenceID.append(Line[1:])
+    # if a sequence line
+    else:
+        # get the number of characters in the sequence and convert to a float to avoid integer division
+        seqLen=float(len(Line))
+        # count the number of G's and C's
+        nG=Line.count("G")
+        nC=Line.count("C")
+        
+        # if the sequence is 14 or fewer bases calculate melting temperature
+        if seqLen<=14:
+            Tm=2*(nG+nC)+2*seqLen
+        else:
+            Tm=-9999
+        
+        # append values to the lists
+        sequenceLength.append(seqLen)
+        percentGC.append((nG+nC)/seqLen*100)
+        meltingTemp.append(Tm)
+
+# combine lists into dataframe
+seqDF = pandas.DataFrame(list(zip(sequenceID,sequenceLength,percentGC,meltingTemp)),columns=['sequenceID','sequenceLength','percentGC','meltingTemp'])
+
+# close file
+InFile.close()
+
+
+import plotnine
+
+#ex 1.1, histogram of sequence length
+plot1=plotnine.ggplot(seqDF, plotnine.aes('sequenceLength'))
+plot1 + plotnine.geom_histogram(binwidth=5)
+
+#ex 1.2, histogram of GC content
+plot2=plotnine.ggplot(seqDF, plotnine.aes('percentGC'))
+plot2 + plotnine.geom_histogram(binwidth=5)
+
+#ex 2, data of speed and stopping distance of cars.csv
+cars=pd.read_csv("cars.csv")
+
+plot3=plotnine.ggplot(cars, plotnine.aes('speed','dist'))
+plot3 + plotnine.geom_point() + plotnine.geom_smooth()
+
diff --git a/cars.csv b/cars.csv
new file mode 100644
index 0000000..34b3d46
--- /dev/null
+++ b/cars.csv
@@ -0,0 +1,51 @@
+"","speed","dist"
+"1",4,2
+"2",4,10
+"3",7,4
+"4",7,22
+"5",8,16
+"6",9,10
+"7",10,18
+"8",10,26
+"9",10,34
+"10",11,17
+"11",11,28
+"12",12,14
+"13",12,20
+"14",12,24
+"15",12,28
+"16",13,26
+"17",13,34
+"18",13,34
+"19",13,46
+"20",14,26
+"21",14,36
+"22",14,60
+"23",14,80
+"24",15,20
+"25",15,26
+"26",15,54
+"27",16,32
+"28",16,40
+"29",17,32
+"30",17,40
+"31",17,50
+"32",18,42
+"33",18,56
+"34",18,76
+"35",18,84
+"36",19,36
+"37",19,46
+"38",19,68
+"39",20,32
+"40",20,48
+"41",20,52
+"42",20,56
+"43",20,64
+"44",22,66
+"45",23,54
+"46",24,70
+"47",24,92
+"48",24,93
+"49",24,120
+"50",25,85

From aae6d7dc5afedf16ace8d80021060909f73873ef Mon Sep 17 00:00:00 2001
From: Tim Burton <tburton@nd.edu>
Date: Fri, 13 Oct 2017 09:39:14 -0400
Subject: [PATCH 2/2] finished exercise 7.3

---
 Ex7.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/Ex7.py b/Ex7.py
index f7532de..bae1bce 100644
--- a/Ex7.py
+++ b/Ex7.py
@@ -61,3 +61,16 @@
 plot3=plotnine.ggplot(cars, plotnine.aes('speed','dist'))
 plot3 + plotnine.geom_point() + plotnine.geom_smooth()
 
+
+#ex 3
+data=pd.read_table("data.txt", delimiter=",")
+
+#ex3 plot 1, barplot of means
+plot4=plotnine.ggplot(data, plotnine.aes('region', 'observations'))
+plot4 + plotnine.geom_bar(stat='summary')
+
+#ex3 plot 2, scatter plot
+plot5=plotnine.ggplot(data, plotnine.aes('region','observations'))
+plot5 + plotnine.geom_jitter()
+
+#the bar plot showing the means leads me to believe the data are all very similar, since the means are all very close. However the scatterplot shows that the distribution of the data in each group is very different.