From e6031e0f14c2da3a20c618d7ca04e788d4881dfb Mon Sep 17 00:00:00 2001
From: Joe C <jchambe5@nd.edu>
Date: Sun, 8 Oct 2017 22:07:09 -0400
Subject: [PATCH 1/5] JC added opening the file, calculating seqLength,
 calculating GC percentage, completed first histogram, notes added

---
 Exercise7answers.py | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100755 Exercise7answers.py

diff --git a/Exercise7answers.py b/Exercise7answers.py
new file mode 100755
index 0000000..c70ab25
--- /dev/null
+++ b/Exercise7answers.py
@@ -0,0 +1,38 @@
+# open fasta file
+InFile=open("Lecture11.fasta","r")
+
+#create lists for storing information about sequences
+sequenceID=[]
+sequenceLength=[]
+percentGC=[]
+
+#loop through each line of fasta file to process sequences
+for Line in InFile:
+    # remove newline character from file line
+    Line=Line.strip()
+    # if a sequence record
+    if '>' in Line:
+        # add the sequence ID (except the ">" character) to the sequenceID list
+        sequenceID.append(Line[1:])
+    # if a sequence line
+    else:
+        # get the number of characters in the sequence and convert to a float to avoid integer division
+        seqLen=float(len(Line))
+        # count the number of G's and C's
+        nG=Line.count("G")
+        nC=Line.count("C")
+        
+        # append values to the lists
+        sequenceLength.append(seqLen)
+        percentGC.append((nG+nC)/seqLen*100)
+
+import pandas
+# combine lists into dataframe
+seqDF = pandas.DataFrame(list(zip(sequenceID,sequenceLength,percentGC)),columns=['sequenceID','sequenceLength','percentGC'])
+
+from plotnine import *
+# histogram of sequence length
+histogram1=ggplot(seqDF,aes(x="sequenceLength"))
+histogram1+geom_histogram()+theme_classic()
+
+

From 430cf4e3d37fe3b676eb7f5d96224463b0093811 Mon Sep 17 00:00:00 2001
From: Joe C <jchambe5@nd.edu>
Date: Sun, 8 Oct 2017 22:17:49 -0400
Subject: [PATCH 2/5] JC added histogram2, changed colors and optimized bin
 size

---
 Exercise7answers.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/Exercise7answers.py b/Exercise7answers.py
index c70ab25..92bfab3 100755
--- a/Exercise7answers.py
+++ b/Exercise7answers.py
@@ -33,6 +33,12 @@
 from plotnine import *
 # histogram of sequence length
 histogram1=ggplot(seqDF,aes(x="sequenceLength"))
-histogram1+geom_histogram()+theme_classic()
+histogram1+geom_histogram(binwidth=20,fill='blue',color='black')+theme_classic()
+
+# histogram of percentGC
+histogram2=ggplot(seqDF,aes(x="percentGC"))
+histogram2+geom_histogram()+theme_classic()
+# changing colors and bins
+histogram2+geom_histogram(binwidth=15,fill='yellow',color='black')+theme_classic()
 
 

From 35a4c22bfd9e5e65cc578493e476642714a5ede0 Mon Sep 17 00:00:00 2001
From: Joe C <jchambe5@nd.edu>
Date: Thu, 12 Oct 2017 14:25:56 -0400
Subject: [PATCH 3/5] question 1 complete. no question 2. working on question 3
 part 1 with part 2 finished.JC

---
 Exercise7answers.py | 50 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/Exercise7answers.py b/Exercise7answers.py
index 92bfab3..fc3c110 100755
--- a/Exercise7answers.py
+++ b/Exercise7answers.py
@@ -1,3 +1,8 @@
+# Exercise 7
+
+
+#Question 1
+
 # open fasta file
 InFile=open("Lecture11.fasta","r")
 
@@ -41,4 +46,49 @@
 # changing colors and bins
 histogram2+geom_histogram(binwidth=15,fill='yellow',color='black')+theme_classic()
 
+#Question 2
+
+
+#Question 3
+
+# open data file
+q3north=0
+q3south=0
+q3west=0
+q3east=0
+
+
+for i in range(0,len(q3),1):
+    if q3.region[i]=="north":
+        q3north=q3north+q3.observations[i]
+    elif q3.region[i]=="south":
+        q3south=q3south+q3.observations[i]
+    elif q3.region[i]=="east":
+        q3east=q3east+q3.observations[i]
+    elif q3.region[i]=="west":
+        q3west=q3west+q3.observations[i]
+
+q3north
+q3south
+q3west
+q3east
+
+avgnorth=(sum(q3.region=="north")/len(q3north))
+sum(q3.region=="south")
+sum(q3.region=="east")
+sum(q3.region=="west")
+
+
+import pandas
+q3=pandas.read_csv("data.txt", sep=",", header=0)
+average=q3.groupby('region')['observations'].mean()
+
+regions=q3.groupby('region')['region']
+seqDF=pandas.DataFrame(list(zip(average)),columns=['regions','observations'] set='\t')
+df=average.to_frame(name=none)
+
+#scatter plot with jitter applied
+from plotnine import *
+q3sp=ggplot(q3,aes(x="region",y="observations"))
+q3sp+geom_jitter()+coord_cartesian()
 

From db9483d7f283d6443e561a204adff9e6044659bb Mon Sep 17 00:00:00 2001
From: Joe C <jchambe5@nd.edu>
Date: Thu, 12 Oct 2017 15:59:44 -0400
Subject: [PATCH 4/5] finished question 3. JC

---
 Exercise7answers.py | 43 ++++++++++++-------------------------------
 1 file changed, 12 insertions(+), 31 deletions(-)

diff --git a/Exercise7answers.py b/Exercise7answers.py
index fc3c110..8fa59fa 100755
--- a/Exercise7answers.py
+++ b/Exercise7answers.py
@@ -51,44 +51,25 @@
 
 #Question 3
 
-# open data file
-q3north=0
-q3south=0
-q3west=0
-q3east=0
-
-
-for i in range(0,len(q3),1):
-    if q3.region[i]=="north":
-        q3north=q3north+q3.observations[i]
-    elif q3.region[i]=="south":
-        q3south=q3south+q3.observations[i]
-    elif q3.region[i]=="east":
-        q3east=q3east+q3.observations[i]
-    elif q3.region[i]=="west":
-        q3west=q3west+q3.observations[i]
-
-q3north
-q3south
-q3west
-q3east
-
-avgnorth=(sum(q3.region=="north")/len(q3north))
-sum(q3.region=="south")
-sum(q3.region=="east")
-sum(q3.region=="west")
-
-
+#open file and import pandas
 import pandas
 q3=pandas.read_csv("data.txt", sep=",", header=0)
+
+#group by the region and find the mean of each region
 average=q3.groupby('region')['observations'].mean()
+#print to dataframe
+df=average.to_frame()
+#add the region rows - can find the order if you print the previous variable
+df['region']=["east", "north", "south", "west"]
 
-regions=q3.groupby('region')['region']
-seqDF=pandas.DataFrame(list(zip(average)),columns=['regions','observations'] set='\t')
-df=average.to_frame(name=none)
+#making a bar graph with the avg of the corresponding regions
+from plotnine import *
+q3bp=ggplot(df)+theme_classic()+xlab("region")+ylab("observations")
+q3bp+geom_bar(aes(x="region",y="observations"),stat="summary",)
 
 #scatter plot with jitter applied
 from plotnine import *
 q3sp=ggplot(q3,aes(x="region",y="observations"))
 q3sp+geom_jitter()+coord_cartesian()
 
+

From f3ad056a999ed40db0b4aa00d672e104ec0d486b Mon Sep 17 00:00:00 2001
From: Patrick Doherty <pdoherty31@gmail.com>
Date: Thu, 12 Oct 2017 23:00:53 -0400
Subject: [PATCH 5/5] Scatter plot with trend line

---
 Ex7_02.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100755 Ex7_02.py

diff --git a/Ex7_02.py b/Ex7_02.py
new file mode 100755
index 0000000..4a711c1
--- /dev/null
+++ b/Ex7_02.py
@@ -0,0 +1,14 @@
+import pandas as pd
+import numpy as np
+data=pd.read_csv("dataset.csv")
+import matplotlib.pyplot as plt
+plt.title('Percent of Planted Corn in USA that is GMO')
+plt.ylabel('Percent')
+plt.xlabel('Year')
+plt.xlim([2000,2018])
+plt.scatter(data.Year, data.Percent)
+x=data.Year
+y=data.Percent
+z = np.polyfit(x,y, 1)
+p = np.poly1d(z)
+plt.plot(x,p(x),"r--")