lyy005 · bruzecruise · Oct 6, 2017 · Oct 6, 2017 · Oct 10, 2017 · Oct 10, 2017
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
diff --git a/exercise 7.py b/exercise 7.py
@@ -0,0 +1,71 @@
+#exercise 7#
+#Dan Bruzzese and Zoe Loh
+
+
+
+
+
+# question 1
+import pandas
+from plotnine import *
+File=open("Lecture11.fasta","r")
+plotData = pandas.DataFrame(columns = ["Sequence Length" , "GC content"])
+
+for line in File:
+    line = line.strip()
+    if ">" in line:
+        continue
+    else:
+        #First the length of the sequence and the percent gc count is calculated
+        Length = (len(line)-1)
+        #Because it is integer division we must force python do divide as if it was real numbers by using float()
+        GCcount = (float((line.count("G"))+line.count("C"))/len(line))
+        #The values are inserted into a dataframe for plotting
+        row = pandas.DataFrame({"Sequence Length": Length, "GC content": GCcount}, index=[0])
+        plotData = plotData.append(row)
+#GC histogram plot
+a=ggplot(plotData,aes(x="GC content"))
+aa= a+geom_histogram()+theme_classic()
+print aa
+
+#sequence length histogram plot
+b=ggplot(plotData,aes(x="Sequence Length"))
+bb=b+geom_histogram()+theme_classic()
+print bb
+
+#question2
+
+import pandas
+from plotnine import *
+data=pandas.read_csv("heartrate.txt",sep=",",header=0)
+
+#Here I make the scatter plot showing how running speed and heart rate are related
+plot=ggplot(data,aes(x="Heart rate",y="Running speed"))
+p=plot+geom_point()+coord_cartesian()+stat_smooth(method="lm")
+print p
+
+#########question 3################
+from plotnine import *
+import pandas as pd
+dat = pd.read_csv("data.txt")
+
+#barplot  for mean observations in a region
+grouped= dat.groupby(["region"]).mean().reset_index() #mean observations by region
+print grouped
+grouped.columns = ['region', 'mean_observations']
+p= (ggplot(data=grouped)
+    + aes(x='region', y= 'mean_observations',fill= 'region')
+    + geom_bar(stat = "identity")
+    + theme_classic()
+    )
+print p
+
+#scatterplot
+d= (ggplot(data=dat)
+    + aes(y='observations', x='region', fill= 'region')
+    + geom_point(alpha= .1)
+    + theme_classic()
+    )
+print d
+ # why= the bar chart shows us the mean of observations from each region
+#while the scatter plot shows us the value of all  observations from each region
diff --git a/heartrate.txt b/heartrate.txt
@@ -0,0 +1,22 @@
+"Heart rate","Running speed"
+80,0
+85,2
+87,3
+90,3
+94,3
+97,4
+102,4
+60,-5
+110,5
+117,6
+120,6
+124,7
+130,7
+138,8
+143,8
+150,9
+157,10
+160,11
+165,12
+170,12.5
+185,14