diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/exercise 7.py b/exercise 7.py new file mode 100644 index 0000000..37d91f2 --- /dev/null +++ b/exercise 7.py @@ -0,0 +1,71 @@ +#exercise 7# +#Dan Bruzzese and Zoe Loh + + + + + +# question 1 +import pandas +from plotnine import * +File=open("Lecture11.fasta","r") +plotData = pandas.DataFrame(columns = ["Sequence Length" , "GC content"]) + +for line in File: + line = line.strip() + if ">" in line: + continue + else: + #First the length of the sequence and the percent gc count is calculated + Length = (len(line)-1) + #Because it is integer division we must force python do divide as if it was real numbers by using float() + GCcount = (float((line.count("G"))+line.count("C"))/len(line)) + #The values are inserted into a dataframe for plotting + row = pandas.DataFrame({"Sequence Length": Length, "GC content": GCcount}, index=[0]) + plotData = plotData.append(row) +#GC histogram plot +a=ggplot(plotData,aes(x="GC content")) +aa= a+geom_histogram()+theme_classic() +print aa + +#sequence length histogram plot +b=ggplot(plotData,aes(x="Sequence Length")) +bb=b+geom_histogram()+theme_classic() +print bb + +#question2 + +import pandas +from plotnine import * +data=pandas.read_csv("heartrate.txt",sep=",",header=0) + +#Here I make the scatter plot showing how running speed and heart rate are related +plot=ggplot(data,aes(x="Heart rate",y="Running speed")) +p=plot+geom_point()+coord_cartesian()+stat_smooth(method="lm") +print p + +#########question 3################ +from plotnine import * +import pandas as pd +dat = pd.read_csv("data.txt") + +#barplot for mean observations in a region +grouped= dat.groupby(["region"]).mean().reset_index() #mean observations by region +print grouped +grouped.columns = ['region', 'mean_observations'] +p= (ggplot(data=grouped) + + aes(x='region', y= 'mean_observations',fill= 'region') + + geom_bar(stat = "identity") + + theme_classic() + ) +print p + +#scatterplot +d= (ggplot(data=dat) + + aes(y='observations', x='region', fill= 'region') + + geom_point(alpha= .1) + + theme_classic() + ) +print d + # why= the bar chart shows us the mean of observations from each region +#while the scatter plot shows us the value of all observations from each region diff --git a/heartrate.txt b/heartrate.txt new file mode 100644 index 0000000..63dcc94 --- /dev/null +++ b/heartrate.txt @@ -0,0 +1,22 @@ +"Heart rate","Running speed" +80,0 +85,2 +87,3 +90,3 +94,3 +97,4 +102,4 +60,-5 +110,5 +117,6 +120,6 +124,7 +130,7 +138,8 +143,8 +150,9 +157,10 +160,11 +165,12 +170,12.5 +185,14