Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

71 changes: 71 additions & 0 deletions exercise 7.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#exercise 7#
#Dan Bruzzese and Zoe Loh





# question 1
import pandas
from plotnine import *
File=open("Lecture11.fasta","r")
plotData = pandas.DataFrame(columns = ["Sequence Length" , "GC content"])

for line in File:
line = line.strip()
if ">" in line:
continue
else:
#First the length of the sequence and the percent gc count is calculated
Length = (len(line)-1)
#Because it is integer division we must force python do divide as if it was real numbers by using float()
GCcount = (float((line.count("G"))+line.count("C"))/len(line))
#The values are inserted into a dataframe for plotting
row = pandas.DataFrame({"Sequence Length": Length, "GC content": GCcount}, index=[0])
plotData = plotData.append(row)
#GC histogram plot
a=ggplot(plotData,aes(x="GC content"))
aa= a+geom_histogram()+theme_classic()
print aa

#sequence length histogram plot
b=ggplot(plotData,aes(x="Sequence Length"))
bb=b+geom_histogram()+theme_classic()
print bb

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good job

#question2

import pandas
from plotnine import *
data=pandas.read_csv("heartrate.txt",sep=",",header=0)

#Here I make the scatter plot showing how running speed and heart rate are related
plot=ggplot(data,aes(x="Heart rate",y="Running speed"))
p=plot+geom_point()+coord_cartesian()+stat_smooth(method="lm")
print p

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good job

#########question 3################
from plotnine import *
import pandas as pd
dat = pd.read_csv("data.txt")

#barplot for mean observations in a region
grouped= dat.groupby(["region"]).mean().reset_index() #mean observations by region
print grouped
grouped.columns = ['region', 'mean_observations']
p= (ggplot(data=grouped)
+ aes(x='region', y= 'mean_observations',fill= 'region')
+ geom_bar(stat = "identity")
+ theme_classic()
)
print p

#scatterplot
d= (ggplot(data=dat)
+ aes(y='observations', x='region', fill= 'region')
+ geom_point(alpha= .1)
+ theme_classic()
)
print d

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good job. jitter plot?

# why= the bar chart shows us the mean of observations from each region
#while the scatter plot shows us the value of all observations from each region
22 changes: 22 additions & 0 deletions heartrate.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
"Heart rate","Running speed"
80,0
85,2
87,3
90,3
94,3
97,4
102,4
60,-5
110,5
117,6
120,6
124,7
130,7
138,8
143,8
150,9
157,10
160,11
165,12
170,12.5
185,14