Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions Exercise07-3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#Part 3
import pandas
import numpy
from plotnine import *

#load data
data=pandas.read_csv("data.txt")

#Barplot for the population means
means=ggplot(data)+theme_classic()+xlab("Populations")+ylab("Mean Number of Observations")
+means+geom_bar(aes(x="factor(region)",y="observations",fill="region"),stat="summary",fun_y=numpy.mean)+ggtitle("Population Means")

#Barplot means calculated for check(not necessary)
data.groupby(['region'])['observations'].mean()
#means are slightly different

#Scatterplot for the observations
scatter=ggplot(data,aes('observations','region'))
scatter+geom_jitter(aes(color='factor(region)'))+theme_classic()+ggtitle('All Observations')
#could have used scatter+geom_jitter()+coord_cartesian() instead
#Scatterplot shows that although the average observations seem to be similar across the regions, the observation distributions are different.

#Why?
#Barplot shows the means for the regional observations whilst the scatterplot shows the values for the all of the observations from each region.
59 changes: 59 additions & 0 deletions Exercise07-Full.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#Part 1
import numpy
import pandas
from plotnine import *

#load data
Ex7=open("Lecture11.fasta", "r")

sequenceID=[]
sequenceLength=[]
percentGC=[]

#Determine G:C content
for line in Ex7:
line=line.strip()
if '>' in line:
sequenceID.append(line[1:])
else:
seqLen=float(len(line))
G=line.count("G")
C=line.count("C")
sequenceLength.append(seqLen)
percentGC.append((G+C)/seqLen*100)

#Generate histogram of G:C content
b=ggplot(line,aes(x="seqID"))
b+geom_histogram()+theme_classic

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good job

#Part 2
import pandas
from plotnine import *
football=pandas.read_csv("NFL-graph-for-class.txt", sep='\t',header=0)
football.shape
(ggplot(football) + aes('Pats-Win-Tot','Lions-Win-Tot') + geom_point() + geom_smooth(method='lm'))

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good job

#Part 3
import pandas
import numpy
from plotnine import *

#load data
data=pandas.read_csv("data.txt")

#Barplot for the population means
means=ggplot(data)+theme_classic()+xlab("Populations")+ylab("Mean Number of Observations")
+means+geom_bar(aes(x="factor(region)",y="observations",fill="region"),stat="summary",fun_y=numpy.mean)+ggtitle("Population Means")

#Barplot means calculated for check(not necessary)
data.groupby(['region'])['observations'].mean()
#means are slightly different

#Scatterplot for the observations
scatter=ggplot(data,aes('observations','region'))
scatter+geom_jitter(aes(color='factor(region)'))+theme_classic()+ggtitle('All Observations')
#could have used scatter+geom_jitter()+coord_cartesian() instead
#Scatterplot shows that although the average observations seem to be similar across the regions, the observation distributions are different.

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good job

#Why?
#Barplot shows the means for the regional observations whilst the scatterplot shows the values for the all of the observations from each region.
26 changes: 26 additions & 0 deletions Exercise7-1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import numpy
import pandas
from plotnine import *

#load data
Ex7=open("Lecture11.fasta", "r")

sequenceID=[]
sequenceLength=[]
percentGC=[]

#Determine G:C content
for line in Ex7:
line=line.strip()
if '>' in line:
sequenceID.append(line[1:])
else:
seqLen=float(len(line))
G=line.count("G")
C=line.count("C")
sequenceLength.append(seqLen)
percentGC.append((G+C)/seqLen*100)

#Generate histogram of G:C content
b=ggplot(line,aes(x="seqID"))
b+geom_histogram()+theme_classic
5 changes: 5 additions & 0 deletions Exercise7-2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import pandas
from plotnine import *
football=pandas.read_csv("NFL-graph-for-class.txt", sep='\t',header=0)
football.shape
(ggplot(football) + aes('Pats-Win-Tot','Lions-Win-Tot') + geom_point() + geom_smooth(method='lm'))
58 changes: 58 additions & 0 deletions NFL-graph-for-class.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
Year,Pats-Wins,Pats-Win-Tot,Lions-Wins,Lions-Win-Tot
1960,5,5,7,7
1961,9,14,8,15
1962,9,23,11,26
1963,7,30,5,31
1964,10,40,7,38
1965,4,44,6,44
1966,8,52,4,48
1967,3,55,5,53
1968,4,59,4,57
1969,4,63,9,66
1970,2,65,10,76
1971,6,71,7,83
1972,3,74,8,91
1973,5,79,6,97
1974,7,86,7,104
1975,3,89,7,111
1976,11,100,6,117
1977,9,109,6,123
1978,11,120,7,130
1979,9,129,2,132
1980,10,139,9,141
1981,2,141,8,149
1982,5,146,4,153
1983,8,154,9,162
1984,9,163,4,166
1985,11,174,7,173
1986,11,185,5,178
1987,8,193,4,182
1988,9,202,4,186
1989,5,207,7,193
1990,1,208,6,199
1991,6,214,12,211
1992,2,216,5,216
1993,5,221,10,226
1994,10,231,9,235
1995,6,237,10,245
1996,11,248,5,250
1997,10,258,9,259
1998,9,267,5,264
1999,8,275,8,272
2000,5,280,9,281
2001,11,291,2,283
2002,9,300,3,286
2003,14,314,5,291
2004,14,328,6,297
2005,10,338,5,302
2006,12,350,3,305
2007,16,366,7,312
2008,11,377,0,312
2009,10,387,2,314
2010,14,401,6,320
2011,13,414,10,330
2012,12,426,4,334
2013,12,438,7,341
2014,12,450,11,352
2015,12,462,7,359
2016,14,476,9,368
58 changes: 58 additions & 0 deletions NFL-graph-for-class.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
Year Pats-Wins Pats-Win-Tot Lions-Wins Lions-Win-Tot
1960 5 5 7 7
1961 9 14 8 15
1962 9 23 11 26
1963 7 30 5 31
1964 10 40 7 38
1965 4 44 6 44
1966 8 52 4 48
1967 3 55 5 53
1968 4 59 4 57
1969 4 63 9 66
1970 2 65 10 76
1971 6 71 7 83
1972 3 74 8 91
1973 5 79 6 97
1974 7 86 7 104
1975 3 89 7 111
1976 11 100 6 117
1977 9 109 6 123
1978 11 120 7 130
1979 9 129 2 132
1980 10 139 9 141
1981 2 141 8 149
1982 5 146 4 153
1983 8 154 9 162
1984 9 163 4 166
1985 11 174 7 173
1986 11 185 5 178
1987 8 193 4 182
1988 9 202 4 186
1989 5 207 7 193
1990 1 208 6 199
1991 6 214 12 211
1992 2 216 5 216
1993 5 221 10 226
1994 10 231 9 235
1995 6 237 10 245
1996 11 248 5 250
1997 10 258 9 259
1998 9 267 5 264
1999 8 275 8 272
2000 5 280 9 281
2001 11 291 2 283
2002 9 300 3 286
2003 14 314 5 291
2004 14 328 6 297
2005 10 338 5 302
2006 12 350 3 305
2007 16 366 7 312
2008 11 377 0 312
2009 10 387 2 314
2010 14 401 6 320
2011 13 414 10 330
2012 12 426 4 334
2013 12 438 7 341
2014 12 450 11 352
2015 12 462 7 359
2016 14 476 9 368