diff --git a/Exercise07-3.py b/Exercise07-3.py new file mode 100644 index 0000000..81c63eb --- /dev/null +++ b/Exercise07-3.py @@ -0,0 +1,24 @@ +#Part 3 +import pandas +import numpy +from plotnine import * + +#load data +data=pandas.read_csv("data.txt") + +#Barplot for the population means +means=ggplot(data)+theme_classic()+xlab("Populations")+ylab("Mean Number of Observations") ++means+geom_bar(aes(x="factor(region)",y="observations",fill="region"),stat="summary",fun_y=numpy.mean)+ggtitle("Population Means") + +#Barplot means calculated for check(not necessary) +data.groupby(['region'])['observations'].mean() +#means are slightly different + +#Scatterplot for the observations +scatter=ggplot(data,aes('observations','region')) +scatter+geom_jitter(aes(color='factor(region)'))+theme_classic()+ggtitle('All Observations') +#could have used scatter+geom_jitter()+coord_cartesian() instead +#Scatterplot shows that although the average observations seem to be similar across the regions, the observation distributions are different. + +#Why? +#Barplot shows the means for the regional observations whilst the scatterplot shows the values for the all of the observations from each region. diff --git a/Exercise07-Full.py b/Exercise07-Full.py new file mode 100644 index 0000000..a5cda86 --- /dev/null +++ b/Exercise07-Full.py @@ -0,0 +1,59 @@ +#Part 1 +import numpy +import pandas +from plotnine import * + +#load data +Ex7=open("Lecture11.fasta", "r") + +sequenceID=[] +sequenceLength=[] +percentGC=[] + +#Determine G:C content +for line in Ex7: + line=line.strip() + if '>' in line: + sequenceID.append(line[1:]) + else: + seqLen=float(len(line)) + G=line.count("G") + C=line.count("C") + sequenceLength.append(seqLen) + percentGC.append((G+C)/seqLen*100) + +#Generate histogram of G:C content +b=ggplot(line,aes(x="seqID")) +b+geom_histogram()+theme_classic + +#Part 2 +import pandas +from plotnine import * +football=pandas.read_csv("NFL-graph-for-class.txt", sep='\t',header=0) +football.shape +(ggplot(football) + aes('Pats-Win-Tot','Lions-Win-Tot') + geom_point() + geom_smooth(method='lm')) + +#Part 3 +import pandas +import numpy +from plotnine import * + +#load data +data=pandas.read_csv("data.txt") + +#Barplot for the population means +means=ggplot(data)+theme_classic()+xlab("Populations")+ylab("Mean Number of Observations") ++means+geom_bar(aes(x="factor(region)",y="observations",fill="region"),stat="summary",fun_y=numpy.mean)+ggtitle("Population Means") + +#Barplot means calculated for check(not necessary) +data.groupby(['region'])['observations'].mean() +#means are slightly different + +#Scatterplot for the observations +scatter=ggplot(data,aes('observations','region')) +scatter+geom_jitter(aes(color='factor(region)'))+theme_classic()+ggtitle('All Observations') +#could have used scatter+geom_jitter()+coord_cartesian() instead +#Scatterplot shows that although the average observations seem to be similar across the regions, the observation distributions are different. + +#Why? +#Barplot shows the means for the regional observations whilst the scatterplot shows the values for the all of the observations from each region. diff --git a/Exercise7-1.py b/Exercise7-1.py new file mode 100755 index 0000000..e2bcdeb --- /dev/null +++ b/Exercise7-1.py @@ -0,0 +1,26 @@ +import numpy +import pandas +from plotnine import * + +#load data +Ex7=open("Lecture11.fasta", "r") + +sequenceID=[] +sequenceLength=[] +percentGC=[] + +#Determine G:C content +for line in Ex7: + line=line.strip() + if '>' in line: + sequenceID.append(line[1:]) + else: + seqLen=float(len(line)) + G=line.count("G") + C=line.count("C") + sequenceLength.append(seqLen) + percentGC.append((G+C)/seqLen*100) + +#Generate histogram of G:C content +b=ggplot(line,aes(x="seqID")) +b+geom_histogram()+theme_classic diff --git a/Exercise7-2.py b/Exercise7-2.py new file mode 100755 index 0000000..f7332a1 --- /dev/null +++ b/Exercise7-2.py @@ -0,0 +1,5 @@ +import pandas +from plotnine import * +football=pandas.read_csv("NFL-graph-for-class.txt", sep='\t',header=0) +football.shape +(ggplot(football) + aes('Pats-Win-Tot','Lions-Win-Tot') + geom_point() + geom_smooth(method='lm')) \ No newline at end of file diff --git a/NFL-graph-for-class.csv b/NFL-graph-for-class.csv new file mode 100755 index 0000000..4028765 --- /dev/null +++ b/NFL-graph-for-class.csv @@ -0,0 +1,58 @@ +Year,Pats-Wins,Pats-Win-Tot,Lions-Wins,Lions-Win-Tot +1960,5,5,7,7 +1961,9,14,8,15 +1962,9,23,11,26 +1963,7,30,5,31 +1964,10,40,7,38 +1965,4,44,6,44 +1966,8,52,4,48 +1967,3,55,5,53 +1968,4,59,4,57 +1969,4,63,9,66 +1970,2,65,10,76 +1971,6,71,7,83 +1972,3,74,8,91 +1973,5,79,6,97 +1974,7,86,7,104 +1975,3,89,7,111 +1976,11,100,6,117 +1977,9,109,6,123 +1978,11,120,7,130 +1979,9,129,2,132 +1980,10,139,9,141 +1981,2,141,8,149 +1982,5,146,4,153 +1983,8,154,9,162 +1984,9,163,4,166 +1985,11,174,7,173 +1986,11,185,5,178 +1987,8,193,4,182 +1988,9,202,4,186 +1989,5,207,7,193 +1990,1,208,6,199 +1991,6,214,12,211 +1992,2,216,5,216 +1993,5,221,10,226 +1994,10,231,9,235 +1995,6,237,10,245 +1996,11,248,5,250 +1997,10,258,9,259 +1998,9,267,5,264 +1999,8,275,8,272 +2000,5,280,9,281 +2001,11,291,2,283 +2002,9,300,3,286 +2003,14,314,5,291 +2004,14,328,6,297 +2005,10,338,5,302 +2006,12,350,3,305 +2007,16,366,7,312 +2008,11,377,0,312 +2009,10,387,2,314 +2010,14,401,6,320 +2011,13,414,10,330 +2012,12,426,4,334 +2013,12,438,7,341 +2014,12,450,11,352 +2015,12,462,7,359 +2016,14,476,9,368 diff --git a/NFL-graph-for-class.txt b/NFL-graph-for-class.txt new file mode 100755 index 0000000..0a8cad1 --- /dev/null +++ b/NFL-graph-for-class.txt @@ -0,0 +1,58 @@ +Year Pats-Wins Pats-Win-Tot Lions-Wins Lions-Win-Tot +1960 5 5 7 7 +1961 9 14 8 15 +1962 9 23 11 26 +1963 7 30 5 31 +1964 10 40 7 38 +1965 4 44 6 44 +1966 8 52 4 48 +1967 3 55 5 53 +1968 4 59 4 57 +1969 4 63 9 66 +1970 2 65 10 76 +1971 6 71 7 83 +1972 3 74 8 91 +1973 5 79 6 97 +1974 7 86 7 104 +1975 3 89 7 111 +1976 11 100 6 117 +1977 9 109 6 123 +1978 11 120 7 130 +1979 9 129 2 132 +1980 10 139 9 141 +1981 2 141 8 149 +1982 5 146 4 153 +1983 8 154 9 162 +1984 9 163 4 166 +1985 11 174 7 173 +1986 11 185 5 178 +1987 8 193 4 182 +1988 9 202 4 186 +1989 5 207 7 193 +1990 1 208 6 199 +1991 6 214 12 211 +1992 2 216 5 216 +1993 5 221 10 226 +1994 10 231 9 235 +1995 6 237 10 245 +1996 11 248 5 250 +1997 10 258 9 259 +1998 9 267 5 264 +1999 8 275 8 272 +2000 5 280 9 281 +2001 11 291 2 283 +2002 9 300 3 286 +2003 14 314 5 291 +2004 14 328 6 297 +2005 10 338 5 302 +2006 12 350 3 305 +2007 16 366 7 312 +2008 11 377 0 312 +2009 10 387 2 314 +2010 14 401 6 320 +2011 13 414 10 330 +2012 12 426 4 334 +2013 12 438 7 341 +2014 12 450 11 352 +2015 12 462 7 359 +2016 14 476 9 368