forked from lyy005/Intro_Biocom_ND_319_Tutorial7
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathexercise7
More file actions
executable file
·82 lines (74 loc) · 2.22 KB
/
Copy pathexercise7
File metadata and controls
executable file
·82 lines (74 loc) · 2.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
####Exercise7####
#Question 1
#load dataset
import pandas
InFile=open("Lecture11.fasta","r")
#InFile=close()
#create lists for storing information about sequences
sequenceID=[]
sequenceLength=[]
percentGC=[]
meltingTemp=[]
#for loop to sort out sequence lines and append their lengths
for Line in InFile:
# remove newline character from file line
Line=Line.strip()
print (Line)
# carrot lines separated from sequence lines
if '>' in Line:
sequenceID.append(Line[1:])
else:
# Create new seqlength dataframe and append lengths
Seqlength = float(len(Line))
print (Seqlength)
sequenceLength.append(Seqlength)
# count the number of G's and C's
nG=Line.count("G")
print (nG)
nC=Line.count("C")
print (nC)
# append values to list
gcTotal = (nG+nC)/Seqlength*100
percentGC.append(gcTotal)
#dataframe of resulting info
seqDF = pandas.DataFrame(list(zip(sequenceID,sequenceLength,percentGC)),columns=['sequenceID','sequenceLength','percentGC'])
#to make infile management easier
#InFile=open("Lecture11.fasta","r")
InFile.close()
#Histogram of sequence lengths
import plotnine
from plotnine import *
p=(ggplot(data=seqDF) +
aes(x="sequenceLength") +
geom_histogram(binwidth=4))
p
#Histogram of Percent GC
g=(ggplot(data=seqDF) +
aes(x="percentGC") +
geom_histogram(binwidth=5))
g
#Question 2
import numpy
import pandas
import plotnine
from plotnine import *
#read in file
Part2=pandas.read_csv("part2datacopy.txt", sep=",")
#print(Part2)
#plotting data in scatterplot with trendline
a=ggplot(Part2,aes(x="oil changes per year",y="cost of repairs($)"))+theme_classic()+geom_point()
a+xlab("oil changes per year")+ylab("cost of repairs($)")+stat_smooth(method="lm")
#Question 3
#load the dataset
import pandas
import numpy
Data = pandas.read_csv("data.txt", sep=',')
print (Data)
#making bar graph with region as x and ave as y
import plotnine
from plotnine import *
d=ggplot(Data)+theme_classic()+xlab("region")+ylab("Average")
d+geom_bar(aes(x="factor(region)",y="observations"),stat="summary",fun_y=numpy.mean)
#scatter plot of everything observed
a=ggplot(Data,aes(x="region",y="observations"))
a+geom_jitter()+coord_cartesian()