Intro_Biocom_ND_319_Tutorial7/exercise7 at master · kinskeep/Intro_Biocom_ND_319_Tutorial7 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
####Exercise7####
#Question 1
#load dataset
import pandas
InFile=open("Lecture11.fasta","r")
#InFile=close()
#create lists for storing information about sequences
sequenceID=[]
sequenceLength=[]
percentGC=[]
meltingTemp=[]
#for loop to sort out sequence lines and append their lengths
for Line in InFile:
# remove newline character from file line
    Line=Line.strip()
    print (Line)
    # carrot lines separated from sequence lines
    if '>' in Line:
        sequenceID.append(Line[1:])
    else:
        # Create new seqlength dataframe and append lengths
        Seqlength = float(len(Line))
        print (Seqlength)
        sequenceLength.append(Seqlength)
        # count the number of G's and C's
        nG=Line.count("G")
        print (nG)
        nC=Line.count("C")
        print (nC)
        # append values to list
        gcTotal = (nG+nC)/Seqlength*100
        percentGC.append(gcTotal)

#dataframe of resulting info
seqDF = pandas.DataFrame(list(zip(sequenceID,sequenceLength,percentGC)),columns=['sequenceID','sequenceLength','percentGC'])
#to make infile management easier
#InFile=open("Lecture11.fasta","r")
InFile.close()

#Histogram of sequence lengths
import plotnine
from plotnine import *
p=(ggplot(data=seqDF) +
    aes(x="sequenceLength") +
    geom_histogram(binwidth=4))
p
#Histogram of Percent GC
g=(ggplot(data=seqDF) +
    aes(x="percentGC") +
    geom_histogram(binwidth=5))
g

#Question 2
import numpy
import pandas
import plotnine
from plotnine import *

#read in file
Part2=pandas.read_csv("part2datacopy.txt", sep=",")
#print(Part2)

#plotting data in scatterplot with trendline
a=ggplot(Part2,aes(x="oil changes per year",y="cost of repairs($)"))+theme_classic()+geom_point()
a+xlab("oil changes per year")+ylab("cost of repairs($)")+stat_smooth(method="lm")

#Question 3
#load the dataset
import pandas
import numpy
Data = pandas.read_csv("data.txt", sep=',')
print (Data)

#making bar graph with region as x and ave as y
import plotnine
from plotnine import *
d=ggplot(Data)+theme_classic()+xlab("region")+ylab("Average")
d+geom_bar(aes(x="factor(region)",y="observations"),stat="summary",fun_y=numpy.mean)

#scatter plot of everything observed
a=ggplot(Data,aes(x="region",y="observations"))
a+geom_jitter()+coord_cartesian()