forked from lyy005/Intro_Biocom_ND_319_Tutorial7
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpart1script.py
More file actions
49 lines (34 loc) · 1.1 KB
/
Copy pathpart1script.py
File metadata and controls
49 lines (34 loc) · 1.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import numpy
import pandas
from plotnine import *
#Question 1
InFile=open("Lecture11.fasta","r")
#create lists for storing information about sequences
sequenceID=[]
sequenceLength=[]
percentGC=[]
meltingTemp=[]
#loop through each line in fasta file to process sequences
for Line in InFile:
Line=Line.strip() #removes white space, tab, space, newline characters
if '>' in Line:
sequenceID.append(Line[1:])
#print(Line[1:])
else:
seqLen=float(len(Line))
nG=Line.count("G")
nC=Line.count("C")
#append values to lists
sequenceLength.append(seqLen)
percentGC.append((nG+nC)/seqLen*100)
#combine lists into dataframe
seqDF = pandas.DataFrame(list(zip(sequenceID,sequenceLength,percentGC)),columns=['sequenceID','sequenceLength','percentGC'])
#min(seqDF.sequenceLength)
#close file
InFile.close()
#plots histogram of sequence length
b=ggplot(seqDF,aes(x="sequenceLength"))
b+geom_histogram(binwidth=5)+theme_classic()
#plots histogram of percent GC
b=ggplot(seqDF,aes(x="percentGC"))
b+geom_histogram(binwidth=5)+theme_classic()