forked from lyy005/Intro_Biocom_ND_319_Tutorial7
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathEX_7_Script_Final
More file actions
75 lines (58 loc) · 2.69 KB
/
Copy pathEX_7_Script_Final
File metadata and controls
75 lines (58 loc) · 2.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#Part 1 - Amanda
import pandas
from plotnine import *
InFile=open("Lecture11.fasta","r") #Open fasta file as read-only
sequenceLength=[] #Set up variables to accept/store sequence data as it is calculated
percentGC=[]
for line in InFile: #Loop through each line in fasta file
if '>' in line: #Check line for >, if present, skip to next line
continue
else:
seqLen=float(len(line)) #Calculate length of sequence
nG=line.count("G") #Count individual G and C contents
nC=line.count("C")
percGC=float(((nG+nC)/seqLen)*100) #Calculate % GC
sequenceLength.append(seqLen) #Append length of individual sequences to list
percentGC.append(percGC) #Append %GC of individual sequences to list
seqDF=pandas.DataFrame(list(zip(sequenceLength,percentGC)),columns=['sequenceLength','percentGC']) #combine lists into dataframe for easier plotting
a=ggplot(seqDF, aes(x="sequenceLength")) #Create plot of sequence lengths
a+geom_histogram()+theme_classic() #Plot as histogram
b=ggplot(seqDF, aes(x="percentGC")) #Create plot of %GC
b+geom_histogram()+theme_classic() #Plot as histogram
InFile.close() #Close file
#############################################################################
#Part 2 - Thomas
import numpy
import pandas
from plotnine import *
icecream=pandas.read_csv("icream_sales.txt",sep="\t",header=0)
icecream.shape
icecream.head(20)
a=ggplot(icecream,aes(x="temp",y="sales"))+theme_classic()+geom_point()
a+xlab("Temperature (C)")+ylab("Sales In Dollars")+stat_smooth(method="lm")
#############################################################################
#Part 3 - Balaji
import numpy
import os
import matplotlib.pyplot as plt
from plotnine import *
os.listdir('.')
os.chdir('/Users/sampathkumarbalaji/EX_7/Intro_Biocom_ND_319_Tutorial7')
import pandas
#to parse and read
data_txt = pandas.read_csv("data.txt")
directions = ['north','south','east','west']
#create dataframe
A=numpy.zeros((4,2))
mean_DF=pandas.DataFrame(A,columns=['region', 'mean_dir'])
#assigning values to data frame elements with mean of 4 regions
for i in range(0,4) :
mean_DF.mean_dir[i] = numpy.mean(data_txt[data_txt.region==directions[i]].observations)
mean_DF.region[i] = directions[i]
a=ggplot(mean_DF)+theme_classic()+xlab("region")+ylab("mean_dir")
a+geom_bar(aes(x="region",y="mean_dir"),stat="summary")
b=ggplot(data, aes(x="region", y="observations")) #Plot all observations on scatter plot
b+geom_jitter()+theme_classic()
#Bar Graph had mean plotted across the regions which almost had same values (~15). They almost looked the same.
#But on the scatter plot we north has points centered around 15 while east and west are equally spread and south
#has a bi-modal distribution.