Intro_Biocom_ND_319_Tutorial7/EX_7_Script_Final at master · twhmitchell/Intro_Biocom_ND_319_Tutorial7 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#Part 1 - Amanda
import pandas
from plotnine import *

InFile=open("Lecture11.fasta","r") #Open fasta file as read-only

sequenceLength=[] #Set up variables to accept/store sequence data as it is calculated
percentGC=[]

for line in InFile: #Loop through each line in fasta file
    if '>' in line: #Check line for >, if present, skip to next line
        continue
    else:
        seqLen=float(len(line)) #Calculate length of sequence
        nG=line.count("G") #Count individual G and C contents
        nC=line.count("C")
        percGC=float(((nG+nC)/seqLen)*100) #Calculate % GC

        sequenceLength.append(seqLen) #Append length of individual sequences to list
        percentGC.append(percGC) #Append %GC of individual sequences to list

seqDF=pandas.DataFrame(list(zip(sequenceLength,percentGC)),columns=['sequenceLength','percentGC']) #combine lists into dataframe for easier plotting
a=ggplot(seqDF, aes(x="sequenceLength")) #Create plot of sequence lengths
a+geom_histogram()+theme_classic() #Plot as histogram

b=ggplot(seqDF, aes(x="percentGC")) #Create plot of %GC
b+geom_histogram()+theme_classic() #Plot as histogram

InFile.close() #Close file
#############################################################################
#Part 2 - Thomas
import numpy
import pandas
from plotnine import *

icecream=pandas.read_csv("icream_sales.txt",sep="\t",header=0)
icecream.shape
icecream.head(20)

a=ggplot(icecream,aes(x="temp",y="sales"))+theme_classic()+geom_point()
a+xlab("Temperature (C)")+ylab("Sales In Dollars")+stat_smooth(method="lm")
#############################################################################
#Part 3 - Balaji
import numpy
import os
import matplotlib.pyplot as plt
from plotnine import *
os.listdir('.')
os.chdir('/Users/sampathkumarbalaji/EX_7/Intro_Biocom_ND_319_Tutorial7')
import pandas

#to parse and read
data_txt = pandas.read_csv("data.txt")
directions = ['north','south','east','west']

#create dataframe
A=numpy.zeros((4,2))
mean_DF=pandas.DataFrame(A,columns=['region', 'mean_dir'])

#assigning values to data frame elements with mean of 4 regions
for i in range(0,4) :
    mean_DF.mean_dir[i] = numpy.mean(data_txt[data_txt.region==directions[i]].observations)
    mean_DF.region[i] = directions[i]

a=ggplot(mean_DF)+theme_classic()+xlab("region")+ylab("mean_dir")
a+geom_bar(aes(x="region",y="mean_dir"),stat="summary")

b=ggplot(data, aes(x="region", y="observations")) #Plot all observations on scatter plot
b+geom_jitter()+theme_classic()

#Bar Graph had mean plotted across the regions which almost had same values (~15). They almost looked the same.
#But on the scatter plot we north has points centered around 15 while east and west are equally spread and south
#has a bi-modal distribution.