Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions EX_7_Script_Final
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#Part 1 - Amanda
import pandas
from plotnine import *

InFile=open("Lecture11.fasta","r") #Open fasta file as read-only

sequenceLength=[] #Set up variables to accept/store sequence data as it is calculated
percentGC=[]

for line in InFile: #Loop through each line in fasta file
if '>' in line: #Check line for >, if present, skip to next line
continue
else:
seqLen=float(len(line)) #Calculate length of sequence

@lyy005 lyy005 Oct 26, 2017

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

line=line.strip() # remove the new line character

nG=line.count("G") #Count individual G and C contents
nC=line.count("C")
percGC=float(((nG+nC)/seqLen)*100) #Calculate % GC

sequenceLength.append(seqLen) #Append length of individual sequences to list
percentGC.append(percGC) #Append %GC of individual sequences to list

seqDF=pandas.DataFrame(list(zip(sequenceLength,percentGC)),columns=['sequenceLength','percentGC']) #combine lists into dataframe for easier plotting
a=ggplot(seqDF, aes(x="sequenceLength")) #Create plot of sequence lengths
a+geom_histogram()+theme_classic() #Plot as histogram

b=ggplot(seqDF, aes(x="percentGC")) #Create plot of %GC
b+geom_histogram()+theme_classic() #Plot as histogram

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good job

InFile.close() #Close file
#############################################################################
#Part 2 - Thomas
import numpy
import pandas
from plotnine import *

icecream=pandas.read_csv("icream_sales.txt",sep="\t",header=0)
icecream.shape
icecream.head(20)

a=ggplot(icecream,aes(x="temp",y="sales"))+theme_classic()+geom_point()
a+xlab("Temperature (C)")+ylab("Sales In Dollars")+stat_smooth(method="lm")

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good job

#############################################################################
#Part 3 - Balaji
import numpy
import os
import matplotlib.pyplot as plt
from plotnine import *
os.listdir('.')
os.chdir('/Users/sampathkumarbalaji/EX_7/Intro_Biocom_ND_319_Tutorial7')
import pandas

#to parse and read
data_txt = pandas.read_csv("data.txt")
directions = ['north','south','east','west']

#create dataframe
A=numpy.zeros((4,2))
mean_DF=pandas.DataFrame(A,columns=['region', 'mean_dir'])

#assigning values to data frame elements with mean of 4 regions
for i in range(0,4) :
mean_DF.mean_dir[i] = numpy.mean(data_txt[data_txt.region==directions[i]].observations)
mean_DF.region[i] = directions[i]

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or you can use:
data_barplot=ggplot(data)
data_barplot+geom_bar(aes(x="factor(region)",y="observations"),stat="summary",fun_y=numpy.mean)

a=ggplot(mean_DF)+theme_classic()+xlab("region")+ylab("mean_dir")
a+geom_bar(aes(x="region",y="mean_dir"),stat="summary")

b=ggplot(data, aes(x="region", y="observations")) #Plot all observations on scatter plot
b+geom_jitter()+theme_classic()

#Bar Graph had mean plotted across the regions which almost had same values (~15). They almost looked the same.
#But on the scatter plot we north has points centered around 15 while east and west are equally spread and south
#has a bi-modal distribution.

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good job


30 changes: 30 additions & 0 deletions EX_7_Script_Q3
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import numpy
import os
import matplotlib.pyplot as plt
from plotnine import *
os.listdir('.')
os.chdir('/Users/sampathkumarbalaji/EX_7/Intro_Biocom_ND_319_Tutorial7')
import pandas

#to parse and read
data_txt = pandas.read_csv("data.txt")
directions = ['north','south','east','west']

#create dataframe
A=numpy.zeros((4,2))
mean_DF=pandas.DataFrame(A,columns=['region', 'mean_dir'])

#assigning values to data frame elements with mean of 4 regions
for i in range(0,4) :
mean_DF.mean_dir[i] = numpy.mean(data_txt[data_txt.region==directions[i]].observations)
mean_DF.region[i] = directions[i]

a=ggplot(mean_DF)+theme_classic()+xlab("region")+ylab("mean_dir")
a+geom_bar(aes(x="region",y="mean_dir"),stat="summary")

b=ggplot(data, aes(x="region", y="observations")) #Plot all observations on scatter plot
b+geom_jitter()+theme_classic()

#Bar Graph had mean plotted across the regions which almost had same values (~15). They almost looked the same.
#But on the scatter plot we north has points centered around 15 while east and west are equally spread and south
#has a bi-modal distribution.
65 changes: 65 additions & 0 deletions Exercise7.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#Part 1
import pandas
from plotnine import *

InFile=open("Lecture11.fasta","r") #Open fasta file as read-only

sequenceLength=[] #Set up variables to accept/store sequence data as it is calculated
percentGC=[]

for line in InFile: #Loop through each line in fasta file
if '>' in line: #Check line for >, if present, skip to next line
continue
else:
seqLen=float(len(line)) #Calculate length of sequence
nG=line.count("G") #Count individual G and C contents
nC=line.count("C")
percGC=float(((nG+nC)/seqLen)*100) #Calculate % GC

sequenceLength.append(seqLen) #Append length of individual sequences to list
percentGC.append(percGC) #Append %GC of individual sequences to list

seqDF=pandas.DataFrame(list(zip(sequenceLength,percentGC)),columns=['sequenceLength','percentGC']) #combine lists into dataframe for easier plotting
a=ggplot(seqDF, aes(x="sequenceLength")) #Create plot of sequence lengths
a+geom_histogram()+theme_classic() #Plot as histogram

b=ggplot(seqDF, aes(x="percentGC")) #Create plot of %GC
b+geom_histogram()+theme_classic() #Plot as histogram

InFile.close() #Close file

#Part 3
import numpy

data=pandas.read_csv("data.txt", header=0, sep=",") #Open file as data frame

dataN=data[data.region=="north"] #Subset data frame & find mean for all populations
nMean=numpy.mean(dataN.observations)

dataE=data[data.region=="east"]
eMean=numpy.mean(dataE.observations)

dataW=data[data.region=="west"]
wMean=numpy.mean(dataW.observations)

dataS=data[data.region=="south"]
sMean=numpy.mean(dataS.observations)

means=pandas.DataFrame(columns=('region', 'mean')) #Combine means into new data frame
means.region='north','south','east','west'
means.iloc[0,1]=nMean
means.iloc[1,1]=sMean
means.iloc[2,1]=eMean
means.iloc[3,1]=wMean

c=ggplot(means, aes(x="region",y="mean")) #Plot means on bar graph
c+geom_col()+theme_classic()

d=ggplot(data, aes(x="region", y="observations")) #Plot all observations on scatter plot
d+geom_jitter()+theme_classic()

#Graphs tell different stories - only on the scatter plot does it become apparent that the observations
#in the south region are two discrete populations, rather than a continuous spread like the others.
#Additionally, the mean for the West region makes it look as though it has the smallest values, whereas
#the scatterplot shows that it has both the lowest and the highest values, over a very large spread.
#The mean barplot is really only an accurate respresentation for the North region.
29 changes: 29 additions & 0 deletions Exercise7Part1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#Part 1
import pandas
from plotnine import *

InFile=open("Lecture11.fasta","r") #Open fasta file as read-only

sequenceLength=[] #Set up variables to accept/store sequence data as it is calculated
percentGC=[]

for line in InFile: #Loop through each line in fasta file
if '>' in line: #Check line for >, if present, skip to next line
continue
else:
seqLen=float(len(line)) #Calculate length of sequence
nG=line.count("G") #Count individual G and C contents
nC=line.count("C")
percGC=float(((nG+nC)/seqLen)*100) #Calculate % GC

sequenceLength.append(seqLen) #Append length of individual sequences to list
percentGC.append(percGC) #Append %GC of individual sequences to list

seqDF=pandas.DataFrame(list(zip(sequenceLength,percentGC)),columns=['sequenceLength','percentGC']) #combine lists into dataframe for easier plotting
a=ggplot(seqDF, aes(x="sequenceLength")) #Create plot of sequence lengths
a+geom_histogram()+theme_classic() #Plot as histogram

b=ggplot(seqDF, aes(x="percentGC")) #Create plot of %GC
b+geom_histogram()+theme_classic() #Plot as histogram

InFile.close() #Close file
10 changes: 10 additions & 0 deletions code
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import numpy
import pandas
from plotnine import *

icecream=pandas.read_csv("icream_sales.txt",sep="\t",header=0)
icecream.shape
icecream.head(20)

a=ggplot(icecream,aes(x="temp",y="sales"))+theme_classic()+geom_point()
a+xlab("Temperature (C)")+ylab("Sales In Dollars")+stat_smooth(method="lm")
2 changes: 2 additions & 0 deletions icream_sales.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
“temp” “sales”
14.2 21516.4 32511.9 18515.2 33218.5 40622.1 52219.4 41225.1 61423.4 54418.1 42122.6 44517.2 408
Expand Down