Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 82 additions & 0 deletions exercise7
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
####Exercise7####
#Question 1
#load dataset
import pandas
InFile=open("Lecture11.fasta","r")
#InFile=close()
#create lists for storing information about sequences
sequenceID=[]
sequenceLength=[]
percentGC=[]
meltingTemp=[]
#for loop to sort out sequence lines and append their lengths
for Line in InFile:
# remove newline character from file line
Line=Line.strip()
print (Line)
# carrot lines separated from sequence lines
if '>' in Line:
sequenceID.append(Line[1:])
else:
# Create new seqlength dataframe and append lengths
Seqlength = float(len(Line))
print (Seqlength)
sequenceLength.append(Seqlength)
# count the number of G's and C's
nG=Line.count("G")
print (nG)
nC=Line.count("C")
print (nC)
# append values to list
gcTotal = (nG+nC)/Seqlength*100
percentGC.append(gcTotal)

#dataframe of resulting info
seqDF = pandas.DataFrame(list(zip(sequenceID,sequenceLength,percentGC)),columns=['sequenceID','sequenceLength','percentGC'])
#to make infile management easier
#InFile=open("Lecture11.fasta","r")
InFile.close()

#Histogram of sequence lengths
import plotnine
from plotnine import *
p=(ggplot(data=seqDF) +
aes(x="sequenceLength") +
geom_histogram(binwidth=4))
p
#Histogram of Percent GC
g=(ggplot(data=seqDF) +
aes(x="percentGC") +
geom_histogram(binwidth=5))
g

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good job

#Question 2
import numpy
import pandas
import plotnine
from plotnine import *

#read in file
Part2=pandas.read_csv("part2datacopy.txt", sep=",")
#print(Part2)

#plotting data in scatterplot with trendline
a=ggplot(Part2,aes(x="oil changes per year",y="cost of repairs($)"))+theme_classic()+geom_point()
a+xlab("oil changes per year")+ylab("cost of repairs($)")+stat_smooth(method="lm")

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good job

#Question 3
#load the dataset
import pandas
import numpy
Data = pandas.read_csv("data.txt", sep=',')
print (Data)

#making bar graph with region as x and ave as y
import plotnine
from plotnine import *
d=ggplot(Data)+theme_classic()+xlab("region")+ylab("Average")
d+geom_bar(aes(x="factor(region)",y="observations"),stat="summary",fun_y=numpy.mean)

#scatter plot of everything observed
a=ggplot(Data,aes(x="region",y="observations"))
a+geom_jitter()+coord_cartesian()

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good job

49 changes: 49 additions & 0 deletions part1script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import numpy
import pandas
from plotnine import *


#Question 1
InFile=open("Lecture11.fasta","r")

#create lists for storing information about sequences
sequenceID=[]
sequenceLength=[]
percentGC=[]
meltingTemp=[]

#loop through each line in fasta file to process sequences
for Line in InFile:
Line=Line.strip() #removes white space, tab, space, newline characters
if '>' in Line:
sequenceID.append(Line[1:])
#print(Line[1:])
else:
seqLen=float(len(Line))
nG=Line.count("G")
nC=Line.count("C")

#append values to lists
sequenceLength.append(seqLen)
percentGC.append((nG+nC)/seqLen*100)

#combine lists into dataframe
seqDF = pandas.DataFrame(list(zip(sequenceID,sequenceLength,percentGC)),columns=['sequenceID','sequenceLength','percentGC'])
#min(seqDF.sequenceLength)

#close file
InFile.close()

#plots histogram of sequence length
b=ggplot(seqDF,aes(x="sequenceLength"))
b+geom_histogram(binwidth=5)+theme_classic()

#plots histogram of percent GC
b=ggplot(seqDF,aes(x="percentGC"))
b+geom_histogram(binwidth=5)+theme_classic()






1 change: 1 addition & 0 deletions part2datacopy.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
oil changes per year,cost of repairs($)3,3005,3002,5003,4001,7004,4006,1004,2503,4502,6500,60010,07,150
Expand Down
11 changes: 11 additions & 0 deletions part2script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import numpy
import pandas
import plotnine
from plotnine import *

Part2=pandas.read_csv("part2datacopy.txt", sep=",")
#print(Part2)

#plotting data in scatterplot with trendline
a=ggplot(Part2,aes(x="oil changes per year",y="cost of repairs($)"))+theme_classic()+geom_point()
a+xlab("oil changes per year")+ylab("cost of repairs($)")+stat_smooth(method="lm")
16 changes: 16 additions & 0 deletions part3script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#Question 3
#load the dataset
import pandas
import numpy
Data = pandas.read_csv("data.txt", sep=',')
#print (Data)

#making bar graph with region as x and ave as y
import plotnine
from plotnine import *
d=ggplot(Data)+theme_classic()+xlab("region")+ylab("Average")
d+geom_bar(aes(x="factor(region)",y="observations"),stat="summary",fun_y=numpy.mean)

#scatter plot of all observations
a=ggplot(Data,aes(x="region",y="observations"))
a+geom_jitter()+coord_cartesian()