Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Antibodies to test
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
tsg101
actin
mhc1
89 changes: 89 additions & 0 deletions Exercise7answers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# Exercise 7


#Question 1

# open fasta file
InFile=open("Lecture11.fasta","r")

#create lists for storing information about sequences
sequenceID=[]
sequenceLength=[]
percentGC=[]

#loop through each line of fasta file to process sequences
for Line in InFile:
# remove newline character from file line
Line=Line.strip()
# if a sequence record
if '>' in Line:
# add the sequence ID (except the ">" character) to the sequenceID list
sequenceID.append(Line[1:])
# if a sequence line
else:
# get the number of characters in the sequence and convert to a float to avoid integer division
seqLen=float(len(Line))
# count the number of G's and C's
nG=Line.count("G")
nC=Line.count("C")

# append values to the lists
sequenceLength.append(seqLen)
percentGC.append((nG+nC)/seqLen*100)

import pandas
# combine lists into dataframe
seqDF = pandas.DataFrame(list(zip(sequenceID,sequenceLength,percentGC)),columns=['sequenceID','sequenceLength','percentGC'])

from plotnine import *
# histogram of sequence length
histogram1=ggplot(seqDF,aes(x="sequenceLength"))
histogram1+geom_histogram(binwidth=20,fill='blue',color='black')+theme_classic()

# histogram of percentGC
histogram2=ggplot(seqDF,aes(x="percentGC"))
histogram2+geom_histogram()+theme_classic()
# changing colors and bins
histogram2+geom_histogram(binwidth=15,fill='yellow',color='black')+theme_classic()

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good job

#Question 2
import pandas as pd
import numpy as np
data=pd.read_csv("dataset.csv")
import matplotlib.pyplot as plt
plt.title('Percent of Planted Corn in USA that is GMO')
plt.ylabel('Percent')
plt.xlabel('Year')
plt.xlim([2000,2018])
plt.scatter(data.Year, data.Percent)
x=data.Year
y=data.Percent
z = np.polyfit(x,y, 1)
p = np.poly1d(z)
plt.plot(x,p(x),"r--")

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good job


#Question 3

#open file and import pandas
import pandas
q3=pandas.read_csv("data.txt", sep=",", header=0)

#group by the region and find the mean of each region
average=q3.groupby('region')['observations'].mean()
#print to dataframe
df=average.to_frame()
#add the region rows - can find the order if you print the previous variable
df['region']=["east", "north", "south", "west"]

#making a bar graph with the avg of the corresponding regions
from plotnine import *
q3bp=ggplot(df)+theme_classic()+xlab("region")+ylab("observations")
q3bp+geom_bar(aes(x="region",y="observations"),stat="summary",)

#scatter plot with jitter applied
from plotnine import *
q3sp=ggplot(q3,aes(x="region",y="observations"))
q3sp+geom_jitter()+coord_cartesian()

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good job


19 changes: 19 additions & 0 deletions dataset.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
Year,Percent
2000,25
2001,26
2002,34
2003,40
2004,47
2005,52
2006,61
2007,73
2008,80
2009,85
2010,86
2011,88
2012,88
2013,90
2014,93
2015,92
2016,92
2017,92