Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 106 additions & 0 deletions AnalysisTutorial7.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
#### Analysis for Tutorial 7


# Set working directory
#os.chdir('/Users/brittnibertolet/Desktop/bcTutorials/Intro_Biocomp_ND_318_Tutorial7/')

# Load packages
import numpy
import pandas
# from plotnine import * (not sure we need to do it this way)


############################################
################ Question 1 ################
############################################

# Read in fasta
seq=open("Lecture11.fasta",'r')

# Plan for storing info
sequenceID=[]
sequenceLength=[]
percentGC=[]
meltingTemp=[]

# Create for loop to do these things
for Line in seq:
# Remove newline character from file line
Line=Line.strip()
# Use if/else statement to figure out what line you're on
if '>' in Line:
# Append sequenceID to stored list
sequenceID.append(Line[1:])
else:
# Count sequence length
seqLen=float(len(Line))
# Count the number of G's and C's
nG=Line.count("G")
nC=Line.count("C")

# Use if/else to figure out if the sequence is 14 or fewer bases
if seqLen<=14:
# Calculate melting temperature
Tm=2*(nG+nC)+2*seqLen
else:
# Return "-9999"
Tm=-9999

# Append values to the stored lists
sequenceLength.append(seqLen)
percentGC.append((nG+nC)/seqLen*100)
meltingTemp.append(Tm)

# Combine lists into a dataframe
seqDF = pandas.DataFrame(list(zip(sequenceID,sequenceLength,percentGC,meltingTemp)),columns=['sequenceID','sequenceLength','percentGC','meltingTemp'])

# Close file
seq.close()

# Create histogram of sequence length
from plotnine import *
plot1=ggplot(seqDF,aes(x="sequenceLength"))
plot1+geom_histogram()+theme_classic()

# Create histogram of GC content
plot2=ggplot(seqDF,aes(x="percentGC"))
plot2+geom_histogram()+theme_classic()

############################################
################ Question 2 ################
############################################

# I put data in the GitHub titled "Q2_lakeData.txt".
# Read in Q2_lakeData.txt

lakeData=pandas.read_csv("Q2_lakeData.txt", sep="\t")

# Plot scatter plot of Prod vs chlA with trendline

a=ggplot(lakeData,aes(x="chlA",y="Prod"))+theme_classic()+geom_point()
a+xlab("Concentration of Chlorophyll A")+ylab("Methane Production")+stat_smooth(method="lm")



############################################
################ Question 3 ################
############################################

#read in data.txt
data3=pandas.read_csv("data.txt",sep=",",header=0)

#bar plot of north, east, south, and west populations
plot4=ggplot(data3)+xlab("Region")+ylab("Mean Observation")
plot4+geom_bar(aes(x="factor(region)",y="observations"),stat="summary",fun_y=numpy.mean)+theme_classic()

# Plot a scatter plot of of all the observations
plot5=ggplot(data3, aes(x="region", y="observations"))+xlab("Region")+ylab("Mean Observation")
plot5+geom_point()+geom_jitter()+theme_classic()

### The two plots tell very different stories. While each region does have a very similar
### mean, the data are distributed differently around the mean. Both East and West have very
### large standard deviations, while the North region does not. The South region is more like a
### bimodal distribution with both maximam on opposite sides of the mean.


#scatter plot of observations
1 change: 1 addition & 0 deletions Q2_lakeData.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
lakeID chlA pH DOC TP TN ProdBA 4.4 7 6.5 22.8 840 4493BE 12.8 5.2 9.4 21.5 531 5329BO 20.7 5.4 19.5 48.7 1389 9261BR 40.2 8.3 6.6 86.9 1187 12649CB 16.7 4.1 18.2 33 990 8137CR 4 5.9 4.5 11.1 1111 2842FO 6.3 5.5 11.1 52.6 696 3052HB 13.8 5.3 23 30.7 1678 11418MO 7.2 7.8 22.6 36.2 1450 8397NG 32.3 4.7 23.4 17.6 2239 18701PA 4.3 7.2 5.1 40.9 1024 9279PE 3.4 7.7 6.4 12 1252 12256TU 7 6.7 13.4 15.3 1122 8085WL 8.1 6 7.4 22.1 803 13075
Expand Down