Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 115 additions & 0 deletions Exercise7.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
# Question 1

#Uploading data
setwd("~/Desktop/Biocomputing/Week6")
fasta=scan(file = "Lecture11.fasta.txt",what=character())

# Using Stuart's code to get my matrix values becuase my code
# is still kind of weird

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the problem?


# pre-allocate matrix to store sequence information
summ=matrix(NA,length(fasta)/2,4)

# loop through lines of fasta and process each record
for(i in 1:length(fasta)){
# if-else to figure out if we have a recordID line or sequence line
if(grepl(">",fasta[i])){
summ[((i+1)/2),1]=substr(fasta[i],2,nchar(fasta[i])) # could use counter instead of complicated row calculation
}else{
seqLength=nchar(fasta[i])

numG=nchar(fasta[i])-nchar(gsub("G","",fasta[i]))
numC=nchar(fasta[i])-nchar(gsub("C","",fasta[i]))
if(seqLength<=14){
Tm=2*(numG+numC)+2*seqLength
}else{
Tm=-9999
}

summ[i/2,2]=seqLength
summ[i/2,3]=round((numG+numC)/seqLength*100,1)
summ[i/2,4]=Tm
}
}

seqSumm=data.frame(sequenceID=summ[,1],sequenceLength=as.numeric(summ[,2]),percentGC=as.numeric(summ[,3]),meltingTemp=as.numeric(summ[,4]))

#### Graph 1 - Histogram of sequence lengths

a = ggplot(data = seqSumm, aes(x = sequenceLength))
a + geom_histogram() + theme_classic() + stat_bin(binwidth = 1) +
xlab("Sequence Length") + ylab("Count") +
ggtitle("Histogram of Sequence Length")

#### Graph 2 - Histogram of %GC Content

b = ggplot(data = seqSumm, aes(x = percentGC))
b + geom_histogram() + theme_classic() + xlab("% GC Content") +
ylab("Count") + ggtitle("Histogram of %GC Content")



#Question 2
#Point-framing data and LAI data for a site in Alaska.
#Point-framing is just a measure of species richness from the number of times a pin dropped from the top of the canopy hits a certain spp.
#LAI measurements are from a leaf area index meter.

pfdata <- read.csv("/Users/elizabethfortin12/Documents/ND First Year/Meetings with Adrian/Fert Project/WorkingData/Point_Framing_Data_2016.csv", header = TRUE)
laidata <- read.csv("/Users/elizabethfortin12/Documents/ND First Year/Meetings with Adrian/Fert Project/WorkingData/LAI.output.csv", header = FALSE)
colnames(laidata) <- c("Site","Block","Treatment","LAI2016","LAI2017")

library(plyr)
pftot <- ddply(pfdata, c("SITE","Treatment"), summarize, Betula=sum(Betula), Tussock=sum(Tussock), Forb=sum(Forb), Total=sum(Total))
colnames(pftot) <- c("Site","Treatment", "Betula","Tussock","Forb","Total")
perc.cov <- ddply(pftot, c("Site", "Treatment"), summarize, Betula = (Betula/Total)*100, Tussock = (Tussock/Total)*100, Forb = (Forb/Total)*100)

laitot <- ddply(laidata, c("Site", "Treatment"), summarize, LAI2016=mean(LAI2016), LAI2017=mean(LAI2017))

pc.lai <- merge(perc.cov, laitot, by = c("Site","Treatment"))
write.table(pc.lai, file = "pc.lai.txt")

pclai <- read.table("/Users/elizabethfortin12/Documents/ND First Year/Biocomputing/R_Programming/pc.lai.txt")

library(ggplot2)
library(grid)
library(gridExtra)
library(reshape2)

pclai.melt <- melt(pclai, id.vars=c('Site','Treatment','LAI2016','LAI2017'))
colnames(pclai.melt) <- c("Site","Treatment","LAI2016","LAI2017","GrowthForm","PercentCover")

a <- ggplot(data = pclai.melt, aes(x=LAI2016, y=PercentCover)) + theme_classic() +
geom_point(aes(colour=as.factor(GrowthForm))) +
stat_smooth(method = 'lm') +
theme(legend.position = "none") +
scale_color_manual(values = c("red", "green", "blue")) +
labs(x="LAI 2016", y="Percent Cover %")
a

b <- ggplot(data = pclai.melt, aes(x=LAI2017, y=PercentCover)) + theme_classic() +
geom_point(aes(colour=as.factor(GrowthForm))) +
stat_smooth(method = 'lm') +
theme(axis.title.y=element_blank(), legend.title = element_blank(), legend.position = c(0.2,0.9)) +
scale_color_manual(values = c("red", "green", "blue")) +
xlab("LAI 2017")

b

grid.arrange(a, b, ncol = 2)


#Question 3
data <- read.csv("/Users/elizabethfortin12/Documents/ND First Year/Biocomputing/R_Programming/Intro_Biocomp_ND_317_Tutorial7/data.txt")

ddply(data, c("region"), summarize, mean=mean(observations))

d <- ggplot(data = data)
d + geom_bar(aes(x = as.factor(region), y = observations), stat = "summary",
fun.y = "mean", fill = "black", color = "black") + theme_classic() +
xlab("Region") + ylab("Mean Observation") + ggtitle("Mean Observations in Each Region")

a <- ggplot(data = data)
a + theme_classic() + geom_jitter(aes(x=as.factor(region),y=observations), alpha=0.1) + labs(x='Region', y='Observations') + ggtitle("Observations in Each Region")
#Yes because the bar plot compares the averages of each region,
#while the geom_jitter plot gives you an idea of the average and
#the spread because it shows each individual data point