Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
512 changes: 512 additions & 0 deletions .Rhistory

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
countryX/
countryY/

20,063 changes: 20,063 additions & 0 deletions CountryX.csv

Large diffs are not rendered by default.

19,883 changes: 19,883 additions & 0 deletions CountryY.csv

Large diffs are not rendered by default.

Binary file added InfectionsPlot.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added MarkerPlot.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
74 changes: 74 additions & 0 deletions SupportingFunctions.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#This file will contain supporting files
makeCSV <- function(filename,delimeter,fileOutname){
tempTable <- read.table(file=filename,sep=delimeter)
write.csv(tempTable,file=paste(fileOutname,".csv",sep=""),row.names = F)
}

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+2

#This function combines all the data from a directory into a big csv file and also
# returns this aggregated data set as a data frame
combineCSVs <- function(directoryname,country,na_handling="none",FileOutName){
#making list of all .csv files in directory using grep to search for .csv
csv.files.list <- dir(directoryname)[grep(".csv",dir(directoryname))]
#initiating matrix for final results
bigMatrix <- matrix(data=NA,nrow=0,ncol=14)
for(csv.file in csv.files.list){
#initiating temp data obj for raw data from csv
tempDF <- read.csv(file=paste(directoryname,csv.file,sep=""),header=T)
day <- gsub("[^0-9]*","",csv.file)
day = gsub("[_]","",day) # using char replace to extract the day from file name
day.vect <- rep(day,n=nrow(tempDF)) # making vector of day name to bind to tempDF
country.vect <- rep(country,n=nrow(tempDF)) # making vector of country to bind to tempDF
expanded.tempDF <- cbind(tempDF,country.vect,day.vect) # binding day and country col to tempDF
bigMatrix <- rbind(bigMatrix,expanded.tempDF) #binding the new completed DF to existing large one
}
#renaming columns of the final df
colsList <- list("gender","age","marker01","marker02","marker03","marker04",
"marker05","marker06","marker07","marker08","marker09","marker10",

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this can be shorten:
paste("marker0", 1:10, sep = "")

"country","dayofYear")
colnames(bigMatrix) = colsList
#code allowing user to request option removal of or warning about NA's present in data
if(na_handling == "remove"){
bigMatrix=na.omit(bigMatrix)
print("Any NA Values in data removed")
}

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"else" here

if(na_handling == "warn" ){
if(any(is.na.data.frame(bigMatrix))){print("Warning : Your Data Contains NA Values")}else{
print("No NA Values Present")
}
}

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what if the user type the wrong argument here? You should have a condition for that
-0.25

write.csv(bigMatrix,file = paste(as.character(FileOutName),".csv",sep=""))
return(bigMatrix)

}

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+2

# This function summarizes the Data Set for a variety of aspects
SummarizeCompiledTests <- function(dfName){
screen_number <- nrow(dfName)
running_infected_total <- 0
#markers.list <- c("marker01","marker02","marker03","marker04","marker05",
# "marker06","marker07","marker08","marker09","marker10")
for (i in 1:screen_number){
if(sum(as.numeric(dfName[i,3:12] == "1"))){
running_infected_total <- running_infected_total +1
}
}
percentage.male <- sum(dfName[,"gender"] == "male") / screen_number * 100
percentage.female <- 100 - percentage.male
percentage.juvenile <- sum(dfName[,"age"] < 18) / screen_number * 100
percentage.adult <- (sum(dfName[,"age"] < 65) / screen_number *100 ) - percentage.juvenile
percentage.senior <- sum( dfName[,"age"] >= 65) / screen_number *100
infected <- running_infected_total / screen_number *100
print(c("Data Summary:",
paste("There were ",screen_number," screening tests run",sep=""),
#percentage of males out of all screened
paste(round(percentage.male,1),"% of patients were male",sep=""),
#percentage of females out of all screened
paste(round(percentage.female,1),"% of patients were female",sep=""),
#percentage of juveniles out of all screened
paste(round(percentage.juvenile,1),"% of patients were juveniles (under 18)",sep=""),
#percentage of adults out of all screened
paste(round(percentage.adult,1),"% of patients were adults (18-65)",sep=""),
#percentage of senior citizens out of all screened
paste(round(percentage.senior,1),"% of patients were seniors (over 65)",sep=""))
)

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can also plot the age distribution

}

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+2

122 changes: 122 additions & 0 deletions analysis.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
#this R file will be used to analyze the patient data
# with the help of the functions in SupportingFunctions.R

#setting current working directory; Change for your system
setwd("/Users/kevinbuck/Desktop/Biocomputing/Exercises/Rproject")
source("SupportingFunctions.R")
####Initial Data Setup ####
#turning all the txt files for Country Y into CSV files
for (i in 120:175){
makeCSV(filename = paste("countryY/screen_",i,".txt",sep=""),delimeter = " ",
fileOutname = paste("countryY/screen_",i,sep=""))
}

#making giant data frame of screening data from both countries
countryX.df <- combineCSVs(directoryname = "countryX/", country="X",na_handling = "remove",
FileOutName = "CountryX")
countryY.df <- combineCSVs(directoryname = "countryY/", country="Y",na_handling = "remove",
FileOutName = "CountryY")

allData.df <- rbind(countryX.df,countryY.df)

#summarize data
SummarizeCompiledTests(allData.df)


Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+2

#making function to check if the screened patient is infected
check_infected <- function(row_index,df){
return(sum(as.numeric(df[row_index,3:12] == "1")) != 0)
}

####Tracking Cases Over Time####

day.list <- as.character(seq(120,175))

#initiating data frame to store info
Daily_Total_Cases <- data.frame(Day=as.numeric(day.list),CountryX.Total=numeric(length(day.list)),
CountryY.Total=numeric(length(day.list)))
row.names(Daily_Total_Cases) = day.list
#this big loop checks if a screened row has any markers present (is patient infected?)
# and then it will add 1 to the total for cases in the appropriate day as stored in the dataframe
for(day_num in day.list){
tempDF <- allData.df[allData.df[,"dayofYear"] == day_num,]
for(i in 1:nrow(tempDF)){
if( (check_infected(i,tempDF) == TRUE) & (tempDF[i,'country'] == "X")){
Daily_Total_Cases[day_num,"CountryX.Total"] = 1 + Daily_Total_Cases[day_num,"CountryX.Total"]
}
if((check_infected(i,tempDF) == TRUE) & (tempDF[i,'country'] == "Y")){
Daily_Total_Cases[day_num,"CountryY.Total"] = 1 + Daily_Total_Cases[day_num,"CountryY.Total"]
}
}
}

#####Plotting Daily Cases by Day#####

#Notice how Country X has cases starting at day 120, and the country Y
# cases line is at 0 cases until day 138ish. This means the outbreak began in country X

library(ggplot2)
ggplot()+geom_line(data=Daily_Total_Cases,mapping=aes(x=Day,y=CountryX.Total,color="CountryX")) +
geom_line(data=Daily_Total_Cases,mapping=aes(x=Day,y=CountryY.Total,color="CountryY")) +
ylab("Daily Cases") + xlab("Day Number") + ggtitle("Infections Over Time")+
scale_color_manual(name = "Country", values = c("CountryX" = "blue", "CountryY" = "red"))

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For more concise code:
ggplot(data=Daily_Total_Cases,mapping=aes(x=Day))+
geom_line(mapping=aes(y=CountryX.Total,color="CountryX")) +
geom_line(mapping=aes(y=CountryY.Total,color="CountryY")) +
ylab("Daily Cases") + xlab("Day Number") + ggtitle("Infections Over Time")+
scale_color_manual(name = "Country", values = c("CountryX" = "blue", "CountryY" = "red"))


Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+4

#Figuring out which markers cause the disease
total.cases.X <- 0
for (i in 1:nrow(countryX.df)){
if(check_infected(i,countryX.df) == T){
total.cases.X <- total.cases.X +1
}
}
total.cases.Y <- 0
for (i in 1:nrow(countryY.df)){
if(check_infected(i,countryY.df) == T){
total.cases.Y <- total.cases.Y +1
}}

x.markers.pct <- numeric(10)
y.markers.pct <- numeric(10)
x.markers.pct[1] <- sum(countryX.df[,"marker01"] == "1") / total.cases.X
y.markers.pct[1] <- sum(countryY.df[,"marker01"] == "1") / total.cases.Y
x.markers.pct[2] <- sum(countryX.df[,"marker02"] == "1") / total.cases.X
y.markers.pct[2] <- sum(countryY.df[,"marker02"] == "1") / total.cases.Y
x.markers.pct[3] <- sum(countryX.df[,"marker03"] == "1") / total.cases.X
y.markers.pct[3] <- sum(countryY.df[,"marker03"] == "1") / total.cases.Y
x.markers.pct[4] <- sum(countryX.df[,"marker04"] == "1") / total.cases.X
y.markers.pct[4] <- sum(countryY.df[,"marker04"] == "1") / total.cases.Y
x.markers.pct[5] <- sum(countryX.df[,"marker05"] == "1") / total.cases.X
y.markers.pct[5] <- sum(countryY.df[,"marker05"] == "1") / total.cases.Y
x.markers.pct[6] <- sum(countryX.df[,"marker06"] == "1") / total.cases.X
y.markers.pct[6] <- sum(countryY.df[,"marker06"] == "1") / total.cases.Y
x.markers.pct[7] <- sum(countryX.df[,"marker07"] == "1") / total.cases.X
y.markers.pct[7] <- sum(countryY.df[,"marker07"] == "1") / total.cases.Y
x.markers.pct[8] <- sum(countryX.df[,"marker08"] == "1") / total.cases.X
y.markers.pct[8] <- sum(countryY.df[,"marker08"] == "1") / total.cases.Y
x.markers.pct[9] <- sum(countryX.df[,"marker09"] == "1") / total.cases.X
y.markers.pct[9] <- sum(countryY.df[,"marker09"] == "1") / total.cases.Y
x.markers.pct[10] <- sum(countryX.df[,"marker10"] == "1") / total.cases.X
y.markers.pct[10] <- sum(countryY.df[,"marker10"] == "1") / total.cases.Y

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should make a for loop for this
-0.5 for code inefficiency

markers.list <- as.character(seq(1,10))
markers.sum.table <- matrix(ncol = 3,nrow=20)
colnames(markers.sum.table) <- c("country","marker","freq")
markers.sum.table <- as.data.frame(markers.sum.table)
for (i in 1:10){
tempvect01 <- c("X",as.character(i),x.markers.pct[i])
markers.sum.table[i,] <- tempvect01
tempvect02 <- c("Y",as.character(i),y.markers.pct[i])
markers.sum.table[10+i,] <- tempvect02
}
#plotting the markers that cause disease
# Note how the distribution of markers for country X is skewed towards Markers 1-5,
# Whereas Country Y has a relatively even distribution. This means that a vaccine targeted
# At patients in country Y might be based upon solving a problem caused by a micro satellite the patient in
# Country X might not have present. Thus, vaccines developed by country Y MIGHT work for
# Country X, but it is not certain.
ggplot(data=markers.sum.table,aes(x=marker,y=freq,fill=country)) +
geom_bar(stat="identity",position = "dodge",alpha=.75) +
ggtitle("Relative Frequency of MicroSatellites")

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+4




Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good commenting and efficient code that uses coding concepts that we covered in class – 3.25 points (max is 4)

Loading