Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions main.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#load packages and custom functions
library(ggplot2)
library(tidyverse)
source("supportingFunctions.R")

#set work directory
#please set your own work directory before use
setwd("D:/Notre Dame/Class/2022 Fall/Introduction to Biocomputing/Tutorial/Rproject-main/Rproject-main")

#convert txt files into csv files
txt_convert("./countryY")

#compile all data
all_data <- compile(c("X","Y"), NA_remove = FALSE, NA_warning = FALSE)
write.csv(all_data, file = "allData.csv", row.names=F)

#answer the questions
#1.In which country (X or Y) did the disease outbreak likely begin?
#The function "find_first_positive" is defined in supportingFunctions.R
print(find_first_positive("X"))
print(find_first_positive("Y"))
#Answer: From the outputs above, we know that the first patient in country X was found earlier than that of country Y, so the disease began in country X.
#We can also get this conclusion through a graph.
#load the data in allData.csv
allData <- read.csv('allData.csv')
allData$infectedSums <- rowSums(allData[,3:12] )
allData$infected <- ifelse(allData$infectedSums>0, 1, 0)
infectedData <- allData[allData$infected==1,]
ggplot(infectedData, aes(x=dayofYear, fill=country))+geom_bar(position = 'dodge')

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+4

#2.If Country Y develops a vaccine for the disease, is it likely to work for citizens of Country X?
#generate plots
allData_long <- allData %>% pivot_longer(cols = marker01:marker10, names_to="marker",values_to="value")

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is not advised to use any functions that was not taught in the class

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

-0.5

allData_long <- allData_long[allData_long$value==1,]
ggplot(allData_long, aes(x=marker, fill=country))+geom_bar(position = 'dodge')
#Answer: Markers detected in country A and country B are not highly consistent, so vaccine for country Y is not likely to work in country X.

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+4

#summarize all data
#The function "summarize" is defined in supportingFunctions.R
summarize(allData)

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good commenting and efficient code that uses coding concepts that we covered in class – 3 points (max is 4)

118 changes: 118 additions & 0 deletions supportingFunctions.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
##supporting functions for analysis

#Text file format conversion function
#This function converts all txt files in a given directory to csv files
#usage: txt_convert("./countryX")
#first define a function to convert a single txt file
single_txt_convert <- function(file_name){
data <- read.table(file_name, sep = " ", header = TRUE)
new_file_name <- gsub('txt', 'csv', file_name)
write.csv(data, file = new_file_name, row.names=F)
}
#then use the function above to define the function for converting multiple files
txt_convert <- function(file_path){
file_name_list <- list.files(path = file_path, pattern = ".txt")
for(file in file_name_list){
name_including_path <- paste(file_path, file, sep = "/")
single_txt_convert(name_including_path)
}
}

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+2

#Compilation function
#This function compiles all data of a given list of countries. Users should input a vector containing the name(s) of one or multiple countries.
#usage: all_data <- compile(c("X","Y"), NA_remove = FALSE, NA_warning = FALSE)
#requirements: the folders containing data from different countries should be put in the working directory, and they should be named like "countryX"
#define a function to add columns of country and date to a dataframe
add_col <- function(dataframe, country, date){
country_col <- rep(country, times = nrow(dataframe))
date_col <- rep(date, times = nrow(dataframe))
country_col <- data.frame(country_col)
date_col <- data.frame(date_col)
data_to_add <- cbind(country_col, date_col)
colnames(data_to_add) <- c("country","dayofYear")
dataframe <- cbind(dataframe, data_to_add)

@qtran4 qtran4 Dec 16, 2022

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lines 27 - 33 can be shorten:
dataframe$country = country
dataframe$dayofYear = date
return(dataframe)
-0.5 for code inefficient

return(dataframe)
}
#define a function to compile files of a single country
compile_single_country <- function(country_name){
#create a void dataframe
single_country_data <- data.frame(matrix(ncol = 14, nrow = 0))
colnames(single_country_data) <- c("gender", "age", "marker01", "marker02",
"marker03", "marker04", "marker05", "marker06",
"marker07", "marker08", "marker09", "marker10",
"country", "dayofYear")
#get file names
file_path <- paste("./country", country_name, sep = "")
file_name_list <- list.files(path = file_path, pattern = ".csv")
#compile files
for(file in file_name_list){
file_name_including_path <- paste(file_path, file, sep = "/")
single_file_data <- read.csv(file_name_including_path, sep = ",", header = TRUE)
date <- substr(file, 8, 10)
single_file_data <- add_col(single_file_data, country_name, date)
single_country_data <- rbind(single_country_data, single_file_data)
}
return(single_country_data)
}
#define a function to compile all files
compile <- function(country_name_list, NA_remove = FALSE, NA_warning = FALSE){
#create a void dataframe
all_data <- data.frame(matrix(ncol = 14, nrow = 0))
colnames(all_data) <- c("gender", "age", "marker01", "marker02",
"marker03", "marker04", "marker05", "marker06",
"marker07", "marker08", "marker09", "marker10",

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can use paste("marker0", 1:10, sep = "") instead of typing all the markers' name

"country", "dayofYear")
#compile files
for(country in country_name_list){
all_data <- rbind(all_data, compile_single_country(country))
}
#deal with NAs
if(NA_warning == TRUE){
nNA <- sum(is.na(all_data))
output <- paste(nNA, "NA(s) found in", nrow(all_data)*ncol(all_data), "data", sep = " ")
print(output)
}
if(NA_remove == TRUE){
all_data <- na.omit(all_data)
}
#return the final result
return(all_data)
}

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+2

#function to find out the earliest date that at least one patient was found in a given country
find_first_positive <- function(country_name){
file_path <- paste("./country", country_name, sep = "")
file_name_list <- list.files(path = file_path, pattern = ".csv")
for(file in file_name_list){
file_name_including_path <- paste(file_path, file, sep = "/")
data_to_check <- read.csv(file_name_including_path, sep = ",", header = TRUE)
date <- substr(file, 8, 10)
for(i in 1:nrow(data_to_check)){
if(rowSums(data_to_check[i,3:12]) != 0){
return(date)
}
}
}
}

#summarize function
summarize <- function(data){
allData <- data[data$age<100,]
screened<-nrow(allData)
allData$infectedSums <- rowSums(allData[,3:12] )
allData$infected <- ifelse(allData$infectedSums>0, 1, 0)
infectedData <- allData[allData$infected==1,]
percentInf<- nrow(infectedData)/nrow(allData)
males <- nrow(allData[allData$gender =="male",])
infectedMales <- nrow(infectedData[infectedData$gender =="male",])
percentMalesInfected <- infectedMales/males
females <- nrow(allData[allData$gender =="female",])
infectedFemales <- nrow(infectedData[infectedData$gender =="female",])
percentFemalesInfected <- infectedFemales/females
ageHist<-ggplot(data=allData, aes(x=age), fill='green', alpha = 0.2) +
geom_histogram() +
geom_histogram(data=infectedData, aes(x=age), fill='red', alpha=0.2)
returnString <- cat("Number Screened: ", screened , "\nPercentage Infected: " , percentInf , "\nNumber of Males: " , males , "\nPercentage of Males Infected: " , percentMalesInfected , "\nNumber of Females: " , females , "\nPercentage of Females Infected: ", percentFemalesInfected , "\nAge Distribution Graph (grey is total, red is infected): ")
print(returnString)
print(ageHist)
}

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+2