-
Notifications
You must be signed in to change notification settings - Fork 19
submission -- Jiang and Keenan #3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,40 @@ | ||
| #load packages and custom functions | ||
| library(ggplot2) | ||
| library(tidyverse) | ||
| source("supportingFunctions.R") | ||
|
|
||
| #set work directory | ||
| #please set your own work directory before use | ||
| setwd("D:/Notre Dame/Class/2022 Fall/Introduction to Biocomputing/Tutorial/Rproject-main/Rproject-main") | ||
|
|
||
| #convert txt files into csv files | ||
| txt_convert("./countryY") | ||
|
|
||
| #compile all data | ||
| all_data <- compile(c("X","Y"), NA_remove = FALSE, NA_warning = FALSE) | ||
| write.csv(all_data, file = "allData.csv", row.names=F) | ||
|
|
||
| #answer the questions | ||
| #1.In which country (X or Y) did the disease outbreak likely begin? | ||
| #The function "find_first_positive" is defined in supportingFunctions.R | ||
| print(find_first_positive("X")) | ||
| print(find_first_positive("Y")) | ||
| #Answer: From the outputs above, we know that the first patient in country X was found earlier than that of country Y, so the disease began in country X. | ||
| #We can also get this conclusion through a graph. | ||
| #load the data in allData.csv | ||
| allData <- read.csv('allData.csv') | ||
| allData$infectedSums <- rowSums(allData[,3:12] ) | ||
| allData$infected <- ifelse(allData$infectedSums>0, 1, 0) | ||
| infectedData <- allData[allData$infected==1,] | ||
| ggplot(infectedData, aes(x=dayofYear, fill=country))+geom_bar(position = 'dodge') | ||
|
|
||
| #2.If Country Y develops a vaccine for the disease, is it likely to work for citizens of Country X? | ||
| #generate plots | ||
| allData_long <- allData %>% pivot_longer(cols = marker01:marker10, names_to="marker",values_to="value") | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is not advised to use any functions that was not taught in the class
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. -0.5 |
||
| allData_long <- allData_long[allData_long$value==1,] | ||
| ggplot(allData_long, aes(x=marker, fill=country))+geom_bar(position = 'dodge') | ||
| #Answer: Markers detected in country A and country B are not highly consistent, so vaccine for country Y is not likely to work in country X. | ||
|
|
||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +4 |
||
| #summarize all data | ||
| #The function "summarize" is defined in supportingFunctions.R | ||
| summarize(allData) | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good commenting and efficient code that uses coding concepts that we covered in class – 3 points (max is 4) |
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,118 @@ | ||
| ##supporting functions for analysis | ||
|
|
||
| #Text file format conversion function | ||
| #This function converts all txt files in a given directory to csv files | ||
| #usage: txt_convert("./countryX") | ||
| #first define a function to convert a single txt file | ||
| single_txt_convert <- function(file_name){ | ||
| data <- read.table(file_name, sep = " ", header = TRUE) | ||
| new_file_name <- gsub('txt', 'csv', file_name) | ||
| write.csv(data, file = new_file_name, row.names=F) | ||
| } | ||
| #then use the function above to define the function for converting multiple files | ||
| txt_convert <- function(file_path){ | ||
| file_name_list <- list.files(path = file_path, pattern = ".txt") | ||
| for(file in file_name_list){ | ||
| name_including_path <- paste(file_path, file, sep = "/") | ||
| single_txt_convert(name_including_path) | ||
| } | ||
| } | ||
|
|
||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +2 |
||
| #Compilation function | ||
| #This function compiles all data of a given list of countries. Users should input a vector containing the name(s) of one or multiple countries. | ||
| #usage: all_data <- compile(c("X","Y"), NA_remove = FALSE, NA_warning = FALSE) | ||
| #requirements: the folders containing data from different countries should be put in the working directory, and they should be named like "countryX" | ||
| #define a function to add columns of country and date to a dataframe | ||
| add_col <- function(dataframe, country, date){ | ||
| country_col <- rep(country, times = nrow(dataframe)) | ||
| date_col <- rep(date, times = nrow(dataframe)) | ||
| country_col <- data.frame(country_col) | ||
| date_col <- data.frame(date_col) | ||
| data_to_add <- cbind(country_col, date_col) | ||
| colnames(data_to_add) <- c("country","dayofYear") | ||
| dataframe <- cbind(dataframe, data_to_add) | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. lines 27 - 33 can be shorten: |
||
| return(dataframe) | ||
| } | ||
| #define a function to compile files of a single country | ||
| compile_single_country <- function(country_name){ | ||
| #create a void dataframe | ||
| single_country_data <- data.frame(matrix(ncol = 14, nrow = 0)) | ||
| colnames(single_country_data) <- c("gender", "age", "marker01", "marker02", | ||
| "marker03", "marker04", "marker05", "marker06", | ||
| "marker07", "marker08", "marker09", "marker10", | ||
| "country", "dayofYear") | ||
| #get file names | ||
| file_path <- paste("./country", country_name, sep = "") | ||
| file_name_list <- list.files(path = file_path, pattern = ".csv") | ||
| #compile files | ||
| for(file in file_name_list){ | ||
| file_name_including_path <- paste(file_path, file, sep = "/") | ||
| single_file_data <- read.csv(file_name_including_path, sep = ",", header = TRUE) | ||
| date <- substr(file, 8, 10) | ||
| single_file_data <- add_col(single_file_data, country_name, date) | ||
| single_country_data <- rbind(single_country_data, single_file_data) | ||
| } | ||
| return(single_country_data) | ||
| } | ||
| #define a function to compile all files | ||
| compile <- function(country_name_list, NA_remove = FALSE, NA_warning = FALSE){ | ||
| #create a void dataframe | ||
| all_data <- data.frame(matrix(ncol = 14, nrow = 0)) | ||
| colnames(all_data) <- c("gender", "age", "marker01", "marker02", | ||
| "marker03", "marker04", "marker05", "marker06", | ||
| "marker07", "marker08", "marker09", "marker10", | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can use paste("marker0", 1:10, sep = "") instead of typing all the markers' name |
||
| "country", "dayofYear") | ||
| #compile files | ||
| for(country in country_name_list){ | ||
| all_data <- rbind(all_data, compile_single_country(country)) | ||
| } | ||
| #deal with NAs | ||
| if(NA_warning == TRUE){ | ||
| nNA <- sum(is.na(all_data)) | ||
| output <- paste(nNA, "NA(s) found in", nrow(all_data)*ncol(all_data), "data", sep = " ") | ||
| print(output) | ||
| } | ||
| if(NA_remove == TRUE){ | ||
| all_data <- na.omit(all_data) | ||
| } | ||
| #return the final result | ||
| return(all_data) | ||
| } | ||
|
|
||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +2 |
||
| #function to find out the earliest date that at least one patient was found in a given country | ||
| find_first_positive <- function(country_name){ | ||
| file_path <- paste("./country", country_name, sep = "") | ||
| file_name_list <- list.files(path = file_path, pattern = ".csv") | ||
| for(file in file_name_list){ | ||
| file_name_including_path <- paste(file_path, file, sep = "/") | ||
| data_to_check <- read.csv(file_name_including_path, sep = ",", header = TRUE) | ||
| date <- substr(file, 8, 10) | ||
| for(i in 1:nrow(data_to_check)){ | ||
| if(rowSums(data_to_check[i,3:12]) != 0){ | ||
| return(date) | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| #summarize function | ||
| summarize <- function(data){ | ||
| allData <- data[data$age<100,] | ||
| screened<-nrow(allData) | ||
| allData$infectedSums <- rowSums(allData[,3:12] ) | ||
| allData$infected <- ifelse(allData$infectedSums>0, 1, 0) | ||
| infectedData <- allData[allData$infected==1,] | ||
| percentInf<- nrow(infectedData)/nrow(allData) | ||
| males <- nrow(allData[allData$gender =="male",]) | ||
| infectedMales <- nrow(infectedData[infectedData$gender =="male",]) | ||
| percentMalesInfected <- infectedMales/males | ||
| females <- nrow(allData[allData$gender =="female",]) | ||
| infectedFemales <- nrow(infectedData[infectedData$gender =="female",]) | ||
| percentFemalesInfected <- infectedFemales/females | ||
| ageHist<-ggplot(data=allData, aes(x=age), fill='green', alpha = 0.2) + | ||
| geom_histogram() + | ||
| geom_histogram(data=infectedData, aes(x=age), fill='red', alpha=0.2) | ||
| returnString <- cat("Number Screened: ", screened , "\nPercentage Infected: " , percentInf , "\nNumber of Males: " , males , "\nPercentage of Males Infected: " , percentMalesInfected , "\nNumber of Females: " , females , "\nPercentage of Females Infected: ", percentFemalesInfected , "\nAge Distribution Graph (grey is total, red is infected): ") | ||
| print(returnString) | ||
| print(ageHist) | ||
| } | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +2 |
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
+4