-
Notifications
You must be signed in to change notification settings - Fork 19
Buck_Neufell_Reisch_Submission #4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
a985bf3
9468f50
6e78481
fd6e5b8
827ee91
b8e3445
3434170
e645cab
30eecc4
fecdc7d
e4a0cb8
50ef654
7c20cea
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| countryX/ | ||
| countryY/ | ||
|
|
Large diffs are not rendered by default.
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,74 @@ | ||
| #This file will contain supporting files | ||
| makeCSV <- function(filename,delimeter,fileOutname){ | ||
| tempTable <- read.table(file=filename,sep=delimeter) | ||
| write.csv(tempTable,file=paste(fileOutname,".csv",sep=""),row.names = F) | ||
| } | ||
|
|
||
| #This function combines all the data from a directory into a big csv file and also | ||
| # returns this aggregated data set as a data frame | ||
| combineCSVs <- function(directoryname,country,na_handling="none",FileOutName){ | ||
| #making list of all .csv files in directory using grep to search for .csv | ||
| csv.files.list <- dir(directoryname)[grep(".csv",dir(directoryname))] | ||
| #initiating matrix for final results | ||
| bigMatrix <- matrix(data=NA,nrow=0,ncol=14) | ||
| for(csv.file in csv.files.list){ | ||
| #initiating temp data obj for raw data from csv | ||
| tempDF <- read.csv(file=paste(directoryname,csv.file,sep=""),header=T) | ||
| day <- gsub("[^0-9]*","",csv.file) | ||
| day = gsub("[_]","",day) # using char replace to extract the day from file name | ||
| day.vect <- rep(day,n=nrow(tempDF)) # making vector of day name to bind to tempDF | ||
| country.vect <- rep(country,n=nrow(tempDF)) # making vector of country to bind to tempDF | ||
| expanded.tempDF <- cbind(tempDF,country.vect,day.vect) # binding day and country col to tempDF | ||
| bigMatrix <- rbind(bigMatrix,expanded.tempDF) #binding the new completed DF to existing large one | ||
| } | ||
| #renaming columns of the final df | ||
| colsList <- list("gender","age","marker01","marker02","marker03","marker04", | ||
| "marker05","marker06","marker07","marker08","marker09","marker10", | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this can be shorten: |
||
| "country","dayofYear") | ||
| colnames(bigMatrix) = colsList | ||
| #code allowing user to request option removal of or warning about NA's present in data | ||
| if(na_handling == "remove"){ | ||
| bigMatrix=na.omit(bigMatrix) | ||
| print("Any NA Values in data removed") | ||
| } | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "else" here |
||
| if(na_handling == "warn" ){ | ||
| if(any(is.na.data.frame(bigMatrix))){print("Warning : Your Data Contains NA Values")}else{ | ||
| print("No NA Values Present") | ||
| } | ||
| } | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what if the user type the wrong argument here? You should have a condition for that |
||
| write.csv(bigMatrix,file = paste(as.character(FileOutName),".csv",sep="")) | ||
| return(bigMatrix) | ||
|
|
||
| } | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +2 |
||
| # This function summarizes the Data Set for a variety of aspects | ||
| SummarizeCompiledTests <- function(dfName){ | ||
| screen_number <- nrow(dfName) | ||
| running_infected_total <- 0 | ||
| #markers.list <- c("marker01","marker02","marker03","marker04","marker05", | ||
| # "marker06","marker07","marker08","marker09","marker10") | ||
| for (i in 1:screen_number){ | ||
| if(sum(as.numeric(dfName[i,3:12] == "1"))){ | ||
| running_infected_total <- running_infected_total +1 | ||
| } | ||
| } | ||
| percentage.male <- sum(dfName[,"gender"] == "male") / screen_number * 100 | ||
| percentage.female <- 100 - percentage.male | ||
| percentage.juvenile <- sum(dfName[,"age"] < 18) / screen_number * 100 | ||
| percentage.adult <- (sum(dfName[,"age"] < 65) / screen_number *100 ) - percentage.juvenile | ||
| percentage.senior <- sum( dfName[,"age"] >= 65) / screen_number *100 | ||
| infected <- running_infected_total / screen_number *100 | ||
| print(c("Data Summary:", | ||
| paste("There were ",screen_number," screening tests run",sep=""), | ||
| #percentage of males out of all screened | ||
| paste(round(percentage.male,1),"% of patients were male",sep=""), | ||
| #percentage of females out of all screened | ||
| paste(round(percentage.female,1),"% of patients were female",sep=""), | ||
| #percentage of juveniles out of all screened | ||
| paste(round(percentage.juvenile,1),"% of patients were juveniles (under 18)",sep=""), | ||
| #percentage of adults out of all screened | ||
| paste(round(percentage.adult,1),"% of patients were adults (18-65)",sep=""), | ||
| #percentage of senior citizens out of all screened | ||
| paste(round(percentage.senior,1),"% of patients were seniors (over 65)",sep="")) | ||
| ) | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can also plot the age distribution |
||
| } | ||
|
|
||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +2 |
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,122 @@ | ||
| #this R file will be used to analyze the patient data | ||
| # with the help of the functions in SupportingFunctions.R | ||
|
|
||
| #setting current working directory; Change for your system | ||
| setwd("/Users/kevinbuck/Desktop/Biocomputing/Exercises/Rproject") | ||
| source("SupportingFunctions.R") | ||
| ####Initial Data Setup #### | ||
| #turning all the txt files for Country Y into CSV files | ||
| for (i in 120:175){ | ||
| makeCSV(filename = paste("countryY/screen_",i,".txt",sep=""),delimeter = " ", | ||
| fileOutname = paste("countryY/screen_",i,sep="")) | ||
| } | ||
|
|
||
| #making giant data frame of screening data from both countries | ||
| countryX.df <- combineCSVs(directoryname = "countryX/", country="X",na_handling = "remove", | ||
| FileOutName = "CountryX") | ||
| countryY.df <- combineCSVs(directoryname = "countryY/", country="Y",na_handling = "remove", | ||
| FileOutName = "CountryY") | ||
|
|
||
| allData.df <- rbind(countryX.df,countryY.df) | ||
|
|
||
| #summarize data | ||
| SummarizeCompiledTests(allData.df) | ||
|
|
||
|
|
||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +2 |
||
| #making function to check if the screened patient is infected | ||
| check_infected <- function(row_index,df){ | ||
| return(sum(as.numeric(df[row_index,3:12] == "1")) != 0) | ||
| } | ||
|
|
||
| ####Tracking Cases Over Time#### | ||
|
|
||
| day.list <- as.character(seq(120,175)) | ||
|
|
||
| #initiating data frame to store info | ||
| Daily_Total_Cases <- data.frame(Day=as.numeric(day.list),CountryX.Total=numeric(length(day.list)), | ||
| CountryY.Total=numeric(length(day.list))) | ||
| row.names(Daily_Total_Cases) = day.list | ||
| #this big loop checks if a screened row has any markers present (is patient infected?) | ||
| # and then it will add 1 to the total for cases in the appropriate day as stored in the dataframe | ||
| for(day_num in day.list){ | ||
| tempDF <- allData.df[allData.df[,"dayofYear"] == day_num,] | ||
| for(i in 1:nrow(tempDF)){ | ||
| if( (check_infected(i,tempDF) == TRUE) & (tempDF[i,'country'] == "X")){ | ||
| Daily_Total_Cases[day_num,"CountryX.Total"] = 1 + Daily_Total_Cases[day_num,"CountryX.Total"] | ||
| } | ||
| if((check_infected(i,tempDF) == TRUE) & (tempDF[i,'country'] == "Y")){ | ||
| Daily_Total_Cases[day_num,"CountryY.Total"] = 1 + Daily_Total_Cases[day_num,"CountryY.Total"] | ||
| } | ||
| } | ||
| } | ||
|
|
||
| #####Plotting Daily Cases by Day##### | ||
|
|
||
| #Notice how Country X has cases starting at day 120, and the country Y | ||
| # cases line is at 0 cases until day 138ish. This means the outbreak began in country X | ||
|
|
||
| library(ggplot2) | ||
| ggplot()+geom_line(data=Daily_Total_Cases,mapping=aes(x=Day,y=CountryX.Total,color="CountryX")) + | ||
| geom_line(data=Daily_Total_Cases,mapping=aes(x=Day,y=CountryY.Total,color="CountryY")) + | ||
| ylab("Daily Cases") + xlab("Day Number") + ggtitle("Infections Over Time")+ | ||
| scale_color_manual(name = "Country", values = c("CountryX" = "blue", "CountryY" = "red")) | ||
|
|
||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For more concise code: |
||
|
|
||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +4 |
||
| #Figuring out which markers cause the disease | ||
| total.cases.X <- 0 | ||
| for (i in 1:nrow(countryX.df)){ | ||
| if(check_infected(i,countryX.df) == T){ | ||
| total.cases.X <- total.cases.X +1 | ||
| } | ||
| } | ||
| total.cases.Y <- 0 | ||
| for (i in 1:nrow(countryY.df)){ | ||
| if(check_infected(i,countryY.df) == T){ | ||
| total.cases.Y <- total.cases.Y +1 | ||
| }} | ||
|
|
||
| x.markers.pct <- numeric(10) | ||
| y.markers.pct <- numeric(10) | ||
| x.markers.pct[1] <- sum(countryX.df[,"marker01"] == "1") / total.cases.X | ||
| y.markers.pct[1] <- sum(countryY.df[,"marker01"] == "1") / total.cases.Y | ||
| x.markers.pct[2] <- sum(countryX.df[,"marker02"] == "1") / total.cases.X | ||
| y.markers.pct[2] <- sum(countryY.df[,"marker02"] == "1") / total.cases.Y | ||
| x.markers.pct[3] <- sum(countryX.df[,"marker03"] == "1") / total.cases.X | ||
| y.markers.pct[3] <- sum(countryY.df[,"marker03"] == "1") / total.cases.Y | ||
| x.markers.pct[4] <- sum(countryX.df[,"marker04"] == "1") / total.cases.X | ||
| y.markers.pct[4] <- sum(countryY.df[,"marker04"] == "1") / total.cases.Y | ||
| x.markers.pct[5] <- sum(countryX.df[,"marker05"] == "1") / total.cases.X | ||
| y.markers.pct[5] <- sum(countryY.df[,"marker05"] == "1") / total.cases.Y | ||
| x.markers.pct[6] <- sum(countryX.df[,"marker06"] == "1") / total.cases.X | ||
| y.markers.pct[6] <- sum(countryY.df[,"marker06"] == "1") / total.cases.Y | ||
| x.markers.pct[7] <- sum(countryX.df[,"marker07"] == "1") / total.cases.X | ||
| y.markers.pct[7] <- sum(countryY.df[,"marker07"] == "1") / total.cases.Y | ||
| x.markers.pct[8] <- sum(countryX.df[,"marker08"] == "1") / total.cases.X | ||
| y.markers.pct[8] <- sum(countryY.df[,"marker08"] == "1") / total.cases.Y | ||
| x.markers.pct[9] <- sum(countryX.df[,"marker09"] == "1") / total.cases.X | ||
| y.markers.pct[9] <- sum(countryY.df[,"marker09"] == "1") / total.cases.Y | ||
| x.markers.pct[10] <- sum(countryX.df[,"marker10"] == "1") / total.cases.X | ||
| y.markers.pct[10] <- sum(countryY.df[,"marker10"] == "1") / total.cases.Y | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You should make a for loop for this |
||
| markers.list <- as.character(seq(1,10)) | ||
| markers.sum.table <- matrix(ncol = 3,nrow=20) | ||
| colnames(markers.sum.table) <- c("country","marker","freq") | ||
| markers.sum.table <- as.data.frame(markers.sum.table) | ||
| for (i in 1:10){ | ||
| tempvect01 <- c("X",as.character(i),x.markers.pct[i]) | ||
| markers.sum.table[i,] <- tempvect01 | ||
| tempvect02 <- c("Y",as.character(i),y.markers.pct[i]) | ||
| markers.sum.table[10+i,] <- tempvect02 | ||
| } | ||
| #plotting the markers that cause disease | ||
| # Note how the distribution of markers for country X is skewed towards Markers 1-5, | ||
| # Whereas Country Y has a relatively even distribution. This means that a vaccine targeted | ||
| # At patients in country Y might be based upon solving a problem caused by a micro satellite the patient in | ||
| # Country X might not have present. Thus, vaccines developed by country Y MIGHT work for | ||
| # Country X, but it is not certain. | ||
| ggplot(data=markers.sum.table,aes(x=marker,y=freq,fill=country)) + | ||
| geom_bar(stat="identity",position = "dodge",alpha=.75) + | ||
| ggtitle("Relative Frequency of MicroSatellites") | ||
|
|
||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +4 |
||
|
|
||
|
|
||
|
|
||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good commenting and efficient code that uses coding concepts that we covered in class – 3.25 points (max is 4) |
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
+2