qtran4 · treisch1 · Dec 5, 2022 · Dec 5, 2022 · Dec 5, 2022 · Dec 7, 2022
diff --git a/.Rhistory b/.Rhistory
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+countryX/
+countryY/
+
diff --git a/CountryX.csv b/CountryX.csv
diff --git a/CountryY.csv b/CountryY.csv
diff --git a/InfectionsPlot.png b/InfectionsPlot.png
diff --git a/MarkerPlot.png b/MarkerPlot.png
diff --git a/SupportingFunctions.R b/SupportingFunctions.R
@@ -0,0 +1,74 @@
+#This file will contain supporting files
+makeCSV <- function(filename,delimeter,fileOutname){
+  tempTable <- read.table(file=filename,sep=delimeter)
+  write.csv(tempTable,file=paste(fileOutname,".csv",sep=""),row.names = F)
+}
+
+#This function combines all the data from a directory into a big csv file and also
+# returns this aggregated data set as a data frame
+combineCSVs <- function(directoryname,country,na_handling="none",FileOutName){
+  #making list of all .csv files in directory using grep to search for .csv
+  csv.files.list <- dir(directoryname)[grep(".csv",dir(directoryname))] 
+  #initiating matrix for final results
+  bigMatrix <- matrix(data=NA,nrow=0,ncol=14)
+  for(csv.file in csv.files.list){
+    #initiating temp data obj for raw data from csv
+    tempDF <- read.csv(file=paste(directoryname,csv.file,sep=""),header=T)
+    day <- gsub("[^0-9]*","",csv.file)
+    day = gsub("[_]","",day) # using char replace to extract the day from file name
+    day.vect <- rep(day,n=nrow(tempDF)) # making vector of day name to bind to tempDF
+    country.vect <- rep(country,n=nrow(tempDF)) # making vector of country to bind to tempDF
+    expanded.tempDF <- cbind(tempDF,country.vect,day.vect) # binding day and country col to tempDF
+    bigMatrix <- rbind(bigMatrix,expanded.tempDF) #binding the new completed DF to existing large one
+  }
+  #renaming columns of the final df
+  colsList <- list("gender","age","marker01","marker02","marker03","marker04",
+                   "marker05","marker06","marker07","marker08","marker09","marker10",
+                   "country","dayofYear")
+  colnames(bigMatrix) = colsList
+  #code allowing user to request option removal of or warning about NA's present in data
+  if(na_handling == "remove"){
+    bigMatrix=na.omit(bigMatrix)
+    print("Any NA Values in data removed")
+    }
+  if(na_handling == "warn" ){
+    if(any(is.na.data.frame(bigMatrix))){print("Warning : Your Data Contains NA Values")}else{
+      print("No NA Values Present")
+    }
+    }
+  write.csv(bigMatrix,file = paste(as.character(FileOutName),".csv",sep=""))
+  return(bigMatrix)
+
+}
+# This function summarizes the Data Set for a variety of aspects
+SummarizeCompiledTests <- function(dfName){
+  screen_number <- nrow(dfName)
+  running_infected_total <- 0
+  #markers.list <- c("marker01","marker02","marker03","marker04","marker05",
+  #                  "marker06","marker07","marker08","marker09","marker10")
+  for (i in 1:screen_number){
+    if(sum(as.numeric(dfName[i,3:12] == "1"))){
+      running_infected_total <- running_infected_total +1
+      }
+  }
+  percentage.male <- sum(dfName[,"gender"] == "male") / screen_number * 100
+  percentage.female <- 100 - percentage.male
+  percentage.juvenile <- sum(dfName[,"age"] < 18) / screen_number * 100
+  percentage.adult <- (sum(dfName[,"age"] < 65) / screen_number *100 ) - percentage.juvenile
+  percentage.senior <- sum( dfName[,"age"] >= 65) / screen_number *100
+  infected <- running_infected_total / screen_number *100
+  print(c("Data Summary:",
+          paste("There were ",screen_number," screening tests run",sep=""),
+          #percentage of males out of all screened
+          paste(round(percentage.male,1),"% of patients were male",sep=""), 
+          #percentage of females out of all screened
+          paste(round(percentage.female,1),"% of patients were female",sep=""),
+          #percentage of juveniles out of all screened
+          paste(round(percentage.juvenile,1),"% of patients were juveniles (under 18)",sep=""),  
+          #percentage of adults out of all screened
+          paste(round(percentage.adult,1),"% of patients were adults (18-65)",sep=""),
+          #percentage of senior citizens out of all screened
+          paste(round(percentage.senior,1),"% of patients were seniors (over 65)",sep=""))
+          )
+}
+
diff --git a/analysis.R b/analysis.R
@@ -0,0 +1,122 @@
+#this R file will be used to analyze the patient data
+# with the help of the functions in SupportingFunctions.R
+
+#setting current working directory; Change for your system
+setwd("/Users/kevinbuck/Desktop/Biocomputing/Exercises/Rproject")
+source("SupportingFunctions.R")
+####Initial Data Setup ####
+#turning all the txt files for Country Y into CSV files
+for (i in 120:175){
+  makeCSV(filename = paste("countryY/screen_",i,".txt",sep=""),delimeter = " ",
+          fileOutname = paste("countryY/screen_",i,sep=""))
+}
+
+#making giant data frame of screening data from both countries
+countryX.df <- combineCSVs(directoryname = "countryX/", country="X",na_handling = "remove",
+                           FileOutName = "CountryX")
+countryY.df <- combineCSVs(directoryname = "countryY/", country="Y",na_handling = "remove",
+                           FileOutName = "CountryY")
+
+allData.df <- rbind(countryX.df,countryY.df)
+
+#summarize data 
+SummarizeCompiledTests(allData.df)
+
+
+#making function to check if the screened patient is infected
+check_infected <- function(row_index,df){
+  return(sum(as.numeric(df[row_index,3:12] == "1")) != 0)
+}
+
+####Tracking Cases Over Time####
+
+day.list <- as.character(seq(120,175))
+
+#initiating data frame to store info
+Daily_Total_Cases <- data.frame(Day=as.numeric(day.list),CountryX.Total=numeric(length(day.list)),
+                                  CountryY.Total=numeric(length(day.list)))
+row.names(Daily_Total_Cases) = day.list
+#this big loop checks if a screened row has any markers present (is patient infected?)
+# and then it will add 1 to the total for cases in the appropriate day as stored in the dataframe
+for(day_num in day.list){
+  tempDF <- allData.df[allData.df[,"dayofYear"] == day_num,]
+  for(i in 1:nrow(tempDF)){
+    if( (check_infected(i,tempDF) == TRUE) & (tempDF[i,'country'] == "X")){
+      Daily_Total_Cases[day_num,"CountryX.Total"] = 1 + Daily_Total_Cases[day_num,"CountryX.Total"]
+    }
+    if((check_infected(i,tempDF) == TRUE) & (tempDF[i,'country'] == "Y")){
+      Daily_Total_Cases[day_num,"CountryY.Total"] = 1 + Daily_Total_Cases[day_num,"CountryY.Total"]
+    }
+  }
+}
+
+#####Plotting Daily Cases by Day#####
+
+#Notice how Country X has cases starting at day 120, and the country Y
+# cases line is at 0 cases until day 138ish. This means the outbreak began in country X
+
+library(ggplot2)
+ggplot()+geom_line(data=Daily_Total_Cases,mapping=aes(x=Day,y=CountryX.Total,color="CountryX")) +
+  geom_line(data=Daily_Total_Cases,mapping=aes(x=Day,y=CountryY.Total,color="CountryY")) +
+  ylab("Daily Cases") + xlab("Day Number") + ggtitle("Infections Over Time")+ 
+  scale_color_manual(name = "Country", values = c("CountryX" = "blue", "CountryY" = "red")) 
+
+
+#Figuring out which markers cause the disease
+total.cases.X <- 0
+for (i in 1:nrow(countryX.df)){
+  if(check_infected(i,countryX.df) == T){ 
+    total.cases.X <- total.cases.X +1
+    }
+}
+total.cases.Y <- 0
+for (i in 1:nrow(countryY.df)){
+  if(check_infected(i,countryY.df) == T){ 
+    total.cases.Y <- total.cases.Y +1
+  }}
+
+x.markers.pct <- numeric(10)
+y.markers.pct <- numeric(10)
+x.markers.pct[1] <- sum(countryX.df[,"marker01"] == "1") / total.cases.X
+y.markers.pct[1] <- sum(countryY.df[,"marker01"] == "1") / total.cases.Y
+x.markers.pct[2] <- sum(countryX.df[,"marker02"] == "1") / total.cases.X
+y.markers.pct[2] <- sum(countryY.df[,"marker02"] == "1") / total.cases.Y
+x.markers.pct[3] <- sum(countryX.df[,"marker03"] == "1") / total.cases.X
+y.markers.pct[3] <- sum(countryY.df[,"marker03"] == "1") / total.cases.Y
+x.markers.pct[4] <- sum(countryX.df[,"marker04"] == "1") / total.cases.X
+y.markers.pct[4] <- sum(countryY.df[,"marker04"] == "1") / total.cases.Y
+x.markers.pct[5] <- sum(countryX.df[,"marker05"] == "1") / total.cases.X
+y.markers.pct[5] <- sum(countryY.df[,"marker05"] == "1") / total.cases.Y
+x.markers.pct[6] <- sum(countryX.df[,"marker06"] == "1") / total.cases.X
+y.markers.pct[6] <- sum(countryY.df[,"marker06"] == "1") / total.cases.Y
+x.markers.pct[7] <- sum(countryX.df[,"marker07"] == "1") / total.cases.X
+y.markers.pct[7] <- sum(countryY.df[,"marker07"] == "1") / total.cases.Y
+x.markers.pct[8] <- sum(countryX.df[,"marker08"] == "1") / total.cases.X
+y.markers.pct[8] <- sum(countryY.df[,"marker08"] == "1") / total.cases.Y
+x.markers.pct[9] <- sum(countryX.df[,"marker09"] == "1") / total.cases.X
+y.markers.pct[9] <- sum(countryY.df[,"marker09"] == "1") / total.cases.Y
+x.markers.pct[10] <- sum(countryX.df[,"marker10"] == "1") / total.cases.X
+y.markers.pct[10] <- sum(countryY.df[,"marker10"] == "1") / total.cases.Y
+markers.list <- as.character(seq(1,10))
+markers.sum.table <- matrix(ncol = 3,nrow=20)
+colnames(markers.sum.table) <- c("country","marker","freq")
+markers.sum.table <- as.data.frame(markers.sum.table)
+for (i in 1:10){
+  tempvect01 <- c("X",as.character(i),x.markers.pct[i])
+  markers.sum.table[i,] <- tempvect01
+  tempvect02 <- c("Y",as.character(i),y.markers.pct[i])
+  markers.sum.table[10+i,] <- tempvect02
+}
+#plotting the markers that cause disease
+# Note how the distribution of markers for country X is skewed towards Markers 1-5,
+# Whereas Country Y has a relatively even distribution. This means that a vaccine targeted
+# At patients in country Y might be based upon solving a problem caused by a micro satellite the patient in 
+# Country X might not have present. Thus, vaccines developed by country Y MIGHT work for 
+# Country X, but it is not certain.
+ggplot(data=markers.sum.table,aes(x=marker,y=freq,fill=country)) + 
+  geom_bar(stat="identity",position = "dodge",alpha=.75) + 
+  ggtitle("Relative Frequency of MicroSatellites")
+
+
+
+