SVRBOD/Script 1 at master · xcmarvin/SVRBOD · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
library(e1071)
library(caret)
library(hydroGOF)

# Data import and normalisation, and model tuning -------------------------
fullData <- read.csv(filename, header = TRUE)
minY <- min(fullData$y)
maxY <- max(fullData$y)
preFullData <- preProcess(fullData, method = "range")
newFullData <- predict(preFullData, fullData)
tuneResult <- tune(svm, y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9 + x10 + x11 + x12 + x13 + x14 + x15 + x16 + x17 + x18 + x19 + x20 + x21 + x22 + x23 + x24 + x25 + x26 + x27 + x28 + x29 + x30 + x31 + x32 + x33 + x34 + x35 + x36 + x37 + x38 + x39 + x40, data = newFullData, scale = FALSE, ranges = list(epsilon = seq(0,1,0.01), cost = 2^(0:4), gamma = 2^(-5:-1)))
print(tuneResult) #the optimal model parameters are cost = 4, epsilon = 0.04, gamma = 0.03


# 10-fold cross validation ------------------------------------------------
cvData <- newFullData[sample(nrow(fullData)), ]
folds <- cut(seq(1, nrow(cvData)), breaks = 10, labels = FALSE)
for (i in 1:10) {
  testIndices <- which(folds == i, arr.ind = TRUE)
  testData <- cvData[testIndices, ]
  trainData <- cvData[-testIndices, ]
  model <- svm(y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9 + x10 + x11 + x12 + x13 + x14 + x15 + x16 + x17 + x18 + x19 + x20 + x21 + x22 + x23 + x24 + x25 + x26 + x27 + x28 + x29 + x30 + x31 + x32 + x33 + x34 + x35 + x36 + x37 + x38 + x39 + x40, trainData, cost = 4, epsilon = 0.04, gamma = 0.03, scale = FALSE)
  predictY <- predict(model, testData)
  actualY <- testData$y * (maxY - minY) + minY
  predictY <- predictY * (maxY - minY) + minY
  error <- predictY - actualY
  if (i == 1) {
    cvActualY <- actualY
    cvPredictY <- predictY
    cvError <- error
  } else {
    cvActualY <- c(cvActualY, actualY)
    cvPredictY <- c(cvPredictY, predictY)
    cvError <- c(cvError, error)
  }
}
output <- data.frame(cvActualY, cvPredictY, cvError)

# Finding the extreme high BOD cutoff value -------------------------------
output <- output[order(output$cvActualY), ]
cvErrorCopy <- rep(NA, 658)
for (i in 1:length(output$cvError)) {
  if(output$cvError[[i]] > 0) {
    cvErrorCopy[i] <- 1
  } else {
    cvErrorCopy[i] <- -1
  }
}
diff <- 0
for (i in 1:657) {
  tempDiff <- mean(cvErrorCopy[1:i]) - mean(cvErrorCopy[(i + 1):658])
  if(tempDiff > diff) {
    diff <- tempDiff
    cutoff <- output$cvActualY[[i]] #the cutoff value is 150
  }
}

# Store 10-fold cross validation results in a csv file --------------------
lowOutput <- subset(output, cvActualY < 150)
highOutput <- subset(output, cvActualY >= 150)
mae <- c(mean(abs(output$cvError)), mean(abs(lowOutput$cvError)), mean(abs(highOutput$cvError)))
bias <- c(mean(output$cvError), mean(lowOutput$cvError), mean(highOutput$cvError))
r2 <- c((cor(output)[2,1])^2, (cor(lowOutput)[2,1])^2, (cor(highOutput)[2,1])^2)
nse <- c(NSE(output$cvPredictY, output$cvActualY), NSE(lowOutput$cvPredictY, lowOutput$cvActualY), NSE(highOutput$cvPredictY, highOutput$cvActualY))
cvResults <- data.frame(mae, bias, r2, nse)
rownames(cvResults) <- c("overall", "low", "high")
colnames(cvResults) <- c("mae", "bias", "r2", "nse")
write.table(cvResults, file = filename, sep = ",", row.names = FALSE)