d <- read.csv(file.choose()) d$target <- relevel(as.factor(d$target), "uncontacted") #http://rstudio-pubs-static.s3.amazonaws.com/145252_b241fc4c9cc640a185e721694648ad31.html library(caret); library(pROC); library(reshape2); library(ggplot2); library(randomForest) thresh_code <- getModelInfo("rf", regex = FALSE)[[1]] # rf model modified with following parameters # thresh_code$type, thresh_code$parameters, thresh_code$grid, thresh_code$loop # thresh_code$fit, thresh_code$predict, thresh_code$prob thresh_code$type <- c("Classification") #...1 ## Add the threshold as another tuning parameter thresh_code$parameters <- data.frame(parameter = c("mtry", "threshold"), #...2 class = c("numeric", "numeric"), label = c("#Randomly Selected Predictors", "Probability Cutoff")) ## The default tuning grid code: thresh_code$grid <- function(x, y, len = NULL, search = "grid") { #...3 p <- ncol(x) if(search == "grid") { grid <- expand.grid(mtry = floor(sqrt(p)), threshold = seq(.01, .99, length = len)) } else { grid <- expand.grid(mtry = sample(1:p, size = len), threshold = runif(1, 0, size = len)) } grid } ## Here we fit a single random forest model (with a fixed mtry) ## and loop over the threshold values to get predictions from the same ## randomForest model. thresh_code$loop = function(grid) { #...4 library(plyr) loop <- ddply(grid, c("mtry"), function(x) c(threshold = max(x$threshold))) submodels <- vector(mode = "list", length = nrow(loop)) for(i in seq(along = loop$threshold)) { index <- which(grid$mtry == loop$mtry[i]) cuts <- grid[index, "threshold"] submodels[[i]] <- data.frame(threshold = cuts[cuts != loop$threshold[i]]) } list(loop = loop, submodels = submodels) } ## Fit the model independent of the threshold parameter thresh_code$fit = function(x, y, wts, param, lev, last, classProbs, ...) { #...5 if(length(levels(y)) != 2) stop("This works only for 2-class problems") randomForest(x, y, mtry = param$mtry, ...) } ## Now get a probability prediction and use different thresholds to ## get the predicted class thresh_code$predict = function(modelFit, newdata, submodels = NULL) { #...6 class1Prob <- predict(modelFit, newdata, type = "prob")[, modelFit$obsLevels[1]] ## Raise the threshold for class #1 and a higher level of ## evidence is needed to call it class 1 so it should ## decrease sensitivity and increase specificity out <- ifelse(class1Prob >= modelFit$tuneValue$threshold, modelFit$obsLevels[1], modelFit$obsLevels[2]) if(!is.null(submodels)) { tmp2 <- out out <- vector(mode = "list", length = length(submodels$threshold)) out[[1]] <- tmp2 for(i in seq(along = submodels$threshold)) { out[[i+1]] <- ifelse(class1Prob >= submodels$threshold[[i]], modelFit$obsLevels[1], modelFit$obsLevels[2]) } } out } ## The probabilities are always the same but we have to create ## mulitple versions of the probs to evaluate the data across ## thresholds thresh_code$prob = function(modelFit, newdata, submodels = NULL) { #...7 out <- as.data.frame(predict(modelFit, newdata, type = "prob")) if(!is.null(submodels)) { probs <- out out <- vector(mode = "list", length = length(submodels$threshold)+1) out <- lapply(out, function(x) probs) } out } ### for summaryFunction in trControl fourStats <- function (data, lev = levels(data$obs), model = NULL) { ## This code will get use the area under the ROC curve and the ## sensitivity and specificity values using the current candidate ## value of the probability threshold. out <- c(twoClassSummary(data, lev = levels(data$obs), model = NULL)) ## The best possible model has sensitivity of 1 and specificity of 1. ## How far are we from that value? coords <- matrix(c(1, 1, out["Spec"], out["Sens"]), ncol = 2, byrow = TRUE) colnames(coords) <- c("Spec", "Sens") rownames(coords) <- c("Best", "Current") c(out, Dist = dist(coords)[1]) } ################## Modeling with customized RF model ################### set.seed(1) d <- na.omit(d) mod1 <- train(target ~ ., data = d, method = thresh_code, # modified model -- in lieu of "rf" ## Minimize the distance to the perfect model metric = "Dist", maximize = FALSE, tuneLength = 99, ntree = 1000, trControl = trainControl(method = "LOOCV", classProbs = TRUE, summaryFunction = fourStats)) mod1 #save(mod1,file="C:/users/walkerro/Desktop/uncontML/mod1") metrics <- mod1$results[, c(2, 4:6)] metrics <- melt(metrics, id.vars = "threshold", variable.name = "Resampled", value.name = "Data") cutoff <- ggplot(metrics, aes(x = threshold, y = Data, color = Resampled)) + geom_line(size=2) + ylab("") + xlab("cutoff") + theme(legend.title=element_blank()) + theme(legend.position = "right") +scale_color_manual(name="", labels=c("Sensitivity","Specificity","Distance"), values=c("red","green","blue")) cutoff #ggexport(cutoff, height=4, width=5, res=400, filename = "c:/users/walkerro/desktop/cutoff.pdf") varImp(mod1, scale = FALSE)