Merge branch 'boruta' into merging_tbrick_boruta

brandmaier · Sep 5, 2024 · a57768d · a57768d
2 parents 985740e + 09638cd
commit a57768d
Show file tree

Hide file tree

Showing 5 changed files with 193 additions and 83 deletions.
diff --git a/.gitignore b/.gitignore
@@ -8,4 +8,6 @@ Meta
 /doc/
 /Meta/
 cran_comments.md
-CRAN*
+CRAN*
+.DS_Store
+
diff --git a/R/boruta.R b/R/boruta.R
@@ -1,31 +1,41 @@
-#' BORUTA algorithm for SEM trees
+#' Run the Boruta algorithm on a sem tree
 #' 
-#' This is an experimental feature. Use cautiously.
+#' Grows a series of SEM Forests following the boruta algorithm to determine
+#'    feature importance as moderators of the underlying model.
 #' 
-#' @aliases boruta
-#' @param model A template model specification from \code{\link{OpenMx}} using
-#' the \code{\link{mxModel}} function (or a \code{\link[lavaan]{lavaan}} model
-#' using the \code{\link[lavaan]{lavaan}} function with option fit=FALSE).
-#' Model must be syntactically correct within the framework chosen, and
-#' converge to a solution.
-#' @param data Data.frame used in the model creation using
-#' \code{\link{mxModel}} or \code{\link[lavaan]{lavaan}} are input here. Order
-#' of modeled variables and predictors is not important when providing a
-#' dataset to \code{semtree}.
-#' @param control \code{\link{semtree}} model specifications from
-#' \code{\link{semtree.control}} are input here. Any changes from the default
-#' setting can be specified here.
-#' @param percentile_threshold Numeric.
-#' @param rounds Numeric. Number of rounds of the BORUTA algorithm.
-#'
-#' @export
 #' 
+#' @aliases boruta plot.boruta print.boruta
+#' @param model A template SEM. Same as in \code{semtree}.
+#' @param data A dataframe to boruta on. Same as in \code{semtree}.
+#' @param control A semforest control object to set forest parameters.
+#' @param predictors An optional list of covariates. See semtree code example.
+#' @param maxRuns Maximum number of boruta search cycles
+#' @param pAdjMethod A value from \link{stats::p.adjust.methods} defining a 
+#'          multiple testing correction method
+#' @param alpha p-value cutoff for decisionmaking. Default .05
+#' @param verbose Verbosity level for boruta processing 
+#'          similar to the same argument in \link{semtree.control} and 
+#'          \link{semforest.control}
+#' @param \dots Optional parameters to undefined subfunctions
+#' @return A vim object with several elements that need work. 
+#'          Of particular note, `$importance` carries mean importance; 
+#'          `$decision` denotes Accepted/Rejected/Tentative;
+#'          `$impHistory` has the entire varimp history; and
+#'          `$details` has exit values for each parameter.
+#' @author Priyanka Paul, Timothy R. Brick, Andreas Brandmaier
+#' @seealso \code{\link{semtree}} \code{\link{semforest}}
+#' 
+#' @keywords tree models multivariate
+#' @export
+
 boruta <- function(model,
                    data,
                    control = NULL,
                    predictors = NULL,
-                   percentile_threshold = 1,
-                   rounds = 1,
+                   maxRuns = 30,
+                   pAdjMethod = "none",
+                   alpha = .05,
+                   verbose=FALSE,
                    ...) {
 
   # initial checks
@@ -84,48 +94,113 @@ boruta <- function(model,
   } else {
     ui_stop("Unknown model type selected. Use OpenMx or lavaanified lavaan models!")
   }
-  model.ids <- tmp[[1]]
-  covariate.ids <- tmp[[2]]
 
-  # stage 1 - create shadow features
-
-  shadow.ids <- (ncol(data) + 1):(ncol(data) + length(covariate.ids))
-
-  for (cur_cov_id in covariate.ids) {
-    for (rep_id in 1:num_shadows) {
-    # pick column and shuffle
-    temp_column <- data[, cur_cov_id]
-    temp_column <- sample(temp_column, length(temp_column), replace = FALSE)
-    # add to dataset as shadow feature
-    temp_colname <- paste0("shadow_", names(data)[cur_cov_id], collapse = "")
-    if (num_shadows>1) temp_colname <- paste0(temp_colname, rep_id, collapse = "")
-    data[temp_colname] <- temp_column
-    if (!is.null(predictors)) predictors <- c(predictors, temp_colname)
-    }
+  # Checks on x & y from the boruta package
+  if(length(grep('^shadow',covariate.ids)>0)) 
+    stop('Attributes with names starting from "shadow" are reserved for internal use. Please rename them.')
+  if(maxRuns<11)
+    stop('maxRuns must be greater than 10.')
+  if(!pAdjMethod %in% stats::p.adjust.methods) 
+    stop(c('P-value adjustment method not found. Must be one of:', 
+           stats::p.adjust.methods))
+
+  # Might clash with some other semtrees stuff
+  if(is.null(predictors)) {
+    predictors <- names(data)[covariate.ids]
   }
+
+  # Initialize and then loop over runs:
+  impHistory <- data.frame(matrix(NA, nrow=0, ncol=length(predictors)+3))
+  names(impHistory) <- c(predictors, "shadowMin", "shadowMean", "shadowMax")
+  decisionList <- data.frame(predictor=predictors, decision = "Tentative",
+                             hitCount = 0, raw.p=NA, adjusted.p=NA)
 
-  # run the forest
-  forest <- semforest(model, data, control, predictors, ...)
-
-  # run variable importance
-  vim <- varimp(forest)
-
-  # get variable importance from shadow features
-  shadow_names <- names(data)[shadow.ids]
-  agvim <- aggregateVarimp(vim, aggregate = "mean")
+  # TODO: Parallelize the first five runs.
+  for(runNo in 1:maxRuns) {
+    if(verbose) {
+      message(paste("Beginning Run", runNo))
+    }
+
+    # stage 1 - create shadow features
+    rejected <- decisionList$predictor[decisionList$decision == "Rejected"]
+    current.predictors <- setdiff(predictors, rejected)
+    current.covariate.ids <- setdiff(covariate.ids, names(data) %in% rejected)
+    current.data <- data[, setdiff(names(data), rejected)]
+
+    shadow.ids <- (ncol(current.data)+1):(ncol(current.data)+length(current.covariate.ids))
+
+    for (cur_cov_id in current.covariate.ids) {
+      # pick column and shuffle
+      temp_column <- current.data[, cur_cov_id]
+      temp_column <- sample(temp_column, length(temp_column), replace=FALSE)
+      # add to dataset as shadow feature
+      temp_colname <- paste0("shadow_", names(current.data)[cur_cov_id],collapse="")
+      current.data[temp_colname] <- temp_column
+      if (!is.null(current.predictors)) current.predictors <- c(current.predictors, temp_colname)
+    }
+
+    # TODO: Pre-run model if needed.
+
+    # run the forest
+    forest <- semforest(model, current.data, control, current.predictors, ...)
 
-  vals <- agvim[names(agvim) %in% shadow_names]
-  #max_shadow_importance <- max(vals)
-  max_shadow_importance <- quantile(vals, percentile_threshold)
+    # run variable importance
+    vim <- varimp(forest)
 
-  agvim_filtered <- agvim[!(names(agvim) %in% shadow_names)]
-
-  df <- data.frame(importance = agvim_filtered, predictor = names(agvim_filtered))
+    # get variable importance from shadow features
+    shadow_names <- names(current.data)[shadow.ids]
+    agvim <- aggregateVarimp(vim, aggregate="mean")
+
+    # Compute shadow stats
+    shadow_importances <- agvim[names(agvim)%in%shadow_names]
+    impHistory[runNo, "shadowMax"] <- max_shadow_importance <- max(shadow_importances)  
+    impHistory[runNo, "shadowMin"] <- min(shadow_importances)
+    impHistory[runNo, "shadowMean"] <- mean(shadow_importances)
+    agvim_filtered <- agvim[!(names(agvim)%in%shadow_names)]
+    impHistory[runNo, names(agvim_filtered)] <- agvim_filtered
+
+    # Compute "hits" 
+    hits <- decisionList$predictor %in% names(agvim_filtered[agvim_filtered>max_shadow_importance])
+    decisionList$hitCount[hits] <- decisionList$hitCount[hits] + 1
+
+    # Run tests. 
+    # The biasing here means that there are no decisions without correction 
+    #   before 5 runs and no decisions with Bonferroni before 7 runs.
+
+    # Run confirmation tests (pulled from Boruta package)
+    newPs <- stats::pbinom(decisionList$hitCount-1,runNo,0.5,lower.tail=FALSE)
+    adjPs <- stats::p.adjust(newPs,method=pAdjMethod)
+    acceptable <- adjPs < alpha
+    updateList <- acceptable & decisionList$decision == "Tentative"
+    decisionList$raw.p[updateList] <- newPs[updateList]
+    decisionList$adjusted.p[updateList] <- adjPs[updateList]
+    decisionList$decision[updateList] <- "Confirmed"
+
+    # Run rejection tests (pulled from Boruta package)
+    newPs <- stats::pbinom(decisionList$hitCount,runNo,0.5,lower.tail=TRUE)
+    adjPs <- stats::p.adjust(newPs,method=pAdjMethod)
+    acceptable <- adjPs < alpha
+    updateList <- acceptable & decisionList$decision == "Tentative"
+    decisionList$raw.p[updateList] <- newPs[updateList]
+    decisionList$adjusted.p[updateList] <- adjPs[updateList]
+    decisionList$decision[updateList] <- "Rejected"
+
+    if(!any(decisionList$decision == "Tentative")) {
+      break
+    }
+
+  }
+
+  vim$importance <- colMeans(impHistory, na.rm = TRUE)
+  vim$impHistory <- impHistory
+  vim$decisions <- decisionList$decision
+  vim$details <- decisionList
 
-  vim$filter <- agvim_filtered > max_shadow_importance
+  vim$filter <- decisionList$decision == "Confirmed"  # Turns into hitreg
   vim$boruta <- TRUE
-  vim$boruta_threshold <- max_shadow_importance
-  vim$percentile_threshold <- percentile_threshold
-
+
+  # TODO: Loop ends here with some reporting.
+  
   return(vim)
 }
+
diff --git a/man/boruta.Rd b/man/boruta.Rd
diff --git a/semtree.Rproj b/semtree.Rproj
@@ -15,3 +15,4 @@ LaTeX: pdfLaTeX
 BuildType: Package
 PackageUseDevtools: Yes
 PackageInstallArgs: --no-multiarch --with-keep.source
+PackageRoxygenize: rd,collate,namespace
diff --git a/tests/borutaOneIndicator.R b/tests/borutaOneIndicator.R
@@ -0,0 +1,32 @@
+# Tests a simple boruta model on a basic one-indicator SEM
+
+library(OpenMx)
+library(semtree)
+
+N <- 1000
+influence <- c(1, 0)
+nPred <- length(influence)
+cutbreaks <- 3
+make_categorical <- TRUE
+
+
+genModel <- mxModel(type="RAM", manifestVars=c("Y", paste0("X", 1:nPred)),
+                    mxPath(paste0("X", 1:nPred), "Y", values=influence),
+                    mxPath(c("Y", paste0("X", 1:nPred)), arrows=2, values=1))
+simpleData <- mxGenerateData(genModel, N)
+if(make_categorical) {
+  for(i in paste0("X", 1:nPred)) {
+    simpleData[[i]] <- cut(simpleData[[i]], cutbreaks)
+    simpleData[[i]] <- mxFactor(simpleData[[i]], levels(simpleData[[i]]))
+  }
+}
+
+testModel <- mxModel("SimplisticModel", type="RAM", manifestVars="Y", 
+                     mxPath("Y", arrows=2, values=1, free=TRUE, labels=c("Var")),
+                     mxPath("one", "Y", values=0, free=TRUE, labels=c("mu")), 
+                     mxData(simpleData, type="raw"))
+
+control <- semforest.control(control=semtree.control(method="score", alpha=1))
+
+output <- boruta(testModel, simpleData, verbose=TRUE, control = control)
+output
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,4 +8,6 @@ Meta @@
     /doc/
     /Meta/
     cran_comments.md
-    CRAN*
+    CRAN*
+    .DS_Store