Merge pull request #72 from trbrick/boruta

Boruta
brandmaier · Jul 4, 2024 · 09638cd · 09638cd
2 parents 5e632ee + 41af790
commit 09638cd
Show file tree

Hide file tree

Showing 8 changed files with 211 additions and 28 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,4 @@
 inst/doc
 doc
 Meta
+.DS_Store
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -55,7 +55,7 @@ Encoding: UTF-8
 LazyLoad: yes
 Version: 0.9.19
 Date: 2023-03-03
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
 VignetteBuilder: knitr
 BugReports: https://github.com/brandmaier/semtree/issues
 URL: https://github.com/brandmaier/semtree
diff --git a/NAMESPACE b/NAMESPACE
@@ -32,6 +32,7 @@ S3method(strip,semtree)
 S3method(summary,semforest)
 S3method(summary,semtree)
 export(biodiversity)
+export(boruta)
 export(diversityMatrix)
 export(evaluateTree)
 export(fitSubmodels)

diff --git a/R/boruta.R b/R/boruta.R
@@ -1,7 +1,41 @@
+
+#' Run the Boruta algorithm on a sem tree
+#' 
+#' Grows a series of SEM Forests following the boruta algorithm to determine
+#'    feature importance as moderators of the underlying model.
+#' 
+#' 
+#' @aliases boruta plot.boruta print.boruta
+#' @param model A template SEM. Same as in \code{semtree}.
+#' @param data A dataframe to boruta on. Same as in \code{semtree}.
+#' @param control A semforest control object to set forest parameters.
+#' @param predictors An optional list of covariates. See semtree code example.
+#' @param maxRuns Maximum number of boruta search cycles
+#' @param pAdjMethod A value from \link{stats::p.adjust.methods} defining a 
+#'          multiple testing correction method
+#' @param alpha p-value cutoff for decisionmaking. Default .05
+#' @param verbose Verbosity level for boruta processing 
+#'          similar to the same argument in \link{semtree.control} and 
+#'          \link{semforest.control}
+#' @param \dots Optional parameters to undefined subfunctions
+#' @return A vim object with several elements that need work. 
+#'          Of particular note, `$importance` carries mean importance; 
+#'          `$decision` denotes Accepted/Rejected/Tentative;
+#'          `$impHistory` has the entire varimp history; and
+#'          `$details` has exit values for each parameter.
+#' @author Priyanka Paul, Timothy R. Brick, Andreas Brandmaier
+#' @seealso \code{\link{semtree}} \code{\link{semforest}}
+#' 
+#' @keywords tree models multivariate
+#' @export
 boruta <- function(model,
                    data,
                    control = NULL,
                    predictors = NULL,
+                   maxRuns = 30,
+                   pAdjMethod = "none",
+                   alpha = .05,
+                   verbose=FALSE,
                    ...) {
 
 
@@ -20,37 +54,112 @@ boruta <- function(model,
     ui_stop("Unknown model type selected. Use OpenMx or lavaanified lavaan models!");
   }
 
-  # stage 1 - create shadow features
-
-  shadow.ids <- (ncol(data)+1):(ncol(data)+length(covariate.ids))
-
-  for (cur_cov_id in covariate.ids) {
-    # pick column and shuffle
-    temp_column <- data[, cur_cov_id]
-    temp_column <- sample(temp_column, length(temp_column), replace=FALSE)
-    # add to dataset as shadow feature
-    temp_colname <- paste0("shadow_", names(data)[cur_cov_id],collapse="")
-    data[temp_colname] <- temp_column
-    if (!is.null(predictors)) predictors <- c(predictors, temp_colname)
-  }
+  # Checks on x & y from the boruta package
+  if(length(grep('^shadow',covariate.ids)>0)) 
+    stop('Attributes with names starting from "shadow" are reserved for internal use. Please rename them.')
+  if(maxRuns<11)
+    stop('maxRuns must be greater than 10.')
+  if(!pAdjMethod %in% stats::p.adjust.methods) 
+    stop(c('P-value adjustment method not found. Must be one of:', 
+           stats::p.adjust.methods))
 
-  # run the forest
-  forest <- semforest(model, data, control, predictors, ...)
+  # Might clash with some other semtrees stuff
+  if(is.null(predictors)) {
+    predictors <- names(data)[covariate.ids]
+  }
 
-  # run variable importance
-  vim <- varimp(forest)
+  # Initialize and then loop over runs:
+  impHistory <- data.frame(matrix(NA, nrow=0, ncol=length(predictors)+3))
+  names(impHistory) <- c(predictors, "shadowMin", "shadowMean", "shadowMax")
+  decisionList <- data.frame(predictor=predictors, decision = "Tentative",
+                             hitCount = 0, raw.p=NA, adjusted.p=NA)
+
+  # TODO: Parallelize the first five runs.
+  for(runNo in 1:maxRuns) {
+    if(verbose) {
+      message(paste("Beginning Run", runNo))
+    }
+
+    # stage 1 - create shadow features
+    rejected <- decisionList$predictor[decisionList$decision == "Rejected"]
+    current.predictors <- setdiff(predictors, rejected)
+    current.covariate.ids <- setdiff(covariate.ids, names(data) %in% rejected)
+    current.data <- data[, setdiff(names(data), rejected)]
+
+    shadow.ids <- (ncol(current.data)+1):(ncol(current.data)+length(current.covariate.ids))
+
+    for (cur_cov_id in current.covariate.ids) {
+      # pick column and shuffle
+      temp_column <- current.data[, cur_cov_id]
+      temp_column <- sample(temp_column, length(temp_column), replace=FALSE)
+      # add to dataset as shadow feature
+      temp_colname <- paste0("shadow_", names(current.data)[cur_cov_id],collapse="")
+      current.data[temp_colname] <- temp_column
+      if (!is.null(current.predictors)) current.predictors <- c(current.predictors, temp_colname)
+    }
+
+    # TODO: Pre-run model if needed.
+
+    # run the forest
+    forest <- semforest(model, current.data, control, current.predictors, ...)
 
-  # get variable importance from shadow features
-  shadow_names <- names(data)[shadow.ids]
-  agvim <- aggregateVarimp(vim, aggregate="mean")
-  max_shadow_importance <- max(agvim[names(agvim)%in%shadow_names])  
-  agvim_filtered <- agvim[!(names(agvim)%in%shadow_names)]
+    # run variable importance
+    vim <- varimp(forest)
 
-  df<-data.frame(importance=agvim_filtered, predictor=names(agvim_filtered))
+    # get variable importance from shadow features
+    shadow_names <- names(current.data)[shadow.ids]
+    agvim <- aggregateVarimp(vim, aggregate="mean")
+
+    # Compute shadow stats
+    shadow_importances <- agvim[names(agvim)%in%shadow_names]
+    impHistory[runNo, "shadowMax"] <- max_shadow_importance <- max(shadow_importances)  
+    impHistory[runNo, "shadowMin"] <- min(shadow_importances)
+    impHistory[runNo, "shadowMean"] <- mean(shadow_importances)
+    agvim_filtered <- agvim[!(names(agvim)%in%shadow_names)]
+    impHistory[runNo, names(agvim_filtered)] <- agvim_filtered
+
+    # Compute "hits" 
+    hits <- decisionList$predictor %in% names(agvim_filtered[agvim_filtered>max_shadow_importance])
+    decisionList$hitCount[hits] <- decisionList$hitCount[hits] + 1
+
+    # Run tests. 
+    # The biasing here means that there are no decisions without correction 
+    #   before 5 runs and no decisions with Bonferroni before 7 runs.
+
+    # Run confirmation tests (pulled from Boruta package)
+    newPs <- stats::pbinom(decisionList$hitCount-1,runNo,0.5,lower.tail=FALSE)
+    adjPs <- stats::p.adjust(newPs,method=pAdjMethod)
+    acceptable <- adjPs < alpha
+    updateList <- acceptable & decisionList$decision == "Tentative"
+    decisionList$raw.p[updateList] <- newPs[updateList]
+    decisionList$adjusted.p[updateList] <- adjPs[updateList]
+    decisionList$decision[updateList] <- "Confirmed"
+
+    # Run rejection tests (pulled from Boruta package)
+    newPs <- stats::pbinom(decisionList$hitCount,runNo,0.5,lower.tail=TRUE)
+    adjPs <- stats::p.adjust(newPs,method=pAdjMethod)
+    acceptable <- adjPs < alpha
+    updateList <- acceptable & decisionList$decision == "Tentative"
+    decisionList$raw.p[updateList] <- newPs[updateList]
+    decisionList$adjusted.p[updateList] <- adjPs[updateList]
+    decisionList$decision[updateList] <- "Rejected"
+
+    if(!any(decisionList$decision == "Tentative")) {
+      break
+    }
+
+  }
 
-  vim$filter <- agvim_filtered>max_shadow_importance
+  vim$importance <- colMeans(impHistory, na.rm = TRUE)
+  vim$impHistory <- impHistory
+  vim$decisions <- decisionList$decision
+  vim$details <- decisionList
+
+  vim$filter <- decisionList$decision == "Confirmed"  # Turns into hitreg
   vim$boruta <- TRUE
-  vim$boruta_threshold = max_shadow_importance
+
+  # TODO: Loop ends here with some reporting.
 
   return(vim)
 }
+
diff --git a/man/boruta.Rd b/man/boruta.Rd
diff --git a/man/toTable.Rd b/man/toTable.Rd
diff --git a/semtree.Rproj b/semtree.Rproj
@@ -15,3 +15,4 @@ LaTeX: pdfLaTeX
 BuildType: Package
 PackageUseDevtools: Yes
 PackageInstallArgs: --no-multiarch --with-keep.source
+PackageRoxygenize: rd,collate,namespace
diff --git a/tests/borutaOneIndicator.R b/tests/borutaOneIndicator.R
@@ -0,0 +1,32 @@
+# Tests a simple boruta model on a basic one-indicator SEM
+
+library(OpenMx)
+library(semtree)
+
+N <- 1000
+influence <- c(1, 0)
+nPred <- length(influence)
+cutbreaks <- 3
+make_categorical <- TRUE
+
+
+genModel <- mxModel(type="RAM", manifestVars=c("Y", paste0("X", 1:nPred)),
+                    mxPath(paste0("X", 1:nPred), "Y", values=influence),
+                    mxPath(c("Y", paste0("X", 1:nPred)), arrows=2, values=1))
+simpleData <- mxGenerateData(genModel, N)
+if(make_categorical) {
+  for(i in paste0("X", 1:nPred)) {
+    simpleData[[i]] <- cut(simpleData[[i]], cutbreaks)
+    simpleData[[i]] <- mxFactor(simpleData[[i]], levels(simpleData[[i]]))
+  }
+}
+
+testModel <- mxModel("SimplisticModel", type="RAM", manifestVars="Y", 
+                     mxPath("Y", arrows=2, values=1, free=TRUE, labels=c("Var")),
+                     mxPath("one", "Y", values=0, free=TRUE, labels=c("mu")), 
+                     mxData(simpleData, type="raw"))
+
+control <- semforest.control(control=semtree.control(method="score", alpha=1))
+
+output <- boruta(testModel, simpleData, verbose=TRUE, control = control)
+output
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,3 +5,4 @@ @@
     inst/doc
     doc
     Meta
+    .DS_Store