Skip to content

Commit

Permalink
Merge branch 'boruta' into merging_tbrick_boruta
Browse files Browse the repository at this point in the history
  • Loading branch information
brandmaier authored Sep 5, 2024
2 parents 985740e + 09638cd commit a57768d
Show file tree
Hide file tree
Showing 5 changed files with 193 additions and 83 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,6 @@ Meta
/doc/
/Meta/
cran_comments.md
CRAN*
CRAN*
.DS_Store

189 changes: 132 additions & 57 deletions R/boruta.R
Original file line number Diff line number Diff line change
@@ -1,31 +1,41 @@
#' BORUTA algorithm for SEM trees
#' Run the Boruta algorithm on a sem tree
#'
#' This is an experimental feature. Use cautiously.
#' Grows a series of SEM Forests following the boruta algorithm to determine
#' feature importance as moderators of the underlying model.
#'
#' @aliases boruta
#' @param model A template model specification from \code{\link{OpenMx}} using
#' the \code{\link{mxModel}} function (or a \code{\link[lavaan]{lavaan}} model
#' using the \code{\link[lavaan]{lavaan}} function with option fit=FALSE).
#' Model must be syntactically correct within the framework chosen, and
#' converge to a solution.
#' @param data Data.frame used in the model creation using
#' \code{\link{mxModel}} or \code{\link[lavaan]{lavaan}} are input here. Order
#' of modeled variables and predictors is not important when providing a
#' dataset to \code{semtree}.
#' @param control \code{\link{semtree}} model specifications from
#' \code{\link{semtree.control}} are input here. Any changes from the default
#' setting can be specified here.
#' @param percentile_threshold Numeric.
#' @param rounds Numeric. Number of rounds of the BORUTA algorithm.
#'
#' @export
#'
#' @aliases boruta plot.boruta print.boruta
#' @param model A template SEM. Same as in \code{semtree}.
#' @param data A dataframe to boruta on. Same as in \code{semtree}.
#' @param control A semforest control object to set forest parameters.
#' @param predictors An optional list of covariates. See semtree code example.
#' @param maxRuns Maximum number of boruta search cycles
#' @param pAdjMethod A value from \link{stats::p.adjust.methods} defining a
#' multiple testing correction method
#' @param alpha p-value cutoff for decisionmaking. Default .05
#' @param verbose Verbosity level for boruta processing
#' similar to the same argument in \link{semtree.control} and
#' \link{semforest.control}
#' @param \dots Optional parameters to undefined subfunctions
#' @return A vim object with several elements that need work.
#' Of particular note, `$importance` carries mean importance;
#' `$decision` denotes Accepted/Rejected/Tentative;
#' `$impHistory` has the entire varimp history; and
#' `$details` has exit values for each parameter.
#' @author Priyanka Paul, Timothy R. Brick, Andreas Brandmaier
#' @seealso \code{\link{semtree}} \code{\link{semforest}}
#'
#' @keywords tree models multivariate
#' @export

boruta <- function(model,
data,
control = NULL,
predictors = NULL,
percentile_threshold = 1,
rounds = 1,
maxRuns = 30,
pAdjMethod = "none",
alpha = .05,
verbose=FALSE,
...) {

# initial checks
Expand Down Expand Up @@ -84,48 +94,113 @@ boruta <- function(model,
} else {
ui_stop("Unknown model type selected. Use OpenMx or lavaanified lavaan models!")
}
model.ids <- tmp[[1]]
covariate.ids <- tmp[[2]]

# stage 1 - create shadow features

shadow.ids <- (ncol(data) + 1):(ncol(data) + length(covariate.ids))

for (cur_cov_id in covariate.ids) {
for (rep_id in 1:num_shadows) {
# pick column and shuffle
temp_column <- data[, cur_cov_id]
temp_column <- sample(temp_column, length(temp_column), replace = FALSE)
# add to dataset as shadow feature
temp_colname <- paste0("shadow_", names(data)[cur_cov_id], collapse = "")
if (num_shadows>1) temp_colname <- paste0(temp_colname, rep_id, collapse = "")
data[temp_colname] <- temp_column
if (!is.null(predictors)) predictors <- c(predictors, temp_colname)
}
# Checks on x & y from the boruta package
if(length(grep('^shadow',covariate.ids)>0))
stop('Attributes with names starting from "shadow" are reserved for internal use. Please rename them.')
if(maxRuns<11)
stop('maxRuns must be greater than 10.')
if(!pAdjMethod %in% stats::p.adjust.methods)
stop(c('P-value adjustment method not found. Must be one of:',
stats::p.adjust.methods))

# Might clash with some other semtrees stuff
if(is.null(predictors)) {
predictors <- names(data)[covariate.ids]
}

# Initialize and then loop over runs:
impHistory <- data.frame(matrix(NA, nrow=0, ncol=length(predictors)+3))
names(impHistory) <- c(predictors, "shadowMin", "shadowMean", "shadowMax")
decisionList <- data.frame(predictor=predictors, decision = "Tentative",
hitCount = 0, raw.p=NA, adjusted.p=NA)

# run the forest
forest <- semforest(model, data, control, predictors, ...)

# run variable importance
vim <- varimp(forest)

# get variable importance from shadow features
shadow_names <- names(data)[shadow.ids]
agvim <- aggregateVarimp(vim, aggregate = "mean")
# TODO: Parallelize the first five runs.
for(runNo in 1:maxRuns) {
if(verbose) {
message(paste("Beginning Run", runNo))
}

# stage 1 - create shadow features
rejected <- decisionList$predictor[decisionList$decision == "Rejected"]
current.predictors <- setdiff(predictors, rejected)
current.covariate.ids <- setdiff(covariate.ids, names(data) %in% rejected)
current.data <- data[, setdiff(names(data), rejected)]

shadow.ids <- (ncol(current.data)+1):(ncol(current.data)+length(current.covariate.ids))

for (cur_cov_id in current.covariate.ids) {
# pick column and shuffle
temp_column <- current.data[, cur_cov_id]
temp_column <- sample(temp_column, length(temp_column), replace=FALSE)
# add to dataset as shadow feature
temp_colname <- paste0("shadow_", names(current.data)[cur_cov_id],collapse="")
current.data[temp_colname] <- temp_column
if (!is.null(current.predictors)) current.predictors <- c(current.predictors, temp_colname)
}

# TODO: Pre-run model if needed.

# run the forest
forest <- semforest(model, current.data, control, current.predictors, ...)

vals <- agvim[names(agvim) %in% shadow_names]
#max_shadow_importance <- max(vals)
max_shadow_importance <- quantile(vals, percentile_threshold)
# run variable importance
vim <- varimp(forest)

agvim_filtered <- agvim[!(names(agvim) %in% shadow_names)]

df <- data.frame(importance = agvim_filtered, predictor = names(agvim_filtered))
# get variable importance from shadow features
shadow_names <- names(current.data)[shadow.ids]
agvim <- aggregateVarimp(vim, aggregate="mean")

# Compute shadow stats
shadow_importances <- agvim[names(agvim)%in%shadow_names]
impHistory[runNo, "shadowMax"] <- max_shadow_importance <- max(shadow_importances)
impHistory[runNo, "shadowMin"] <- min(shadow_importances)
impHistory[runNo, "shadowMean"] <- mean(shadow_importances)
agvim_filtered <- agvim[!(names(agvim)%in%shadow_names)]
impHistory[runNo, names(agvim_filtered)] <- agvim_filtered

# Compute "hits"
hits <- decisionList$predictor %in% names(agvim_filtered[agvim_filtered>max_shadow_importance])
decisionList$hitCount[hits] <- decisionList$hitCount[hits] + 1

# Run tests.
# The biasing here means that there are no decisions without correction
# before 5 runs and no decisions with Bonferroni before 7 runs.

# Run confirmation tests (pulled from Boruta package)
newPs <- stats::pbinom(decisionList$hitCount-1,runNo,0.5,lower.tail=FALSE)
adjPs <- stats::p.adjust(newPs,method=pAdjMethod)
acceptable <- adjPs < alpha
updateList <- acceptable & decisionList$decision == "Tentative"
decisionList$raw.p[updateList] <- newPs[updateList]
decisionList$adjusted.p[updateList] <- adjPs[updateList]
decisionList$decision[updateList] <- "Confirmed"

# Run rejection tests (pulled from Boruta package)
newPs <- stats::pbinom(decisionList$hitCount,runNo,0.5,lower.tail=TRUE)
adjPs <- stats::p.adjust(newPs,method=pAdjMethod)
acceptable <- adjPs < alpha
updateList <- acceptable & decisionList$decision == "Tentative"
decisionList$raw.p[updateList] <- newPs[updateList]
decisionList$adjusted.p[updateList] <- adjPs[updateList]
decisionList$decision[updateList] <- "Rejected"

if(!any(decisionList$decision == "Tentative")) {
break
}

}

vim$importance <- colMeans(impHistory, na.rm = TRUE)
vim$impHistory <- impHistory
vim$decisions <- decisionList$decision
vim$details <- decisionList

vim$filter <- agvim_filtered > max_shadow_importance
vim$filter <- decisionList$decision == "Confirmed" # Turns into hitreg
vim$boruta <- TRUE
vim$boruta_threshold <- max_shadow_importance
vim$percentile_threshold <- percentile_threshold


# TODO: Loop ends here with some reporting.
return(vim)
}

50 changes: 25 additions & 25 deletions man/boruta.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions semtree.Rproj
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@ LaTeX: pdfLaTeX
BuildType: Package
PackageUseDevtools: Yes
PackageInstallArgs: --no-multiarch --with-keep.source
PackageRoxygenize: rd,collate,namespace
32 changes: 32 additions & 0 deletions tests/borutaOneIndicator.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Tests a simple boruta model on a basic one-indicator SEM

library(OpenMx)
library(semtree)

N <- 1000
influence <- c(1, 0)
nPred <- length(influence)
cutbreaks <- 3
make_categorical <- TRUE


genModel <- mxModel(type="RAM", manifestVars=c("Y", paste0("X", 1:nPred)),
mxPath(paste0("X", 1:nPred), "Y", values=influence),
mxPath(c("Y", paste0("X", 1:nPred)), arrows=2, values=1))
simpleData <- mxGenerateData(genModel, N)
if(make_categorical) {
for(i in paste0("X", 1:nPred)) {
simpleData[[i]] <- cut(simpleData[[i]], cutbreaks)
simpleData[[i]] <- mxFactor(simpleData[[i]], levels(simpleData[[i]]))
}
}

testModel <- mxModel("SimplisticModel", type="RAM", manifestVars="Y",
mxPath("Y", arrows=2, values=1, free=TRUE, labels=c("Var")),
mxPath("one", "Y", values=0, free=TRUE, labels=c("mu")),
mxData(simpleData, type="raw"))

control <- semforest.control(control=semtree.control(method="score", alpha=1))

output <- boruta(testModel, simpleData, verbose=TRUE, control = control)
output

0 comments on commit a57768d

Please sign in to comment.