Skip to content

Commit

Permalink
Merge pull request #72 from trbrick/boruta
Browse files Browse the repository at this point in the history
Boruta
  • Loading branch information
brandmaier authored Jul 4, 2024
2 parents 5e632ee + 41af790 commit 09638cd
Show file tree
Hide file tree
Showing 8 changed files with 211 additions and 28 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
inst/doc
doc
Meta
.DS_Store
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ Encoding: UTF-8
LazyLoad: yes
Version: 0.9.19
Date: 2023-03-03
RoxygenNote: 7.2.3
RoxygenNote: 7.3.1
VignetteBuilder: knitr
BugReports: https://github.com/brandmaier/semtree/issues
URL: https://github.com/brandmaier/semtree
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ S3method(strip,semtree)
S3method(summary,semforest)
S3method(summary,semtree)
export(biodiversity)
export(boruta)
export(diversityMatrix)
export(evaluateTree)
export(fitSubmodels)
Expand Down
159 changes: 134 additions & 25 deletions R/boruta.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,41 @@

#' Run the Boruta algorithm on a sem tree
#'
#' Grows a series of SEM Forests following the boruta algorithm to determine
#' feature importance as moderators of the underlying model.
#'
#'
#' @aliases boruta plot.boruta print.boruta
#' @param model A template SEM. Same as in \code{semtree}.
#' @param data A dataframe to boruta on. Same as in \code{semtree}.
#' @param control A semforest control object to set forest parameters.
#' @param predictors An optional list of covariates. See semtree code example.
#' @param maxRuns Maximum number of boruta search cycles
#' @param pAdjMethod A value from \link{stats::p.adjust.methods} defining a
#' multiple testing correction method
#' @param alpha p-value cutoff for decisionmaking. Default .05
#' @param verbose Verbosity level for boruta processing
#' similar to the same argument in \link{semtree.control} and
#' \link{semforest.control}
#' @param \dots Optional parameters to undefined subfunctions
#' @return A vim object with several elements that need work.
#' Of particular note, `$importance` carries mean importance;
#' `$decision` denotes Accepted/Rejected/Tentative;
#' `$impHistory` has the entire varimp history; and
#' `$details` has exit values for each parameter.
#' @author Priyanka Paul, Timothy R. Brick, Andreas Brandmaier
#' @seealso \code{\link{semtree}} \code{\link{semforest}}
#'
#' @keywords tree models multivariate
#' @export
boruta <- function(model,
data,
control = NULL,
predictors = NULL,
maxRuns = 30,
pAdjMethod = "none",
alpha = .05,
verbose=FALSE,
...) {


Expand All @@ -20,37 +54,112 @@ boruta <- function(model,
ui_stop("Unknown model type selected. Use OpenMx or lavaanified lavaan models!");
}

# stage 1 - create shadow features

shadow.ids <- (ncol(data)+1):(ncol(data)+length(covariate.ids))

for (cur_cov_id in covariate.ids) {
# pick column and shuffle
temp_column <- data[, cur_cov_id]
temp_column <- sample(temp_column, length(temp_column), replace=FALSE)
# add to dataset as shadow feature
temp_colname <- paste0("shadow_", names(data)[cur_cov_id],collapse="")
data[temp_colname] <- temp_column
if (!is.null(predictors)) predictors <- c(predictors, temp_colname)
}
# Checks on x & y from the boruta package
if(length(grep('^shadow',covariate.ids)>0))
stop('Attributes with names starting from "shadow" are reserved for internal use. Please rename them.')
if(maxRuns<11)
stop('maxRuns must be greater than 10.')
if(!pAdjMethod %in% stats::p.adjust.methods)
stop(c('P-value adjustment method not found. Must be one of:',
stats::p.adjust.methods))

# run the forest
forest <- semforest(model, data, control, predictors, ...)
# Might clash with some other semtrees stuff
if(is.null(predictors)) {
predictors <- names(data)[covariate.ids]
}

# run variable importance
vim <- varimp(forest)
# Initialize and then loop over runs:
impHistory <- data.frame(matrix(NA, nrow=0, ncol=length(predictors)+3))
names(impHistory) <- c(predictors, "shadowMin", "shadowMean", "shadowMax")
decisionList <- data.frame(predictor=predictors, decision = "Tentative",
hitCount = 0, raw.p=NA, adjusted.p=NA)

# TODO: Parallelize the first five runs.
for(runNo in 1:maxRuns) {
if(verbose) {
message(paste("Beginning Run", runNo))
}

# stage 1 - create shadow features
rejected <- decisionList$predictor[decisionList$decision == "Rejected"]
current.predictors <- setdiff(predictors, rejected)
current.covariate.ids <- setdiff(covariate.ids, names(data) %in% rejected)
current.data <- data[, setdiff(names(data), rejected)]

shadow.ids <- (ncol(current.data)+1):(ncol(current.data)+length(current.covariate.ids))

for (cur_cov_id in current.covariate.ids) {
# pick column and shuffle
temp_column <- current.data[, cur_cov_id]
temp_column <- sample(temp_column, length(temp_column), replace=FALSE)
# add to dataset as shadow feature
temp_colname <- paste0("shadow_", names(current.data)[cur_cov_id],collapse="")
current.data[temp_colname] <- temp_column
if (!is.null(current.predictors)) current.predictors <- c(current.predictors, temp_colname)
}

# TODO: Pre-run model if needed.

# run the forest
forest <- semforest(model, current.data, control, current.predictors, ...)

# get variable importance from shadow features
shadow_names <- names(data)[shadow.ids]
agvim <- aggregateVarimp(vim, aggregate="mean")
max_shadow_importance <- max(agvim[names(agvim)%in%shadow_names])
agvim_filtered <- agvim[!(names(agvim)%in%shadow_names)]
# run variable importance
vim <- varimp(forest)

df<-data.frame(importance=agvim_filtered, predictor=names(agvim_filtered))
# get variable importance from shadow features
shadow_names <- names(current.data)[shadow.ids]
agvim <- aggregateVarimp(vim, aggregate="mean")

# Compute shadow stats
shadow_importances <- agvim[names(agvim)%in%shadow_names]
impHistory[runNo, "shadowMax"] <- max_shadow_importance <- max(shadow_importances)
impHistory[runNo, "shadowMin"] <- min(shadow_importances)
impHistory[runNo, "shadowMean"] <- mean(shadow_importances)
agvim_filtered <- agvim[!(names(agvim)%in%shadow_names)]
impHistory[runNo, names(agvim_filtered)] <- agvim_filtered

# Compute "hits"
hits <- decisionList$predictor %in% names(agvim_filtered[agvim_filtered>max_shadow_importance])
decisionList$hitCount[hits] <- decisionList$hitCount[hits] + 1

# Run tests.
# The biasing here means that there are no decisions without correction
# before 5 runs and no decisions with Bonferroni before 7 runs.

# Run confirmation tests (pulled from Boruta package)
newPs <- stats::pbinom(decisionList$hitCount-1,runNo,0.5,lower.tail=FALSE)
adjPs <- stats::p.adjust(newPs,method=pAdjMethod)
acceptable <- adjPs < alpha
updateList <- acceptable & decisionList$decision == "Tentative"
decisionList$raw.p[updateList] <- newPs[updateList]
decisionList$adjusted.p[updateList] <- adjPs[updateList]
decisionList$decision[updateList] <- "Confirmed"

# Run rejection tests (pulled from Boruta package)
newPs <- stats::pbinom(decisionList$hitCount,runNo,0.5,lower.tail=TRUE)
adjPs <- stats::p.adjust(newPs,method=pAdjMethod)
acceptable <- adjPs < alpha
updateList <- acceptable & decisionList$decision == "Tentative"
decisionList$raw.p[updateList] <- newPs[updateList]
decisionList$adjusted.p[updateList] <- adjPs[updateList]
decisionList$decision[updateList] <- "Rejected"

if(!any(decisionList$decision == "Tentative")) {
break
}

}

vim$filter <- agvim_filtered>max_shadow_importance
vim$importance <- colMeans(impHistory, na.rm = TRUE)
vim$impHistory <- impHistory
vim$decisions <- decisionList$decision
vim$details <- decisionList

vim$filter <- decisionList$decision == "Confirmed" # Turns into hitreg
vim$boruta <- TRUE
vim$boruta_threshold = max_shadow_importance

# TODO: Loop ends here with some reporting.

return(vim)
}

39 changes: 39 additions & 0 deletions man/boruta.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions man/toTable.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions semtree.Rproj
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@ LaTeX: pdfLaTeX
BuildType: Package
PackageUseDevtools: Yes
PackageInstallArgs: --no-multiarch --with-keep.source
PackageRoxygenize: rd,collate,namespace
32 changes: 32 additions & 0 deletions tests/borutaOneIndicator.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Tests a simple boruta model on a basic one-indicator SEM

library(OpenMx)
library(semtree)

N <- 1000
influence <- c(1, 0)
nPred <- length(influence)
cutbreaks <- 3
make_categorical <- TRUE


genModel <- mxModel(type="RAM", manifestVars=c("Y", paste0("X", 1:nPred)),
mxPath(paste0("X", 1:nPred), "Y", values=influence),
mxPath(c("Y", paste0("X", 1:nPred)), arrows=2, values=1))
simpleData <- mxGenerateData(genModel, N)
if(make_categorical) {
for(i in paste0("X", 1:nPred)) {
simpleData[[i]] <- cut(simpleData[[i]], cutbreaks)
simpleData[[i]] <- mxFactor(simpleData[[i]], levels(simpleData[[i]]))
}
}

testModel <- mxModel("SimplisticModel", type="RAM", manifestVars="Y",
mxPath("Y", arrows=2, values=1, free=TRUE, labels=c("Var")),
mxPath("one", "Y", values=0, free=TRUE, labels=c("mu")),
mxData(simpleData, type="raw"))

control <- semforest.control(control=semtree.control(method="score", alpha=1))

output <- boruta(testModel, simpleData, verbose=TRUE, control = control)
output

0 comments on commit 09638cd

Please sign in to comment.