diff --git a/.gitignore b/.gitignore index 1fc4f6e..4c4c5fb 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,6 @@ Meta /doc/ /Meta/ cran_comments.md -CRAN* \ No newline at end of file +CRAN* +.DS_Store + diff --git a/R/boruta.R b/R/boruta.R index f3cfcca..0ba714b 100644 --- a/R/boruta.R +++ b/R/boruta.R @@ -1,31 +1,41 @@ -#' BORUTA algorithm for SEM trees +#' Run the Boruta algorithm on a sem tree #' -#' This is an experimental feature. Use cautiously. +#' Grows a series of SEM Forests following the boruta algorithm to determine +#' feature importance as moderators of the underlying model. #' -#' @aliases boruta -#' @param model A template model specification from \code{\link{OpenMx}} using -#' the \code{\link{mxModel}} function (or a \code{\link[lavaan]{lavaan}} model -#' using the \code{\link[lavaan]{lavaan}} function with option fit=FALSE). -#' Model must be syntactically correct within the framework chosen, and -#' converge to a solution. -#' @param data Data.frame used in the model creation using -#' \code{\link{mxModel}} or \code{\link[lavaan]{lavaan}} are input here. Order -#' of modeled variables and predictors is not important when providing a -#' dataset to \code{semtree}. -#' @param control \code{\link{semtree}} model specifications from -#' \code{\link{semtree.control}} are input here. Any changes from the default -#' setting can be specified here. -#' @param percentile_threshold Numeric. -#' @param rounds Numeric. Number of rounds of the BORUTA algorithm. -#' -#' @export #' +#' @aliases boruta plot.boruta print.boruta +#' @param model A template SEM. Same as in \code{semtree}. +#' @param data A dataframe to boruta on. Same as in \code{semtree}. +#' @param control A semforest control object to set forest parameters. +#' @param predictors An optional list of covariates. See semtree code example. +#' @param maxRuns Maximum number of boruta search cycles +#' @param pAdjMethod A value from \link{stats::p.adjust.methods} defining a +#' multiple testing correction method +#' @param alpha p-value cutoff for decisionmaking. Default .05 +#' @param verbose Verbosity level for boruta processing +#' similar to the same argument in \link{semtree.control} and +#' \link{semforest.control} +#' @param \dots Optional parameters to undefined subfunctions +#' @return A vim object with several elements that need work. +#' Of particular note, `$importance` carries mean importance; +#' `$decision` denotes Accepted/Rejected/Tentative; +#' `$impHistory` has the entire varimp history; and +#' `$details` has exit values for each parameter. +#' @author Priyanka Paul, Timothy R. Brick, Andreas Brandmaier +#' @seealso \code{\link{semtree}} \code{\link{semforest}} +#' +#' @keywords tree models multivariate +#' @export + boruta <- function(model, data, control = NULL, predictors = NULL, - percentile_threshold = 1, - rounds = 1, + maxRuns = 30, + pAdjMethod = "none", + alpha = .05, + verbose=FALSE, ...) { # initial checks @@ -84,48 +94,113 @@ boruta <- function(model, } else { ui_stop("Unknown model type selected. Use OpenMx or lavaanified lavaan models!") } - model.ids <- tmp[[1]] - covariate.ids <- tmp[[2]] - # stage 1 - create shadow features - - shadow.ids <- (ncol(data) + 1):(ncol(data) + length(covariate.ids)) - - for (cur_cov_id in covariate.ids) { - for (rep_id in 1:num_shadows) { - # pick column and shuffle - temp_column <- data[, cur_cov_id] - temp_column <- sample(temp_column, length(temp_column), replace = FALSE) - # add to dataset as shadow feature - temp_colname <- paste0("shadow_", names(data)[cur_cov_id], collapse = "") - if (num_shadows>1) temp_colname <- paste0(temp_colname, rep_id, collapse = "") - data[temp_colname] <- temp_column - if (!is.null(predictors)) predictors <- c(predictors, temp_colname) - } + # Checks on x & y from the boruta package + if(length(grep('^shadow',covariate.ids)>0)) + stop('Attributes with names starting from "shadow" are reserved for internal use. Please rename them.') + if(maxRuns<11) + stop('maxRuns must be greater than 10.') + if(!pAdjMethod %in% stats::p.adjust.methods) + stop(c('P-value adjustment method not found. Must be one of:', + stats::p.adjust.methods)) + + # Might clash with some other semtrees stuff + if(is.null(predictors)) { + predictors <- names(data)[covariate.ids] } + + # Initialize and then loop over runs: + impHistory <- data.frame(matrix(NA, nrow=0, ncol=length(predictors)+3)) + names(impHistory) <- c(predictors, "shadowMin", "shadowMean", "shadowMax") + decisionList <- data.frame(predictor=predictors, decision = "Tentative", + hitCount = 0, raw.p=NA, adjusted.p=NA) - # run the forest - forest <- semforest(model, data, control, predictors, ...) - - # run variable importance - vim <- varimp(forest) - - # get variable importance from shadow features - shadow_names <- names(data)[shadow.ids] - agvim <- aggregateVarimp(vim, aggregate = "mean") + # TODO: Parallelize the first five runs. + for(runNo in 1:maxRuns) { + if(verbose) { + message(paste("Beginning Run", runNo)) + } + + # stage 1 - create shadow features + rejected <- decisionList$predictor[decisionList$decision == "Rejected"] + current.predictors <- setdiff(predictors, rejected) + current.covariate.ids <- setdiff(covariate.ids, names(data) %in% rejected) + current.data <- data[, setdiff(names(data), rejected)] + + shadow.ids <- (ncol(current.data)+1):(ncol(current.data)+length(current.covariate.ids)) + + for (cur_cov_id in current.covariate.ids) { + # pick column and shuffle + temp_column <- current.data[, cur_cov_id] + temp_column <- sample(temp_column, length(temp_column), replace=FALSE) + # add to dataset as shadow feature + temp_colname <- paste0("shadow_", names(current.data)[cur_cov_id],collapse="") + current.data[temp_colname] <- temp_column + if (!is.null(current.predictors)) current.predictors <- c(current.predictors, temp_colname) + } + + # TODO: Pre-run model if needed. + + # run the forest + forest <- semforest(model, current.data, control, current.predictors, ...) - vals <- agvim[names(agvim) %in% shadow_names] - #max_shadow_importance <- max(vals) - max_shadow_importance <- quantile(vals, percentile_threshold) + # run variable importance + vim <- varimp(forest) - agvim_filtered <- agvim[!(names(agvim) %in% shadow_names)] - - df <- data.frame(importance = agvim_filtered, predictor = names(agvim_filtered)) + # get variable importance from shadow features + shadow_names <- names(current.data)[shadow.ids] + agvim <- aggregateVarimp(vim, aggregate="mean") + + # Compute shadow stats + shadow_importances <- agvim[names(agvim)%in%shadow_names] + impHistory[runNo, "shadowMax"] <- max_shadow_importance <- max(shadow_importances) + impHistory[runNo, "shadowMin"] <- min(shadow_importances) + impHistory[runNo, "shadowMean"] <- mean(shadow_importances) + agvim_filtered <- agvim[!(names(agvim)%in%shadow_names)] + impHistory[runNo, names(agvim_filtered)] <- agvim_filtered + + # Compute "hits" + hits <- decisionList$predictor %in% names(agvim_filtered[agvim_filtered>max_shadow_importance]) + decisionList$hitCount[hits] <- decisionList$hitCount[hits] + 1 + + # Run tests. + # The biasing here means that there are no decisions without correction + # before 5 runs and no decisions with Bonferroni before 7 runs. + + # Run confirmation tests (pulled from Boruta package) + newPs <- stats::pbinom(decisionList$hitCount-1,runNo,0.5,lower.tail=FALSE) + adjPs <- stats::p.adjust(newPs,method=pAdjMethod) + acceptable <- adjPs < alpha + updateList <- acceptable & decisionList$decision == "Tentative" + decisionList$raw.p[updateList] <- newPs[updateList] + decisionList$adjusted.p[updateList] <- adjPs[updateList] + decisionList$decision[updateList] <- "Confirmed" + + # Run rejection tests (pulled from Boruta package) + newPs <- stats::pbinom(decisionList$hitCount,runNo,0.5,lower.tail=TRUE) + adjPs <- stats::p.adjust(newPs,method=pAdjMethod) + acceptable <- adjPs < alpha + updateList <- acceptable & decisionList$decision == "Tentative" + decisionList$raw.p[updateList] <- newPs[updateList] + decisionList$adjusted.p[updateList] <- adjPs[updateList] + decisionList$decision[updateList] <- "Rejected" + + if(!any(decisionList$decision == "Tentative")) { + break + } + + } + + vim$importance <- colMeans(impHistory, na.rm = TRUE) + vim$impHistory <- impHistory + vim$decisions <- decisionList$decision + vim$details <- decisionList - vim$filter <- agvim_filtered > max_shadow_importance + vim$filter <- decisionList$decision == "Confirmed" # Turns into hitreg vim$boruta <- TRUE - vim$boruta_threshold <- max_shadow_importance - vim$percentile_threshold <- percentile_threshold - + + # TODO: Loop ends here with some reporting. + return(vim) } + diff --git a/man/boruta.Rd b/man/boruta.Rd index f0e8b85..4921d25 100644 --- a/man/boruta.Rd +++ b/man/boruta.Rd @@ -2,38 +2,38 @@ % Please edit documentation in R/boruta.R \name{boruta} \alias{boruta} -\title{BORUTA algorithm for SEM trees} +\alias{plot.boruta} +\alias{print.boruta} +\title{Run the Boruta algorithm on a sem tree} \usage{ -boruta( - model, - data, - control = NULL, - predictors = NULL, - percentile_threshold = 100, - rounds = 1, - ... -) +boruta(model, data, control = NULL, predictors = NULL, ...) } \arguments{ -\item{model}{A template model specification from \code{\link{OpenMx}} using -the \code{\link{mxModel}} function (or a \code{\link[lavaan]{lavaan}} model -using the \code{\link[lavaan]{lavaan}} function with option fit=FALSE). -Model must be syntactically correct within the framework chosen, and -converge to a solution.} +\item{model}{A template SEM. Same as in \code{semtree}.} -\item{data}{Data.frame used in the model creation using -\code{\link{mxModel}} or \code{\link[lavaan]{lavaan}} are input here. Order -of modeled variables and predictors is not important when providing a -dataset to \code{semtree}.} +\item{data}{A dataframe to boruta on. Same as in \code{semtree}.} -\item{control}{\code{\link{semtree}} model specifications from -\code{\link{semtree.control}} are input here. Any changes from the default -setting can be specified here.} +\item{control}{A semforest control object to set forest parameters.} -\item{percentile_threshold}{Numeric.} +\item{predictors}{An optional list of covariates. See semtree code example.} -\item{rounds}{Numeric. Number of rounds of the BORUTA algorithm.} +\item{\dots}{Optional parameters.} + +\item{constraints}{An optional list of covariates. See semtree code example.} +} +\value{ +A boruta object. } \description{ -BORUTA algorithm for SEM trees +Grows a series of SEM Forests following the boruta algorithm to determine + feature importance as moderators of the underlying model. +} +\seealso{ +\code{\link{semtree}} \code{\link{semforest}} +} +\author{ +Priyanka Paul, Timothy R. Brick, Andreas Brandmaier } +\keyword{models} +\keyword{multivariate} +\keyword{tree} diff --git a/semtree.Rproj b/semtree.Rproj index 21a4da0..eaa6b81 100644 --- a/semtree.Rproj +++ b/semtree.Rproj @@ -15,3 +15,4 @@ LaTeX: pdfLaTeX BuildType: Package PackageUseDevtools: Yes PackageInstallArgs: --no-multiarch --with-keep.source +PackageRoxygenize: rd,collate,namespace diff --git a/tests/borutaOneIndicator.R b/tests/borutaOneIndicator.R new file mode 100644 index 0000000..7e73b27 --- /dev/null +++ b/tests/borutaOneIndicator.R @@ -0,0 +1,32 @@ +# Tests a simple boruta model on a basic one-indicator SEM + +library(OpenMx) +library(semtree) + +N <- 1000 +influence <- c(1, 0) +nPred <- length(influence) +cutbreaks <- 3 +make_categorical <- TRUE + + +genModel <- mxModel(type="RAM", manifestVars=c("Y", paste0("X", 1:nPred)), + mxPath(paste0("X", 1:nPred), "Y", values=influence), + mxPath(c("Y", paste0("X", 1:nPred)), arrows=2, values=1)) +simpleData <- mxGenerateData(genModel, N) +if(make_categorical) { + for(i in paste0("X", 1:nPred)) { + simpleData[[i]] <- cut(simpleData[[i]], cutbreaks) + simpleData[[i]] <- mxFactor(simpleData[[i]], levels(simpleData[[i]])) + } +} + +testModel <- mxModel("SimplisticModel", type="RAM", manifestVars="Y", + mxPath("Y", arrows=2, values=1, free=TRUE, labels=c("Var")), + mxPath("one", "Y", values=0, free=TRUE, labels=c("mu")), + mxData(simpleData, type="raw")) + +control <- semforest.control(control=semtree.control(method="score", alpha=1)) + +output <- boruta(testModel, simpleData, verbose=TRUE, control = control) +output