diff --git a/.gitignore b/.gitignore index 1fc4f6e..4c4c5fb 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,6 @@ Meta /doc/ /Meta/ cran_comments.md -CRAN* \ No newline at end of file +CRAN* +.DS_Store + diff --git a/DESCRIPTION b/DESCRIPTION index 2c97d4f..7ccbd7f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -26,7 +26,8 @@ Imports: future.apply, data.table, expm, - gridBase + gridBase, + dplyr Suggests: knitr, rmarkdown, diff --git a/NAMESPACE b/NAMESPACE index 0158412..bf7c43e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,6 +8,7 @@ S3method(merge,semforest) S3method(nobs,semtree) S3method(partialDependence,semforest) S3method(partialDependence,semforest_stripped) +S3method(plot,boruta) S3method(plot,diversityMatrix) S3method(plot,partialDependence) S3method(plot,semforest.proximity) diff --git a/R/boruta.R b/R/boruta.R index f3cfcca..5831899 100644 --- a/R/boruta.R +++ b/R/boruta.R @@ -1,131 +1,255 @@ -#' BORUTA algorithm for SEM trees -#' -#' This is an experimental feature. Use cautiously. -#' -#' @aliases boruta -#' @param model A template model specification from \code{\link{OpenMx}} using -#' the \code{\link{mxModel}} function (or a \code{\link[lavaan]{lavaan}} model -#' using the \code{\link[lavaan]{lavaan}} function with option fit=FALSE). -#' Model must be syntactically correct within the framework chosen, and -#' converge to a solution. -#' @param data Data.frame used in the model creation using -#' \code{\link{mxModel}} or \code{\link[lavaan]{lavaan}} are input here. Order -#' of modeled variables and predictors is not important when providing a -#' dataset to \code{semtree}. -#' @param control \code{\link{semtree}} model specifications from -#' \code{\link{semtree.control}} are input here. Any changes from the default -#' setting can be specified here. -#' @param percentile_threshold Numeric. -#' @param rounds Numeric. Number of rounds of the BORUTA algorithm. +#' Run the Boruta algorithm on a sem tree #' +#' Grows a series of SEM Forests following the boruta algorithm to determine +#' feature importance as moderators of the underlying model. +#' +#' +#' @aliases boruta plot.boruta print.boruta +#' @param model A template SEM. Same as in \code{semtree}. +#' @param data A dataframe to boruta on. Same as in \code{semtree}. +#' @param control A semforest control object to set forest parameters. +#' @param predictors An optional list of covariates. See semtree code example. +#' @param maxRuns Maximum number of boruta search cycles +#' @param pAdjMethod A value from \link{stats::p.adjust.methods} defining a +#' multiple testing correction method +#' @param alpha p-value cutoff for decisionmaking. Default .05 +#' @param verbose Verbosity level for boruta processing +#' similar to the same argument in \link{semtree.control} and +#' \link{semforest.control} +#' @param \dots Optional parameters to undefined subfunctions +#' @return A vim object with several elements that need work. +#' Of particular note, `$importance` carries mean importance; +#' `$decision` denotes Accepted/Rejected/Tentative; +#' `$impHistory` has the entire varimp history; and +#' `$details` has exit values for each parameter. +#' @author Priyanka Paul, Timothy R. Brick, Andreas Brandmaier +#' @seealso \code{\link{semtree}} \code{\link{semforest}} +#' +#' @keywords tree models multivariate #' @export -#' boruta <- function(model, data, control = NULL, predictors = NULL, - percentile_threshold = 1, - rounds = 1, + maxRuns = 30, + pAdjMethod = "none", + alpha = .05, + verbose = FALSE, + quant = 1, ...) { + # detect model (warning: duplicated code) + if (inherits(model, "MxModel") || inherits(model, "MxRAMModel")) { + tmp <- + getPredictorsOpenMx(mxmodel = model, + dataset = data, + covariates = predictors) + model.ids <- tmp[[1]] + covariate.ids <- tmp[[2]] + } else if (inherits(model, "lavaan")) { + tmp <- + getPredictorsLavaan(model, dataset = data, covariates = predictors) + model.ids <- tmp[[1]] + covariate.ids <- tmp[[2]] + } else { + ui_stop("Unknown model type selected. Use OpenMx or lavaanified lavaan models!") + + } # initial checks - stopifnot(percentile_threshold>=0) - stopifnot(percentile_threshold<=1) - stopifnot(is.numeric(rounds)) - stopifnot(rounds>0) + # Checks on x & y from the boruta package + if (length(grep('^shadow', covariate.ids) > 0)) + stop( + 'Attributes with names starting from "shadow" are reserved for internal use. Please rename them.' + ) + if (maxRuns < 11) + stop('maxRuns must be greater than 10.') + if (!pAdjMethod %in% stats::p.adjust.methods) + stop(c( + 'P-value adjustment method not found. Must be one of:', + stats::p.adjust.methods + )) - preds_important <- c() - preds_unimportant <- c() - - cur_round = 1 - temp_vims <- list() - - while(cur_round <= rounds) { - vim_boruta <- .boruta(model=model, - data=data, - control=control, - predictors=predictors, - percentile_threshold = percentile_threshold, - ...) - browser() - # add predictors to list of unimportant variables - preds_unimportant <- c(preds_unimportant, names(vim_boruta$filter)[!vim_boruta$filter]) - # remove them from the dataset - data <- data[, -c(preds_unimportant)] - temp_vims[[cur_round]] <-vim_boruta + # Might clash with some other semtrees stuff + if (is.null(predictors)) { + predictors <- names(data)[covariate.ids] } - result <- list( - preds_unimportant, - rounds = rounds - ) + # Initialize and then loop over runs: + impHistory <- + data.frame(matrix(NA, nrow = 0, ncol = length(predictors) + 3)) + names(impHistory) <- + c(predictors, "shadowMin", "shadowMean", "shadowMax") + decisionList <- + data.frame( + predictor = predictors, + decision = "Tentative", + hitCount = 0, + raw.p = NA, + adjusted.p = NA + ) - return(result) -} - -.boruta <- function(model, - data, - control = NULL, - predictors = NULL, - percentile_threshold = 1, - num_shadows = 1, - ...) { - - # make sure that no column names start with "shadow_" prefix - stopifnot(all(sapply(names(data), function(x) {!startsWith(x, "shadow_")}))) - - # detect model (warning: duplicated code) - if (inherits(model, "MxModel") || inherits(model, "MxRAMModel")) { - tmp <- getPredictorsOpenMx(mxmodel = model, dataset = data, covariates = predictors) - - } else if (inherits(model,"lavaan")){ + # TODO: Parallelize the first five runs. + end_time <- NULL + for (runNo in 1:maxRuns) { + start_time <- Sys.time() + if (verbose) { + time_info <- "" + if (!is.null(end_time)) time_info <- paste0("Last run took", (end_time-start_time)) + message(paste("Beginning Run", runNo," ",time_info)) + } - tmp <- getPredictorsLavaan(model, data, predictors) - } else { - ui_stop("Unknown model type selected. Use OpenMx or lavaanified lavaan models!") - } - model.ids <- tmp[[1]] - covariate.ids <- tmp[[2]] - - # stage 1 - create shadow features - - shadow.ids <- (ncol(data) + 1):(ncol(data) + length(covariate.ids)) - - for (cur_cov_id in covariate.ids) { - for (rep_id in 1:num_shadows) { - # pick column and shuffle - temp_column <- data[, cur_cov_id] - temp_column <- sample(temp_column, length(temp_column), replace = FALSE) - # add to dataset as shadow feature - temp_colname <- paste0("shadow_", names(data)[cur_cov_id], collapse = "") - if (num_shadows>1) temp_colname <- paste0(temp_colname, rep_id, collapse = "") - data[temp_colname] <- temp_column - if (!is.null(predictors)) predictors <- c(predictors, temp_colname) + # stage 1 - create shadow features + rejected <- + decisionList$predictor[decisionList$decision == "Rejected"] + current.predictors <- setdiff(predictors, rejected) + current.covariate.ids <- + setdiff(covariate.ids, names(data) %in% rejected) + current.data <- data[, setdiff(names(data), rejected)] + + shadow.ids <- + (ncol(current.data) + 1):(ncol(current.data) + length(current.covariate.ids)) + + for (cur_cov_id in current.covariate.ids) { + # pick column and shuffle + temp_column <- current.data[, cur_cov_id] + temp_column <- + sample(temp_column, length(temp_column), replace = FALSE) + # add to dataset as shadow feature + temp_colname <- + paste0("shadow_", names(current.data)[cur_cov_id], collapse = "") + current.data[temp_colname] <- temp_column + if (!is.null(current.predictors)) + current.predictors <- c(current.predictors, temp_colname) + } + + # TODO: Pre-run model if needed. + + # run the forest + forest <- + semforest(model, current.data, control, current.predictors, ...) + + # run variable importance + vim <- varimp(forest) + + # get variable importance from shadow features + shadow_names <- names(current.data)[shadow.ids] + agvim <- aggregateVarimp(vim, aggregate = "mean") + + # Compute shadow stats + shadow_importances <- agvim[names(agvim) %in% shadow_names] + impHistory[runNo, "shadowMax"] <- max(shadow_importances, na.rm=TRUE) + + max_shadow_importance <- stats::quantile(shadow_importances, + probs=quant,na.rm=TRUE) + + impHistory[runNo, "shadowMin"] <- min(shadow_importances, na.rm=TRUE) + impHistory[runNo, "shadowMean"] <- mean(shadow_importances, na.rm=TRUE) + agvim_filtered <- agvim[!(names(agvim) %in% shadow_names)] + impHistory[runNo, names(agvim_filtered)] <- agvim_filtered + + # Compute "hits" + hits <- + decisionList$predictor %in% names(agvim_filtered[agvim_filtered > max_shadow_importance]) + decisionList$hitCount[hits] <- decisionList$hitCount[hits] + 1 + + # Run tests. + # The biasing here means that there are no decisions without correction + # before 5 runs and no decisions with Bonferroni before 7 runs. + + # Run confirmation tests (pulled from Boruta package) + newPs <- + stats::pbinom(decisionList$hitCount - 1, runNo, 0.5, lower.tail = FALSE) + adjPs <- stats::p.adjust(newPs, method = pAdjMethod) + acceptable <- adjPs < alpha + updateList <- acceptable & decisionList$decision == "Tentative" + decisionList$raw.p[updateList] <- newPs[updateList] + decisionList$adjusted.p[updateList] <- adjPs[updateList] + decisionList$decision[updateList] <- "Confirmed" + + # Run rejection tests (pulled from Boruta package) + newPs <- + stats::pbinom(decisionList$hitCount, runNo, 0.5, lower.tail = TRUE) + adjPs <- stats::p.adjust(newPs, method = pAdjMethod) + acceptable <- adjPs < alpha + updateList <- acceptable & decisionList$decision == "Tentative" + decisionList$raw.p[updateList] <- newPs[updateList] + decisionList$adjusted.p[updateList] <- adjPs[updateList] + decisionList$decision[updateList] <- "Rejected" + + if (!any(decisionList$decision == "Tentative")) { + break } + + end_time <- Sys.time() + } - - # run the forest - forest <- semforest(model, data, control, predictors, ...) - - # run variable importance - vim <- varimp(forest) - - # get variable importance from shadow features - shadow_names <- names(data)[shadow.ids] - agvim <- aggregateVarimp(vim, aggregate = "mean") - vals <- agvim[names(agvim) %in% shadow_names] - #max_shadow_importance <- max(vals) - max_shadow_importance <- quantile(vals, percentile_threshold) + vim$importance <- colMeans(impHistory, na.rm = TRUE) + vim$impHistory <- impHistory + vim$decisions <- decisionList$decision + vim$details <- decisionList - agvim_filtered <- agvim[!(names(agvim) %in% shadow_names)] - - df <- data.frame(importance = agvim_filtered, predictor = names(agvim_filtered)) - - vim$filter <- agvim_filtered > max_shadow_importance + vim$filter <- + decisionList$decision == "Confirmed" # Turns into hitreg vim$boruta <- TRUE - vim$boruta_threshold <- max_shadow_importance - vim$percentile_threshold <- percentile_threshold - + + class(vim) <- "boruta" + + # TODO: Loop ends here with some reporting. + return(vim) } + + +#' @exportS3Method plot boruta +plot.boruta = function(vim, type = 0, ...) { + decisionList = vim$details + impHistory = vim$impHistory + impHistory <- impHistory |> + dplyr::mutate(rnd = 1:nrow(impHistory)) |> + tidyr::pivot_longer(cols = -last_col()) |> #everything()) |> + dplyr::left_join(data.frame(decisionList), + by = dplyr::join_by("name" == "predictor")) |> + dplyr::mutate(decision = + dplyr::case_when(is.na(decision) ~ "Shadow", .default = decision)) |> + dplyr::group_by(name) |> + dplyr::mutate(median_value = median(value, na.rm = TRUE)) + + if (type == 0) { + + # sort Inf values to the left + #impHistory[is.na(impHistory$median_value)] <- -Inf + impHistory$sort_value <- impHistory$median_value + # mv <- min(impHistory$sort_value,na.rm=TRUE) + # impHistory$sort_value[impHistory$decision=="Rejected"]<- (mv-1) + + ggplot2::ggplot(impHistory, + ggplot2::aes( + x = stats::reorder(name, sort_value), + y = value, + color = decision + )) + + ggplot2::geom_boxplot() + + ggplot2::xlab("") + + ggplot2::ylab("Importance") + + ggplot2::scale_color_discrete(name = "Decision") + + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) + } else if (type == 1) { + ggplot2::ggplot(impHistory, + ggplot2::aes( + x = rnd, + y = value, + group = name, + col = name + )) + + ggplot2::geom_line() + + ggplot2::geom_hline(ggplot2::aes(yintercept = median_value, col = name), lwd = + 2) + + ggplot2::xlab("Round") + + ggplot2::ylab("Importance") + + ggplot2::scale_color_discrete(name = "Predictor") + } else { + stop("Unknown graph type. Please choose 0 or 1.") + } + +} \ No newline at end of file diff --git a/R/toTable.R b/R/toTable.R index 337c235..2be2bf7 100644 --- a/R/toTable.R +++ b/R/toTable.R @@ -66,7 +66,6 @@ alls <- unique(alls) # create table -#covariate.names <-simplify2array(tree$result$btn.matrix[2,]) covariate.names <- getCovariatesFromTree(tree) # default is to display all parameters @@ -79,10 +78,12 @@ all.names <- c(covariate.names, added.param.cols) str.matrix <- matrix(NA, nrow = length(rowdata),ncol=length(all.names)) +# convert to a data frame to avoid coercion to string +str.matrix <- data.frame(str.matrix) + colnames(str.matrix) <- all.names for (i in 1:length(rowdata)) { -# result.row <- rep(" ",length(covariate.names)) myrow <- rowdata[[i]][[1]] for (j in 1:length(myrow)) { myitem <- myrow[[j]] @@ -126,7 +127,6 @@ for (i in 1:length(rowdata)) { } } -# result.string <- paste(result.string,paste(result.row,collapse="\t"),"\n") } ## prune empty columns? @@ -139,7 +139,7 @@ if (length(is.col.empty)>0) { sortby <- apply(str.matrix,2,function(x){sum(!is.na(x))}) if (!is.null(added.param.cols)) { remids <- (dim(str.matrix)[2]-length(added.param.cols)+1):(dim(str.matrix)[2]) - sortby[remids] <- sortby[remids]-999999 + sortby[remids] <- sortby[remids]-999999 # bad style ^^ #TODO } sort.ix <- sort(sortby,index.return=TRUE,decreasing = TRUE)$ix str.matrix <- str.matrix[, sort.ix] @@ -156,7 +156,6 @@ str.matrix[is.na(str.matrix)]<-"" ## and display #cat(result.string) -#require("openxlsx") return(str.matrix) diff --git a/man/boruta.Rd b/man/boruta.Rd index f0e8b85..6a8cc8a 100644 --- a/man/boruta.Rd +++ b/man/boruta.Rd @@ -2,38 +2,61 @@ % Please edit documentation in R/boruta.R \name{boruta} \alias{boruta} -\title{BORUTA algorithm for SEM trees} +\alias{plot.boruta} +\alias{print.boruta} +\title{Run the Boruta algorithm on a sem tree} \usage{ boruta( model, data, control = NULL, predictors = NULL, - percentile_threshold = 100, - rounds = 1, + maxRuns = 30, + pAdjMethod = "none", + alpha = 0.05, + verbose = FALSE, ... ) } \arguments{ -\item{model}{A template model specification from \code{\link{OpenMx}} using -the \code{\link{mxModel}} function (or a \code{\link[lavaan]{lavaan}} model -using the \code{\link[lavaan]{lavaan}} function with option fit=FALSE). -Model must be syntactically correct within the framework chosen, and -converge to a solution.} +\item{model}{A template SEM. Same as in \code{semtree}.} -\item{data}{Data.frame used in the model creation using -\code{\link{mxModel}} or \code{\link[lavaan]{lavaan}} are input here. Order -of modeled variables and predictors is not important when providing a -dataset to \code{semtree}.} +\item{data}{A dataframe to boruta on. Same as in \code{semtree}.} -\item{control}{\code{\link{semtree}} model specifications from -\code{\link{semtree.control}} are input here. Any changes from the default -setting can be specified here.} +\item{control}{A semforest control object to set forest parameters.} -\item{percentile_threshold}{Numeric.} +\item{predictors}{An optional list of covariates. See semtree code example.} -\item{rounds}{Numeric. Number of rounds of the BORUTA algorithm.} +\item{maxRuns}{Maximum number of boruta search cycles} + +\item{pAdjMethod}{A value from \link{stats::p.adjust.methods} defining a +multiple testing correction method} + +\item{alpha}{p-value cutoff for decisionmaking. Default .05} + +\item{verbose}{Verbosity level for boruta processing +similar to the same argument in \link{semtree.control} and +\link{semforest.control}} + +\item{\dots}{Optional parameters to undefined subfunctions} +} +\value{ +A vim object with several elements that need work. + Of particular note, `$importance` carries mean importance; + `$decision` denotes Accepted/Rejected/Tentative; + `$impHistory` has the entire varimp history; and + `$details` has exit values for each parameter. } \description{ -BORUTA algorithm for SEM trees +Grows a series of SEM Forests following the boruta algorithm to determine + feature importance as moderators of the underlying model. +} +\seealso{ +\code{\link{semtree}} \code{\link{semforest}} +} +\author{ +Priyanka Paul, Timothy R. Brick, Andreas Brandmaier } +\keyword{models} +\keyword{multivariate} +\keyword{tree} diff --git a/man/semtree.control.Rd b/man/semtree.control.Rd index 7ce8847..df9d886 100644 --- a/man/semtree.control.Rd +++ b/man/semtree.control.Rd @@ -8,7 +8,7 @@ \usage{ semtree.control( method = c("naive", "score", "fair", "fair3"), - min.N = 20, + min.N = NULL, max.depth = NA, alpha = 0.05, alpha.invariance = NA, @@ -24,7 +24,7 @@ semtree.control( report.level = 0, exclude.code = NA, linear = TRUE, - min.bucket = 10, + min.bucket = NULL, naive.bonferroni.type = 0, missing = "ignore", use.maxlm = FALSE, diff --git a/semtree.Rproj b/semtree.Rproj index 21a4da0..eaa6b81 100644 --- a/semtree.Rproj +++ b/semtree.Rproj @@ -15,3 +15,4 @@ LaTeX: pdfLaTeX BuildType: Package PackageUseDevtools: Yes PackageInstallArgs: --no-multiarch --with-keep.source +PackageRoxygenize: rd,collate,namespace diff --git a/tests/borutaOneIndicator.R b/tests/borutaOneIndicator.R new file mode 100644 index 0000000..7e73b27 --- /dev/null +++ b/tests/borutaOneIndicator.R @@ -0,0 +1,32 @@ +# Tests a simple boruta model on a basic one-indicator SEM + +library(OpenMx) +library(semtree) + +N <- 1000 +influence <- c(1, 0) +nPred <- length(influence) +cutbreaks <- 3 +make_categorical <- TRUE + + +genModel <- mxModel(type="RAM", manifestVars=c("Y", paste0("X", 1:nPred)), + mxPath(paste0("X", 1:nPred), "Y", values=influence), + mxPath(c("Y", paste0("X", 1:nPred)), arrows=2, values=1)) +simpleData <- mxGenerateData(genModel, N) +if(make_categorical) { + for(i in paste0("X", 1:nPred)) { + simpleData[[i]] <- cut(simpleData[[i]], cutbreaks) + simpleData[[i]] <- mxFactor(simpleData[[i]], levels(simpleData[[i]])) + } +} + +testModel <- mxModel("SimplisticModel", type="RAM", manifestVars="Y", + mxPath("Y", arrows=2, values=1, free=TRUE, labels=c("Var")), + mxPath("one", "Y", values=0, free=TRUE, labels=c("mu")), + mxData(simpleData, type="raw")) + +control <- semforest.control(control=semtree.control(method="score", alpha=1)) + +output <- boruta(testModel, simpleData, verbose=TRUE, control = control) +output