From ee252611e13067003cdd2492b048151d164277b0 Mon Sep 17 00:00:00 2001 From: trbrick Date: Mon, 24 Jun 2024 21:21:29 +0200 Subject: [PATCH 01/16] Updated boruta documentation --- DESCRIPTION | 2 +- NAMESPACE | 1 + R/boruta.R | 20 ++++++++++++++++++++ man/boruta.Rd | 39 +++++++++++++++++++++++++++++++++++++++ man/toTable.Rd | 4 ++-- semtree.Rproj | 1 + 6 files changed, 64 insertions(+), 3 deletions(-) create mode 100644 man/boruta.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 97dcece..3486b46 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -55,7 +55,7 @@ Encoding: UTF-8 LazyLoad: yes Version: 0.9.19 Date: 2023-03-03 -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.1 VignetteBuilder: knitr BugReports: https://github.com/brandmaier/semtree/issues URL: https://github.com/brandmaier/semtree diff --git a/NAMESPACE b/NAMESPACE index bc263c8..db0d57e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -32,6 +32,7 @@ S3method(strip,semtree) S3method(summary,semforest) S3method(summary,semtree) export(biodiversity) +export(boruta) export(diversityMatrix) export(evaluateTree) export(fitSubmodels) diff --git a/R/boruta.R b/R/boruta.R index 8ba9da3..5b6cb03 100644 --- a/R/boruta.R +++ b/R/boruta.R @@ -1,3 +1,23 @@ + +#' Run the Boruta algorithm on a sem tree +#' +#' Grows a series of SEM Forests following the boruta algorithm to determine +#' feature importance as moderators of the underlying model. +#' +#' +#' @aliases boruta plot.boruta print.boruta +#' @param model A template SEM. Same as in \code{semtree}. +#' @param data A dataframe to boruta on. Same as in \code{semtree}. +#' @param control A semforest control object to set forest parameters. +#' @param predictors An optional list of covariates. See semtree code example. +#' @param constraints An optional list of covariates. See semtree code example. +#' @param \dots Optional parameters. +#' @return A boruta object. +#' @author Priyanka Paul, Timothy R. Brick, Andreas Brandmaier +#' @seealso \code{\link{semtree}} \code{\link{semforest}} +#' +#' @keywords tree models multivariate +#' @export boruta <- function(model, data, control = NULL, diff --git a/man/boruta.Rd b/man/boruta.Rd new file mode 100644 index 0000000..4921d25 --- /dev/null +++ b/man/boruta.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/boruta.R +\name{boruta} +\alias{boruta} +\alias{plot.boruta} +\alias{print.boruta} +\title{Run the Boruta algorithm on a sem tree} +\usage{ +boruta(model, data, control = NULL, predictors = NULL, ...) +} +\arguments{ +\item{model}{A template SEM. Same as in \code{semtree}.} + +\item{data}{A dataframe to boruta on. Same as in \code{semtree}.} + +\item{control}{A semforest control object to set forest parameters.} + +\item{predictors}{An optional list of covariates. See semtree code example.} + +\item{\dots}{Optional parameters.} + +\item{constraints}{An optional list of covariates. See semtree code example.} +} +\value{ +A boruta object. +} +\description{ +Grows a series of SEM Forests following the boruta algorithm to determine + feature importance as moderators of the underlying model. +} +\seealso{ +\code{\link{semtree}} \code{\link{semforest}} +} +\author{ +Priyanka Paul, Timothy R. Brick, Andreas Brandmaier +} +\keyword{models} +\keyword{multivariate} +\keyword{tree} diff --git a/man/toTable.Rd b/man/toTable.Rd index 4fa608e..bd92d95 100644 --- a/man/toTable.Rd +++ b/man/toTable.Rd @@ -9,9 +9,9 @@ toTable(tree, added.param.cols = NULL, round.param = NULL) \arguments{ \item{tree}{A SEM Tree object.} -\item{added.param.cols}{Add extra columns with parameter estimates.} +\item{added.param.cols}{String. Add extra columns with parameter estimates. Pass a vector with the names of the parameters that should be rendered in the table.} -\item{round.param}{Number of digits to round parameter estiamtes} +\item{round.param}{Integer. Number of digits to round parameter estimates. Default is no rounding (NULL)} } \description{ Converts a tree into a tabular representation. This may be useful as a diff --git a/semtree.Rproj b/semtree.Rproj index 21a4da0..eaa6b81 100644 --- a/semtree.Rproj +++ b/semtree.Rproj @@ -15,3 +15,4 @@ LaTeX: pdfLaTeX BuildType: Package PackageUseDevtools: Yes PackageInstallArgs: --no-multiarch --with-keep.source +PackageRoxygenize: rd,collate,namespace From da0cfe4ac32646a126bc6132614896fa351274d8 Mon Sep 17 00:00:00 2001 From: trbrick Date: Mon, 24 Jun 2024 21:22:58 +0200 Subject: [PATCH 02/16] Added simple boruta test --- tests/borutaOneIndicator.R | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 tests/borutaOneIndicator.R diff --git a/tests/borutaOneIndicator.R b/tests/borutaOneIndicator.R new file mode 100644 index 0000000..be365a3 --- /dev/null +++ b/tests/borutaOneIndicator.R @@ -0,0 +1,20 @@ +# Tests a simple boruta model on a basic one-indicator SEM + +library(OpenMx) + +N <- 1000 +influence <- c(1, 0, 0, 0) +nPred <- length(influence) + + +genModel <- mxModel(type="RAM", manifestVars=c("Y", paste0("X", 1:nPred)), + mxPath(paste0("X", 1:nPred), "Y", values=influence), + mxPath(c("Y", paste0("X", 1:nPred)), arrows=2, values=1)) +simpleData <- mxGenerateData(genModel, 1000) + +testModel <- mxModel("SimplisticModel", type="RAM", manifestVars="Y", + mxPath("Y", arrows=2, values=1, free=TRUE), + mxPath("one", "Y", values=0, free=TRUE), + mxData(simpleData, type="raw")) + +boruta(testModel, simpleData, verbose=TRUE) \ No newline at end of file From 56695bb311ac3712b96509d60c3ad685dfbc3d2a Mon Sep 17 00:00:00 2001 From: trbrick Date: Mon, 24 Jun 2024 21:58:19 +0200 Subject: [PATCH 03/16] Minor updates to simplify the boruta test. --- tests/borutaOneIndicator.R | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/borutaOneIndicator.R b/tests/borutaOneIndicator.R index be365a3..b159120 100644 --- a/tests/borutaOneIndicator.R +++ b/tests/borutaOneIndicator.R @@ -3,18 +3,23 @@ library(OpenMx) N <- 1000 -influence <- c(1, 0, 0, 0) +influence <- c(1, 0) nPred <- length(influence) +cutbreaks <- 3 genModel <- mxModel(type="RAM", manifestVars=c("Y", paste0("X", 1:nPred)), mxPath(paste0("X", 1:nPred), "Y", values=influence), mxPath(c("Y", paste0("X", 1:nPred)), arrows=2, values=1)) simpleData <- mxGenerateData(genModel, 1000) +for(i in paste0("X", 1:nPred)) { + simpleData[[i]] <- cut(simpleData[[i]], cutbreaks) +} testModel <- mxModel("SimplisticModel", type="RAM", manifestVars="Y", mxPath("Y", arrows=2, values=1, free=TRUE), mxPath("one", "Y", values=0, free=TRUE), mxData(simpleData, type="raw")) -boruta(testModel, simpleData, verbose=TRUE) \ No newline at end of file +output <- boruta(testModel, simpleData, verbose=TRUE) +output \ No newline at end of file From 47f18e11d5c000b5d65260c79819e7362c009e91 Mon Sep 17 00:00:00 2001 From: trbrick Date: Mon, 24 Jun 2024 22:00:52 +0200 Subject: [PATCH 04/16] Fixed a bug in borutaOneIndicator test --- tests/borutaOneIndicator.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/borutaOneIndicator.R b/tests/borutaOneIndicator.R index b159120..52b73ca 100644 --- a/tests/borutaOneIndicator.R +++ b/tests/borutaOneIndicator.R @@ -11,7 +11,7 @@ cutbreaks <- 3 genModel <- mxModel(type="RAM", manifestVars=c("Y", paste0("X", 1:nPred)), mxPath(paste0("X", 1:nPred), "Y", values=influence), mxPath(c("Y", paste0("X", 1:nPred)), arrows=2, values=1)) -simpleData <- mxGenerateData(genModel, 1000) +simpleData <- mxGenerateData(genModel, N) for(i in paste0("X", 1:nPred)) { simpleData[[i]] <- cut(simpleData[[i]], cutbreaks) } From 6a705eaa475362146b6843954bbe3074c735c43d Mon Sep 17 00:00:00 2001 From: trbrick Date: Mon, 1 Jul 2024 15:52:33 +0200 Subject: [PATCH 05/16] TODO notes on the meeting --- R/boruta.R | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/R/boruta.R b/R/boruta.R index 5b6cb03..a2d44df 100644 --- a/R/boruta.R +++ b/R/boruta.R @@ -40,6 +40,8 @@ boruta <- function(model, ui_stop("Unknown model type selected. Use OpenMx or lavaanified lavaan models!"); } + # TODO: Loop over runs has to start here + # stage 1 - create shadow features shadow.ids <- (ncol(data)+1):(ncol(data)+length(covariate.ids)) @@ -66,11 +68,31 @@ boruta <- function(model, max_shadow_importance <- max(agvim[names(agvim)%in%shadow_names]) agvim_filtered <- agvim[!(names(agvim)%in%shadow_names)] + # TODO: Apply binomial test across feature importance values: + # Code below from the Boruta package: + # decReg is the decision registry: one of "Tentative" "Accept" or "Reject" + # hitReg is a list the length of all features. + # hitReg starts at all zeros, and increments each run that a given feature has + # an importance greater than shadowmax. + # toAccept<-stats::p.adjust(stats::pbinom(hitReg-1,runs,0.5,lower.tail=FALSE),method=pAdjMethod)toAccept + # toReject<-stats::p.adjust(stats::pbinom(hitReg,runs,0.5,lower.tail=TRUE),method=pAdjMethod)toReject + # + # The biasing here means that there are no decisions without correction before 5 runs + # and no decisions with Bonferroni before 7 runs. + + # TODO: Parallelize the first five runs. + df<-data.frame(importance=agvim_filtered, predictor=names(agvim_filtered)) - vim$filter <- agvim_filtered>max_shadow_importance + # TODO: track importance history + # vim$importanceHistory <- + vim$filter <- agvim_filtered>max_shadow_importance # Turns into hitreg vim$boruta <- TRUE vim$boruta_threshold = max_shadow_importance + # TODO: Loop ends here with some reporting. + return(vim) } From f7bfe6d981fc7c810adff9bfec8863d6df83e981 Mon Sep 17 00:00:00 2001 From: trbrick Date: Wed, 3 Jul 2024 15:40:20 +0200 Subject: [PATCH 06/16] .gitignore .DS_Store (MacOS search index) files. --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 47281dc..7836fb7 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ inst/doc doc Meta +.DS_Store From 259814ba371f3466e99e692c9e12c363fbc9d8c5 Mon Sep 17 00:00:00 2001 From: trbrick Date: Wed, 3 Jul 2024 15:53:13 +0200 Subject: [PATCH 07/16] Converted to score-test splitting. --- tests/borutaOneIndicator.R | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/tests/borutaOneIndicator.R b/tests/borutaOneIndicator.R index 52b73ca..6979bfc 100644 --- a/tests/borutaOneIndicator.R +++ b/tests/borutaOneIndicator.R @@ -1,25 +1,32 @@ # Tests a simple boruta model on a basic one-indicator SEM library(OpenMx) +library(semtree) N <- 1000 influence <- c(1, 0) nPred <- length(influence) cutbreaks <- 3 +categorical <- FALSE genModel <- mxModel(type="RAM", manifestVars=c("Y", paste0("X", 1:nPred)), mxPath(paste0("X", 1:nPred), "Y", values=influence), mxPath(c("Y", paste0("X", 1:nPred)), arrows=2, values=1)) simpleData <- mxGenerateData(genModel, N) -for(i in paste0("X", 1:nPred)) { - simpleData[[i]] <- cut(simpleData[[i]], cutbreaks) +if(make_categorical) { + for(i in paste0("X", 1:nPred)) { + simpleData[[i]] <- cut(simpleData[[i]], cutbreaks) + simpleData[[i]] <- mxFactor(simpleData[[i]], levels(simpleData[[i]])) + } } testModel <- mxModel("SimplisticModel", type="RAM", manifestVars="Y", - mxPath("Y", arrows=2, values=1, free=TRUE), - mxPath("one", "Y", values=0, free=TRUE), + mxPath("Y", arrows=2, values=1, free=TRUE, labels=c("Var")), + mxPath("one", "Y", values=0, free=TRUE, labels=c("mu")), mxData(simpleData, type="raw")) -output <- boruta(testModel, simpleData, verbose=TRUE) -output \ No newline at end of file +control <- semforest.control(control=semtree.control(method="score", alpha=1, verbose=T)) + +output <- boruta(testModel, simpleData, verbose=TRUE, control = control) +output From ae525a2c93aacc2e8687ff2db2fd8a5e0fa30ae1 Mon Sep 17 00:00:00 2001 From: trbrick Date: Wed, 3 Jul 2024 15:53:13 +0200 Subject: [PATCH 08/16] Converted to score-test splitting. --- tests/borutaOneIndicator.R | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/tests/borutaOneIndicator.R b/tests/borutaOneIndicator.R index 52b73ca..f8c03b1 100644 --- a/tests/borutaOneIndicator.R +++ b/tests/borutaOneIndicator.R @@ -1,25 +1,32 @@ # Tests a simple boruta model on a basic one-indicator SEM library(OpenMx) +library(semtree) N <- 1000 influence <- c(1, 0) nPred <- length(influence) cutbreaks <- 3 +make_categorical <- FALSE genModel <- mxModel(type="RAM", manifestVars=c("Y", paste0("X", 1:nPred)), mxPath(paste0("X", 1:nPred), "Y", values=influence), mxPath(c("Y", paste0("X", 1:nPred)), arrows=2, values=1)) simpleData <- mxGenerateData(genModel, N) -for(i in paste0("X", 1:nPred)) { - simpleData[[i]] <- cut(simpleData[[i]], cutbreaks) +if(make_categorical) { + for(i in paste0("X", 1:nPred)) { + simpleData[[i]] <- cut(simpleData[[i]], cutbreaks) + simpleData[[i]] <- mxFactor(simpleData[[i]], levels(simpleData[[i]])) + } } testModel <- mxModel("SimplisticModel", type="RAM", manifestVars="Y", - mxPath("Y", arrows=2, values=1, free=TRUE), - mxPath("one", "Y", values=0, free=TRUE), + mxPath("Y", arrows=2, values=1, free=TRUE, labels=c("Var")), + mxPath("one", "Y", values=0, free=TRUE, labels=c("mu")), mxData(simpleData, type="raw")) -output <- boruta(testModel, simpleData, verbose=TRUE) -output \ No newline at end of file +control <- semforest.control(control=semtree.control(method="score", alpha=1, verbose=T)) + +output <- boruta(testModel, simpleData, verbose=TRUE, control = control) +output From 708541c5fbedf6c4ddfda972162580a5dfb07d02 Mon Sep 17 00:00:00 2001 From: trbrick Date: Wed, 3 Jul 2024 23:59:05 +0200 Subject: [PATCH 09/16] Boruta test update: default to categorical; disable verbosity. --- tests/borutaOneIndicator.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/borutaOneIndicator.R b/tests/borutaOneIndicator.R index f8c03b1..7e73b27 100644 --- a/tests/borutaOneIndicator.R +++ b/tests/borutaOneIndicator.R @@ -7,7 +7,7 @@ N <- 1000 influence <- c(1, 0) nPred <- length(influence) cutbreaks <- 3 -make_categorical <- FALSE +make_categorical <- TRUE genModel <- mxModel(type="RAM", manifestVars=c("Y", paste0("X", 1:nPred)), @@ -26,7 +26,7 @@ testModel <- mxModel("SimplisticModel", type="RAM", manifestVars="Y", mxPath("one", "Y", values=0, free=TRUE, labels=c("mu")), mxData(simpleData, type="raw")) -control <- semforest.control(control=semtree.control(method="score", alpha=1, verbose=T)) +control <- semforest.control(control=semtree.control(method="score", alpha=1)) output <- boruta(testModel, simpleData, verbose=TRUE, control = control) output From 120e69f837c8b1eb9cb8a69ba5394cfba9c0e85f Mon Sep 17 00:00:00 2001 From: trbrick Date: Wed, 3 Jul 2024 23:59:35 +0200 Subject: [PATCH 10/16] Added looping and binomial tests. --- R/boruta.R | 145 +++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 101 insertions(+), 44 deletions(-) diff --git a/R/boruta.R b/R/boruta.R index a2d44df..7ced65c 100644 --- a/R/boruta.R +++ b/R/boruta.R @@ -22,6 +22,10 @@ boruta <- function(model, data, control = NULL, predictors = NULL, + maxRuns = 30, + pAdjMethod = "none", + alpha = .05, + verbose=FALSE, ...) { @@ -40,59 +44,112 @@ boruta <- function(model, ui_stop("Unknown model type selected. Use OpenMx or lavaanified lavaan models!"); } - # TODO: Loop over runs has to start here + # Checks on x & y from the boruta package + if(length(grep('^shadow',covariate.ids)>0)) + stop('Attributes with names starting from "shadow" are reserved for internal use. Please rename them.') + if(maxRuns<11) + stop('maxRuns must be greater than 10.') + if(!pAdjMethod %in% stats::p.adjust.methods) + stop(c('P-value adjustment method not found. Must be one of:', + stats::p.adjust.methods)) - # stage 1 - create shadow features - - shadow.ids <- (ncol(data)+1):(ncol(data)+length(covariate.ids)) - - for (cur_cov_id in covariate.ids) { - # pick column and shuffle - temp_column <- data[, cur_cov_id] - temp_column <- sample(temp_column, length(temp_column), replace=FALSE) - # add to dataset as shadow feature - temp_colname <- paste0("shadow_", names(data)[cur_cov_id],collapse="") - data[temp_colname] <- temp_column - if (!is.null(predictors)) predictors <- c(predictors, temp_colname) + # Might clash with some other semtrees stuff + if(is.null(predictors)) { + predictors <- names(data)[covariate.ids] } - # run the forest - forest <- semforest(model, data, control, predictors, ...) - - # run variable importance - vim <- varimp(forest) - - # get variable importance from shadow features - shadow_names <- names(data)[shadow.ids] - agvim <- aggregateVarimp(vim, aggregate="mean") - max_shadow_importance <- max(agvim[names(agvim)%in%shadow_names]) - agvim_filtered <- agvim[!(names(agvim)%in%shadow_names)] - - # TODO: Apply binomial test across feature importance values: - # Code below from the Boruta package: - # decReg is the decision registry: one of "Tentative" "Accept" or "Reject" - # hitReg is a list the length of all features. - # hitReg starts at all zeros, and increments each run that a given feature has - # an importance greater than shadowmax. - # toAccept<-stats::p.adjust(stats::pbinom(hitReg-1,runs,0.5,lower.tail=FALSE),method=pAdjMethod)toAccept - # toReject<-stats::p.adjust(stats::pbinom(hitReg,runs,0.5,lower.tail=TRUE),method=pAdjMethod)toReject - # - # The biasing here means that there are no decisions without correction before 5 runs - # and no decisions with Bonferroni before 7 runs. - + # Initialize and then loop over runs: + impHistory <- data.frame(matrix(NA, nrow=0, ncol=length(predictors)+3)) + names(impHistory) <- c(predictors, "shadowMin", "shadowMean", "shadowMax") + decisionList <- data.frame(predictor=predictors, decision = "Tentative", + importance = NA, hitCount = 0, raw.p=NA, adjusted.p=NA) + # TODO: Parallelize the first five runs. + for(runNo in 1:maxRuns) { + if(verbose) { + message(paste("Beginning Run", runNo)) + } + + # stage 1 - create shadow features + rejected <- decisionList$predictor[decisionList$decision == "Rejected"] + current.predictors <- setdiff(predictors, rejected) + current.covariate.ids <- setdiff(covariate.ids, names(data) %in% rejected) + current.data <- data[, setdiff(names(data), rejected)] + + shadow.ids <- (ncol(current.data)+1):(ncol(current.data)+length(current.covariate.ids)) + + for (cur_cov_id in current.covariate.ids) { + # pick column and shuffle + temp_column <- current.data[, cur_cov_id] + temp_column <- sample(temp_column, length(temp_column), replace=FALSE) + # add to dataset as shadow feature + temp_colname <- paste0("shadow_", names(current.data)[cur_cov_id],collapse="") + current.data[temp_colname] <- temp_column + if (!is.null(current.predictors)) current.predictors <- c(current.predictors, temp_colname) + } + + # TODO: Pre-run model if needed. + + # run the forest + forest <- semforest(model, current.data, control, current.predictors, ...) + + # run variable importance + vim <- varimp(forest) - df<-data.frame(importance=agvim_filtered, predictor=names(agvim_filtered)) + # get variable importance from shadow features + shadow_names <- names(current.data)[shadow.ids] + agvim <- aggregateVarimp(vim, aggregate="mean") + + # Compute shadow stats + shadow_importances <- agvim[names(agvim)%in%shadow_names] + impHistory[runNo, "shadowMax"] <- max_shadow_importance <- max(shadow_importances) + impHistory[runNo, "shadowMin"] <- min(shadow_importances) + impHistory[runNo, "shadowMean"] <- mean(shadow_importances) + agvim_filtered <- agvim[!(names(agvim)%in%shadow_names)] + impHistory[runNo, names(agvim_filtered)] <- agvim_filtered + + # Compute "hits" + hits <- decisionList$predictor %in% names(agvim_filtered[agvim_filtered>max_shadow_importance]) + decisionList$hitCount[hits] <- decisionList$hitCount[hits] + 1 + + # Run tests. + # The biasing here means that there are no decisions without correction + # before 5 runs and no decisions with Bonferroni before 7 runs. + + # Run confirmation tests (pulled from Boruta package) + newPs <- stats::pbinom(decisionList$hitCount-1,runNo,0.5,lower.tail=FALSE) + adjPs <- stats::p.adjust(newPs,method=pAdjMethod) + acceptable <- adjPs < alpha + updateList <- acceptable & decisionList$decision == "Tentative" + decisionList$raw.p[updateList] <- newPs[updateList] + decisionList$adjusted.p[updateList] <- adjPs[updateList] + decisionList$decision[updateList] <- "Confirmed" + + # Run rejection tests (pulled from Boruta package) + newPs <- stats::pbinom(decisionList$hitCount,runNo,0.5,lower.tail=TRUE) + adjPs <- stats::p.adjust(newPs,method=pAdjMethod) + acceptable <- adjPs < alpha + updateList <- acceptable & decisionList$decision == "Tentative" + decisionList$raw.p[updateList] <- newPs[updateList] + decisionList$adjusted.p[updateList] <- adjPs[updateList] + decisionList$decision[updateList] <- "Rejected" + + if(!any(decisionList$decision == "Tentative")) { + break + } + + } - # TODO: track importance history - # vim$importanceHistory <- - vim$filter <- agvim_filtered>max_shadow_importance # Turns into hitreg + vim$importance <- colMeans(impHistory, na.rm = TRUE) + vim$impHistory <- impHistory + vim$decisions <- decisionList$decision + vim$details <- decisionList + + vim$filter <- decisionList$decision == "Confirmed" # Turns into hitreg vim$boruta <- TRUE - vim$boruta_threshold = max_shadow_importance # TODO: Loop ends here with some reporting. return(vim) } + From 41af79067b6ff73a3bbb7dac6e4a7bc56c80ca7b Mon Sep 17 00:00:00 2001 From: trbrick Date: Thu, 4 Jul 2024 00:12:24 +0200 Subject: [PATCH 11/16] Documentation improvements. --- R/boruta.R | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/R/boruta.R b/R/boruta.R index 7ced65c..cfe0ff5 100644 --- a/R/boruta.R +++ b/R/boruta.R @@ -10,9 +10,19 @@ #' @param data A dataframe to boruta on. Same as in \code{semtree}. #' @param control A semforest control object to set forest parameters. #' @param predictors An optional list of covariates. See semtree code example. -#' @param constraints An optional list of covariates. See semtree code example. -#' @param \dots Optional parameters. -#' @return A boruta object. +#' @param maxRuns Maximum number of boruta search cycles +#' @param pAdjMethod A value from \link{stats::p.adjust.methods} defining a +#' multiple testing correction method +#' @param alpha p-value cutoff for decisionmaking. Default .05 +#' @param verbose Verbosity level for boruta processing +#' similar to the same argument in \link{semtree.control} and +#' \link{semforest.control} +#' @param \dots Optional parameters to undefined subfunctions +#' @return A vim object with several elements that need work. +#' Of particular note, `$importance` carries mean importance; +#' `$decision` denotes Accepted/Rejected/Tentative; +#' `$impHistory` has the entire varimp history; and +#' `$details` has exit values for each parameter. #' @author Priyanka Paul, Timothy R. Brick, Andreas Brandmaier #' @seealso \code{\link{semtree}} \code{\link{semforest}} #' @@ -62,7 +72,7 @@ boruta <- function(model, impHistory <- data.frame(matrix(NA, nrow=0, ncol=length(predictors)+3)) names(impHistory) <- c(predictors, "shadowMin", "shadowMean", "shadowMax") decisionList <- data.frame(predictor=predictors, decision = "Tentative", - importance = NA, hitCount = 0, raw.p=NA, adjusted.p=NA) + hitCount = 0, raw.p=NA, adjusted.p=NA) # TODO: Parallelize the first five runs. for(runNo in 1:maxRuns) { From 731bebbaaf6fcb080e256e01e94838975c847fc0 Mon Sep 17 00:00:00 2001 From: LeonieHagitte Date: Thu, 5 Sep 2024 14:35:20 +0200 Subject: [PATCH 12/16] improved plotting functionality for BORUTA fixed coercion problem in tables --- DESCRIPTION | 3 +- R/boruta.R | 102 ++++++++++++++++++----------------------- R/toTable.R | 9 ++-- man/boruta.Rd | 31 +++++++++++-- man/semtree.control.Rd | 4 +- 5 files changed, 80 insertions(+), 69 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 2c97d4f..7ccbd7f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -26,7 +26,8 @@ Imports: future.apply, data.table, expm, - gridBase + gridBase, + dplyr Suggests: knitr, rmarkdown, diff --git a/R/boruta.R b/R/boruta.R index 0ba714b..ec87a56 100644 --- a/R/boruta.R +++ b/R/boruta.R @@ -27,7 +27,6 @@ #' #' @keywords tree models multivariate #' @export - boruta <- function(model, data, control = NULL, @@ -38,63 +37,21 @@ boruta <- function(model, verbose=FALSE, ...) { - # initial checks - stopifnot(percentile_threshold>=0) - stopifnot(percentile_threshold<=1) - stopifnot(is.numeric(rounds)) - stopifnot(rounds>0) - - preds_important <- c() - preds_unimportant <- c() - - cur_round = 1 - temp_vims <- list() - - while(cur_round <= rounds) { - vim_boruta <- .boruta(model=model, - data=data, - control=control, - predictors=predictors, - percentile_threshold = percentile_threshold, - ...) - browser() - # add predictors to list of unimportant variables - preds_unimportant <- c(preds_unimportant, names(vim_boruta$filter)[!vim_boruta$filter]) - # remove them from the dataset - data <- data[, -c(preds_unimportant)] - temp_vims[[cur_round]] <-vim_boruta - } - - result <- list( - preds_unimportant, - rounds = rounds - ) - return(result) -} - -.boruta <- function(model, - data, - control = NULL, - predictors = NULL, - percentile_threshold = 1, - num_shadows = 1, - ...) { - - # make sure that no column names start with "shadow_" prefix - stopifnot(all(sapply(names(data), function(x) {!startsWith(x, "shadow_")}))) - # detect model (warning: duplicated code) - if (inherits(model, "MxModel") || inherits(model, "MxRAMModel")) { - tmp <- getPredictorsOpenMx(mxmodel = model, dataset = data, covariates = predictors) - - } else if (inherits(model,"lavaan")){ + if (inherits(model,"MxModel") || inherits(model,"MxRAMModel")) { + tmp <- getPredictorsOpenMx(mxmodel=model, dataset=data, covariates=predictors) + model.ids <- tmp[[1]] + covariate.ids <- tmp[[2]] + # } else if (inherits(model,"lavaan")){ - tmp <- getPredictorsLavaan(model, data, predictors) + # } else if ((inherits(model,"ctsemFit")) || (inherits(model,"ctsemInit"))) { + # } else { - ui_stop("Unknown model type selected. Use OpenMx or lavaanified lavaan models!") + ui_stop("Unknown model type selected. Use OpenMx or lavaanified lavaan models!"); } - + + # initial checks # Checks on x & y from the boruta package if(length(grep('^shadow',covariate.ids)>0)) stop('Attributes with names starting from "shadow" are reserved for internal use. Please rename them.') @@ -114,7 +71,7 @@ boruta <- function(model, names(impHistory) <- c(predictors, "shadowMin", "shadowMean", "shadowMax") decisionList <- data.frame(predictor=predictors, decision = "Tentative", hitCount = 0, raw.p=NA, adjusted.p=NA) - + # TODO: Parallelize the first five runs. for(runNo in 1:maxRuns) { if(verbose) { @@ -143,10 +100,10 @@ boruta <- function(model, # run the forest forest <- semforest(model, current.data, control, current.predictors, ...) - + # run variable importance vim <- varimp(forest) - + # get variable importance from shadow features shadow_names <- names(current.data)[shadow.ids] agvim <- aggregateVarimp(vim, aggregate="mean") @@ -195,12 +152,43 @@ boruta <- function(model, vim$impHistory <- impHistory vim$decisions <- decisionList$decision vim$details <- decisionList - + vim$filter <- decisionList$decision == "Confirmed" # Turns into hitreg vim$boruta <- TRUE + class(vim) <- "boruta" + # TODO: Loop ends here with some reporting. return(vim) } +#' @exportS3Method plot boruta +plot.boruta = function(vim, type=0, ...) { + decisionList = vim$details + impHistory = vim$impHistory + impHistory <- impHistory |> + dplyr::mutate(rnd=1:nrow(impHistory)) |> + tidyr::pivot_longer(cols = -last_col()) |> #everything()) |> + dplyr::left_join(data.frame(decisionList), by=join_by("name"=="predictor")) |> + dplyr::mutate(decision = case_when(is.na(decision)~"Shadow", .default=decision)) |> + dplyr::group_by(name) |> mutate(median_value = median(value,na.rm=TRUE)) + + if (type==0) { + ggplot2::ggplot(impHistory, + aes(x=stats::reorder(name, median_value), + y=value, color=decision)) + + ggplot2::geom_boxplot()+ + ggplot2::xlab("")+ + ggplot2::ylab("Importance")+ + scale_color_discrete(name = "Predictor")+ + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + } else if (type==1) { + ggplot2::ggplot(impHistory, + aes(x=rnd, y=value,group=name,col=name))+geom_line()+ geom_hline(aes(yintercept=median_value,col=name),lwd=2)+ + xlab("Round")+ylab("Importance")+scale_color_discrete(name = "Predictor") + } else { + stop("Unknown graph type. Please choose 0 or 1.") + } + +} \ No newline at end of file diff --git a/R/toTable.R b/R/toTable.R index 337c235..2be2bf7 100644 --- a/R/toTable.R +++ b/R/toTable.R @@ -66,7 +66,6 @@ alls <- unique(alls) # create table -#covariate.names <-simplify2array(tree$result$btn.matrix[2,]) covariate.names <- getCovariatesFromTree(tree) # default is to display all parameters @@ -79,10 +78,12 @@ all.names <- c(covariate.names, added.param.cols) str.matrix <- matrix(NA, nrow = length(rowdata),ncol=length(all.names)) +# convert to a data frame to avoid coercion to string +str.matrix <- data.frame(str.matrix) + colnames(str.matrix) <- all.names for (i in 1:length(rowdata)) { -# result.row <- rep(" ",length(covariate.names)) myrow <- rowdata[[i]][[1]] for (j in 1:length(myrow)) { myitem <- myrow[[j]] @@ -126,7 +127,6 @@ for (i in 1:length(rowdata)) { } } -# result.string <- paste(result.string,paste(result.row,collapse="\t"),"\n") } ## prune empty columns? @@ -139,7 +139,7 @@ if (length(is.col.empty)>0) { sortby <- apply(str.matrix,2,function(x){sum(!is.na(x))}) if (!is.null(added.param.cols)) { remids <- (dim(str.matrix)[2]-length(added.param.cols)+1):(dim(str.matrix)[2]) - sortby[remids] <- sortby[remids]-999999 + sortby[remids] <- sortby[remids]-999999 # bad style ^^ #TODO } sort.ix <- sort(sortby,index.return=TRUE,decreasing = TRUE)$ix str.matrix <- str.matrix[, sort.ix] @@ -156,7 +156,6 @@ str.matrix[is.na(str.matrix)]<-"" ## and display #cat(result.string) -#require("openxlsx") return(str.matrix) diff --git a/man/boruta.Rd b/man/boruta.Rd index 4921d25..6a8cc8a 100644 --- a/man/boruta.Rd +++ b/man/boruta.Rd @@ -6,7 +6,17 @@ \alias{print.boruta} \title{Run the Boruta algorithm on a sem tree} \usage{ -boruta(model, data, control = NULL, predictors = NULL, ...) +boruta( + model, + data, + control = NULL, + predictors = NULL, + maxRuns = 30, + pAdjMethod = "none", + alpha = 0.05, + verbose = FALSE, + ... +) } \arguments{ \item{model}{A template SEM. Same as in \code{semtree}.} @@ -17,12 +27,25 @@ boruta(model, data, control = NULL, predictors = NULL, ...) \item{predictors}{An optional list of covariates. See semtree code example.} -\item{\dots}{Optional parameters.} +\item{maxRuns}{Maximum number of boruta search cycles} -\item{constraints}{An optional list of covariates. See semtree code example.} +\item{pAdjMethod}{A value from \link{stats::p.adjust.methods} defining a +multiple testing correction method} + +\item{alpha}{p-value cutoff for decisionmaking. Default .05} + +\item{verbose}{Verbosity level for boruta processing +similar to the same argument in \link{semtree.control} and +\link{semforest.control}} + +\item{\dots}{Optional parameters to undefined subfunctions} } \value{ -A boruta object. +A vim object with several elements that need work. + Of particular note, `$importance` carries mean importance; + `$decision` denotes Accepted/Rejected/Tentative; + `$impHistory` has the entire varimp history; and + `$details` has exit values for each parameter. } \description{ Grows a series of SEM Forests following the boruta algorithm to determine diff --git a/man/semtree.control.Rd b/man/semtree.control.Rd index 7ce8847..df9d886 100644 --- a/man/semtree.control.Rd +++ b/man/semtree.control.Rd @@ -8,7 +8,7 @@ \usage{ semtree.control( method = c("naive", "score", "fair", "fair3"), - min.N = 20, + min.N = NULL, max.depth = NA, alpha = 0.05, alpha.invariance = NA, @@ -24,7 +24,7 @@ semtree.control( report.level = 0, exclude.code = NA, linear = TRUE, - min.bucket = 10, + min.bucket = NULL, naive.bonferroni.type = 0, missing = "ignore", use.maxlm = FALSE, From eb271f431024275a01b151bb1665594f1d3e744c Mon Sep 17 00:00:00 2001 From: LeonieHagitte Date: Fri, 6 Sep 2024 10:27:55 +0200 Subject: [PATCH 13/16] extension of BORUTA --- NAMESPACE | 1 + R/boruta.R | 22 +++++++++++++--------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 0158412..bf7c43e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,6 +8,7 @@ S3method(merge,semforest) S3method(nobs,semtree) S3method(partialDependence,semforest) S3method(partialDependence,semforest_stripped) +S3method(plot,boruta) S3method(plot,diversityMatrix) S3method(plot,partialDependence) S3method(plot,semforest.proximity) diff --git a/R/boruta.R b/R/boruta.R index ec87a56..71059c1 100644 --- a/R/boruta.R +++ b/R/boruta.R @@ -43,10 +43,10 @@ boruta <- function(model, tmp <- getPredictorsOpenMx(mxmodel=model, dataset=data, covariates=predictors) model.ids <- tmp[[1]] covariate.ids <- tmp[[2]] - # } else if (inherits(model,"lavaan")){ - - # } else if ((inherits(model,"ctsemFit")) || (inherits(model,"ctsemInit"))) { - # + } else if (inherits(model,"lavaan")){ + tmp <- getPredictorsLavaan(model, dataset=data, covariates=predictors) + model.ids <- tmp[[1]] + covariate.ids <- tmp[[2]] } else { ui_stop("Unknown model type selected. Use OpenMx or lavaanified lavaan models!"); } @@ -172,7 +172,7 @@ plot.boruta = function(vim, type=0, ...) { tidyr::pivot_longer(cols = -last_col()) |> #everything()) |> dplyr::left_join(data.frame(decisionList), by=join_by("name"=="predictor")) |> dplyr::mutate(decision = case_when(is.na(decision)~"Shadow", .default=decision)) |> - dplyr::group_by(name) |> mutate(median_value = median(value,na.rm=TRUE)) + dplyr::group_by(name) |> dplyr::mutate(median_value = median(value,na.rm=TRUE)) if (type==0) { ggplot2::ggplot(impHistory, @@ -181,12 +181,16 @@ plot.boruta = function(vim, type=0, ...) { ggplot2::geom_boxplot()+ ggplot2::xlab("")+ ggplot2::ylab("Importance")+ - scale_color_discrete(name = "Predictor")+ - theme(axis.text.x = element_text(angle = 45, hjust = 1)) + ggplot2::scale_color_discrete(name = "Decision")+ + ggplot2::theme(axis.text.x = element_text(angle = 45, hjust = 1)) } else if (type==1) { ggplot2::ggplot(impHistory, - aes(x=rnd, y=value,group=name,col=name))+geom_line()+ geom_hline(aes(yintercept=median_value,col=name),lwd=2)+ - xlab("Round")+ylab("Importance")+scale_color_discrete(name = "Predictor") + ggplot2::aes(x=rnd, y=value,group=name,col=name))+ + ggplot2::geom_line()+ + ggplot2::geom_hline(aes(yintercept=median_value,col=name),lwd=2)+ + ggplot2::xlab("Round")+ + ggplot2::ylab("Importance")+ + ggplot2::scale_color_discrete(name = "Predictor") } else { stop("Unknown graph type. Please choose 0 or 1.") } From 222cb97f91c6444af8afce603d47517a01331c13 Mon Sep 17 00:00:00 2001 From: LeonieHagitte Date: Fri, 6 Sep 2024 10:42:38 +0200 Subject: [PATCH 14/16] added imports reformatting --- R/boruta.R | 186 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 113 insertions(+), 73 deletions(-) diff --git a/R/boruta.R b/R/boruta.R index 71059c1..066de45 100644 --- a/R/boruta.R +++ b/R/boruta.R @@ -1,30 +1,30 @@ #' Run the Boruta algorithm on a sem tree -#' +#' #' Grows a series of SEM Forests following the boruta algorithm to determine #' feature importance as moderators of the underlying model. -#' -#' +#' +#' #' @aliases boruta plot.boruta print.boruta #' @param model A template SEM. Same as in \code{semtree}. #' @param data A dataframe to boruta on. Same as in \code{semtree}. #' @param control A semforest control object to set forest parameters. #' @param predictors An optional list of covariates. See semtree code example. #' @param maxRuns Maximum number of boruta search cycles -#' @param pAdjMethod A value from \link{stats::p.adjust.methods} defining a +#' @param pAdjMethod A value from \link{stats::p.adjust.methods} defining a #' multiple testing correction method #' @param alpha p-value cutoff for decisionmaking. Default .05 -#' @param verbose Verbosity level for boruta processing -#' similar to the same argument in \link{semtree.control} and +#' @param verbose Verbosity level for boruta processing +#' similar to the same argument in \link{semtree.control} and #' \link{semforest.control} #' @param \dots Optional parameters to undefined subfunctions -#' @return A vim object with several elements that need work. -#' Of particular note, `$importance` carries mean importance; +#' @return A vim object with several elements that need work. +#' Of particular note, `$importance` carries mean importance; #' `$decision` denotes Accepted/Rejected/Tentative; #' `$impHistory` has the entire varimp history; and #' `$details` has exit values for each parameter. #' @author Priyanka Paul, Timothy R. Brick, Andreas Brandmaier #' @seealso \code{\link{semtree}} \code{\link{semforest}} -#' +#' #' @keywords tree models multivariate #' @export boruta <- function(model, @@ -34,99 +34,124 @@ boruta <- function(model, maxRuns = 30, pAdjMethod = "none", alpha = .05, - verbose=FALSE, + verbose = FALSE, ...) { - - # detect model (warning: duplicated code) - if (inherits(model,"MxModel") || inherits(model,"MxRAMModel")) { - tmp <- getPredictorsOpenMx(mxmodel=model, dataset=data, covariates=predictors) + if (inherits(model, "MxModel") || inherits(model, "MxRAMModel")) { + tmp <- + getPredictorsOpenMx(mxmodel = model, + dataset = data, + covariates = predictors) model.ids <- tmp[[1]] covariate.ids <- tmp[[2]] - } else if (inherits(model,"lavaan")){ - tmp <- getPredictorsLavaan(model, dataset=data, covariates=predictors) + } else if (inherits(model, "lavaan")) { + tmp <- + getPredictorsLavaan(model, dataset = data, covariates = predictors) model.ids <- tmp[[1]] covariate.ids <- tmp[[2]] } else { - ui_stop("Unknown model type selected. Use OpenMx or lavaanified lavaan models!"); + ui_stop("Unknown model type selected. Use OpenMx or lavaanified lavaan models!") + } # initial checks # Checks on x & y from the boruta package - if(length(grep('^shadow',covariate.ids)>0)) - stop('Attributes with names starting from "shadow" are reserved for internal use. Please rename them.') - if(maxRuns<11) + if (length(grep('^shadow', covariate.ids) > 0)) + stop( + 'Attributes with names starting from "shadow" are reserved for internal use. Please rename them.' + ) + if (maxRuns < 11) stop('maxRuns must be greater than 10.') - if(!pAdjMethod %in% stats::p.adjust.methods) - stop(c('P-value adjustment method not found. Must be one of:', - stats::p.adjust.methods)) + if (!pAdjMethod %in% stats::p.adjust.methods) + stop(c( + 'P-value adjustment method not found. Must be one of:', + stats::p.adjust.methods + )) # Might clash with some other semtrees stuff - if(is.null(predictors)) { + if (is.null(predictors)) { predictors <- names(data)[covariate.ids] } # Initialize and then loop over runs: - impHistory <- data.frame(matrix(NA, nrow=0, ncol=length(predictors)+3)) - names(impHistory) <- c(predictors, "shadowMin", "shadowMean", "shadowMax") - decisionList <- data.frame(predictor=predictors, decision = "Tentative", - hitCount = 0, raw.p=NA, adjusted.p=NA) + impHistory <- + data.frame(matrix(NA, nrow = 0, ncol = length(predictors) + 3)) + names(impHistory) <- + c(predictors, "shadowMin", "shadowMean", "shadowMax") + decisionList <- + data.frame( + predictor = predictors, + decision = "Tentative", + hitCount = 0, + raw.p = NA, + adjusted.p = NA + ) # TODO: Parallelize the first five runs. - for(runNo in 1:maxRuns) { - if(verbose) { + for (runNo in 1:maxRuns) { + if (verbose) { message(paste("Beginning Run", runNo)) } # stage 1 - create shadow features - rejected <- decisionList$predictor[decisionList$decision == "Rejected"] + rejected <- + decisionList$predictor[decisionList$decision == "Rejected"] current.predictors <- setdiff(predictors, rejected) - current.covariate.ids <- setdiff(covariate.ids, names(data) %in% rejected) + current.covariate.ids <- + setdiff(covariate.ids, names(data) %in% rejected) current.data <- data[, setdiff(names(data), rejected)] - shadow.ids <- (ncol(current.data)+1):(ncol(current.data)+length(current.covariate.ids)) + shadow.ids <- + (ncol(current.data) + 1):(ncol(current.data) + length(current.covariate.ids)) for (cur_cov_id in current.covariate.ids) { # pick column and shuffle temp_column <- current.data[, cur_cov_id] - temp_column <- sample(temp_column, length(temp_column), replace=FALSE) + temp_column <- + sample(temp_column, length(temp_column), replace = FALSE) # add to dataset as shadow feature - temp_colname <- paste0("shadow_", names(current.data)[cur_cov_id],collapse="") + temp_colname <- + paste0("shadow_", names(current.data)[cur_cov_id], collapse = "") current.data[temp_colname] <- temp_column - if (!is.null(current.predictors)) current.predictors <- c(current.predictors, temp_colname) + if (!is.null(current.predictors)) + current.predictors <- c(current.predictors, temp_colname) } # TODO: Pre-run model if needed. # run the forest - forest <- semforest(model, current.data, control, current.predictors, ...) + forest <- + semforest(model, current.data, control, current.predictors, ...) # run variable importance vim <- varimp(forest) # get variable importance from shadow features shadow_names <- names(current.data)[shadow.ids] - agvim <- aggregateVarimp(vim, aggregate="mean") + agvim <- aggregateVarimp(vim, aggregate = "mean") # Compute shadow stats - shadow_importances <- agvim[names(agvim)%in%shadow_names] - impHistory[runNo, "shadowMax"] <- max_shadow_importance <- max(shadow_importances) + shadow_importances <- agvim[names(agvim) %in% shadow_names] + impHistory[runNo, "shadowMax"] <- + max_shadow_importance <- max(shadow_importances) impHistory[runNo, "shadowMin"] <- min(shadow_importances) impHistory[runNo, "shadowMean"] <- mean(shadow_importances) - agvim_filtered <- agvim[!(names(agvim)%in%shadow_names)] + agvim_filtered <- agvim[!(names(agvim) %in% shadow_names)] impHistory[runNo, names(agvim_filtered)] <- agvim_filtered - # Compute "hits" - hits <- decisionList$predictor %in% names(agvim_filtered[agvim_filtered>max_shadow_importance]) + # Compute "hits" + hits <- + decisionList$predictor %in% names(agvim_filtered[agvim_filtered > max_shadow_importance]) decisionList$hitCount[hits] <- decisionList$hitCount[hits] + 1 - # Run tests. - # The biasing here means that there are no decisions without correction + # Run tests. + # The biasing here means that there are no decisions without correction # before 5 runs and no decisions with Bonferroni before 7 runs. # Run confirmation tests (pulled from Boruta package) - newPs <- stats::pbinom(decisionList$hitCount-1,runNo,0.5,lower.tail=FALSE) - adjPs <- stats::p.adjust(newPs,method=pAdjMethod) + newPs <- + stats::pbinom(decisionList$hitCount - 1, runNo, 0.5, lower.tail = FALSE) + adjPs <- stats::p.adjust(newPs, method = pAdjMethod) acceptable <- adjPs < alpha updateList <- acceptable & decisionList$decision == "Tentative" decisionList$raw.p[updateList] <- newPs[updateList] @@ -134,15 +159,16 @@ boruta <- function(model, decisionList$decision[updateList] <- "Confirmed" # Run rejection tests (pulled from Boruta package) - newPs <- stats::pbinom(decisionList$hitCount,runNo,0.5,lower.tail=TRUE) - adjPs <- stats::p.adjust(newPs,method=pAdjMethod) + newPs <- + stats::pbinom(decisionList$hitCount, runNo, 0.5, lower.tail = TRUE) + adjPs <- stats::p.adjust(newPs, method = pAdjMethod) acceptable <- adjPs < alpha updateList <- acceptable & decisionList$decision == "Tentative" decisionList$raw.p[updateList] <- newPs[updateList] decisionList$adjusted.p[updateList] <- adjPs[updateList] decisionList$decision[updateList] <- "Rejected" - if(!any(decisionList$decision == "Tentative")) { + if (!any(decisionList$decision == "Tentative")) { break } @@ -153,7 +179,8 @@ boruta <- function(model, vim$decisions <- decisionList$decision vim$details <- decisionList - vim$filter <- decisionList$decision == "Confirmed" # Turns into hitreg + vim$filter <- + decisionList$decision == "Confirmed" # Turns into hitreg vim$boruta <- TRUE class(vim) <- "boruta" @@ -163,33 +190,46 @@ boruta <- function(model, return(vim) } + #' @exportS3Method plot boruta -plot.boruta = function(vim, type=0, ...) { +plot.boruta = function(vim, type = 0, ...) { decisionList = vim$details impHistory = vim$impHistory impHistory <- impHistory |> - dplyr::mutate(rnd=1:nrow(impHistory)) |> - tidyr::pivot_longer(cols = -last_col()) |> #everything()) |> - dplyr::left_join(data.frame(decisionList), by=join_by("name"=="predictor")) |> - dplyr::mutate(decision = case_when(is.na(decision)~"Shadow", .default=decision)) |> - dplyr::group_by(name) |> dplyr::mutate(median_value = median(value,na.rm=TRUE)) - - if (type==0) { - ggplot2::ggplot(impHistory, - aes(x=stats::reorder(name, median_value), - y=value, color=decision)) + - ggplot2::geom_boxplot()+ - ggplot2::xlab("")+ - ggplot2::ylab("Importance")+ - ggplot2::scale_color_discrete(name = "Decision")+ - ggplot2::theme(axis.text.x = element_text(angle = 45, hjust = 1)) - } else if (type==1) { + dplyr::mutate(rnd = 1:nrow(impHistory)) |> + tidyr::pivot_longer(cols = -last_col()) |> #everything()) |> + dplyr::left_join(data.frame(decisionList), + by = dplyr::join_by("name" == "predictor")) |> + dplyr::mutate(decision = + case_when(is.na(decision) ~ "Shadow", .default = decision)) |> + dplyr::group_by(name) |> + dplyr::mutate(median_value = median(value, na.rm = TRUE)) + + if (type == 0) { + ggplot2::ggplot(impHistory, + ggplot2::aes( + x = stats::reorder(name, median_value), + y = value, + color = decision + )) + + ggplot2::geom_boxplot() + + ggplot2::xlab("") + + ggplot2::ylab("Importance") + + ggplot2::scale_color_discrete(name = "Decision") + + ggplot2::theme(axis.text.x = element_text(angle = 45, hjust = 1)) + } else if (type == 1) { ggplot2::ggplot(impHistory, - ggplot2::aes(x=rnd, y=value,group=name,col=name))+ - ggplot2::geom_line()+ - ggplot2::geom_hline(aes(yintercept=median_value,col=name),lwd=2)+ - ggplot2::xlab("Round")+ - ggplot2::ylab("Importance")+ + ggplot2::aes( + x = rnd, + y = value, + group = name, + col = name + )) + + ggplot2::geom_line() + + ggplot2::geom_hline(aes(yintercept = median_value, col = name), lwd = + 2) + + ggplot2::xlab("Round") + + ggplot2::ylab("Importance") + ggplot2::scale_color_discrete(name = "Predictor") } else { stop("Unknown graph type. Please choose 0 or 1.") From b9d6ba430e5913df81c93cf136c6a0c5e85cbf21 Mon Sep 17 00:00:00 2001 From: brandmaier Date: Wed, 18 Sep 2024 15:09:46 +0200 Subject: [PATCH 15/16] added quantiles --- R/boruta.R | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/R/boruta.R b/R/boruta.R index 066de45..a765f14 100644 --- a/R/boruta.R +++ b/R/boruta.R @@ -88,9 +88,13 @@ boruta <- function(model, ) # TODO: Parallelize the first five runs. + end_time <- NULL for (runNo in 1:maxRuns) { + start_time <- Sys.time() if (verbose) { - message(paste("Beginning Run", runNo)) + time_info <- "" + if (!is.null(end_time)) time_info <- paste0("Last run took", (end_time-start_time)) + message(paste("Beginning Run", runNo," ",time_info)) } # stage 1 - create shadow features @@ -133,9 +137,9 @@ boruta <- function(model, # Compute shadow stats shadow_importances <- agvim[names(agvim) %in% shadow_names] impHistory[runNo, "shadowMax"] <- - max_shadow_importance <- max(shadow_importances) - impHistory[runNo, "shadowMin"] <- min(shadow_importances) - impHistory[runNo, "shadowMean"] <- mean(shadow_importances) + max_shadow_importance <- max(shadow_importances, na.rm=TRUE) + impHistory[runNo, "shadowMin"] <- min(shadow_importances, na.rm=TRUE) + impHistory[runNo, "shadowMean"] <- mean(shadow_importances, na.rm=TRUE) agvim_filtered <- agvim[!(names(agvim) %in% shadow_names)] impHistory[runNo, names(agvim_filtered)] <- agvim_filtered @@ -172,6 +176,8 @@ boruta <- function(model, break } + end_time <- Sys.time() + } vim$importance <- colMeans(impHistory, na.rm = TRUE) @@ -201,14 +207,21 @@ plot.boruta = function(vim, type = 0, ...) { dplyr::left_join(data.frame(decisionList), by = dplyr::join_by("name" == "predictor")) |> dplyr::mutate(decision = - case_when(is.na(decision) ~ "Shadow", .default = decision)) |> + dplyr::case_when(is.na(decision) ~ "Shadow", .default = decision)) |> dplyr::group_by(name) |> dplyr::mutate(median_value = median(value, na.rm = TRUE)) if (type == 0) { + + # sort Inf values to the left + #impHistory[is.na(impHistory$median_value)] <- -Inf + impHistory$sort_value <- impHistory$median_value + # mv <- min(impHistory$sort_value,na.rm=TRUE) + # impHistory$sort_value[impHistory$decision=="Rejected"]<- (mv-1) + ggplot2::ggplot(impHistory, ggplot2::aes( - x = stats::reorder(name, median_value), + x = stats::reorder(name, sort_value), y = value, color = decision )) + @@ -216,7 +229,7 @@ plot.boruta = function(vim, type = 0, ...) { ggplot2::xlab("") + ggplot2::ylab("Importance") + ggplot2::scale_color_discrete(name = "Decision") + - ggplot2::theme(axis.text.x = element_text(angle = 45, hjust = 1)) + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) } else if (type == 1) { ggplot2::ggplot(impHistory, ggplot2::aes( @@ -226,7 +239,7 @@ plot.boruta = function(vim, type = 0, ...) { col = name )) + ggplot2::geom_line() + - ggplot2::geom_hline(aes(yintercept = median_value, col = name), lwd = + ggplot2::geom_hline(ggplot2::aes(yintercept = median_value, col = name), lwd = 2) + ggplot2::xlab("Round") + ggplot2::ylab("Importance") + From fd4116f2a22b7340e381211f00570fbcae5c7099 Mon Sep 17 00:00:00 2001 From: brandmaier Date: Mon, 30 Sep 2024 08:50:43 +0200 Subject: [PATCH 16/16] add quantile function --- R/boruta.R | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/R/boruta.R b/R/boruta.R index a765f14..5831899 100644 --- a/R/boruta.R +++ b/R/boruta.R @@ -35,6 +35,7 @@ boruta <- function(model, pAdjMethod = "none", alpha = .05, verbose = FALSE, + quant = 1, ...) { # detect model (warning: duplicated code) if (inherits(model, "MxModel") || inherits(model, "MxRAMModel")) { @@ -136,8 +137,11 @@ boruta <- function(model, # Compute shadow stats shadow_importances <- agvim[names(agvim) %in% shadow_names] - impHistory[runNo, "shadowMax"] <- - max_shadow_importance <- max(shadow_importances, na.rm=TRUE) + impHistory[runNo, "shadowMax"] <- max(shadow_importances, na.rm=TRUE) + + max_shadow_importance <- stats::quantile(shadow_importances, + probs=quant,na.rm=TRUE) + impHistory[runNo, "shadowMin"] <- min(shadow_importances, na.rm=TRUE) impHistory[runNo, "shadowMean"] <- mean(shadow_importances, na.rm=TRUE) agvim_filtered <- agvim[!(names(agvim) %in% shadow_names)]