From e6e5a63b29797b6212e709fd0dad18587e75e30a Mon Sep 17 00:00:00 2001 From: "Andreas M. Brandmaier" Date: Sun, 24 Mar 2024 21:40:28 +0100 Subject: [PATCH] added styling using styler package --- R/evaluateTree.R | 25 +- R/evaluateTreeFocus.R | 49 ++-- R/fairSplit.R | 536 ++++++++++++++++++++++-------------------- R/getLikelihood.R | 12 +- R/getNumNodes.R | 47 ++-- R/growTree.R | 530 +++++++++++++++++++++-------------------- R/naiveSplit.R | 205 ++++++++-------- R/print.semforest.R | 32 +-- R/print.semtree.R | 52 ++-- R/semtree.R | 379 +++++++++++++++-------------- 10 files changed, 950 insertions(+), 917 deletions(-) diff --git a/R/evaluateTree.R b/R/evaluateTree.R index 181489d..13e685c 100644 --- a/R/evaluateTree.R +++ b/R/evaluateTree.R @@ -1,11 +1,11 @@ #' Evaluate Tree -2LL -#' +#' #' A helper function to evaluate the negative two log-likelihood (-2LL) of leaf (terminal) nodes for a #' dataset. When given a \code{\link{semtree}} and a unique dataset, the model #' estimates -2LL for the tree parameters and data subsets that fit the tree #' branching criteria. -#' -#' +#' +#' #' @param tree A fitted \code{\link{semtree}} object #' @param test_set Dataset to fit to a fitted \code{\link{semtree}} object #' @param data_type type of data ("raw", "cov", "cor") @@ -26,32 +26,29 @@ evaluateTree <- function(tree, test_set, data_type = "raw", - leaf_ids = NULL) - { + leaf_ids = NULL) { # get a mapping of dataset rows to leaf ids if (is.null(leaf_ids)) { leaf_ids <- traverse(tree, test_set) } - + # for each leaf, calculate deviance of each data row dev <- 0 for (leaf_id in unique(leaf_ids)) { - temp_set <- test_set[leaf_ids == leaf_id,] - - + temp_set <- test_set[leaf_ids == leaf_id, ] + + leaf <- getNodeById(tree, leaf_id) - + # add up log-likelihoods dev <- dev + evaluateDataLikelihood(leaf$model, temp_set[, , drop = F], data_type) } - + result <- list() result$deviance <- dev result$num_models <- length(unique(leaf_ids)) - + return(result) - - } diff --git a/R/evaluateTreeFocus.R b/R/evaluateTreeFocus.R index 9160d87..23a15bd 100644 --- a/R/evaluateTreeFocus.R +++ b/R/evaluateTreeFocus.R @@ -4,60 +4,59 @@ # computes the difference in log-likelihoods considering focus parameters # evaluateTreeFocus <- - function(tree, test_set, data_type="raw", leaf_ids=NULL) - { - + function(tree, test_set, data_type = "raw", leaf_ids = NULL) { # get a mapping of dataset rows to leaf ids if (is.null(leaf_ids)) { leaf_ids <- traverse(tree, test_set) } - + # for each leaf, calculate deviance of each data row dev <- 0 for (leaf_id in unique(leaf_ids)) { # get all data rows from current leaf - temp_set <- test_set[leaf_ids==leaf_id, ]; - + temp_set <- test_set[leaf_ids == leaf_id, ] + # get the leaf object - leaf <- getNodeById( tree, leaf_id) - + leaf <- getNodeById(tree, leaf_id) + # test if node has a focus model if (is.null(leaf$focus.model)) { ui_warn("No focus model available!") return(NA) } - + # evaluate log-likelihood from baseline and focus model - #baseline = evaluateDataLikelihood(leaf$model, temp_set[,,drop=F], data_type ) - ll.focus = evaluateDataLikelihood(leaf$focus.model, - temp_set[,,drop=F], data_type ) - + # baseline = evaluateDataLikelihood(leaf$model, temp_set[,,drop=F], data_type ) + ll.focus <- evaluateDataLikelihood( + leaf$focus.model, + temp_set[, , drop = F], data_type + ) + # evaluate log-likelihood after permutation - - + + # add up log-likelihoods dev <- dev + ll.focus } - + result <- list() result$deviance <- dev result$num_models <- length(unique(leaf_ids)) - - return(result); - + + return(result) } # TODO: finish this block # TODO: remove earlier computation of baseline ll # compute influence of focus parameter before permutation -#ll.baseline <- eval.fun(tree, oob.data)$deviance -#ll.baseline <- fitSubmodels(tree$model, subset1, subset2, +# ll.baseline <- eval.fun(tree, oob.data)$deviance +# ll.baseline <- fitSubmodels(tree$model, subset1, subset2, # control, invariance=constraints$focus.parameters) # compute misfit of focus parameter after permutation -#ll.permuted <- eval.fun(tree, oob.data.permuted)$deviance -#ll.diff <- -ll.baseline + ll.permuted -#ui_warn("Unfinished implementation!") +# ll.permuted <- eval.fun(tree, oob.data.permuted)$deviance +# ll.diff <- -ll.baseline + ll.permuted +# ui_warn("Unfinished implementation!") -#} \ No newline at end of file +# } diff --git a/R/fairSplit.R b/R/fairSplit.R index 181445d..12333ca 100644 --- a/R/fairSplit.R +++ b/R/fairSplit.R @@ -6,43 +6,43 @@ fairSplit <- meta = NULL, constraints = NULL, ...) { - - # add a column of uniform random numbers to the data + # add a column of uniform random numbers to the data # to split into two halfes # TODO: replace with sample() of row ids n <- nrow(mydata) random <- runif(n, 0, 1) mydata <- cbind(mydata, random) - cross1 <- subset (mydata, as.numeric(mydata[, ncol(mydata)]) > 0.50) + cross1 <- subset(mydata, as.numeric(mydata[, ncol(mydata)]) > 0.50) cross2 <- - subset (mydata, as.numeric(mydata[, ncol(mydata)]) <= 0.50) + subset(mydata, as.numeric(mydata[, ncol(mydata)]) <= 0.50) cross1 <- cross1[-ncol(cross1)] cross2 <- cross2[-ncol(cross2)] mydata <- mydata[-ncol(mydata)] - + LL.btn <- c() split.btn <- c() cov.btn.names <- c() cov.btn.cols <- c() cov.type <- c() - - #Baseline model to compare subgroup fits to + + # Baseline model to compare subgroup fits to modelnew <- mxAddNewModelData(model, cross1, name = "BASE MODEL C1") LL.overall <- safeRunAndEvaluate(modelnew) - suppressWarnings(if (is.na(LL.overall)) - return(NULL)) + suppressWarnings(if (is.na(LL.overall)) { + return(NULL) + }) n.comp <- 0 - + if (control$report.level > 2) { report("Phase I - Select within variables", 1) } - + # - + # Step I - use cross validation fold 1 to evaluate all splits and # select best split # - + # iterate over all variables for (cur_col in meta$covariate.ids) { LL.baseline <- LL.overall @@ -53,51 +53,54 @@ fairSplit <- if (control$report.level > 10) { report(paste("Estimating baseline likelihood: ", LL.baseline), 2) } - - + + if (control$report.level >= 1 || control$verbose) { - ui_message(paste( - "Testing predictor", - colnames(cross1)[cur_col], - " (#", - cur_col, - "/", - ncol(cross1), - ")" - ), - 2) + ui_message( + paste( + "Testing predictor", + colnames(cross1)[cur_col], + " (#", + cur_col, + "/", + ncol(cross1), + ")" + ), + 2 + ) } - + LL.within <- base::c() within.split <- base::c() - - #case for factored covariates############################## + + # case for factored covariates############################## if (is.factor(cross1[, cur_col])) { - #unordered factors##################################### + # unordered factors##################################### if (!is.ordered(cross1[, cur_col])) { - var.type = .SCALE_CATEGORICAL - + var.type <- .SCALE_CATEGORICAL + v <- cross1[, cur_col] val.sets <- sort(unique(v)) - - + + if (length(val.sets) > 1) { - #create binaries for comparison of all combinations + # create binaries for comparison of all combinations result <- - recodeAllSubsets(cross1[, cur_col], colnames(cross1)[cur_col], use.levels = - T) - test1 <- rep(0, length(cross1[, cur_col]))#base::c() + recodeAllSubsets(cross1[, cur_col], colnames(cross1)[cur_col], + use.levels = + T + ) + test1 <- rep(0, length(cross1[, cur_col])) # base::c() test2 <- rep(NA, length(cross1[, cur_col])) - + for (j in 1:ncol(result$columns)) { - #cat("RUN",j,"\n") + # cat("RUN",j,"\n") test1 <- rep(0, length(cross1[, cur_col])) for (i in 1:length(cross1[, cur_col])) { if (isTRUE(result$columns[i, j])) { test1[i] <- 1 - } - else { + } else { test1[i] <- 0 } } @@ -107,25 +110,25 @@ fairSplit <- test2 <- test2[, -1] for (i in 1:(result$num_sets)) { LL.temp <- base::c() - #subset data for chosen value and store LL + # subset data for chosen value and store LL if (result$num_sets == 1) { - vec = test2 + vec <- test2 + } else { + vec <- test2[[i]] } - else { - vec = test2[[i]] - } - subset1 <- subset (cross1, as.numeric(vec) == 2) - subset2 <- subset (cross1, as.numeric(vec) == 1) - + subset1 <- subset(cross1, as.numeric(vec) == 2) + subset2 <- subset(cross1, as.numeric(vec) == 1) + # refit baseline model with focus parameters @TAGX # for each new potential split if (!is.null(constraints) & - (!is.null(constraints$focus.parameters))) { + (!is.null(constraints$focus.parameters))) { LL.baseline <- fitSubmodels(model, - subset1, - subset2, - control, - invariance = constraints$focus.parameters) + subset1, + subset2, + control, + invariance = constraints$focus.parameters + ) if (control$report.level > 10) { report( paste( @@ -136,17 +139,17 @@ fairSplit <- ) } } - - #catch LLR for each comparison, only if valid + + # catch LLR for each comparison, only if valid LL.return <- fitSubmodels(model, subset1, subset2, control, invariance = NULL) - + if (nrow(subset1) + nrow(subset2) != nrow(cross1)) { message("INCONSISTENCY ERROR DETECTED") - + LL.return <- NA } - + if (!is.na(LL.return)) { LL.within <- cbind(LL.within, (LL.baseline - LL.return)) within.split <- cbind(within.split, i) @@ -154,34 +157,35 @@ fairSplit <- } } } - - - #ordered factors######################################### + + + # ordered factors######################################### if (is.ordered(cross1[, cur_col])) { - var.type = .SCALE_ORDINAL - + var.type <- .SCALE_ORDINAL + v <- cross1[, cur_col] val.sets <- sort(unique(v)) if (length(val.sets) > 1) { for (i in 2:(length(val.sets))) { LL.temp <- base::c() - #subset data for chosen value and store LL + # subset data for chosen value and store LL cond1 <- - cross1[, cur_col] > val.sets[i-1] + cross1[, cur_col] > val.sets[i - 1] cond2 <- - cross1[, cur_col] <= val.sets[i-1] - - subset1 <- subset (cross1, cond1) - subset2 <- subset (cross1, cond2) - + cross1[, cur_col] <= val.sets[i - 1] + + subset1 <- subset(cross1, cond1) + subset2 <- subset(cross1, cond2) + # refit baseline model with focus parameters @TAGX if (!is.null(constraints) & - (!is.null(constraints$focus.parameters))) { + (!is.null(constraints$focus.parameters))) { LL.baseline <- fitSubmodels(model, - subset1, - subset2, - control, - invariance = constraints$focus.parameters) + subset1, + subset2, + control, + invariance = constraints$focus.parameters + ) if (control$report.level > 10) { report( paste( @@ -192,46 +196,47 @@ fairSplit <- ) } } - - #catch LLR for each comparison + + # catch LLR for each comparison LL.return <- fitSubmodels(model, subset1, subset2, control, invariance = NULL) if (!is.na(LL.return)) { LL.within <- cbind(LL.within, (LL.baseline - LL.return)) within.split <- - # cbind(within.split, (val.sets[i] + val.sets[(i - 1)]) / 2) - c(within.split, as.character(val.sets[i-1])) + # cbind(within.split, (val.sets[i] + val.sets[(i - 1)]) / 2) + c(within.split, as.character(val.sets[i - 1])) } } } } } - - #numeric (continuous) covariates################################ + + # numeric (continuous) covariates################################ if (is.numeric(cross1[, cur_col])) { - var.type = .SCALE_METRIC + var.type <- .SCALE_METRIC v <- as.numeric(cross1[, cur_col]) val.sets <- sort(unique(v)) - #if(length(val.sets) < 30|!isTRUE(control$shortcut)){ + # if(length(val.sets) < 30|!isTRUE(control$shortcut)){ if (length(val.sets) > 1) { for (i in 2:(length(val.sets))) { LL.temp <- base::c() - #subset data for chosen value and store LL + # subset data for chosen value and store LL cond1 <- as.numeric(cross1[, cur_col]) > (val.sets[i] + val.sets[(i - 1)]) / 2 cond2 <- as.numeric(cross1[, cur_col]) < (val.sets[i] + val.sets[(i - 1)]) / 2 - subset1 <- subset (cross1, cond1) - subset2 <- subset (cross1, cond2) - + subset1 <- subset(cross1, cond1) + subset2 <- subset(cross1, cond2) + # refit baseline model with focus parameters @TAGX if (!is.null(constraints) & - (!is.null(constraints$focus.parameters))) { + (!is.null(constraints$focus.parameters))) { LL.baseline <- fitSubmodels(model, - subset1, - subset2, - control, - invariance = constraints$focus.parameters) + subset1, + subset2, + control, + invariance = constraints$focus.parameters + ) if (control$report.level > 10) { report( paste( @@ -242,8 +247,8 @@ fairSplit <- ) } } - - #catch LLR for each comparison + + # catch LLR for each comparison LL.return <- fitSubmodels(model, subset1, subset2, control, invariance = NULL) if (!is.na(LL.return)) { @@ -251,45 +256,49 @@ fairSplit <- within.split <- cbind(within.split, (val.sets[i] + val.sets[(i - 1)]) / 2) } else { - if (control$verbose) + if (control$verbose) { ui_fail("LL was NA when fitting submodels!") + } if (control$report.level > 2) { - report(paste("Could not estimate split at value ", val.sets[i]), - 2) + report( + paste("Could not estimate split at value ", val.sets[i]), + 2 + ) } } } } - #} + # } } - + if (control$report.level > 10) { if (!is.null(LL.within)) { report(paste("Within Likelihoods ", paste(round( LL.within, 2 ), collapse = " ")), 2) - } - else{ + } else { message("Within LLs NULL") } } - + if (control$report.level > 0) { if (is.null(LL.within)) { - report("No valid estimate found for any split value during the first round.", - 2) + report( + "No valid estimate found for any split value during the first round.", + 2 + ) } } - + max.LL.within <- base::c() max.within.split <- base::c() - - #store the LL, split value and variable number for each cov that makes a possible split + + # store the LL, split value and variable number for each cov that makes a possible split if (!is.null(LL.within)) { max.LL.within <- LL.within[1] max.within.split <- within.split[1] max.within.cov <- cur_col - + if (length(LL.within) > 1) { for (i in 2:length(LL.within)) { if (!is.na(LL.within[i]) | !is.null(LL.within[i])) { @@ -299,52 +308,51 @@ fairSplit <- } } } - } - + LL.btn <- cbind(LL.btn, max.LL.within) split.btn <- cbind(split.btn, max.within.split) - #cov.btn.names <- cbind(cov.btn.names, colnames(mydata[cur_col])) + # cov.btn.names <- cbind(cov.btn.names, colnames(mydata[cur_col])) cov.btn.cols <- cbind(cov.btn.cols, max.within.cov) cov.type <- cbind(cov.type, var.type) - + if (control$report.level >= 3) { - report(paste( - "Best split at ", - max.within.split, - " with LL", - max.LL.within - ), - 2) + report( + paste( + "Best split at ", + max.within.split, + " with LL", + max.LL.within + ), + 2 + ) } - } - } - + # # Phase II - select between variables using their best split # use cross validation fold 2 for evaluation # - #cat("PHASE II") + # cat("PHASE II") if (control$report.level > 2) { report("Phase II - Select between variables", 1) } - - #Baseline model to compare subgroup fits to + + # Baseline model to compare subgroup fits to modelnew <- mxAddNewModelData(model, cross2, name = "BASE MODEL C2") LL.overall <- safeRunAndEvaluate(modelnew) suppressWarnings(if (is.na(LL.overall)) { warning("Baseline likelihood is N/A; Aborting!") - + return(NULL) }) n.comp <- ncol(LL.btn) - + LL.max <- NULL - + if (!is.null(LL.btn)) { for (cur_col in 1:length(LL.btn)) { LL.temp <- base::c() @@ -356,12 +364,14 @@ fairSplit <- LL.baseline <- safeRunAndEvaluate(missingModel) num.rows <- dim(missingModel@data@observed)[1] } - + if (cov.type[cur_col] == .SCALE_CATEGORICAL) { if (!is.ordered(cross2[, cov.btn.cols[cur_col]])) { result <- - recodeAllSubsets(cross2[, (cov.btn.cols[cur_col])], colnames(cross2)[(cov.btn.cols[cur_col])], use.levels = - T) + recodeAllSubsets(cross2[, (cov.btn.cols[cur_col])], colnames(cross2)[(cov.btn.cols[cur_col])], + use.levels = + T + ) test1 <- base::c() clen <- dim(cross2)[1] test2 <- rep(NA, clen) @@ -370,8 +380,7 @@ fairSplit <- for (i in 1:clen) { if (isTRUE(result$columns[i, j])) { test1[i] <- 1 - } - else { + } else { test1[i] <- 0 } } @@ -379,23 +388,21 @@ fairSplit <- test2 <- data.frame(test2, test1) } test2 <- test2[, -1] - - + + if (result$num_sets == 1) { vec <- test2 - } - else { + } else { vec <- test2[[split.btn[cur_col]]] } - - - subset1 <- subset (cross2, as.numeric(vec) == 2) - subset2 <- subset (cross2, as.numeric(vec) == 1) + + + subset1 <- subset(cross2, as.numeric(vec) == 2) + subset2 <- subset(cross2, as.numeric(vec) == 1) } - } - else if (cov.type[cur_col] == .SCALE_ORDINAL) { + } else if (cov.type[cur_col] == .SCALE_ORDINAL) { cond1 <- cross2[, cov.btn.cols[cur_col]] > split.btn[cur_col] cond2 <- @@ -408,12 +415,12 @@ fairSplit <- cond2 <- cross2[, cov.btn.cols[cur_col]] < split.btn[cur_col] subset1 <- subset(cross2, cond1) - subset2 <- subset(cross2, cond2) + subset2 <- subset(cross2, cond2) } - + # refit baseline model with focus parameters @TAGX if (!is.null(constraints) & - (!is.null(constraints$focus.parameters))) { + (!is.null(constraints$focus.parameters))) { result <- fitSubmodels( model, subset1, @@ -437,10 +444,10 @@ fairSplit <- nrow(subset1) + nrow(subset2) # THIS DEFINITELY IS A HACK. SHOULD BE # DEFINED BY THE RETURNED MODEL } else { - #warning("LL.sum is NA after reestimation with focus parameter") + # warning("LL.sum is NA after reestimation with focus parameter") ui_warn( "Could not estimate constrained/focus model for variable ", -# colnames(cross2)[cur_col], + # colnames(cross2)[cur_col], names(mydata[cov.btn.cols[cur_col]]), ". Possibly choosing a suboptimal split." ) @@ -448,16 +455,18 @@ fairSplit <- num.rows <- nrow(subset1) + nrow(subset2) } } - + # evaluate split LL.cur <- - LL.baseline - fitSubmodels(model, subset1, subset2, control, invariance = - NULL) - + LL.baseline - fitSubmodels(model, subset1, subset2, control, + invariance = + NULL + ) + # cross2 == subset of data for Phase II - + if (nrow(subset1) + nrow(subset2) != num.rows) { - #browser() + # browser() ui_fail( paste( "SERIOUS INCONSISTENCY ERROR. Numbers of rows do not match. Type=" @@ -472,11 +481,12 @@ fairSplit <- ) LL.cur <- NA } - - if (control$verbose) + + if (control$verbose) { ui_message(paste("Testing", cur_col, ":", names(mydata[cov.btn.cols[cur_col]]), " ", LL.cur, "\n")) - - + } + + if (cur_col == 1) { LL.max <- LL.cur LL.btn <- LL.cur @@ -484,8 +494,7 @@ fairSplit <- name.max <- names(mydata[cov.btn.cols[1]]) col.max <- cov.btn.cols[1] type.max <- cov.type[1] - } - else { + } else { LL.btn <- cbind(LL.btn, LL.cur) if (!is.na(LL.cur) & !is.null(LL.cur)) { if (is.na(LL.max) || is.na(LL.cur) || LL.max < LL.cur) { @@ -497,15 +506,12 @@ fairSplit <- } } } - + if (control$report.level > 2) { report(paste("Name", names(mydata[cov.btn.cols[cur_col]]), "LL:", LL.cur), 2) } - } - - } - else { + } else { return(NULL) } ## @@ -516,22 +522,22 @@ fairSplit <- if (control$report.level > 2) { report("Phase III - Select Split in Best Variable", 1) } - + if (!is.null(constraints$focus.parameters)) { stop("Method 'fair3' does not yet support focus parameters in tree.constraints.") } - + # run full data on model for overall model fit modelnew <- mxAddNewModelData(model, mydata, name = "BASE MODEL FULL") LL.overall <- safeRunAndEvaluate(modelnew) suppressWarnings(if (is.na(LL.overall)) { warning("Overall likelihood is N/A; Aborting!") - + return(NULL) }) n.comp <- 0 - + # iterate over all splits in col.max cur_col <- col.max LL.baseline <- LL.overall @@ -539,47 +545,51 @@ fairSplit <- if (!is.null(missingModel)) { LL.baseline <- safeRunAndEvaluate(missingModel) } - - if (control$verbose) - ui_message("Testing ", - cur_col, - "/", - ncol(mydata), - " (", - colnames(mydata)[cur_col], - ")") - + + if (control$verbose) { + ui_message( + "Testing ", + cur_col, + "/", + ncol(mydata), + " (", + colnames(mydata)[cur_col], + ")" + ) + } + if (control$report.level >= 1) { report(paste("Testing predictor", colnames(mydata)[cur_col]), 1) } - + LL.within <- base::c() within.split <- base::c() - - #case for factored covariates############################## + + # case for factored covariates############################## if (is.factor(mydata[, cur_col])) { - #unordered factors##################################### + # unordered factors##################################### if (!is.ordered(mydata[, cur_col])) { - var.type = 1 + var.type <- 1 v <- as.numeric(mydata[, cur_col]) val.sets <- sort(union(v, v)) - #cat("Length", length(val.sets),":",paste(v),"\n") + # cat("Length", length(val.sets),":",paste(v),"\n") if (length(val.sets) > 1) { - #create binaries for comparison of all combinations + # create binaries for comparison of all combinations result <- - recodeAllSubsets(mydata[, cur_col], colnames(mydata)[cur_col], use.levels = - T) - test1 <- rep(0, length(mydata[, cur_col]))#base::c() + recodeAllSubsets(mydata[, cur_col], colnames(mydata)[cur_col], + use.levels = + T + ) + test1 <- rep(0, length(mydata[, cur_col])) # base::c() test2 <- rep(NA, length(mydata[, cur_col])) - + for (j in 1:ncol(result$columns)) { - #cat("RUN",j,"\n") + # cat("RUN",j,"\n") test1 <- rep(0, length(mydata[, cur_col])) for (i in 1:length(mydata[, cur_col])) { if (isTRUE(result$columns[i, j])) { test1[i] <- 1 - } - else { + } else { test1[i] <- 0 } } @@ -589,26 +599,25 @@ fairSplit <- test2 <- test2[, -1] for (i in 1:(result$num_sets)) { LL.temp <- base::c() - #subset data for chosen value and store LL + # subset data for chosen value and store LL if (result$num_sets == 1) { - vec = test2 - } - else { - vec = test2[[i]] + vec <- test2 + } else { + vec <- test2[[i]] } - subset1 <- subset (mydata, as.numeric(vec) == 2) - subset2 <- subset (mydata, as.numeric(vec) == 1) - - #catch LLR for each comparison, only if valid + subset1 <- subset(mydata, as.numeric(vec) == 2) + subset2 <- subset(mydata, as.numeric(vec) == 1) + + # catch LLR for each comparison, only if valid LL.return <- fitSubmodels(model, subset1, subset2, control, invariance = NULL) - + if (nrow(subset1) + nrow(subset2) != nrow(mydata)) { message("INCONSISTENCY ERROR DETECTED") - + LL.return <- NA } - + if (!is.na(LL.return)) { LL.within <- cbind(LL.within, (LL.baseline - LL.return)) within.split <- cbind(within.split, i) @@ -616,26 +625,26 @@ fairSplit <- } } } - - - #ordered factors######################################### + + + # ordered factors######################################### if (is.ordered(mydata[, cur_col])) { - var.type = .SCALE_ORDINAL + var.type <- .SCALE_ORDINAL v <- as.numeric(as.character(mydata[, cur_col])) val.sets <- sort(union(v, v)) if (length(val.sets) > 1) { for (i in 2:(length(val.sets))) { LL.temp <- base::c() - #subset data for chosen value and store LL + # subset data for chosen value and store LL cond1 <- as.numeric(as.character(mydata[, cur_col])) > (val.sets[i] + val.sets[(i - - 1)]) / 2 + 1)]) / 2 cond2 <- as.numeric(as.character(mydata[, cur_col])) < (val.sets[i] + val.sets[(i - - 1)]) / 2 - subset1 <- subset (mydata, cond1) - subset2 <- subset (mydata, cond2) - #catch LLR for each comparison + 1)]) / 2 + subset1 <- subset(mydata, cond1) + subset2 <- subset(mydata, cond2) + # catch LLR for each comparison LL.return <- fitSubmodels(model, subset1, subset2, control, invariance = NULL) if (!is.na(LL.return)) { @@ -647,23 +656,23 @@ fairSplit <- } } } - - #numeric (continuous) covariates################################ + + # numeric (continuous) covariates################################ if (is.numeric(mydata[, cur_col])) { - var.type = 2 + var.type <- 2 v <- as.numeric(mydata[, cur_col]) val.sets <- sort(union(v, v)) if (length(val.sets) > 1) { for (i in 2:(length(val.sets))) { LL.temp <- base::c() - #subset data for chosen value and store LL + # subset data for chosen value and store LL cond1 <- as.numeric(mydata[, cur_col]) > (val.sets[i] + val.sets[(i - 1)]) / 2 cond2 <- as.numeric(mydata[, cur_col]) < (val.sets[i] + val.sets[(i - 1)]) / 2 - subset1 <- subset (mydata, cond1) - subset2 <- subset (mydata, cond2) - #catch LLR for each comparison + subset1 <- subset(mydata, cond1) + subset2 <- subset(mydata, cond2) + # catch LLR for each comparison LL.return <- fitSubmodels(model, subset1, subset2, control, invariance = NULL) if (!is.na(LL.return)) { @@ -671,42 +680,46 @@ fairSplit <- within.split <- cbind(within.split, (val.sets[i] + val.sets[(i - 1)]) / 2) } else { - if (control$verbose) + if (control$verbose) { ui_fail("LL was NA when fitting submodels!") + } if (control$report.level > 2) { - report(paste("Could not estimate split at value ", val.sets[i]), - 2) + report( + paste("Could not estimate split at value ", val.sets[i]), + 2 + ) } } } } } - + if (control$report > 10) { if (!is.null(LL.within)) { message("Within LLs ", paste(round(LL.within, 2), collapse = " ")) - } - else{ + } else { message("Within LLs NULL") } } - + if (control$report.level > 0) { if (is.null(LL.within)) { - report("No valid estimate found for any split value during the first round.", - 2) + report( + "No valid estimate found for any split value during the first round.", + 2 + ) } } - + max.LL.within <- base::c() max.within.split <- base::c() - - #store the LL, split value and variable number for each cov that makes a possible split + + # store the LL, split value and variable number for each cov that makes a possible split if (!is.null(LL.within)) { max.LL.within <- LL.within[1] max.within.split <- within.split[1] max.within.cov <- cur_col - + if (length(LL.within) > 1) { for (i in 2:length(LL.within)) { if (!is.na(LL.within[i]) | !is.null(LL.within[i])) { @@ -716,46 +729,49 @@ fairSplit <- } } } - } - #max.LL.within - #max.within.split + # max.LL.within + # max.within.split split.max <- max.within.split LL.max <- max.LL.within - + if (control$report.level >= 3) { - report(paste( - "Best split at ", - max.within.split, - " with LL", - max.LL.within - ), - 2) + report( + paste( + "Best split at ", + max.within.split, + " with LL", + max.LL.within + ), + 2 + ) } } } - + btn.matrix <- rbind(LL.btn, names(mydata[c(cov.btn.cols)]), cov.btn.cols, split.btn) colnames(btn.matrix) <- c(paste("var", seq(1, ncol(btn.matrix)), sep = "")) rownames(btn.matrix) <- c("LR", "variable", "column", "split val") - + if (control$report.level >= 2) { report("_____________________________", 1) - report(paste( - "Best predictor", - name.max, - "split at", - split.max, - "with LL", - LL.max - ), - 1) + report( + paste( + "Best predictor", + name.max, + "split at", + split.max, + "with LL", + LL.max + ), + 1 + ) report("", 1) } - - #cat("Best ",LL.max, " ",split.max," ",name.max," ",col.max,"\n") + + # cat("Best ",LL.max, " ",split.max," ",name.max," ",col.max,"\n") return( list( LL.max = LL.max, diff --git a/R/getLikelihood.R b/R/getLikelihood.R index ae17908..17afe9f 100644 --- a/R/getLikelihood.R +++ b/R/getLikelihood.R @@ -1,15 +1,13 @@ -getLikelihood<-function(model) -{ +getLikelihood <- function(model) { # this seems to be not generic enough # return(OpenMx::mxEval(objective, model)); - - msm <- getS3method("summary","MxModel") + + msm <- getS3method("summary", "MxModel") # alternative: if (is.null(model)) { warning("NULL Model in getLikelihood()-call") return(NULL) } - + return(msm(model)$Minus2LogLikelihood) - - } \ No newline at end of file +} diff --git a/R/getNumNodes.R b/R/getNumNodes.R index 212b4c0..06642cb 100644 --- a/R/getNumNodes.R +++ b/R/getNumNodes.R @@ -1,34 +1,31 @@ #' Tree Size -#' +#' #' Counts the number of nodes in a tree. -#' -#' +#' +#' #' @param tree A SEM tree object. #' @author Andreas M. Brandmaier, John J. Prindle #' @references Brandmaier, A.M., Oertzen, T. v., McArdle, J.J., & Lindenberger, #' U. (2013). Structural equation model trees. \emph{Psychological Methods}, #' 18(1), 71-86. getNumNodes <- -function(tree) -{ - if ((is.null(tree$left_child)) && (is.null(tree$right_child))) - { - return(1); - } + function(tree) { + if ((is.null(tree$left_child)) && (is.null(tree$right_child))) { + return(1) + } - count <- 1 - - if (tree$left_child$caption != "TERMINAL") { - count <- count + getNumNodes(tree$left_child) - } else { - count <- count + 1 - } - if (tree$right_child$caption != "TERMINAL") { - count <- count + getNumNodes(tree$right_child) - } else { - count <- count + 1 - } - - return(count) - -} + count <- 1 + + if (tree$left_child$caption != "TERMINAL") { + count <- count + getNumNodes(tree$left_child) + } else { + count <- count + 1 + } + if (tree$right_child$caption != "TERMINAL") { + count <- count + getNumNodes(tree$right_child) + } else { + count <- count + 1 + } + + return(count) + } diff --git a/R/growTree.R b/R/growTree.R index 50e2e78..fb26334 100644 --- a/R/growTree.R +++ b/R/growTree.R @@ -11,30 +11,28 @@ # # # -growTree <- function(model=NULL, mydata=NULL, - control=NULL, invariance=NULL, meta=NULL, - edgelabel=NULL, depth=0, constraints=NULL, ...) -{ - - if(is.null(mydata)) { +growTree <- function(model = NULL, mydata = NULL, + control = NULL, invariance = NULL, meta = NULL, + edgelabel = NULL, depth = 0, constraints = NULL, ...) { + if (is.null(mydata)) { stop("There was no data for growing the tree") } - + if (is.null(meta)) { - warning("SEM tree could not determine model variables and covariates in the dataset."); - return(NULL); + warning("SEM tree could not determine model variables and covariates in the dataset.") + return(NULL) } - + if (control$verbose) { - ui_message("Growing level ",depth," n = ",nrow(mydata)); + ui_message("Growing level ", depth, " n = ", nrow(mydata)) } - - if (control$report.level>0) { - report(paste("Growing tree level",depth), 0) + + if (control$report.level > 0) { + report(paste("Growing tree level", depth), 0) } - - - + + + # Node null settings in testing for significant splitting node <- list() node$left_child <- NULL @@ -42,147 +40,162 @@ growTree <- function(model=NULL, mydata=NULL, node$caption <- "TERMINAL" node$N <- dim(mydata)[1] class(node) <- "semtree" - + # -- sample columns in case of SEM forest usage -- fulldata <- mydata fullmeta <- meta if (control$mtry > 0) { - # get names of model variables before sampling model.names <- names(mydata)[meta$model.ids] covariate.names <- names(mydata)[meta$covariate.ids] - #perform sampling + # perform sampling mydata <- sampleColumns(mydata, names(mydata)[meta$covariate.ids], control$mtry) # get new model ids after sampling by name - meta$model.ids <- sapply(model.names, function(x) {which(x==names(mydata))}) + meta$model.ids <- sapply(model.names, function(x) { + which(x == names(mydata)) + }) names(meta$model.ids) <- NULL - meta$covariate.ids <- unlist(lapply(covariate.names, function(x) {which(x==names(mydata))})) - + meta$covariate.ids <- unlist(lapply(covariate.names, function(x) { + which(x == names(mydata)) + })) + node$colnames <- colnames(mydata) if (control$verbose) { - ui_message("Subsampled predictors: ",paste(node$colnames[meta$covariate.ids])) + ui_message("Subsampled predictors: ", paste(node$colnames[meta$covariate.ids])) } } - + # determine whether split evaluation can be done on p values - node$p.values.valid <- control$method != "cv" - + node$p.values.valid <- control$method != "cv" + # set some default values for the node object node$lr <- NA node$edge_label <- edgelabel - + # Estimate model on current data set ## 31.08.2022: Model in root is not refitted if refit is set to FALSE if (depth == 0 & !control$refit & length(constraints) == 0) { # do not fit model node$model <- model } else { # fit model # OpenMx - if(control$sem.prog == 'OpenMx'){ - full.model <- mxAddNewModelData(model = model, data = mydata, - name = "BASE MODEL") + if (control$sem.prog == "OpenMx") { + full.model <- mxAddNewModelData( + model = model, data = mydata, + name = "BASE MODEL" + ) node$model <- try( - suppressMessages(OpenMx::mxTryHard(model = full.model, paste = FALSE, - silent = TRUE,bestInitsOutput = FALSE)), - silent = TRUE) + suppressMessages(OpenMx::mxTryHard( + model = full.model, paste = FALSE, + silent = TRUE, bestInitsOutput = FALSE + )), + silent = TRUE + ) } # lavaan - if(control$sem.prog == 'lavaan'){ + if (control$sem.prog == "lavaan") { ## 11.08.2022 Note: fits lavaan model on mydata node$model <- try(suppressWarnings(eval( - parse(text=paste("lavaan::",model@Options$model.type, - '(lavaan::parTable(model),data=mydata,missing=\'', - model@Options$missing,'\')',sep="")))),silent=T) + parse(text = paste("lavaan::", model@Options$model.type, + "(lavaan::parTable(model),data=mydata,missing='", + model@Options$missing, "')", + sep = "" + )) + )), silent = T) } ## 26.06.2022: Added code for ctsem models - if(control$sem.prog == 'ctsem'){ + if (control$sem.prog == "ctsem") { full.model <- suppressMessages(try( - ctsemOMX::ctFit(dat = mydata[, -meta$covariate.ids], - ctmodelobj = model$ctmodelobj, - dataform = "wide", - objective = model$ctfitargs$objective, - stationary = model$ctfitargs$stationary, - optimizer = model$ctfitargs$optimizer, - retryattempts = ifelse(model$ctfitargs$retryattempts >= 20, - yes = model$ctfitargs$retryattempts, - no = 20), - carefulFit = model$ctfitargs$carefulFit, - showInits = model$ctfitargs$showInits, - asymptotes = model$ctfitargs$asymptotes, - meanIntervals = model$ctfitargs$meanIntervals, - discreteTime = model$ctfitargs$discreteTime, - verbose = model$ctfitargs$verbose, - transformedParams = model$ctfitargs$transformedParams) + ctsemOMX::ctFit( + dat = mydata[, -meta$covariate.ids], + ctmodelobj = model$ctmodelobj, + dataform = "wide", + objective = model$ctfitargs$objective, + stationary = model$ctfitargs$stationary, + optimizer = model$ctfitargs$optimizer, + retryattempts = ifelse(model$ctfitargs$retryattempts >= 20, + yes = model$ctfitargs$retryattempts, + no = 20 + ), + carefulFit = model$ctfitargs$carefulFit, + showInits = model$ctfitargs$showInits, + asymptotes = model$ctfitargs$asymptotes, + meanIntervals = model$ctfitargs$meanIntervals, + discreteTime = model$ctfitargs$discreteTime, + verbose = model$ctfitargs$verbose, + transformedParams = model$ctfitargs$transformedParams + ) )) full.model$mxobj@name <- "BASE MODEL" node$model <- full.model } } - - - if (is(node$model,"try-error")) - { + + + if (is(node$model, "try-error")) { ui_fail("Model had a run error.") - node$term.reason <- node$model[[1]] - node$model <- NULL; - return(node); + node$term.reason <- node$model[[1]] + node$model <- NULL + return(node) } - + if (is.null(node$model)) { - node$term.reason <- "Model was NULL! Model could not be estimated."; - return(node); + node$term.reason <- "Model was NULL! Model could not be estimated." + return(node) } - - + + ########################################################### ### OPENMX USED HERE ### ########################################################### - if(control$sem.prog == 'OpenMx'){ - + if (control$sem.prog == "OpenMx") { # some export/namespace problem here with the generic # getS3method("summary","MxModel") gets me the right fun - S3summary <- getS3method("summary","MxModel") - + S3summary <- getS3method("summary", "MxModel") + # list of point estimates, std.dev, and names of all freely estimated parameters - node$params <- S3summary(node$model)$parameters[,5]; - names(node$params) <- S3summary(node$model)$parameters[,1]; - node$params_sd <- S3summary(node$model)$parameters[,6]; - node$param_names <- S3summary(node$model)$parameters[,1]; + node$params <- S3summary(node$model)$parameters[, 5] + names(node$params) <- S3summary(node$model)$parameters[, 1] + node$params_sd <- S3summary(node$model)$parameters[, 6] + node$param_names <- S3summary(node$model)$parameters[, 1] } ########################################################### ### lavaan USED HERE ### ########################################################### - if(control$sem.prog == 'lavaan'){ - - node$params <- lavaan::coef(node$model) # put parameters into node + if (control$sem.prog == "lavaan") { + node$params <- lavaan::coef(node$model) # put parameters into node names(node$params) <- names(lavaan::coef(node$model)) # parameter names are stored as well - - #read in estimated parameters (take only those that have non-NA z values) - #parameters <- data.frame( + + # read in estimated parameters (take only those that have non-NA z values) + # parameters <- data.frame( # lavaan::parameterEstimates(node$model))[!is.na( # data.frame(lavaan::parameterEstimates(node$model))[,"z"]),] - + parameters <- lavaan::parameterEstimates(node$model) - + # if any labels are missing (some labels provided), then put default labels in the label col. - for(i in 1:nrow(parameters)){ - if(!is.null(parameters$label)){ - if(parameters$label[i]==""){parameters$label[i]<-paste(parameters$lhs[i],parameters$op[i],parameters$rhs[i],sep="")} + for (i in 1:nrow(parameters)) { + if (!is.null(parameters$label)) { + if (parameters$label[i] == "") { + parameters$label[i] <- paste(parameters$lhs[i], parameters$op[i], parameters$rhs[i], sep = "") + } } - } + } # if all labels are missing make a label column - if(is.null(parameters$label)){ - label <- paste(parameters$lhs,parameters$op,parameters$rhs,sep="") - parameters<- cbind(parameters,label) - } - + if (is.null(parameters$label)) { + label <- paste(parameters$lhs, parameters$op, parameters$rhs, sep = "") + parameters <- cbind(parameters, label) + } + # store the SE of the estimates - se <- rep(NA,length(unique(parameters$se))) - for(i in 1:length(unique(parameters$label))){ - for(j in 1:nrow(parameters)){ - if(unique(parameters$label)[i]==parameters$label[j]){se[i]<-parameters$se[j]} + se <- rep(NA, length(unique(parameters$se))) + for (i in 1:length(unique(parameters$label))) { + for (j in 1:nrow(parameters)) { + if (unique(parameters$label)[i] == parameters$label[j]) { + se[i] <- parameters$se[j] + } } } - + # list of point estimates, std.dev, and names of all freely estimated parameters node$params_sd <- se node$param_names <- names(lavaan::coef(node$model)) @@ -190,8 +203,7 @@ growTree <- function(model=NULL, mydata=NULL, ########################################################### ### ctsemOMX USED HERE ### ########################################################### - if(control$sem.prog == 'ctsem'){ - + if (control$sem.prog == "ctsem") { # list of point estimates, std.dev, and names of all freely estimated parameters if (control$ctsem_sd) { # slower ctsem_summary <- summary(node$model) # this is very slow @@ -206,9 +218,9 @@ growTree <- function(model=NULL, mydata=NULL, node$param_names <- names(ctsem_coef) } } - + # df - + if (!is.null(constraints$focus.parameters)) { # df's are equal to number of focus parameters node$df <- length(constraints$focus.parameters) @@ -217,82 +229,92 @@ growTree <- function(model=NULL, mydata=NULL, # df == num. parameters node$df <- length(node$param_names) } - - - + + + # add unique node id via global variable node$node_id <- getGlobal("global.node.id") - setGlobal("global.node.id", node$node_id+1) - + setGlobal("global.node.id", node$node_id + 1) + # determine whether we should skip splitting # 1. minimum cases in node if (!is.na(control$min.N)) { if (node$N <= control$min.N) { - if(control$verbose){ + if (control$verbose) { ui_message("Minimum user defined N for leaf node.") } - node$term.reason <- "Minimum number of cases in leaf node" - return(node); + node$term.reason <- "Minimum number of cases in leaf node" + return(node) } } # 2. maximum depth for tree reached - if (!is.na(control$max.depth)){ + if (!is.na(control$max.depth)) { if (depth >= control$max.depth) { - if(control$verbose){ + if (control$verbose) { ui_message("Max user defined tree depth reached.") } - node$term.reason <- "Maximum depth reached in leaf node" - return(node); + node$term.reason <- "Maximum depth reached in leaf node" + return(node) } } - + # determine best split based in chosen method (ml or score) and (naive, cv, fair) result <- NULL # 1. unbalanced selection method if (control$method == "naive") { - # if (control$test.type=="ml") { - + result <- tryCatch( ################################################ - naiveSplit(node$model, mydata, control, invariance, meta, constraints=constraints, ...) + naiveSplit(node$model, mydata, control, invariance, meta, constraints = constraints, ...) ################################################ , - error = function(e) { cat(paste("Error occured!",e,sep="\n")); traceback(); return(NULL); } - ); - - } else if (control$method=="score") { - + error = function(e) { + cat(paste("Error occured!", e, sep = "\n")) + traceback() + return(NULL) + } + ) + } else if (control$method == "score") { result <- tryCatch( ################################################ - result <- ScoreSplit(node$model, mydata, control, invariance, meta, constraints=constraints, ...) + result <- ScoreSplit(node$model, mydata, control, invariance, meta, constraints = constraints, ...) ################################################ , - error = function(e) { cat(paste("Error occured!",e,sep="\n")); traceback(); return(NULL); } - ); - - } + error = function(e) { + cat(paste("Error occured!", e, sep = "\n")) + traceback() + return(NULL) + } + ) + } # 2a. split half data to determine best split then use hold out set to compare one split per covariate else if (control$method == "fair") { control$fair3Step <- FALSE result <- tryCatch( ################################################ - fairSplit(node$model, mydata, control, invariance, meta, constraints=constraints, ...) + fairSplit(node$model, mydata, control, invariance, meta, constraints = constraints, ...) ################################################ , - error = function(e) { cat(paste("Error occured!",e,sep="\n")); return(NULL); } - ); + error = function(e) { + cat(paste("Error occured!", e, sep = "\n")) + return(NULL) + } + ) } # 2b. split half data to determine best split then use hold out set to compare one split per covariate, with step 3 all splits retest else if (control$method == "fair3") { control$fair3Step <- TRUE result <- tryCatch( - ################################################ - fairSplit(node$model, mydata, control, invariance, meta, constraints=constraints, ...) + ################################################ + fairSplit(node$model, mydata, control, invariance, meta, constraints = constraints, ...) ################################################ , - error = function(e) { cat(paste("Error occured!",e,sep="\n")); return(NULL); } - ); + error = function(e) { + cat(paste("Error occured!", e, sep = "\n")) + return(NULL) + } + ) } # 3. Traditional cross validation for covariate split selection else if (control$method == "cv") { @@ -301,10 +323,10 @@ growTree <- function(model=NULL, mydata=NULL, ui_fail("Error. Unknown split method selected") stop() } - + # return values in result are: # LL.max : numeric, log likelihood ratio of best split - # split.max : numeric, split point; value to split best column on + # split.max : numeric, split point; value to split best column on # (for metric variables) # col.max : index of best column # name.max : name of best candidate @@ -314,70 +336,74 @@ growTree <- function(model=NULL, mydata=NULL, # the various split points evaluated # n.comp : the (implicit) number of multiple tests evaluated for # determining the best split - + # store the value of the selected test statistic node$lr <- NA if (!is.null(result)) { node$lr <- result$LL.max node$result <- result } - + # if no split found, exit node without continuing growth if (is.null(result) || is.null(result$LL.max)) { if (control$verbose) { ui_message("Best Likelihood Ratio was NULL. Stop splitting") } - return(node); + return(node) } - + # provide verbose output to the user about best split if (control$verbose) { - ui_ok("Best split is ",result$name.max," with statistic = ",round(node$lr,2)); + ui_ok("Best split is ", result$name.max, " with statistic = ", round(node$lr, 2)) } - + # compute p value from chi2 if (!is.null(result$p.max)) { node$p <- result$p.max } else { - node$p <- pchisq(node$lr,df=node$df, lower.tail=F) - + node$p <- pchisq(node$lr, df = node$df, lower.tail = F) + if (control$use.maxlm) { - # Borders for continuous covariates if (!is.factor(mydata[, result$name.max])) { - props <- cumsum(table(mydata[, result$name.max])) / node$N split_val_lhs <- as.numeric(names(which(props >= control$strucchange.from)[1])) split_val_rhs <- as.numeric(names(which(props >= control$strucchange.to)[1])) - + btn_matrix_max <- result$btn.matrix[, result$btn.matrix["variable", ] == - result$name.max, drop = FALSE] - + result$name.max, drop = FALSE] + num_split_val <- as.numeric(btn_matrix_max["split val", ]) - + n1 <- which(num_split_val <= split_val_lhs) n1 <- n1[length(n1)] n2 <- which(num_split_val >= split_val_rhs)[1] - - if (length(n1) == 0) {n1 <- 1} - if (is.na(n2)) {n2 <- length(num_split_val)} - + + if (length(n1) == 0) { + n1 <- 1 + } + if (is.na(n2)) { + n2 <- length(num_split_val) + } + LR <- as.numeric(btn_matrix_max["LR", n1:n2]) - + max_pos <- which.max(LR) + n1 - 1 node$result$LL.max <- node$lr <- as.numeric(btn_matrix_max["LR", max_pos]) node$result$split.max <- as.numeric(btn_matrix_max["split val", max_pos]) } - - node$p <- computePval_maxLR(maxLR = node$lr, q = node$df, - covariate = mydata[,result$col.max], from = control$strucchange.from, - to = control$strucchange.to, nrep = control$strucchange.nrep) + + node$p <- computePval_maxLR( + maxLR = node$lr, q = node$df, + covariate = mydata[, result$col.max], from = control$strucchange.from, + to = control$strucchange.to, nrep = control$strucchange.nrep + ) } } - - + + # --------- determine whether to continue splitting -------------- - if (is(control$custom.stopping.rule,"function")) { + if (is(control$custom.stopping.rule, "function")) { stopping.rule <- control$custom.stopping.rule } else { stopping.rule <- stoppingRuleDefault @@ -385,172 +411,172 @@ growTree <- function(model=NULL, mydata=NULL, # stoppingRuleDefault() is a function that gets inputs node, result, control # this function can be replaced by a user-specified function srule <- stopping.rule(node, result, control) - + # determine whether splitting should be continued depending on return state # of the function - if (is(srule,"list")) { + if (is(srule, "list")) { node <- srule$node cont.split <- !(srule$stop.rule) } else { - cont.split <- !srule + cont.split <- !srule node$p.values.valid <- FALSE } - + # restore mydata here if (mtry was > 0) -- for semforests if (control$mtry > 0) { - # also need to remap col.max to original data! if (!is.null(result$col.max) && !is.na(result$col.max)) { col.max.name <- names(mydata)[result$col.max] - result$col.max <- which(names(fulldata)==col.max.name) + result$col.max <- which(names(fulldata) == col.max.name) } else { col.max.name <- NULL } - + # restore data mydata <- fulldata meta <- fullmeta } - - if ((!is.null(cont.split)) && (!is.na(cont.split)) && (cont.split)) { + + if ((!is.null(cont.split)) && (!is.na(cont.split)) && (cont.split)) { if (control$report.level > 10) { report("Stop splitting based on stopping rule.", 1) } - - - + + + # store the split name (covariate name and split value) RHS is yes branch - if(result$type.max==.SCALE_CATEGORICAL) { + if (result$type.max == .SCALE_CATEGORICAL) { # unordered factor collating and splitting lvl <- (control$method == "fair") - result1 <- recodeAllSubsets(mydata[,result$col.max],colnames(mydata)[result$col.max], - growbool=T, use.levels=lvl) - - + result1 <- recodeAllSubsets(mydata[, result$col.max], colnames(mydata)[result$col.max], + growbool = T, use.levels = lvl + ) + + test2 <- rep(NA, nrow(mydata)) - if(!is.na(result1$num_sets) & !is.null(result1$num_sets)){ - for(j in 1:result1$num_sets) { + if (!is.na(result1$num_sets) & !is.null(result1$num_sets)) { + for (j in 1:result1$num_sets) { test1 <- rep(NA, nrow(mydata)) - for(i in 1:nrow(mydata)) { - if(isTRUE(result1$columns[i,j])) {test1[i] <- 1} - else if(!is.na(result1$columns[i,j])){test1[i] <- 0} - else{test1[i]<-NA} + for (i in 1:nrow(mydata)) { + if (isTRUE(result1$columns[i, j])) { + test1[i] <- 1 + } else if (!is.na(result1$columns[i, j])) { + test1[i] <- 0 + } else { + test1[i] <- NA + } } test1 <- as.factor(test1) test2 <- data.frame(test2, test1) } } - test2 <- test2[,-1] - + test2 <- test2[, -1] + # if level is categorical, then split.max corresponds to the index of # the best column in the matrix that represents all subsets # make sure that this is not casted to a string if there # are predictors of other types (esp., factors) - #browser() + # browser() if (!all(is.na(result$btn.matrix))) { - result$split.max <- as.integer(result$split.max) - - #named <- colnames(result1$columns)[result$split.max] - # node$caption <- paste(colnames(result1$columns)[result$split.max]) - best_subset_col_id = result$split.max - best_values = result1$expressions[ (best_subset_col_id-1)*3 +1]$value + result$split.max <- as.integer(result$split.max) + + # named <- colnames(result1$columns)[result$split.max] + # node$caption <- paste(colnames(result1$columns)[result$split.max]) + best_subset_col_id <- result$split.max + best_values <- result1$expressions[(best_subset_col_id - 1) * 3 + 1]$value } else { best_values <- result$split.max } - node$rule = list(variable=result$col.max, relation="%in%", - value=best_values, - name = result$name.max) + node$rule <- list( + variable = result$col.max, relation = "%in%", + value = best_values, + name = result$name.max + ) node$caption <- paste(result$name.max, " in [", paste0(best_values, - collapse=" ")," ]") - - if(result1$num_sets==1) { - sub1 <- subset (mydata, as.numeric(test2) == 2) - sub2 <- subset (mydata, as.numeric(test2) == 1) - } - else { - sub1 <- subset (mydata, as.numeric(test2[[result$split.max]]) == 2) - sub2 <- subset (mydata, as.numeric(test2[[result$split.max]]) == 1) + collapse = " " + ), " ]") + + if (result1$num_sets == 1) { + sub1 <- subset(mydata, as.numeric(test2) == 2) + sub2 <- subset(mydata, as.numeric(test2) == 1) + } else { + sub1 <- subset(mydata, as.numeric(test2[[result$split.max]]) == 2) + sub2 <- subset(mydata, as.numeric(test2[[result$split.max]]) == 1) } - - } - else if (result$type.max==.SCALE_METRIC){ - + } else if (result$type.max == .SCALE_METRIC) { # if var.type==2, then split.max corresponds to the split point value # make sure that this is not casted to a string if there # are predictors of other types (esp., factors) result$split.max <- as.numeric(result$split.max) - + # ordered factor splitting of data - node$caption <- paste(result$name.max,">=", signif(result$split.max,6),sep=" ") - node$rule = list(variable=result$col.max, relation=">=", value=c(result$split.max), name = result$name.max) - sub1 <- subset( mydata, as.numeric(as.character(mydata[, (result$col.max)])) >result$split.max) - sub2 <- subset( mydata, as.numeric(as.character(mydata[, (result$col.max)]))<=result$split.max) - } - else if (result$type.max==.SCALE_ORDINAL) { - - node$caption <- paste(result$name.max,">", result$split.max,sep=" ") - node$rule = list(variable=result$col.max, relation=">", value=c(result$split.max), name = result$name.max) - sub1 <- subset( mydata, mydata[, (result$col.max)] >result$split.max) - sub2 <- subset( mydata, mydata[, (result$col.max)]<=result$split.max) - - } - else if (result$type.max == 99) { + node$caption <- paste(result$name.max, ">=", signif(result$split.max, 6), sep = " ") + node$rule <- list(variable = result$col.max, relation = ">=", value = c(result$split.max), name = result$name.max) + sub1 <- subset(mydata, as.numeric(as.character(mydata[, (result$col.max)])) > result$split.max) + sub2 <- subset(mydata, as.numeric(as.character(mydata[, (result$col.max)])) <= result$split.max) + } else if (result$type.max == .SCALE_ORDINAL) { + node$caption <- paste(result$name.max, ">", result$split.max, sep = " ") + node$rule <- list(variable = result$col.max, relation = ">", value = c(result$split.max), name = result$name.max) + sub1 <- subset(mydata, mydata[, (result$col.max)] > result$split.max) + sub2 <- subset(mydata, mydata[, (result$col.max)] <= result$split.max) + } else if (result$type.max == 99) { # this is an error code by score test implementation # return node and stop splitting # TODO (MA): Do we need to issue a warning? return(node) - } - else { + } else { # TODO: remove this bc this condition should be captured earlier in any case # continuous variables splitting - #node$caption <- paste(result$name.max,">=", signif(result$split.max,3),sep=" ") - #node$rule = list(variable=result$col.max, relation=">=", value=c(result$split.max), name = result$name.max) - #sub1 <- subset( mydata, as.numeric(mydata[, (result$col.max)]) >result$split.max) - #sub2 <- subset( mydata, as.numeric(mydata[, (result$col.max)])<=result$split.max) + # node$caption <- paste(result$name.max,">=", signif(result$split.max,3),sep=" ") + # node$rule = list(variable=result$col.max, relation=">=", value=c(result$split.max), name = result$name.max) + # sub1 <- subset( mydata, as.numeric(mydata[, (result$col.max)]) >result$split.max) + # sub2 <- subset( mydata, as.numeric(mydata[, (result$col.max)])<=result$split.max) stop("An error occured!") } - + flush.console() - + ########################################################## ## NEW CODE TO INCLUDE CASES MISSING ON SPLITTING VARIABLE class(node) <- "semtree" - if(control$use.all& (nrow(mydata)>(nrow(sub1)+nrow(sub2)))){ - if(control$verbose){message("Missing on splitting variable: ",result$name.max)} + if (control$use.all & (nrow(mydata) > (nrow(sub1) + nrow(sub2)))) { + if (control$verbose) { + message("Missing on splitting variable: ", result$name.max) + } completeSplits <- calculateBestSubsets(model, mydata, sub1, sub2, result) sub1 <- completeSplits$sub1 sub2 <- completeSplits$sub2 } ########################################################## - + # build a model for missing data if (control$missing == "ctree") { ui_warn("Missing data treatment with ctree is not yet implemented.") - #temp = mydata[!is.na(mydata[,result$name.max]),] - #node$missing.model = party::ctree( + # temp = mydata[!is.na(mydata[,result$name.max]),] + # node$missing.model = party::ctree( # data = temp, # formula = as.formula(paste0(result$name.max,"~."))) } else if (control$missing == "rpart") { - temp = mydata[!is.na(mydata[,result$name.max]),] - node$missing.model = rpart::rpart( + temp <- mydata[!is.na(mydata[, result$name.max]), ] + node$missing.model <- rpart::rpart( data = temp, - formula = as.formula(paste0(result$name.max,"~."))) + formula = as.formula(paste0(result$name.max, "~.")) + ) } - - + + # recursively continue splitting # result1 - RHS; result2 - LHS - result2 <- growTree( node$model, sub2, control, invariance, meta, edgelabel=0, depth=depth+1, constraints) - result1 <- growTree( node$model, sub1, control, invariance, meta, edgelabel=1, depth=depth+1, constraints) - + result2 <- growTree(node$model, sub2, control, invariance, meta, edgelabel = 0, depth = depth + 1, constraints) + result1 <- growTree(node$model, sub1, control, invariance, meta, edgelabel = 1, depth = depth + 1, constraints) + # store results in recursive list structure node$left_child <- result2 node$right_child <- result1 - - return(node); - + + return(node) } else { # if cont.split is F or NA or NULL then return node without further splitting - return(node); - } -} + return(node) + } +} diff --git a/R/naiveSplit.R b/R/naiveSplit.R index 3ae75f3..4046d76 100644 --- a/R/naiveSplit.R +++ b/R/naiveSplit.R @@ -15,44 +15,45 @@ naiveSplit <- cov.type <- c() cov.col <- c() cov.name <- c() - + LL.within <- c() within.split <- c() - #Baseline model to compare subgroup fits to + # Baseline model to compare subgroup fits to ########################################################### ### OPENMX USED HERE ### ########################################################### - if (control$sem.prog == 'OpenMx') { + if (control$sem.prog == "OpenMx") { modelnew <- mxAddNewModelData(model, mydata, name = "BASE MODEL") LL.overall <- safeRunAndEvaluate(modelnew) - suppressWarnings(if (is.na(LL.overall)) - return(NULL)) + suppressWarnings(if (is.na(LL.overall)) { + return(NULL) + }) } ########################################################### ### lavaan USED HERE ### ########################################################### - if (control$sem.prog == 'lavaan') { - #if (control$verbose) {message("Assessing overall model")} + if (control$sem.prog == "lavaan") { + # if (control$verbose) {message("Assessing overall model")} modelnew <- eval(parse( text = paste( - "lavaan::",model@Options$model.type, - '(lavaan::parTable(model),data=mydata,missing=\'', + "lavaan::", model@Options$model.type, + "(lavaan::parTable(model),data=mydata,missing='", model@Options$missing, - '\',do.fit=F)', + "',do.fit=F)", sep = "" ) )) - #modelnew <- lavaan(parTable(model),data=mydata,model.type=model@Options$model.type,do.fit=FALSE) + # modelnew <- lavaan(parTable(model),data=mydata,model.type=model@Options$model.type,do.fit=FALSE) LL.overall <- safeRunAndEvaluate(modelnew) - suppressWarnings(if (is.na(LL.overall)) - return(NULL)) + suppressWarnings(if (is.na(LL.overall)) { + return(NULL) + }) } if (pp) { comparedData <- max(meta$model.ids + 1) - } - else { + } else { comparedData <- meta$covariate.ids } for (cur_col in comparedData) { @@ -66,37 +67,37 @@ naiveSplit <- if (control$report.level > 10) { report(paste("Estimating baseline likelihood: ", LL.baseline), 1) } - - + + # tell the user a little bit about where we are right now if (control$verbose) { - ui_message("Testing Predictor: ", - colnames(mydata)[cur_col]) + ui_message( + "Testing Predictor: ", + colnames(mydata)[cur_col] + ) } ############################################################ - #case for factored covariates############################## + # case for factored covariates############################## if (is.factor(mydata[, cur_col])) { - #unordered factors##################################### + # unordered factors##################################### if (!is.ordered(mydata[, cur_col])) { - var.type = .SCALE_CATEGORICAL - + var.type <- .SCALE_CATEGORICAL + val.sets <- unique(mydata[, cur_col]) if (length(val.sets) > 1) { - #create binaries for comparison of all combinations + # create binaries for comparison of all combinations result <- recodeAllSubsets(mydata[, cur_col], colnames(mydata)[cur_col]) test1 <- c() test2 <- rep(NA, length(mydata[, cur_col])) - + for (j in 1:ncol(result$columns)) { for (i in 1:length(mydata[, cur_col])) { if (isTRUE(result$columns[i, j])) { test1[i] <- 1 - } - else if (!is.na(result$columns[i, j])) { + } else if (!is.na(result$columns[i, j])) { test1[i] <- 0 - } - else{ + } else { test1[i] <- NA } } @@ -104,27 +105,27 @@ naiveSplit <- test2 <- data.frame(test2, test1) } test2 <- test2[, -1] - + for (i in 1:(result$num_sets)) { LL.temp <- c() - #subset data for chosen value and store LL + # subset data for chosen value and store LL if (result$num_sets == 1) { - subset1 <- subset (mydata, as.numeric(test2) == 2) - subset2 <- subset (mydata, as.numeric(test2) == 1) - } - else { - subset1 <- subset (mydata, as.numeric(test2[[i]]) == 2) - subset2 <- subset (mydata, as.numeric(test2[[i]]) == 1) + subset1 <- subset(mydata, as.numeric(test2) == 2) + subset2 <- subset(mydata, as.numeric(test2) == 1) + } else { + subset1 <- subset(mydata, as.numeric(test2[[i]]) == 2) + subset2 <- subset(mydata, as.numeric(test2[[i]]) == 1) } - + # refit baseline model with focus parameters @TAGX if (!is.null(constraints) & - (!is.null(constraints$focus.parameters))) { + (!is.null(constraints$focus.parameters))) { LL.baseline <- fitSubmodels(model, - subset1, - subset2, - control, - invariance = constraints$focus.parameters) + subset1, + subset2, + control, + invariance = constraints$focus.parameters + ) if (control$report.level > 10) { report( paste( @@ -135,7 +136,7 @@ naiveSplit <- ) } } - #browser() + # browser() LL.return <- fitSubmodels(model, subset1, subset2, control, invariance = NULL) if (!is.na(LL.return)) { @@ -149,32 +150,33 @@ naiveSplit <- } } } - #ordered factors######################################### + # ordered factors######################################### if (is.ordered(mydata[, cur_col])) { - var.type = .SCALE_ORDINAL - + var.type <- .SCALE_ORDINAL + val.sets <- sort(unique(mydata[, cur_col])) - + if (length(val.sets) > 1) { for (i in 2:(length(val.sets))) { LL.temp <- c() - #subset data for chosen value and store LL - #cond1 <- as.numeric(as.character(mydata[,cur_col])) > (val.sets[i]+val.sets[(i-1)])/2 - #cond2 <- as.numeric(as.character(mydata[,cur_col])) < (val.sets[i]+val.sets[(i-1)])/2 - + # subset data for chosen value and store LL + # cond1 <- as.numeric(as.character(mydata[,cur_col])) > (val.sets[i]+val.sets[(i-1)])/2 + # cond2 <- as.numeric(as.character(mydata[,cur_col])) < (val.sets[i]+val.sets[(i-1)])/2 + cond1 <- mydata[, cur_col] > val.sets[i - 1] cond2 <- mydata[, cur_col] <= val.sets[i - 1] - subset1 <- subset (mydata, cond1) - subset2 <- subset (mydata, cond2) - + subset1 <- subset(mydata, cond1) + subset2 <- subset(mydata, cond2) + # refit baseline model with focus parameters @TAGX if (!is.null(constraints) & - (!is.null(constraints$focus.parameters))) { + (!is.null(constraints$focus.parameters))) { LL.baseline <- fitSubmodels(model, - subset1, - subset2, - control, - invariance = constraints$focus.parameters) + subset1, + subset2, + control, + invariance = constraints$focus.parameters + ) if (control$report.level > 10) { report( paste( @@ -185,12 +187,12 @@ naiveSplit <- ) } } - + LL.return <- fitSubmodels(model, subset1, subset2, control, invariance = NULL) if (!is.na(LL.return)) { LL.within <- cbind(LL.within, (LL.baseline - LL.return)) - #within.split <- cbind(within.split, (val.sets[i]+val.sets[(i-1)])/2) + # within.split <- cbind(within.split, (val.sets[i]+val.sets[(i-1)])/2) within.split <- cbind(within.split, as.character(val.sets[i - 1])) cov.col <- cbind(cov.col, cur_col) @@ -203,33 +205,34 @@ naiveSplit <- } } - #numeric (continuous) covariates################################ + # numeric (continuous) covariates################################ if (is.numeric(mydata[, cur_col])) { - var.type = .SCALE_METRIC + var.type <- .SCALE_METRIC v <- as.numeric(mydata[, cur_col]) val.sets <- sort(unique(v)) - + if (length(val.sets) > 1) { for (i in 2:(length(val.sets))) { LL.temp <- c() - #subset data for chosen value and store LL + # subset data for chosen value and store LL cond1 <- as.numeric(mydata[, cur_col]) > (val.sets[i] + val.sets[(i - 1)]) / 2 cond2 <- as.numeric(mydata[, cur_col]) < (val.sets[i] + val.sets[(i - 1)]) / 2 - subset1 <- subset (mydata, cond1) - subset2 <- subset (mydata, cond2) - - #catch LLR for each comparison - + subset1 <- subset(mydata, cond1) + subset2 <- subset(mydata, cond2) + + # catch LLR for each comparison + # refit baseline model with focus parameters @TAGX if (!is.null(constraints) & - (!is.null(constraints$focus.parameters))) { + (!is.null(constraints$focus.parameters))) { LL.baseline <- fitSubmodels(model, - subset1, - subset2, - control, - invariance = constraints$focus.parameters) + subset1, + subset2, + control, + invariance = constraints$focus.parameters + ) if (control$report.level > 10) { report( paste( @@ -240,7 +243,7 @@ naiveSplit <- ) } } - + LL.return <- fitSubmodels(model, subset1, subset2, control, invariance = NULL) # cat("LLreturn:",LL.return," and value split at:",(val.sets[i]+val.sets[(i-1)])/2,"\n") @@ -253,23 +256,21 @@ naiveSplit <- cov.type <- cbind(cov.type, var.type) n.comp <- n.comp + 1 } - } } - } } - - + + if (is.null(LL.within)) { return(NULL) } - + btn.matrix <- rbind(LL.within, cov.name, cov.col, within.split) colnames(btn.matrix) <- c(paste("var", seq(1, ncol(btn.matrix)), sep = "")) rownames(btn.matrix) <- c("LR", "variable", "column", "split val") - + filter <- c() if (!is.null(invariance)) { if (control$verbose) { @@ -278,9 +279,9 @@ naiveSplit <- filter <- invarianceFilter(model, mydata, btn.matrix, LL.baseline, invariance, control) } - + # find best - + LL.max <- NA split.max <- NA name.max <- NA @@ -295,8 +296,7 @@ naiveSplit <- name.max <- cov.name[cur_col] col.max <- cov.col[cur_col] type.max <- cov.type[cur_col] - } - else if (LL.within[cur_col] > LL.max) { + } else if (LL.within[cur_col] > LL.max) { LL.max <- LL.within[cur_col] split.max <- within.split[cur_col] name.max <- cov.name[cur_col] @@ -304,8 +304,7 @@ naiveSplit <- type.max <- cov.type[cur_col] } } - } - else { + } else { if (!is.na(LL.within[cur_col])) { if (is.na(LL.max)) { LL.max <- LL.within[cur_col] @@ -313,8 +312,7 @@ naiveSplit <- name.max <- cov.name[cur_col] col.max <- cov.col[cur_col] type.max <- cov.type[cur_col] - } - else if (LL.within[cur_col] > LL.max) { + } else if (LL.within[cur_col] > LL.max) { LL.max <- LL.within[cur_col] split.max <- within.split[cur_col] name.max <- cov.name[cur_col] @@ -324,26 +322,24 @@ naiveSplit <- } } } - - if (control$verbose & control$report.level==99) { - - cat("LL.within:",paste0(LL.within,collapse=","),"\n") - cat("LL.max: ", paste0(LL.max,collapse=","),"\n") - cat("within.split: ",paste0(within.split,collapse=","),"\n" ) - cat("split max",split.max,"\n") + + if (control$verbose & control$report.level == 99) { + cat("LL.within:", paste0(LL.within, collapse = ","), "\n") + cat("LL.max: ", paste0(LL.max, collapse = ","), "\n") + cat("within.split: ", paste0(within.split, collapse = ","), "\n") + cat("split max", split.max, "\n") } - + # alternative way of counting the number of comparisons # count the number of variables instead of tests if (control$naive.bonferroni.type == 1) { n.comp <- length(comparedData) } - - + + if (is.na(LL.max)) { return(NULL) - } - else + } else { ( return( list( @@ -358,4 +354,5 @@ naiveSplit <- ) ) ) - } \ No newline at end of file + } + } diff --git a/R/print.semforest.R b/R/print.semforest.R index 123ceb7..29395c3 100644 --- a/R/print.semforest.R +++ b/R/print.semforest.R @@ -1,27 +1,21 @@ #' @exportS3Method print semforest -print.semforest <- function(x, ...) -{ - invalid.trees <- sum(sapply(x$forest,FUN=is.null)) +print.semforest <- function(x, ...) { + invalid.trees <- sum(sapply(x$forest, FUN = is.null)) - cat(paste("SEM forest with ",length(x$forest)," trees.","\n")) - if (invalid.trees > 0) { - cat(paste("Of these trees, ",invalid.trees," trees are invalid due to errors.\n")) - } - - + cat(paste("SEM forest with ", length(x$forest), " trees.", "\n")) + if (invalid.trees > 0) { + cat(paste("Of these trees, ", invalid.trees, " trees are invalid due to errors.\n")) + } } #' @exportS3Method print semforest_stripped -print.semforest_stripped <- function(x, ...) -{ - invalid.trees <- sum(sapply(x$forest,FUN=is.null)) - - - cat(paste("SEM forest [stripped] with ",length(x$forest)," trees.","\n")) +print.semforest_stripped <- function(x, ...) { + invalid.trees <- sum(sapply(x$forest, FUN = is.null)) + + + cat(paste("SEM forest [stripped] with ", length(x$forest), " trees.", "\n")) if (invalid.trees > 0) { - cat(paste("Of these trees, ",invalid.trees," trees are invalid due to errors.\n")) + cat(paste("Of these trees, ", invalid.trees, " trees are invalid due to errors.\n")) } - - -} \ No newline at end of file +} diff --git a/R/print.semtree.R b/R/print.semtree.R index 89ba674..2a3791f 100644 --- a/R/print.semtree.R +++ b/R/print.semtree.R @@ -3,38 +3,33 @@ print.semtree <- function(x, level = 0, p.values.valid = NULL, - ...) - { + ...) { tree <- x - + indent <- paste(rep("| ", level), collapse = "", sep = "") - + if (level > 0) { edge_label <- tree$edge_label } else { edge_label <- "ROOT" - } - + caption <- tree$caption - - if (caption == "TERMINAL") - { - #caption <- paste(caption,"(ID=",tree$node_id,")",sep="") + + if (caption == "TERMINAL") { + # caption <- paste(caption,"(ID=",tree$node_id,")",sep="") output <- paste(indent, - "|-[", - tree$node_id, - "] ", - caption, - " [N=", - tree$N, - "]\n", - sep = "") - - } - - else { + "|-[", + tree$node_id, + "] ", + caption, + " [N=", + tree$N, + "]\n", + sep = "" + ) + } else { output <- paste( indent, @@ -52,28 +47,23 @@ print.semtree <- "]\n", sep = "" ) - } - - if (tree$caption != "TERMINAL") - { + + if (tree$caption != "TERMINAL") { output <- paste( output, print.semtree(tree$left_child, level + 1, p.values.valid), print.semtree(tree$right_child, level + 1, p.values.valid) ) - } - + if (level == 0) { output <- paste("SEMtree with numbered nodes\n", output) - + cat(output) - } else { return(output) - } } diff --git a/R/semtree.R b/R/semtree.R index 8ad329e..c44c18d 100644 --- a/R/semtree.R +++ b/R/semtree.R @@ -1,31 +1,31 @@ #' SEM Tree: Recursive Partitioning for Structural Equation Models -#' +#' #' Structural equation model (SEM) trees are a combination of SEM and decision #' trees (also known as classification and regression trees or recursive #' partitioning). SEM trees hierarchically split empirical data into #' homogeneous groups sharing similar data patterns with respect to a SEM by #' recursively selecting optimal predictors of these differences from a #' potentially large set of predictors. -#' +#' #' Calling \code{semtree} with an \code{\link{OpenMx}} or #' \code{\link[lavaan]{lavaan}} model creates a tree that recursively #' partitions a dataset such that the partitions maximally differ with respect #' to the model-predicted distributions. Each resulting subgroup (represented #' as a leaf in the tree) is represented by a SEM with a distinct set of #' parameter estimates. -#' +#' #' Predictors (yet unmodeled variables) can take on any form for the splitting #' algorithm to function (categorical, ordered categories, continuous). Care #' must be taken in choosing how many predictors to include in analyses because #' as the number of categories grows for unordered categorical variables, the #' number of multigroup comparisons increases exponentially for unordered #' categories. -#' +#' #' Currently available evaluation methods for assessing partitions: -#' +#' #' 1. "naive" selection method compares all possible split values to one #' another over all predictors included in the dataset. -#' +#' #' 2. "fair" selection uses a two step procedure for analyzing split values on #' predictors at each node of the tree. The first phase uses half of the sample #' to examine the model improvement for each split value on each predictor, and @@ -34,14 +34,14 @@ #' predictor on the second half of the sample. The best improvement for the c #' splits tested on c predictors is selected for the node and the dataset is #' split from this node for further testing. -#' +#' #' 3. "score" uses score-based test statistics. These statistics are much #' faster than the classic SEM tree approach while having favorable -#' statistical properties. -#' +#' statistical properties. +#' #' All other parameters controlling the tree growing process are available #' through a separate \code{\link{semtree.control}} object. -#' +#' #' @aliases semtree plot.semtree print.semtree summary.semtree toLatex.semtree #' nodeFunSemtree #' @param model A template model specification from \code{\link{OpenMx}} using @@ -75,20 +75,19 @@ #' \code{\link{parameters}}, \code{\link{se}}, \code{\link{prune.semtree}}, #' \code{\link{subtree}}, \code{\link[OpenMx]{OpenMx}}, #' \code{\link[lavaan]{lavaan}} -#' @references +#' @references #' Brandmaier, A.M., Oertzen, T. v., McArdle, J.J., & Lindenberger, U. (2013). Structural equation model trees. \emph{Psychological Methods}, 18(1), 71-86. -#' @references +#' @references #' Arnold, M., Voelkle, M. C., & Brandmaier, A. M. (2021). Score-guided structural equation model trees. \emph{Frontiers in Psychology}, 11, Article 564403. https://doi.org/10.3389/fpsyg.2020.564403 #' #' @keywords tree models multivariate -#' +#' #' @export -semtree <- function(model, data=NULL, control=NULL, constraints=NULL, - predictors = NULL, ...) { - +semtree <- function(model, data = NULL, control = NULL, constraints = NULL, + predictors = NULL, ...) { # TODO: change this throughout dataset <- data - + # obtain dots arguments and test for deprecated use of arguments arguments <- list(...) if ("global.constraints" %in% names(arguments)) { @@ -97,267 +96,282 @@ semtree <- function(model, data=NULL, control=NULL, constraints=NULL, if ("invariance" %in% names(arguments)) { stop("Deprecated use of argument 'invariance'. Please use constraints object with property 'local.invariance'") } - + if (is.null(constraints)) { constraints <- semtree.constraints() } - + covariates <- predictors - + # backwards-compatibility - if ( (!is.null(arguments)) & ("covariates" %in% names(arguments)) ) { + if ((!is.null(arguments)) & ("covariates" %in% names(arguments))) { if (is.null(predictors)) { - #report(paste("Setting arguments to ",paste(arguments$covariates)),1) + # report(paste("Setting arguments to ",paste(arguments$covariates)),1) covariates <- arguments$covariates } else { stop("Cannot have both arguments 'predictors' and 'covariates' in SEM Tree model.") } } - - - + + + invariance <- constraints$local.invariance global.constraints <- constraints$global.invariance - - - + + + # create default control object, if not specified if (is.null(control)) { control <- semtree.control() - if (control$verbose) + if (control$verbose) { ui_message("Default SEMtree settings established since no Controls provided.") + } } else { - if (checkControl(control)!=TRUE) {stop( "Unknown options in semtree.control object!");} + if (checkControl(control) != TRUE) { + stop("Unknown options in semtree.control object!") + } } - + # this is a really dumb heuristic # please can someone replace this with something more useful # this based on (Bentler & Chou, 1987; see also Bollen, 1989) if (is.null(control$min.N)) { control$min.N <- 5 * npar(model) } - + # set min.bucket and min.N heuristically if (is.null(control$min.bucket)) { - control$min.bucket = control$min.N / 2 + control$min.bucket <- control$min.N / 2 } - - if (control$method=="cv") { + + if (control$method == "cv") { ui_stop("This method ceased to exist. Please see modern score-based tests.") } - + # check whether data is complete for score-tests # this probably should be a more fine-grained check some day # that tests only model variables and selected predictors if (control$method == "score") { - check_complete = all(stats::complete.cases(data)) - if (!check_complete) + check_complete <- all(stats::complete.cases(data)) + if (!check_complete) { ui_stop("If score tests are used, data must not contain N/A in either the predictors or model variables.") + } } - + # check for correct model entry - if (inherits(model,"MxModel") || inherits(model,"MxRAMModel")) { - if (control$verbose) { message("Detected OpenMx model.") } - control$sem.prog = "OpenMx" - } else if (inherits(model,"lavaan")){ - #if (control$verbose) { ui_message("Detected lavaan model.") } - control$sem.prog = "lavaan" - } else if ((inherits(model,"ctsemFit")) || (inherits(model,"ctsemInit"))) { - #if (control$verbose) { ui_message("Detected ctsem model.") } - control$sem.prog = "ctsem" + if (inherits(model, "MxModel") || inherits(model, "MxRAMModel")) { + if (control$verbose) { + message("Detected OpenMx model.") + } + control$sem.prog <- "OpenMx" + } else if (inherits(model, "lavaan")) { + # if (control$verbose) { ui_message("Detected lavaan model.") } + control$sem.prog <- "lavaan" + } else if ((inherits(model, "ctsemFit")) || (inherits(model, "ctsemInit"))) { + # if (control$verbose) { ui_message("Detected ctsem model.") } + control$sem.prog <- "ctsem" } else { - ui_stop("Unknown model type selected. Use OpenMx or lavaanified lavaan models!"); + ui_stop("Unknown model type selected. Use OpenMx or lavaanified lavaan models!") } - + # set the mtry value to default=0 if not set if (is.na(control$mtry)) control$mtry <- 0 - - + + # some checks if (!is.null(constraints$focus.parameters)) { - if (control$sem.prog != "OpenMx") { ui_stop("Focus parameters are only supported with OpenMx!") } - - num.match <- length(constraints$focus.parameters %in% - OpenMx::omxGetParameters(model)) + + num.match <- length(constraints$focus.parameters %in% + OpenMx::omxGetParameters(model)) if (num.match != length(constraints$focus.parameters)) { ui_stop("Error! Not all focus parameters are free parameters in the model!") } } - + # add data to model if not already done and sort covariates from model variables ########################################################### ### OPENMX USED HERE ### ########################################################### - if((control$sem.prog=='OpenMx') || (control$sem.prog=='ctsem')){ - - if ((control$sem.prog=='ctsem')) { + if ((control$sem.prog == "OpenMx") || (control$sem.prog == "ctsem")) { + if ((control$sem.prog == "ctsem")) { ## 11.08.2022: check data format. Currently, only wide format is supported. - if (all(is.na(match(paste0(model$ctmodelobj$manifestNames, "_T0"), - colnames(dataset))))) { + if (all(is.na(match( + paste0(model$ctmodelobj$manifestNames, "_T0"), + colnames(dataset) + )))) { stop("Long format data detected. Data need to be in wide format.") # Check if the model unsupported components # to be done - } - - model$mxobj@manifestVars <- paste0(model$ctmodelobj$manifestNames, "_T", - rep(0:(model$ctmodelobj$Tpoints - 1), - each = model$ctmodelobj$n.manifest)) + } + + model$mxobj@manifestVars <- paste0( + model$ctmodelobj$manifestNames, "_T", + rep(0:(model$ctmodelobj$Tpoints - 1), + each = model$ctmodelobj$n.manifest + ) + ) mxmodel <- model$mxobj } else { mxmodel <- model } - - if(is.null(dataset)) { + + if (is.null(dataset)) { if (is.null(mxmodel@data)) { stop("MxModel has no data associated!") } dataset <- mxmodel@data@observed } - + # sanity check if (any(!(covariates %in% names(dataset)))) { stop( - paste("Some of the specified predictors are not in the dataset provided: ", - paste(covariates[ (!(covariates %in% names(dataset)))],sep="",collapse = ",") - )) + paste( + "Some of the specified predictors are not in the dataset provided: ", + paste(covariates[(!(covariates %in% names(dataset)))], sep = "", collapse = ",") + ) + ) } - + tmp <- getPredictorsOpenMx(mxmodel, dataset, covariates) model.ids <- tmp[[1]] covariate.ids <- tmp[[2]] - + # check whether character columns are given as predictors for (i in covariate.ids) { - if (!is.factor(dataset[,i]) && !is.numeric(dataset[,i])) { + if (!is.factor(dataset[, i]) && !is.numeric(dataset[, i])) { # this column is neither numeric or a factor, thus cannot be handled # probably a vector of strings - ui_stop("Predictor '", colnames(dataset)[i],"' is neither a factor nor numeric. This is likely causing trouble. Please remove or specify as factor or ordered.") + ui_stop("Predictor '", colnames(dataset)[i], "' is neither a factor nor numeric. This is likely causing trouble. Please remove or specify as factor or ordered.") } } - + # check whether numeric covariates have more than 9 observed values # if score-tests are used, otherwise score statistics can become # unstable - if (control$method=="score") { - for (i in covariate.ids) { - if (!is.factor(dataset[,i]) && is.numeric(dataset[,i])) { - # this column is numeric, should have more than 9 unique values! - check_9levels = length(unique(dataset[,i]))>9 - if (!check_9levels) - ui_warn("Predictor '", colnames(dataset)[i],"' has 9 or fewer unique values. Consider coding as ordinal to avoid instability with score-based tests.") + if (control$method == "score") { + for (i in covariate.ids) { + if (!is.factor(dataset[, i]) && is.numeric(dataset[, i])) { + # this column is numeric, should have more than 9 unique values! + check_9levels <- length(unique(dataset[, i])) > 9 + if (!check_9levels) { + ui_warn("Predictor '", colnames(dataset)[i], "' has 9 or fewer unique values. Consider coding as ordinal to avoid instability with score-based tests.") + } + } } } - } - + # 15.08.2022: all OpenMx models are estimated here if not already estimated ## ctsem are already estimated once - if (control$sem.prog == 'OpenMx' && !summary(model)$wasRun) { - ui_message("Model was not run. Estimating parameters now.") - suppressMessages(model <- OpenMx::mxTryHard(model = model, paste=FALSE, silent = TRUE)) + if (control$sem.prog == "OpenMx" && !summary(model)$wasRun) { + ui_message("Model was not run. Estimating parameters now.") + suppressMessages(model <- OpenMx::mxTryHard(model = model, paste = FALSE, silent = TRUE)) } - - + + # Prepare objects for fast score calculation - ## Only for linear models (semtree$linear == TRUE) or for models with definition variables + ## Only for linear models (semtree$linear == TRUE) or for models with definition variables # Note: model must be run - this is assured by previous code block that performs mxTryHard() - if (control$method == "score" & control$sem.prog == 'OpenMx') { - control <- c(control, - list(scores_info = OpenMx_scores_input(x = model, - control = control))) - } - - - + if (control$method == "score" & control$sem.prog == "OpenMx") { + control <- c( + control, + list(scores_info = OpenMx_scores_input( + x = model, + control = control + )) + ) + } } - + ########################################################### ### lavaan USED HERE ### ########################################################### - if(control$sem.prog=='lavaan'){ - if(is.null(dataset)) { + if (control$sem.prog == "lavaan") { + if (is.null(dataset)) { ui_stop("Must include data for analysis!") } - + tmp <- getPredictorsLavaan(model, dataset, covariates) model.ids <- tmp[[1]] covariate.ids <- tmp[[2]] - } - + meta <- list() meta$model.ids <- model.ids meta$covariate.ids <- covariate.ids - + # init unique node counter - # assign("global.node.id",1,envir = getSemtreeNamespace()) + # assign("global.node.id",1,envir = getSemtreeNamespace()) # TODO: is there a better way to assign ids? - setGlobal("global.node.id",1) - - #create default constraints if none specified for invariance testing of nested models + setGlobal("global.node.id", 1) + + # create default constraints if none specified for invariance testing of nested models if (is.null(invariance)) { - invariance <- NULL - } - else { + invariance <- NULL + } else { if (control$method != "naive") { ui_message("Invariance is only implemented for naive variable selection.") return(NULL) } - if(is.na(control$alpha.invariance)){ + if (is.na(control$alpha.invariance)) { ui_message("No Invariance alpha selected. alpha.invariance set to:", control$alpha) - control$alpha.invariance<-control$alpha} - - if(is(invariance, "character")) { - invariance <- list(invariance) - } else { - if (!is(invariance, "list")) { - ui_stop("Invariance must contain an array of parameter names or a list of such arrays.") - } - } + control$alpha.invariance <- control$alpha + } + if (is(invariance, "character")) { + invariance <- list(invariance) + } else { + if (!is(invariance, "list")) { + ui_stop("Invariance must contain an array of parameter names or a list of such arrays.") + } + } } - + # heuristic checks whether variables are correctly coded # to avoid problems in the computation of test statistics for (cid in covariate.ids) { - column <- dataset[, cid] + column <- dataset[, cid] if (is.numeric(column)) { - if (length(unique(column))<=10) { ui_warn("Variable ",names(dataset)[cid]," is numeric but has only few unique values. Consider recoding as ordered factor." )} + if (length(unique(column)) <= 10) { + ui_warn("Variable ", names(dataset)[cid], " is numeric but has only few unique values. Consider recoding as ordered factor.") + } } } - + # check for no missing data in covariates if score statistics are used if (control$method == "score") { for (cid in covariate.ids) { - column <- dataset[, cid] - if (sum(is.na(column))>0) { ui_stop("Variable ",names(dataset)[cid]," has missing values. Computation of score statistic not possible."); return(NULL); } - } + column <- dataset[, cid] + if (sum(is.na(column)) > 0) { + ui_stop("Variable ", names(dataset)[cid], " has missing values. Computation of score statistic not possible.") + return(NULL) + } + } } - - + + # correct method selection check - method.int <- pmatch(control$method, c("cv","naive","fair","fair3","score")) + method.int <- pmatch(control$method, c("cv", "naive", "fair", "fair3", "score")) if (is.na(method.int)) { ui_stop("Unknown method in control object! Try either 'naive', 'fair', 'fair3', 'score', or 'cv'.") - } - + } + # if this is still null, no data was given if (is.null(dataset)) { ui_stop("No data were provided!") } - + # sanity checks, duplicated col names? - if (any(duplicated(names(dataset)))) - { + if (any(duplicated(names(dataset)))) { ui_stop("Dataset contains duplicated columns names!") } - + # set a seed for user to repeat results if no seed provided - if (!is.null(control$seed)&!is.na(control$seed)){ + if (!is.null(control$seed) & !is.na(control$seed)) { set.seed(control$seed) } ########################################################### @@ -365,57 +379,61 @@ semtree <- function(model, data=NULL, control=NULL, constraints=NULL, ########################################################### # global constraints - estimate once and then regarded fixed in the tree if (!is.null(global.constraints)) { - if (control$sem.prog != "OpenMx") { ui_stop("Global constraints are not yet supported!") } - - run.global <- OpenMx::mxRun(model, silent=TRUE, useOptimizer=TRUE, suppressWarnings=TRUE); + + run.global <- OpenMx::mxRun(model, silent = TRUE, useOptimizer = TRUE, suppressWarnings = TRUE) labels <- names(OpenMx::omxGetParameters(model)) eqids <- which(labels %in% global.constraints) neqids <- which(!labels %in% global.constraints) values <- OpenMx::omxGetParameters(run.global)[eqids] - model <- OpenMx::omxSetParameters(model, - labels=global.constraints,free=FALSE, values=values) + model <- OpenMx::omxSetParameters(model, + labels = global.constraints, free = FALSE, values = values + ) # FIX THIS LINE HERE - + # Read Global Constraints and New model Parameters Here. - ui_message("Global Constraints:\n",paste(global.constraints,collapse=" ")) - ui_message("Freely Estimated Parameters:\n",paste(names(OpenMx::omxGetParameters(model)),collapse=" ")) + ui_message("Global Constraints:\n", paste(global.constraints, collapse = " ")) + ui_message("Freely Estimated Parameters:\n", paste(names(OpenMx::omxGetParameters(model)), collapse = " ")) } - - + + # grow tree - if(control$sem.prog == 'OpenMx'){ - if (control$verbose){message('OpenMx model estimation selected!')} - } - else if(control$sem.prog == 'lavaan'){ - if (control$verbose){message('lavaan model estimation selected!')} - - } - else if(control$sem.prog == 'ctsem'){ - if (control$verbose){message('ctsem model estimation selected!')} - } - else { + if (control$sem.prog == "OpenMx") { + if (control$verbose) { + message("OpenMx model estimation selected!") + } + } else if (control$sem.prog == "lavaan") { + if (control$verbose) { + message("lavaan model estimation selected!") + } + } else if (control$sem.prog == "ctsem") { + if (control$verbose) { + message("ctsem model estimation selected!") + } + } else { stop("Unknown model type. Use OpenMx or lavaans models only!") } - - + + # save time before starting the actual tree growing start.time <- proc.time() - - # start the recursive growTree() function to do the + + # start the recursive growTree() function to do the # actual heavy lifting - tree <- growTree(model=model, mydata=dataset, control=control, - invariance=invariance, meta=meta, - constraints=constraints, ...) - - + tree <- growTree( + model = model, mydata = dataset, control = control, + invariance = invariance, meta = meta, + constraints = constraints, ... + ) + + # determine time elapsed - elapsed <- proc.time()-start.time - - + elapsed <- proc.time() - start.time + + # save various information in the result object and # assign it class 'semtree' tree$elapsed <- elapsed @@ -423,11 +441,12 @@ semtree <- function(model, data=NULL, control=NULL, constraints=NULL, tree$constraints <- constraints tree$version <- tryCatch(sessionInfo()$otherPkgs$semtree$Version) class(tree) <- "semtree" - + # tell the user that everything is OK - ui_ok("Tree construction finished [took ", - human_readable_time(elapsed[3]),"].") - + ui_ok( + "Tree construction finished [took ", + human_readable_time(elapsed[3]), "]." + ) + return(tree) - }