diff --git a/.Rbuildignore b/.Rbuildignore
index 3202962..d93aac8 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -13,3 +13,7 @@ Makefile
^pkgdown$
^LICENSE\.md$
^\.travis\.yml$
+misc
+cran-comments.md
+^CRAN-SUBMISSION$
+^\.github$
diff --git a/.github/.gitignore b/.github/.gitignore
new file mode 100644
index 0000000..2d19fc7
--- /dev/null
+++ b/.github/.gitignore
@@ -0,0 +1 @@
+*.html
diff --git a/.github/workflows/R-CMD-check-windows.yaml b/.github/workflows/R-CMD-check-windows.yaml
new file mode 100644
index 0000000..1d68a31
--- /dev/null
+++ b/.github/workflows/R-CMD-check-windows.yaml
@@ -0,0 +1,29 @@
+# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
+# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
+on:
+ push:
+ branches: [main, master]
+ pull_request:
+ branches: [main, master]
+
+name: R-CMD-check-Win
+
+jobs:
+ R-CMD-check:
+ runs-on: windows-latest
+ env:
+ GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+ R_KEEP_PKG_SOURCE: yes
+ steps:
+ - uses: actions/checkout@v3
+
+ - uses: r-lib/actions/setup-r@v2
+ with:
+ use-public-rspm: true
+
+ - uses: r-lib/actions/setup-r-dependencies@v2
+ with:
+ extra-packages: any::rcmdcheck
+ needs: check
+
+ - uses: r-lib/actions/check-r-package@v2
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
new file mode 100644
index 0000000..f4b17a4
--- /dev/null
+++ b/.github/workflows/R-CMD-check.yaml
@@ -0,0 +1,29 @@
+# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
+# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
+on:
+ push:
+ branches: [main, master]
+ pull_request:
+ branches: [main, master]
+
+name: R-CMD-check
+
+jobs:
+ R-CMD-check:
+ runs-on: ubuntu-latest
+ env:
+ GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+ R_KEEP_PKG_SOURCE: yes
+ steps:
+ - uses: actions/checkout@v3
+
+ - uses: r-lib/actions/setup-r@v2
+ with:
+ use-public-rspm: true
+
+ - uses: r-lib/actions/setup-r-dependencies@v2
+ with:
+ extra-packages: any::rcmdcheck
+ needs: check
+
+ - uses: r-lib/actions/check-r-package@v2
diff --git a/.github/workflows/R-devel-check.yml b/.github/workflows/R-devel-check.yml
new file mode 100644
index 0000000..ca17a96
--- /dev/null
+++ b/.github/workflows/R-devel-check.yml
@@ -0,0 +1,17 @@
+# check package on latest R devel version
+#
+on: workflow-dispatch
+
+name: R-CMD-check
+
+jobs:
+ devel-check:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+ - uses: r-lib/actions/setup-r@v2
+ with:
+ r-version: 'devel'
+ # Use "renv" to retrieve R version recorded in renv.lock file.
+ - run: 'R CMD build ./'
+ - run: 'R CMD check semtree*.tar.gz'
diff --git a/.github/workflows/R-tests.yml b/.github/workflows/R-tests.yml
new file mode 100644
index 0000000..e4b95ea
--- /dev/null
+++ b/.github/workflows/R-tests.yml
@@ -0,0 +1,31 @@
+on:
+ workflow_dispatch
+
+name: Tests
+
+jobs:
+ document:
+ name: run tests
+ runs-on: ubuntu-latest
+ env:
+ GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+ steps:
+ - uses: actions/checkout@v3
+ - uses: r-lib/actions/setup-r@v2
+ with:
+ use-public-rspm: true
+
+ - uses: r-lib/actions/setup-r-dependencies@v2
+ with:
+ cache-version: 2
+ extra-packages: |
+ any::testthat
+ any::devtools
+# needs: pr-document
+ - name: install package
+ run: devtools::install_github("brandmaier/semtree")
+ shell: Rscript {0}
+
+ - name: run test
+ run: testthat::test_dir("tests/testthat/")
+ shell: Rscript {0}
diff --git a/.github/workflows/test-coverage.yml b/.github/workflows/test-coverage.yml
new file mode 100644
index 0000000..2c5bb50
--- /dev/null
+++ b/.github/workflows/test-coverage.yml
@@ -0,0 +1,50 @@
+# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
+# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
+on:
+ push:
+ branches: [main, master]
+ pull_request:
+ branches: [main, master]
+
+name: test-coverage
+
+jobs:
+ test-coverage:
+ runs-on: ubuntu-latest
+ env:
+ GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+
+ steps:
+ - uses: actions/checkout@v3
+
+ - uses: r-lib/actions/setup-r@v2
+ with:
+ use-public-rspm: true
+
+ - uses: r-lib/actions/setup-r-dependencies@v2
+ with:
+ extra-packages: any::covr
+ needs: coverage
+
+ - name: Test coverage
+ run: |
+ covr::codecov(
+ quiet = FALSE,
+ clean = FALSE,
+ install_path = file.path(Sys.getenv("RUNNER_TEMP"), "package")
+ )
+ shell: Rscript {0}
+
+ - name: Show testthat output
+ if: always()
+ run: |
+ ## --------------------------------------------------------------------
+ find ${{ runner.temp }}/package -name 'testthat.Rout*' -exec cat '{}' \; || true
+ shell: bash
+
+ - name: Upload test results
+ if: failure()
+ uses: actions/upload-artifact@v3
+ with:
+ name: coverage-test-failures
+ path: ${{ runner.temp }}/package
diff --git a/.gitignore b/.gitignore
index 7836fb7..4c4c5fb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,4 +5,9 @@
inst/doc
doc
Meta
+/doc/
+/Meta/
+cran_comments.md
+CRAN*
.DS_Store
+
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index d71d611..0000000
--- a/.travis.yml
+++ /dev/null
@@ -1,46 +0,0 @@
-# R for travis: see documentation at https://docs.travis-ci.com/user/languages/r
-
-language: R
-cache: packages
-dist: focal
-r:
- - release
- - devel
-r_binary_packages:
- - bitops
- - sets
- - digest
- - rpart
- - rpart.plot
- - plotrix
- - cluster
- - stringr
- - lavaan
- - expm
- - lavaan
- - tidyr
- - viridis
- - ggplot2
- - strucchange
- - sandwich
- - zoo
- - crayon
- - testthat
- - Rcpp
- - RcppEigen
- - MASS
- - Matrix
- - StanHeaders
- - mvtnorm
- - xfun
- - mime
- - RcppParallel
- - rmarkdown
- - htmltools
- - knitr
- - rpf
- - BH
- - psych
- - openmx
-
-
\ No newline at end of file
diff --git a/DESCRIPTION b/DESCRIPTION
index 3486b46..2c97d4f 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -11,15 +11,10 @@ Depends:
R (>= 2.10),
OpenMx (>= 2.6.9),
Imports:
- bitops,
- sets,
- digest,
rpart,
rpart.plot (>= 3.0.6),
- plotrix,
- cluster,
- stringr,
lavaan,
+ cluster,
ggplot2,
tidyr,
methods,
@@ -30,7 +25,6 @@ Imports:
clisymbols,
future.apply,
data.table,
- ctsemOMX,
expm,
gridBase
Suggests:
@@ -39,7 +33,9 @@ Suggests:
viridis,
MASS,
psychTools,
- testthat
+ testthat,
+ future,
+ ctsemOMX
Description: SEM Trees and SEM Forests -- an extension of model-based decision
trees and forests to Structural Equation Models (SEM). SEM trees hierarchically
split empirical data into homogeneous groups each sharing similar data patterns
@@ -53,9 +49,10 @@ Description: SEM Trees and SEM Forests -- an extension of model-based decision
License: GPL-3
Encoding: UTF-8
LazyLoad: yes
-Version: 0.9.19
-Date: 2023-03-03
-RoxygenNote: 7.3.1
+Version: 0.9.20
+Date: 2024-03-25
+RoxygenNote: 7.2.3
VignetteBuilder: knitr
BugReports: https://github.com/brandmaier/semtree/issues
URL: https://github.com/brandmaier/semtree
+Language: en-US
diff --git a/NAMESPACE b/NAMESPACE
index db0d57e..0158412 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -26,11 +26,13 @@ S3method(print,summary.semtree)
S3method(proximity,semforest)
S3method(proximity,semforest_node_id)
S3method(proximity,semforest_stripped)
+S3method(prune,semforest)
S3method(prune,semtree)
S3method(strip,semforest)
S3method(strip,semtree)
S3method(summary,semforest)
S3method(summary,semtree)
+S3method(toLatex,semtree)
export(biodiversity)
export(boruta)
export(diversityMatrix)
@@ -59,10 +61,12 @@ export(prune)
export(se)
export(semforest)
export(semforest.control)
+export(semforest_control)
export(semforest_score_control)
export(semtree)
export(semtree.constraints)
export(semtree.control)
+export(semtree_control)
export(strip)
export(subforest)
export(subtree)
@@ -72,7 +76,6 @@ export(varimpConvergencePlot)
import(OpenMx)
import(data.table)
import(rpart)
-importFrom(bitops,bitAnd)
importFrom(data.table,data.table)
importFrom(grDevices,heat.colors)
importFrom(graphics,barplot)
@@ -92,7 +95,6 @@ importFrom(methods,is)
importFrom(parallel,clusterMap)
importFrom(parallel,parLapply)
importFrom(sandwich,bread)
-importFrom(sets,as.set)
importFrom(stats,as.dist)
importFrom(stats,as.formula)
importFrom(stats,cmdscale)
diff --git a/NEWS.md b/NEWS.md
index 0a11373..54cfdae 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,12 +1,20 @@
-# semtree 0.9.20 (2023)
+# semtree 0.9.20 (2024)
-- changed default behavior of print function of varimp, such that na.omit=TRUE,
- which is consistent with other packages like party or partykit
+- added an error handler for score-based tests when the vcov matrix cannot be computed (e.g., models with Heywood cases)
+- leaner package imports: removed dependency on bitops and stringr package
+- prefer `semforest_control()` over `semforest.control()` and `semtree_control()` over `semtree.control()`
+- added heuristics for choosing `mtry` in forests (if `NULL`) and for choosing `min.N` and `min.bucket` (if `NULL`)
+- moved dependency on `ctsemOMX` to suggested package
-# semtree 0.9.19 (2022)
+# semtree 0.9.19 (2023)
+- changed default behavior of print function of `varimp`, such that na.omit=TRUE, which is consistent with other packages like party or partykit
+- fixed issues with `toTable()`-command, by default, all parameters are shown now, also fixed a bug with score-based tests and toTable()
+- fixed problem with focus-parameters and variable importance
- bugfix in score-based tests that sometimes did not respect min.N constraints
- new functionality for parameter contribution evaluation
+- more verbose vignettes
+- removed dependency on set, plotrix and digest package to make package imports leaner
# semtree 0.9.18 (2022)
@@ -65,6 +73,6 @@
- deprecated partialDependencePlot and introduced partialDependence() function with S3 plotting method
- added parallel computation option to partialDependence
- added new demo scripts
-- added extra.legend paramter to varimpConvergencePlot
+- added extra.legend parameter to varimpConvergencePlot
- bugfix in traverse() that led to underestimations of variable importance in some cases
- added error message when trying to use lavaan and global constraints
diff --git a/R/OpenMx_scores_input.R b/R/OpenMx_scores_input.R
index 48aa1c9..7a28fdc 100644
--- a/R/OpenMx_scores_input.R
+++ b/R/OpenMx_scores_input.R
@@ -5,6 +5,25 @@ OpenMx_scores_input <- function(x, control) {
p_star <- p * (p + 1) / 2
p_star_means <- p * (p + 3) / 2
+ # AB: give pseudo-labels to matrices if
+ # unlabelled parameters are given
+ candidate_param_id <- which(startsWith(x=names(x$output$estimate), prefix=x$name))
+ if (length(candidate_param_id)>0) {
+ for (k in candidate_param_id) {
+ candidate_param_name <- names(x$output$estimate)[k]
+ cplen <- nchar(x$name)
+ candidate_matrix <- substr(candidate_param_name, cplen+2,cplen+2)
+ candidate_pos <- as.integer(strsplit(substr(candidate_param_name, cplen+4, nchar(candidate_param_name)-1),",")[[1]])
+ if (candidate_matrix=="A") {
+ x$A$labels[candidate_pos[1], candidate_pos[2]]<-candidate_param_name
+ } else if (candidate_matrix=="S") {
+ x$S$labels[candidate_pos[1], candidate_pos[2]]<-candidate_param_name
+ } else if (candidate_matrix == "M") {
+ x$M$labels[candidate_pos]<-candidate_param_name
+ }
+ }
+ }
+
if (control$linear | imxHasDefinitionVariable(x)) {
param_names <- names(x$output$estimate)
diff --git a/R/aggregateVarimp.R b/R/aggregateVarimp.R
index b63969e..9a409de 100644
--- a/R/aggregateVarimp.R
+++ b/R/aggregateVarimp.R
@@ -1,4 +1,3 @@
-
aggregateVarimp <-
function(vimp,
aggregate = "mean",
diff --git a/R/bootstrap.R b/R/bootstrap.R
index 773d087..b1e7b5b 100644
--- a/R/bootstrap.R
+++ b/R/bootstrap.R
@@ -5,15 +5,15 @@ forest.sample <-
return.oob = F,
type = "bootstrap") {
if (mtry > 0) {
- #cov.ids <- which(names(dataset) %in% covariates)
+ # cov.ids <- which(names(dataset) %in% covariates)
# sample length(cov.ids)-mtry of those to exclude
- #rem.ids <- sample(cov.ids, length(cov.ids)-mtry)
- #dataset <- dataset[, -rem.ids]
+ # rem.ids <- sample(cov.ids, length(cov.ids)-mtry)
+ # dataset <- dataset[, -rem.ids]
dataset <- sampleColumns(dataset, covariates, mtry)
}
-
+
N <- dim(dataset)[1]
-
+
if (type == "bootstrap") {
indices <- sample(1:N, N, replace = T)
} else if (type == "subsample") {
@@ -23,12 +23,11 @@ forest.sample <-
}
bootstrap.data <- dataset[indices, ]
oob.indices <- setdiff(1:N, unique(indices))
- oob.data <- dataset[oob.indices,]
+ oob.data <- dataset[oob.indices, ]
if (return.oob) {
return(list(bootstrap.data = bootstrap.data, oob.data = oob.data))
} else {
return(bootstrap.data)
}
-
- }
\ No newline at end of file
+ }
diff --git a/R/boruta.R b/R/boruta.R
index cfe0ff5..0ba714b 100644
--- a/R/boruta.R
+++ b/R/boruta.R
@@ -1,4 +1,3 @@
-
#' Run the Boruta algorithm on a sem tree
#'
#' Grows a series of SEM Forests following the boruta algorithm to determine
@@ -28,6 +27,7 @@
#'
#' @keywords tree models multivariate
#' @export
+
boruta <- function(model,
data,
control = NULL,
@@ -38,22 +38,63 @@ boruta <- function(model,
verbose=FALSE,
...) {
-
- # TODO: make sure that no column names start with "shadow_" prefix
+ # initial checks
+ stopifnot(percentile_threshold>=0)
+ stopifnot(percentile_threshold<=1)
+ stopifnot(is.numeric(rounds))
+ stopifnot(rounds>0)
+
+ preds_important <- c()
+ preds_unimportant <- c()
+
+ cur_round = 1
+ temp_vims <- list()
+
+ while(cur_round <= rounds) {
+ vim_boruta <- .boruta(model=model,
+ data=data,
+ control=control,
+ predictors=predictors,
+ percentile_threshold = percentile_threshold,
+ ...)
+ browser()
+ # add predictors to list of unimportant variables
+ preds_unimportant <- c(preds_unimportant, names(vim_boruta$filter)[!vim_boruta$filter])
+ # remove them from the dataset
+ data <- data[, -c(preds_unimportant)]
+ temp_vims[[cur_round]] <-vim_boruta
+ }
+ result <- list(
+ preds_unimportant,
+ rounds = rounds
+ )
+
+ return(result)
+}
+
+.boruta <- function(model,
+ data,
+ control = NULL,
+ predictors = NULL,
+ percentile_threshold = 1,
+ num_shadows = 1,
+ ...) {
+
+ # make sure that no column names start with "shadow_" prefix
+ stopifnot(all(sapply(names(data), function(x) {!startsWith(x, "shadow_")})))
+
# detect model (warning: duplicated code)
- if (inherits(model,"MxModel") || inherits(model,"MxRAMModel")) {
- tmp <- getPredictorsOpenMx(mxmodel=model, dataset=data, covariates=predictors)
- model.ids <- tmp[[1]]
- covariate.ids <- tmp[[2]]
-# } else if (inherits(model,"lavaan")){
+ if (inherits(model, "MxModel") || inherits(model, "MxRAMModel")) {
+ tmp <- getPredictorsOpenMx(mxmodel = model, dataset = data, covariates = predictors)
- # } else if ((inherits(model,"ctsemFit")) || (inherits(model,"ctsemInit"))) {
-#
+ } else if (inherits(model,"lavaan")){
+
+ tmp <- getPredictorsLavaan(model, data, predictors)
} else {
- ui_stop("Unknown model type selected. Use OpenMx or lavaanified lavaan models!");
+ ui_stop("Unknown model type selected. Use OpenMx or lavaanified lavaan models!")
}
-
+
# Checks on x & y from the boruta package
if(length(grep('^shadow',covariate.ids)>0))
stop('Attributes with names starting from "shadow" are reserved for internal use. Please rename them.')
diff --git a/R/checkControl.R b/R/checkControl.R
index f098454..59f9d5d 100644
--- a/R/checkControl.R
+++ b/R/checkControl.R
@@ -8,12 +8,16 @@ checkControl <- function(control, fail = TRUE) {
return(fail)
}
-check.semtree.control <- function(control, fail = TRUE)
-{
+check.semtree.control <- function(control, fail = TRUE) {
attr <- attributes(control)$names
def.attr <- attributes(semtree.control())$names
- if ((length(intersect(attr, def.attr)) != length(attr)))
- {
+
+ # add NULL-defaults
+ null_def <- c("min.N","min.bucket","strucchange.to")
+ attr <- unique(c(attr, null_def))
+ def.attr <- unique(c(def.attr, null_def))
+
+ if ((length(intersect(attr, def.attr)) != length(attr))) {
unknown <- setdiff(attr, def.attr)
msg <-
paste("Control object contains unknown parameters:", unknown)
@@ -22,9 +26,8 @@ check.semtree.control <- function(control, fail = TRUE)
stop()
} else {
ui_warn(msg)
-
+
return(FALSE)
-
}
} else {
temp <- semtree.control()
@@ -36,23 +39,20 @@ check.semtree.control <- function(control, fail = TRUE)
}
}
} # end for
-
- return (TRUE)
+
+ return(TRUE)
}
-
-
-
- return (length(intersect(attr, def.attr)) == length(attr))
-
+
+
+
+ return(length(intersect(attr, def.attr)) == length(attr))
}
-check.semforest.control <- function(control, fail = TRUE)
-{
+check.semforest.control <- function(control, fail = TRUE) {
attr <- attributes(control)$names
def.attr <- attributes(semforest.control())$names
-
- if ((length(intersect(attr, def.attr)) != length(attr)))
- {
+
+ if ((length(intersect(attr, def.attr)) != length(attr))) {
unknown <- setdiff(attr, def.attr)
msg <-
paste("Control object contains unknown parameters:", unknown)
@@ -61,12 +61,10 @@ check.semforest.control <- function(control, fail = TRUE)
stop()
} else {
ui_warn(msg)
-
+
return(FALSE)
-
}
} else {
- return (TRUE)
-
+ return(TRUE)
}
-}
\ No newline at end of file
+}
diff --git a/R/checkModel.R b/R/checkModel.R
index bd2ded8..2c11479 100644
--- a/R/checkModel.R
+++ b/R/checkModel.R
@@ -24,6 +24,3 @@ checkModel <- function(model, control)
return(TRUE);
}
-
-#inherits(model1,"lavaan")
-#model1@Fit@converged
diff --git a/R/coef_ctsem.R b/R/coef_ctsem.R
index ac5d1ae..06c35a8 100644
--- a/R/coef_ctsem.R
+++ b/R/coef_ctsem.R
@@ -1,31 +1,31 @@
-# Quick and dirty function to get untramsformed parameter estimates from a
+# Quick and dirty function to get untransformed parameter estimates from a
# ctsemFit object. This probably does not work for all tips of CTSEMs.
-coef.ctsemFit <- function(x) {
+coef.ctsemFit <- function(object, ...) {
- res <- x$mxobj$output$estimate
+ res <- object$mxobj$output$estimate
- if (any(c(x$mxobj$MANIFESTVARbase$free))) {
- values <- x$mxobj$MANIFESTVAR$result[x$mxobj$MANIFESTVARbase$free]
- labels <- x$mxobj$MANIFESTVARbase$labels[!is.na(x$mxobj$MANIFESTVARbase$labels)]
+ if (any(c(object$mxobj$MANIFESTVARbase$free))) {
+ values <- object$mxobj$MANIFESTVAR$result[object$mxobj$MANIFESTVARbase$free]
+ labels <- object$mxobj$MANIFESTVARbase$labels[!is.na(object$mxobj$MANIFESTVARbase$labels)]
res[labels] <- values
}
- if (any(c(x$mxobj$DIFFUSIONbase$free))) {
- values <- x$mxobj$DIFFUSION$result[x$mxobj$DIFFUSIONbase$free]
- labels <- x$mxobj$DIFFUSIONbase$labels[!is.na(x$mxobj$DIFFUSIONbase$labels)]
+ if (any(c(object$mxobj$DIFFUSIONbase$free))) {
+ values <- object$mxobj$DIFFUSION$result[object$mxobj$DIFFUSIONbase$free]
+ labels <- object$mxobj$DIFFUSIONbase$labels[!is.na(object$mxobj$DIFFUSIONbase$labels)]
res[labels] <- values
}
- if (any(c(x$mxobj$T0VARbase$free))) {
- values <- x$mxobj$T0VAR$result[x$mxobj$T0VARbase$free]
- labels <- x$mxobj$T0VARbase$labels[!is.na(x$mxobj$T0VARbase$labels)]
+ if (any(c(object$mxobj$T0VARbase$free))) {
+ values <- object$mxobj$T0VAR$result[object$mxobj$T0VARbase$free]
+ labels <- object$mxobj$T0VARbase$labels[!is.na(object$mxobj$T0VARbase$labels)]
res[labels] <- values
}
- if (any(c(x$mxobj$TRAITVARbase$free))) {
- values <- x$mxobj$TRAITVAR$result[x$mxobj$TRAITVARbase$free]
- labels <- x$mxobj$TRAITVARbase$labels[!is.na(x$mxobj$TRAITVARbase$labels)]
+ if (any(c(object$mxobj$TRAITVARbase$free))) {
+ values <- object$mxobj$TRAITVAR$result[object$mxobj$TRAITVARbase$free]
+ labels <- object$mxobj$TRAITVARbase$labels[!is.na(object$mxobj$TRAITVARbase$labels)]
res[labels] <- values
}
diff --git a/R/computePval_maxLR.R b/R/computePval_maxLR.R
index 1b6a114..c3e0438 100644
--- a/R/computePval_maxLR.R
+++ b/R/computePval_maxLR.R
@@ -1,18 +1,18 @@
#' Wrapper function for computing the maxLR corrected p value
#' from strucchange
-#'
+#'
#' @param maxLR maximum of the LR test statistics
#' @param q number of free SEM parameters / degrees of freedom
#' @param covariate covariate under evaluation. This is important to get the level of
#' measurement from the covariate and the bin size for ordinal and
#' categorical covariates.
#' @param from numeric from interval (0, 1) specifying start of trimmed
-#' sample period. With the default
+#' sample period. With the default
#' from = 0.15 the first and last 15 percent of observations are
#' trimmed. This is only needed for continuous covariates.
#' @param to numeric from interval (0, 1) specifying end of trimmed
#' sample period. By default, to is 1.
-#' @param nrep numeric. Number of replications used for simulating from the asymptotic
+#' @param nrep numeric. Number of replications used for simulating from the asymptotic
#' distribution (passed to efpFunctional). Only needed for ordinal
#' covariates.
#'
@@ -20,23 +20,22 @@
#' @author Manuel Arnold
#' @return Numeric. p value for maximally selected LR statistic
- computePval_maxLR <- function(maxLR, q, covariate, from, to, nrep) {
-
- # Level of measurement
- if (!is.factor(covariate)) { # metric
- pval <- strucchange::supLM(from = from, to = to)$computePval(x = maxLR, nproc = q)
- } else {
- covariate <- sort(covariate) # sort covariate
- covariate <- droplevels(covariate)
- if (is.ordered(covariate)) { # ordinal
- pval <- strucchange::ordL2BB(freq = covariate, nproc = q,
- nrep = nrep)$computePval(x = maxLR, nproc = q)
- } else { # categorical
- pval <- strucchange::catL2BB(covariate)$computePval(x = maxLR, nproc = q)
- }
+computePval_maxLR <- function(maxLR, q, covariate, from, to, nrep) {
+ # Level of measurement
+ if (!is.factor(covariate)) { # metric
+ pval <- strucchange::supLM(from = from, to = to)$computePval(x = maxLR, nproc = q)
+ } else {
+ covariate <- sort(covariate) # sort covariate
+ covariate <- droplevels(covariate)
+ if (is.ordered(covariate)) { # ordinal
+ pval <- strucchange::ordL2BB(
+ freq = covariate, nproc = q,
+ nrep = nrep
+ )$computePval(x = maxLR, nproc = q)
+ } else { # categorical
+ pval <- strucchange::catL2BB(covariate)$computePval(x = maxLR, nproc = q)
}
-
- pval
-
}
-
+
+ pval
+}
diff --git a/R/conditional.R b/R/conditional.R
index bd69e42..08da777 100644
--- a/R/conditional.R
+++ b/R/conditional.R
@@ -1,74 +1,78 @@
# collect all cut-points
collect.rules <- function(x, exclude) {
- if (x$caption=="TERMINAL") {
+ if (x$caption == "TERMINAL") {
return(NULL)
}
-
+
if (!x$rule$variable %in% exclude) {
- c(list(x$rule),
- collect.rules(x$left_child, exclude),
- collect.rules(x$right_child, exclude))
+ c(
+ list(x$rule),
+ collect.rules(x$left_child, exclude),
+ collect.rules(x$right_child, exclude)
+ )
} else {
- c(collect.rules(x$left_child, exclude),
- collect.rules(x$right_child, exclude))
+ c(
+ collect.rules(x$left_child, exclude),
+ collect.rules(x$right_child, exclude)
+ )
}
}
-conditionalSample <- function(tree, dataset, id)
-{
-permutation.idx <- id
-rules <- collect.rules(tree, exclude=id)
+conditionalSample <- function(tree, dataset, id) {
+ permutation.idx <- id
+ rules <- collect.rules(tree, exclude = id)
-num.rules <- length(rules)
+ num.rules <- length(rules)
-# if there are too many rules, sample from them
-if (num.rules > 10) {
- rules <- sample(x = rules,size=10,replace = FALSE)
-}
+ # if there are too many rules, sample from them
+ if (num.rules > 10) {
+ rules <- sample(x = rules, size = 10, replace = FALSE)
+ }
-# obtain logicals from rules
-logicalS <- list()
-for (i in 1:length(rules)) {
-rule <- rules[[i]]
-if (rule$relation=="%in%")
- logicalS[[i]] <- dataset[,rule$variable] %in% rule$value
-else if (rule$relation==">=")
- logicalS[[i]] <- dataset[,rule$variable] >= rule$value
-else
- stop("Not Implemented!")
-}
+ # obtain logicals from rules
+ logicalS <- list()
+ for (i in 1:length(rules)) {
+ rule <- rules[[i]]
+ if (rule$relation == "%in%") {
+ logicalS[[i]] <- dataset[, rule$variable] %in% rule$value
+ } else if (rule$relation == ">=") {
+ logicalS[[i]] <- dataset[, rule$variable] >= rule$value
+ } else {
+ stop("Not Implemented!")
+ }
+ }
-if (length(logicalS)<=0) {
- stop("ERROR in generating rules for resampling!")
-}
+ if (length(logicalS) <= 0) {
+ stop("ERROR in generating rules for resampling!")
+ }
-initialS <- rep(TRUE, length(logicalS[[1]]))
+ initialS <- rep(TRUE, length(logicalS[[1]]))
-# generate all combinations and roll!
-allcombs <- 2^length(rules)
-for (i in 1:allcombs) {
- bit.filter <- as.logical(intToBits(i))[1:num.rules]
+ # generate all combinations and roll!
+ allcombs <- 2^length(rules)
+ for (i in 1:allcombs) {
+ bit.filter <- as.logical(intToBits(i))[1:num.rules]
-
- tempS <- initialS
- for (j in 1:length(bit.filter)) {
- if (bit.filter[j])
- tempS <- tempS & logicalS[[j]]
- else
- tempS <- tempS & !logicalS[[j]]
- }
- # permute only temp
- rowids <- 1:nrow(dataset)
- selected.rowids <- rowids[tempS]
-
- if (length(selected.rowids) > 0)
- {
- dataset[selected.rowids,permutation.idx] <- dataset[sample(selected.rowids),permutation.idx]
+
+ tempS <- initialS
+ for (j in 1:length(bit.filter)) {
+ if (bit.filter[j]) {
+ tempS <- tempS & logicalS[[j]]
+ } else {
+ tempS <- tempS & !logicalS[[j]]
+ }
+ }
+ # permute only temp
+ rowids <- 1:nrow(dataset)
+ selected.rowids <- rowids[tempS]
+
+ if (length(selected.rowids) > 0) {
+ dataset[selected.rowids, permutation.idx] <- dataset[sample(selected.rowids), permutation.idx]
+ }
+
+
+ cat(bit.filter, " : ", length(selected.rowids), "\n")
}
-
-
- cat(bit.filter," : ", length(selected.rowids),"\n")
-}
-return(dataset)
-}
\ No newline at end of file
+ return(dataset)
+}
diff --git a/R/evaluateTree.R b/R/evaluateTree.R
index 181489d..13e685c 100644
--- a/R/evaluateTree.R
+++ b/R/evaluateTree.R
@@ -1,11 +1,11 @@
#' Evaluate Tree -2LL
-#'
+#'
#' A helper function to evaluate the negative two log-likelihood (-2LL) of leaf (terminal) nodes for a
#' dataset. When given a \code{\link{semtree}} and a unique dataset, the model
#' estimates -2LL for the tree parameters and data subsets that fit the tree
#' branching criteria.
-#'
-#'
+#'
+#'
#' @param tree A fitted \code{\link{semtree}} object
#' @param test_set Dataset to fit to a fitted \code{\link{semtree}} object
#' @param data_type type of data ("raw", "cov", "cor")
@@ -26,32 +26,29 @@ evaluateTree <-
function(tree,
test_set,
data_type = "raw",
- leaf_ids = NULL)
- {
+ leaf_ids = NULL) {
# get a mapping of dataset rows to leaf ids
if (is.null(leaf_ids)) {
leaf_ids <- traverse(tree, test_set)
}
-
+
# for each leaf, calculate deviance of each data row
dev <- 0
for (leaf_id in unique(leaf_ids))
{
- temp_set <- test_set[leaf_ids == leaf_id,]
-
-
+ temp_set <- test_set[leaf_ids == leaf_id, ]
+
+
leaf <- getNodeById(tree, leaf_id)
-
+
# add up log-likelihoods
dev <-
dev + evaluateDataLikelihood(leaf$model, temp_set[, , drop = F], data_type)
}
-
+
result <- list()
result$deviance <- dev
result$num_models <- length(unique(leaf_ids))
-
+
return(result)
-
-
}
diff --git a/R/evaluateTreeFocus.R b/R/evaluateTreeFocus.R
index 9160d87..23a15bd 100644
--- a/R/evaluateTreeFocus.R
+++ b/R/evaluateTreeFocus.R
@@ -4,60 +4,59 @@
# computes the difference in log-likelihoods considering focus parameters
#
evaluateTreeFocus <-
- function(tree, test_set, data_type="raw", leaf_ids=NULL)
- {
-
+ function(tree, test_set, data_type = "raw", leaf_ids = NULL) {
# get a mapping of dataset rows to leaf ids
if (is.null(leaf_ids)) {
leaf_ids <- traverse(tree, test_set)
}
-
+
# for each leaf, calculate deviance of each data row
dev <- 0
for (leaf_id in unique(leaf_ids))
{
# get all data rows from current leaf
- temp_set <- test_set[leaf_ids==leaf_id, ];
-
+ temp_set <- test_set[leaf_ids == leaf_id, ]
+
# get the leaf object
- leaf <- getNodeById( tree, leaf_id)
-
+ leaf <- getNodeById(tree, leaf_id)
+
# test if node has a focus model
if (is.null(leaf$focus.model)) {
ui_warn("No focus model available!")
return(NA)
}
-
+
# evaluate log-likelihood from baseline and focus model
- #baseline = evaluateDataLikelihood(leaf$model, temp_set[,,drop=F], data_type )
- ll.focus = evaluateDataLikelihood(leaf$focus.model,
- temp_set[,,drop=F], data_type )
-
+ # baseline = evaluateDataLikelihood(leaf$model, temp_set[,,drop=F], data_type )
+ ll.focus <- evaluateDataLikelihood(
+ leaf$focus.model,
+ temp_set[, , drop = F], data_type
+ )
+
# evaluate log-likelihood after permutation
-
-
+
+
# add up log-likelihoods
dev <- dev + ll.focus
}
-
+
result <- list()
result$deviance <- dev
result$num_models <- length(unique(leaf_ids))
-
- return(result);
-
+
+ return(result)
}
# TODO: finish this block
# TODO: remove earlier computation of baseline ll
# compute influence of focus parameter before permutation
-#ll.baseline <- eval.fun(tree, oob.data)$deviance
-#ll.baseline <- fitSubmodels(tree$model, subset1, subset2,
+# ll.baseline <- eval.fun(tree, oob.data)$deviance
+# ll.baseline <- fitSubmodels(tree$model, subset1, subset2,
# control, invariance=constraints$focus.parameters)
# compute misfit of focus parameter after permutation
-#ll.permuted <- eval.fun(tree, oob.data.permuted)$deviance
-#ll.diff <- -ll.baseline + ll.permuted
-#ui_warn("Unfinished implementation!")
+# ll.permuted <- eval.fun(tree, oob.data.permuted)$deviance
+# ll.diff <- -ll.baseline + ll.permuted
+# ui_warn("Unfinished implementation!")
-#}
\ No newline at end of file
+# }
diff --git a/R/fairSplit.R b/R/fairSplit.R
index 181445d..12333ca 100644
--- a/R/fairSplit.R
+++ b/R/fairSplit.R
@@ -6,43 +6,43 @@ fairSplit <-
meta = NULL,
constraints = NULL,
...) {
-
- # add a column of uniform random numbers to the data
+ # add a column of uniform random numbers to the data
# to split into two halfes
# TODO: replace with sample() of row ids
n <- nrow(mydata)
random <- runif(n, 0, 1)
mydata <- cbind(mydata, random)
- cross1 <- subset (mydata, as.numeric(mydata[, ncol(mydata)]) > 0.50)
+ cross1 <- subset(mydata, as.numeric(mydata[, ncol(mydata)]) > 0.50)
cross2 <-
- subset (mydata, as.numeric(mydata[, ncol(mydata)]) <= 0.50)
+ subset(mydata, as.numeric(mydata[, ncol(mydata)]) <= 0.50)
cross1 <- cross1[-ncol(cross1)]
cross2 <- cross2[-ncol(cross2)]
mydata <- mydata[-ncol(mydata)]
-
+
LL.btn <- c()
split.btn <- c()
cov.btn.names <- c()
cov.btn.cols <- c()
cov.type <- c()
-
- #Baseline model to compare subgroup fits to
+
+ # Baseline model to compare subgroup fits to
modelnew <- mxAddNewModelData(model, cross1, name = "BASE MODEL C1")
LL.overall <- safeRunAndEvaluate(modelnew)
- suppressWarnings(if (is.na(LL.overall))
- return(NULL))
+ suppressWarnings(if (is.na(LL.overall)) {
+ return(NULL)
+ })
n.comp <- 0
-
+
if (control$report.level > 2) {
report("Phase I - Select within variables", 1)
}
-
+
#
-
+
# Step I - use cross validation fold 1 to evaluate all splits and
# select best split
#
-
+
# iterate over all variables
for (cur_col in meta$covariate.ids) {
LL.baseline <- LL.overall
@@ -53,51 +53,54 @@ fairSplit <-
if (control$report.level > 10) {
report(paste("Estimating baseline likelihood: ", LL.baseline), 2)
}
-
-
+
+
if (control$report.level >= 1 || control$verbose) {
- ui_message(paste(
- "Testing predictor",
- colnames(cross1)[cur_col],
- " (#",
- cur_col,
- "/",
- ncol(cross1),
- ")"
- ),
- 2)
+ ui_message(
+ paste(
+ "Testing predictor",
+ colnames(cross1)[cur_col],
+ " (#",
+ cur_col,
+ "/",
+ ncol(cross1),
+ ")"
+ ),
+ 2
+ )
}
-
+
LL.within <- base::c()
within.split <- base::c()
-
- #case for factored covariates##############################
+
+ # case for factored covariates##############################
if (is.factor(cross1[, cur_col])) {
- #unordered factors#####################################
+ # unordered factors#####################################
if (!is.ordered(cross1[, cur_col])) {
- var.type = .SCALE_CATEGORICAL
-
+ var.type <- .SCALE_CATEGORICAL
+
v <- cross1[, cur_col]
val.sets <- sort(unique(v))
-
-
+
+
if (length(val.sets) > 1) {
- #create binaries for comparison of all combinations
+ # create binaries for comparison of all combinations
result <-
- recodeAllSubsets(cross1[, cur_col], colnames(cross1)[cur_col], use.levels =
- T)
- test1 <- rep(0, length(cross1[, cur_col]))#base::c()
+ recodeAllSubsets(cross1[, cur_col], colnames(cross1)[cur_col],
+ use.levels =
+ T
+ )
+ test1 <- rep(0, length(cross1[, cur_col])) # base::c()
test2 <- rep(NA, length(cross1[, cur_col]))
-
+
for (j in 1:ncol(result$columns)) {
- #cat("RUN",j,"\n")
+ # cat("RUN",j,"\n")
test1 <- rep(0, length(cross1[, cur_col]))
for (i in 1:length(cross1[, cur_col])) {
if (isTRUE(result$columns[i, j])) {
test1[i] <- 1
- }
- else {
+ } else {
test1[i] <- 0
}
}
@@ -107,25 +110,25 @@ fairSplit <-
test2 <- test2[, -1]
for (i in 1:(result$num_sets)) {
LL.temp <- base::c()
- #subset data for chosen value and store LL
+ # subset data for chosen value and store LL
if (result$num_sets == 1) {
- vec = test2
+ vec <- test2
+ } else {
+ vec <- test2[[i]]
}
- else {
- vec = test2[[i]]
- }
- subset1 <- subset (cross1, as.numeric(vec) == 2)
- subset2 <- subset (cross1, as.numeric(vec) == 1)
-
+ subset1 <- subset(cross1, as.numeric(vec) == 2)
+ subset2 <- subset(cross1, as.numeric(vec) == 1)
+
# refit baseline model with focus parameters @TAGX
# for each new potential split
if (!is.null(constraints) &
- (!is.null(constraints$focus.parameters))) {
+ (!is.null(constraints$focus.parameters))) {
LL.baseline <- fitSubmodels(model,
- subset1,
- subset2,
- control,
- invariance = constraints$focus.parameters)
+ subset1,
+ subset2,
+ control,
+ invariance = constraints$focus.parameters
+ )
if (control$report.level > 10) {
report(
paste(
@@ -136,17 +139,17 @@ fairSplit <-
)
}
}
-
- #catch LLR for each comparison, only if valid
+
+ # catch LLR for each comparison, only if valid
LL.return <-
fitSubmodels(model, subset1, subset2, control, invariance = NULL)
-
+
if (nrow(subset1) + nrow(subset2) != nrow(cross1)) {
message("INCONSISTENCY ERROR DETECTED")
-
+
LL.return <- NA
}
-
+
if (!is.na(LL.return)) {
LL.within <- cbind(LL.within, (LL.baseline - LL.return))
within.split <- cbind(within.split, i)
@@ -154,34 +157,35 @@ fairSplit <-
}
}
}
-
-
- #ordered factors#########################################
+
+
+ # ordered factors#########################################
if (is.ordered(cross1[, cur_col])) {
- var.type = .SCALE_ORDINAL
-
+ var.type <- .SCALE_ORDINAL
+
v <- cross1[, cur_col]
val.sets <- sort(unique(v))
if (length(val.sets) > 1) {
for (i in 2:(length(val.sets))) {
LL.temp <- base::c()
- #subset data for chosen value and store LL
+ # subset data for chosen value and store LL
cond1 <-
- cross1[, cur_col] > val.sets[i-1]
+ cross1[, cur_col] > val.sets[i - 1]
cond2 <-
- cross1[, cur_col] <= val.sets[i-1]
-
- subset1 <- subset (cross1, cond1)
- subset2 <- subset (cross1, cond2)
-
+ cross1[, cur_col] <= val.sets[i - 1]
+
+ subset1 <- subset(cross1, cond1)
+ subset2 <- subset(cross1, cond2)
+
# refit baseline model with focus parameters @TAGX
if (!is.null(constraints) &
- (!is.null(constraints$focus.parameters))) {
+ (!is.null(constraints$focus.parameters))) {
LL.baseline <- fitSubmodels(model,
- subset1,
- subset2,
- control,
- invariance = constraints$focus.parameters)
+ subset1,
+ subset2,
+ control,
+ invariance = constraints$focus.parameters
+ )
if (control$report.level > 10) {
report(
paste(
@@ -192,46 +196,47 @@ fairSplit <-
)
}
}
-
- #catch LLR for each comparison
+
+ # catch LLR for each comparison
LL.return <-
fitSubmodels(model, subset1, subset2, control, invariance = NULL)
if (!is.na(LL.return)) {
LL.within <- cbind(LL.within, (LL.baseline - LL.return))
within.split <-
- # cbind(within.split, (val.sets[i] + val.sets[(i - 1)]) / 2)
- c(within.split, as.character(val.sets[i-1]))
+ # cbind(within.split, (val.sets[i] + val.sets[(i - 1)]) / 2)
+ c(within.split, as.character(val.sets[i - 1]))
}
}
}
}
}
-
- #numeric (continuous) covariates################################
+
+ # numeric (continuous) covariates################################
if (is.numeric(cross1[, cur_col])) {
- var.type = .SCALE_METRIC
+ var.type <- .SCALE_METRIC
v <- as.numeric(cross1[, cur_col])
val.sets <- sort(unique(v))
- #if(length(val.sets) < 30|!isTRUE(control$shortcut)){
+ # if(length(val.sets) < 30|!isTRUE(control$shortcut)){
if (length(val.sets) > 1) {
for (i in 2:(length(val.sets))) {
LL.temp <- base::c()
- #subset data for chosen value and store LL
+ # subset data for chosen value and store LL
cond1 <-
as.numeric(cross1[, cur_col]) > (val.sets[i] + val.sets[(i - 1)]) / 2
cond2 <-
as.numeric(cross1[, cur_col]) < (val.sets[i] + val.sets[(i - 1)]) / 2
- subset1 <- subset (cross1, cond1)
- subset2 <- subset (cross1, cond2)
-
+ subset1 <- subset(cross1, cond1)
+ subset2 <- subset(cross1, cond2)
+
# refit baseline model with focus parameters @TAGX
if (!is.null(constraints) &
- (!is.null(constraints$focus.parameters))) {
+ (!is.null(constraints$focus.parameters))) {
LL.baseline <- fitSubmodels(model,
- subset1,
- subset2,
- control,
- invariance = constraints$focus.parameters)
+ subset1,
+ subset2,
+ control,
+ invariance = constraints$focus.parameters
+ )
if (control$report.level > 10) {
report(
paste(
@@ -242,8 +247,8 @@ fairSplit <-
)
}
}
-
- #catch LLR for each comparison
+
+ # catch LLR for each comparison
LL.return <-
fitSubmodels(model, subset1, subset2, control, invariance = NULL)
if (!is.na(LL.return)) {
@@ -251,45 +256,49 @@ fairSplit <-
within.split <-
cbind(within.split, (val.sets[i] + val.sets[(i - 1)]) / 2)
} else {
- if (control$verbose)
+ if (control$verbose) {
ui_fail("LL was NA when fitting submodels!")
+ }
if (control$report.level > 2) {
- report(paste("Could not estimate split at value ", val.sets[i]),
- 2)
+ report(
+ paste("Could not estimate split at value ", val.sets[i]),
+ 2
+ )
}
}
}
}
- #}
+ # }
}
-
+
if (control$report.level > 10) {
if (!is.null(LL.within)) {
report(paste("Within Likelihoods ", paste(round(
LL.within, 2
), collapse = " ")), 2)
- }
- else{
+ } else {
message("Within LLs NULL")
}
}
-
+
if (control$report.level > 0) {
if (is.null(LL.within)) {
- report("No valid estimate found for any split value during the first round.",
- 2)
+ report(
+ "No valid estimate found for any split value during the first round.",
+ 2
+ )
}
}
-
+
max.LL.within <- base::c()
max.within.split <- base::c()
-
- #store the LL, split value and variable number for each cov that makes a possible split
+
+ # store the LL, split value and variable number for each cov that makes a possible split
if (!is.null(LL.within)) {
max.LL.within <- LL.within[1]
max.within.split <- within.split[1]
max.within.cov <- cur_col
-
+
if (length(LL.within) > 1) {
for (i in 2:length(LL.within)) {
if (!is.na(LL.within[i]) | !is.null(LL.within[i])) {
@@ -299,52 +308,51 @@ fairSplit <-
}
}
}
-
}
-
+
LL.btn <- cbind(LL.btn, max.LL.within)
split.btn <- cbind(split.btn, max.within.split)
- #cov.btn.names <- cbind(cov.btn.names, colnames(mydata[cur_col]))
+ # cov.btn.names <- cbind(cov.btn.names, colnames(mydata[cur_col]))
cov.btn.cols <- cbind(cov.btn.cols, max.within.cov)
cov.type <- cbind(cov.type, var.type)
-
+
if (control$report.level >= 3) {
- report(paste(
- "Best split at ",
- max.within.split,
- " with LL",
- max.LL.within
- ),
- 2)
+ report(
+ paste(
+ "Best split at ",
+ max.within.split,
+ " with LL",
+ max.LL.within
+ ),
+ 2
+ )
}
-
}
-
}
-
+
#
# Phase II - select between variables using their best split
# use cross validation fold 2 for evaluation
#
- #cat("PHASE II")
+ # cat("PHASE II")
if (control$report.level > 2) {
report("Phase II - Select between variables", 1)
}
-
- #Baseline model to compare subgroup fits to
+
+ # Baseline model to compare subgroup fits to
modelnew <- mxAddNewModelData(model, cross2, name = "BASE MODEL C2")
LL.overall <- safeRunAndEvaluate(modelnew)
suppressWarnings(if (is.na(LL.overall)) {
warning("Baseline likelihood is N/A; Aborting!")
-
+
return(NULL)
})
n.comp <- ncol(LL.btn)
-
+
LL.max <- NULL
-
+
if (!is.null(LL.btn)) {
for (cur_col in 1:length(LL.btn)) {
LL.temp <- base::c()
@@ -356,12 +364,14 @@ fairSplit <-
LL.baseline <- safeRunAndEvaluate(missingModel)
num.rows <- dim(missingModel@data@observed)[1]
}
-
+
if (cov.type[cur_col] == .SCALE_CATEGORICAL) {
if (!is.ordered(cross2[, cov.btn.cols[cur_col]])) {
result <-
- recodeAllSubsets(cross2[, (cov.btn.cols[cur_col])], colnames(cross2)[(cov.btn.cols[cur_col])], use.levels =
- T)
+ recodeAllSubsets(cross2[, (cov.btn.cols[cur_col])], colnames(cross2)[(cov.btn.cols[cur_col])],
+ use.levels =
+ T
+ )
test1 <- base::c()
clen <- dim(cross2)[1]
test2 <- rep(NA, clen)
@@ -370,8 +380,7 @@ fairSplit <-
for (i in 1:clen) {
if (isTRUE(result$columns[i, j])) {
test1[i] <- 1
- }
- else {
+ } else {
test1[i] <- 0
}
}
@@ -379,23 +388,21 @@ fairSplit <-
test2 <- data.frame(test2, test1)
}
test2 <- test2[, -1]
-
-
+
+
if (result$num_sets == 1) {
vec <- test2
- }
- else {
+ } else {
vec <- test2[[split.btn[cur_col]]]
}
-
-
- subset1 <- subset (cross2, as.numeric(vec) == 2)
- subset2 <- subset (cross2, as.numeric(vec) == 1)
+
+
+ subset1 <- subset(cross2, as.numeric(vec) == 2)
+ subset2 <- subset(cross2, as.numeric(vec) == 1)
}
- }
- else if (cov.type[cur_col] == .SCALE_ORDINAL) {
+ } else if (cov.type[cur_col] == .SCALE_ORDINAL) {
cond1 <-
cross2[, cov.btn.cols[cur_col]] > split.btn[cur_col]
cond2 <-
@@ -408,12 +415,12 @@ fairSplit <-
cond2 <-
cross2[, cov.btn.cols[cur_col]] < split.btn[cur_col]
subset1 <- subset(cross2, cond1)
- subset2 <- subset(cross2, cond2)
+ subset2 <- subset(cross2, cond2)
}
-
+
# refit baseline model with focus parameters @TAGX
if (!is.null(constraints) &
- (!is.null(constraints$focus.parameters))) {
+ (!is.null(constraints$focus.parameters))) {
result <- fitSubmodels(
model,
subset1,
@@ -437,10 +444,10 @@ fairSplit <-
nrow(subset1) + nrow(subset2) # THIS DEFINITELY IS A HACK. SHOULD BE
# DEFINED BY THE RETURNED MODEL
} else {
- #warning("LL.sum is NA after reestimation with focus parameter")
+ # warning("LL.sum is NA after reestimation with focus parameter")
ui_warn(
"Could not estimate constrained/focus model for variable ",
-# colnames(cross2)[cur_col],
+ # colnames(cross2)[cur_col],
names(mydata[cov.btn.cols[cur_col]]),
". Possibly choosing a suboptimal split."
)
@@ -448,16 +455,18 @@ fairSplit <-
num.rows <- nrow(subset1) + nrow(subset2)
}
}
-
+
# evaluate split
LL.cur <-
- LL.baseline - fitSubmodels(model, subset1, subset2, control, invariance =
- NULL)
-
+ LL.baseline - fitSubmodels(model, subset1, subset2, control,
+ invariance =
+ NULL
+ )
+
# cross2 == subset of data for Phase II
-
+
if (nrow(subset1) + nrow(subset2) != num.rows) {
- #browser()
+ # browser()
ui_fail(
paste(
"SERIOUS INCONSISTENCY ERROR. Numbers of rows do not match. Type="
@@ -472,11 +481,12 @@ fairSplit <-
)
LL.cur <- NA
}
-
- if (control$verbose)
+
+ if (control$verbose) {
ui_message(paste("Testing", cur_col, ":", names(mydata[cov.btn.cols[cur_col]]), " ", LL.cur, "\n"))
-
-
+ }
+
+
if (cur_col == 1) {
LL.max <- LL.cur
LL.btn <- LL.cur
@@ -484,8 +494,7 @@ fairSplit <-
name.max <- names(mydata[cov.btn.cols[1]])
col.max <- cov.btn.cols[1]
type.max <- cov.type[1]
- }
- else {
+ } else {
LL.btn <- cbind(LL.btn, LL.cur)
if (!is.na(LL.cur) & !is.null(LL.cur)) {
if (is.na(LL.max) || is.na(LL.cur) || LL.max < LL.cur) {
@@ -497,15 +506,12 @@ fairSplit <-
}
}
}
-
+
if (control$report.level > 2) {
report(paste("Name", names(mydata[cov.btn.cols[cur_col]]), "LL:", LL.cur), 2)
}
-
}
-
- }
- else {
+ } else {
return(NULL)
}
##
@@ -516,22 +522,22 @@ fairSplit <-
if (control$report.level > 2) {
report("Phase III - Select Split in Best Variable", 1)
}
-
+
if (!is.null(constraints$focus.parameters)) {
stop("Method 'fair3' does not yet support focus parameters in tree.constraints.")
}
-
+
# run full data on model for overall model fit
modelnew <-
mxAddNewModelData(model, mydata, name = "BASE MODEL FULL")
LL.overall <- safeRunAndEvaluate(modelnew)
suppressWarnings(if (is.na(LL.overall)) {
warning("Overall likelihood is N/A; Aborting!")
-
+
return(NULL)
})
n.comp <- 0
-
+
# iterate over all splits in col.max
cur_col <- col.max
LL.baseline <- LL.overall
@@ -539,47 +545,51 @@ fairSplit <-
if (!is.null(missingModel)) {
LL.baseline <- safeRunAndEvaluate(missingModel)
}
-
- if (control$verbose)
- ui_message("Testing ",
- cur_col,
- "/",
- ncol(mydata),
- " (",
- colnames(mydata)[cur_col],
- ")")
-
+
+ if (control$verbose) {
+ ui_message(
+ "Testing ",
+ cur_col,
+ "/",
+ ncol(mydata),
+ " (",
+ colnames(mydata)[cur_col],
+ ")"
+ )
+ }
+
if (control$report.level >= 1) {
report(paste("Testing predictor", colnames(mydata)[cur_col]), 1)
}
-
+
LL.within <- base::c()
within.split <- base::c()
-
- #case for factored covariates##############################
+
+ # case for factored covariates##############################
if (is.factor(mydata[, cur_col])) {
- #unordered factors#####################################
+ # unordered factors#####################################
if (!is.ordered(mydata[, cur_col])) {
- var.type = 1
+ var.type <- 1
v <- as.numeric(mydata[, cur_col])
val.sets <- sort(union(v, v))
- #cat("Length", length(val.sets),":",paste(v),"\n")
+ # cat("Length", length(val.sets),":",paste(v),"\n")
if (length(val.sets) > 1) {
- #create binaries for comparison of all combinations
+ # create binaries for comparison of all combinations
result <-
- recodeAllSubsets(mydata[, cur_col], colnames(mydata)[cur_col], use.levels =
- T)
- test1 <- rep(0, length(mydata[, cur_col]))#base::c()
+ recodeAllSubsets(mydata[, cur_col], colnames(mydata)[cur_col],
+ use.levels =
+ T
+ )
+ test1 <- rep(0, length(mydata[, cur_col])) # base::c()
test2 <- rep(NA, length(mydata[, cur_col]))
-
+
for (j in 1:ncol(result$columns)) {
- #cat("RUN",j,"\n")
+ # cat("RUN",j,"\n")
test1 <- rep(0, length(mydata[, cur_col]))
for (i in 1:length(mydata[, cur_col])) {
if (isTRUE(result$columns[i, j])) {
test1[i] <- 1
- }
- else {
+ } else {
test1[i] <- 0
}
}
@@ -589,26 +599,25 @@ fairSplit <-
test2 <- test2[, -1]
for (i in 1:(result$num_sets)) {
LL.temp <- base::c()
- #subset data for chosen value and store LL
+ # subset data for chosen value and store LL
if (result$num_sets == 1) {
- vec = test2
- }
- else {
- vec = test2[[i]]
+ vec <- test2
+ } else {
+ vec <- test2[[i]]
}
- subset1 <- subset (mydata, as.numeric(vec) == 2)
- subset2 <- subset (mydata, as.numeric(vec) == 1)
-
- #catch LLR for each comparison, only if valid
+ subset1 <- subset(mydata, as.numeric(vec) == 2)
+ subset2 <- subset(mydata, as.numeric(vec) == 1)
+
+ # catch LLR for each comparison, only if valid
LL.return <-
fitSubmodels(model, subset1, subset2, control, invariance = NULL)
-
+
if (nrow(subset1) + nrow(subset2) != nrow(mydata)) {
message("INCONSISTENCY ERROR DETECTED")
-
+
LL.return <- NA
}
-
+
if (!is.na(LL.return)) {
LL.within <- cbind(LL.within, (LL.baseline - LL.return))
within.split <- cbind(within.split, i)
@@ -616,26 +625,26 @@ fairSplit <-
}
}
}
-
-
- #ordered factors#########################################
+
+
+ # ordered factors#########################################
if (is.ordered(mydata[, cur_col])) {
- var.type = .SCALE_ORDINAL
+ var.type <- .SCALE_ORDINAL
v <- as.numeric(as.character(mydata[, cur_col]))
val.sets <- sort(union(v, v))
if (length(val.sets) > 1) {
for (i in 2:(length(val.sets))) {
LL.temp <- base::c()
- #subset data for chosen value and store LL
+ # subset data for chosen value and store LL
cond1 <-
as.numeric(as.character(mydata[, cur_col])) > (val.sets[i] + val.sets[(i -
- 1)]) / 2
+ 1)]) / 2
cond2 <-
as.numeric(as.character(mydata[, cur_col])) < (val.sets[i] + val.sets[(i -
- 1)]) / 2
- subset1 <- subset (mydata, cond1)
- subset2 <- subset (mydata, cond2)
- #catch LLR for each comparison
+ 1)]) / 2
+ subset1 <- subset(mydata, cond1)
+ subset2 <- subset(mydata, cond2)
+ # catch LLR for each comparison
LL.return <-
fitSubmodels(model, subset1, subset2, control, invariance = NULL)
if (!is.na(LL.return)) {
@@ -647,23 +656,23 @@ fairSplit <-
}
}
}
-
- #numeric (continuous) covariates################################
+
+ # numeric (continuous) covariates################################
if (is.numeric(mydata[, cur_col])) {
- var.type = 2
+ var.type <- 2
v <- as.numeric(mydata[, cur_col])
val.sets <- sort(union(v, v))
if (length(val.sets) > 1) {
for (i in 2:(length(val.sets))) {
LL.temp <- base::c()
- #subset data for chosen value and store LL
+ # subset data for chosen value and store LL
cond1 <-
as.numeric(mydata[, cur_col]) > (val.sets[i] + val.sets[(i - 1)]) / 2
cond2 <-
as.numeric(mydata[, cur_col]) < (val.sets[i] + val.sets[(i - 1)]) / 2
- subset1 <- subset (mydata, cond1)
- subset2 <- subset (mydata, cond2)
- #catch LLR for each comparison
+ subset1 <- subset(mydata, cond1)
+ subset2 <- subset(mydata, cond2)
+ # catch LLR for each comparison
LL.return <-
fitSubmodels(model, subset1, subset2, control, invariance = NULL)
if (!is.na(LL.return)) {
@@ -671,42 +680,46 @@ fairSplit <-
within.split <-
cbind(within.split, (val.sets[i] + val.sets[(i - 1)]) / 2)
} else {
- if (control$verbose)
+ if (control$verbose) {
ui_fail("LL was NA when fitting submodels!")
+ }
if (control$report.level > 2) {
- report(paste("Could not estimate split at value ", val.sets[i]),
- 2)
+ report(
+ paste("Could not estimate split at value ", val.sets[i]),
+ 2
+ )
}
}
}
}
}
-
+
if (control$report > 10) {
if (!is.null(LL.within)) {
message("Within LLs ", paste(round(LL.within, 2), collapse = " "))
- }
- else{
+ } else {
message("Within LLs NULL")
}
}
-
+
if (control$report.level > 0) {
if (is.null(LL.within)) {
- report("No valid estimate found for any split value during the first round.",
- 2)
+ report(
+ "No valid estimate found for any split value during the first round.",
+ 2
+ )
}
}
-
+
max.LL.within <- base::c()
max.within.split <- base::c()
-
- #store the LL, split value and variable number for each cov that makes a possible split
+
+ # store the LL, split value and variable number for each cov that makes a possible split
if (!is.null(LL.within)) {
max.LL.within <- LL.within[1]
max.within.split <- within.split[1]
max.within.cov <- cur_col
-
+
if (length(LL.within) > 1) {
for (i in 2:length(LL.within)) {
if (!is.na(LL.within[i]) | !is.null(LL.within[i])) {
@@ -716,46 +729,49 @@ fairSplit <-
}
}
}
-
}
- #max.LL.within
- #max.within.split
+ # max.LL.within
+ # max.within.split
split.max <- max.within.split
LL.max <- max.LL.within
-
+
if (control$report.level >= 3) {
- report(paste(
- "Best split at ",
- max.within.split,
- " with LL",
- max.LL.within
- ),
- 2)
+ report(
+ paste(
+ "Best split at ",
+ max.within.split,
+ " with LL",
+ max.LL.within
+ ),
+ 2
+ )
}
}
}
-
+
btn.matrix <-
rbind(LL.btn, names(mydata[c(cov.btn.cols)]), cov.btn.cols, split.btn)
colnames(btn.matrix) <-
c(paste("var", seq(1, ncol(btn.matrix)), sep = ""))
rownames(btn.matrix) <- c("LR", "variable", "column", "split val")
-
+
if (control$report.level >= 2) {
report("_____________________________", 1)
- report(paste(
- "Best predictor",
- name.max,
- "split at",
- split.max,
- "with LL",
- LL.max
- ),
- 1)
+ report(
+ paste(
+ "Best predictor",
+ name.max,
+ "split at",
+ split.max,
+ "with LL",
+ LL.max
+ ),
+ 1
+ )
report("", 1)
}
-
- #cat("Best ",LL.max, " ",split.max," ",name.max," ",col.max,"\n")
+
+ # cat("Best ",LL.max, " ",split.max," ",name.max," ",col.max,"\n")
return(
list(
LL.max = LL.max,
diff --git a/R/findDefinitionVariables.R b/R/findDefinitionVariables.R
index 991a5f9..fde409f 100644
--- a/R/findDefinitionVariables.R
+++ b/R/findDefinitionVariables.R
@@ -1,6 +1,6 @@
findDefinitionVariables <- function(model) {
def_vars <- c()
- for (mid in 1:length(m@matrices)) {
+ for (mid in 1:length(model@matrices)) {
mt <- model@matrices[[mid]]
idx <- startsWith(mt$labels,"data.")
if (any(idx,na.rm = TRUE)) {
diff --git a/R/fitSubmodels.R b/R/fitSubmodels.R
index 8fbc30d..aec7f81 100644
--- a/R/fitSubmodels.R
+++ b/R/fitSubmodels.R
@@ -237,10 +237,13 @@ fitSubmodels <- function(model,
newlabels1 <- names(omxGetParameters(model1))
newlabels2 <- names(omxGetParameters(model2))
+ # make safe names for variables (should probably be
+ # all non-alpha-numeric?)
+ # replace square brackets, dots, commas by underscores
newlabels1 <-
- stringr::str_replace_all(newlabels1, "\\[|\\]|,|\\.", "_")
+ gsub("\\[|\\]|\\.|,", "_", newlabels1)
newlabels2 <-
- stringr::str_replace_all(newlabels2, "\\[|\\]|,|\\.", "_")
+ gsub("\\[|\\]|\\.|,", "_", newlabels2)
# replace labels
eqids <- which(newlabels1 %in% invariance)
@@ -299,7 +302,7 @@ fitSubmodels <- function(model,
mxFitFunctionAlgebra('h12')
)
- sharedRun <- mxRun(sharedModel, silent = T)
+ sharedRun <- mxRun(sharedModel, silent = TRUE)
LL.sum <- mxEval(h12, sharedRun)
diff --git a/R/gefp_semtree.R b/R/gefp_semtree.R
index 802c3c7..feea5c4 100644
--- a/R/gefp_semtree.R
+++ b/R/gefp_semtree.R
@@ -1,6 +1,6 @@
-gefp_semtree <- function (..., fit = NULL, scores, vcov = NULL,
- decorrelate = TRUE, sandwich = TRUE, order.by = NULL,
- fitArgs = NULL, parm = NULL, data = list()) {
+gefp_semtree <- function(..., fit = NULL, scores, vcov = NULL,
+ decorrelate = TRUE, sandwich = TRUE, order.by = NULL,
+ fitArgs = NULL, parm = NULL, data = list()) {
vcov. <- vcov
fm <- list(...)
fm <- fm[[1]]
@@ -8,47 +8,56 @@ gefp_semtree <- function (..., fit = NULL, scores, vcov = NULL,
k <- NCOL(scores)
z <- order.by
order.name <- deparse(substitute(order.by))
- if (is.factor(z))
+ if (is.factor(z)) {
z <- as.numeric(z)
+ }
scores <- as.matrix(scores)
- if (inherits(z, "POSIXt"))
- z <- suppressWarnings(c(z[1] + as.numeric(difftime(z[1],
- z[2], units = "secs")), z))
- else z <- suppressWarnings(c(z[1] - as.numeric(diff(z[1:2])),
- z))
- process <- scores/sqrt(n)
+ if (inherits(z, "POSIXt")) {
+ z <- suppressWarnings(c(z[1] + as.numeric(difftime(z[1],
+ z[2],
+ units = "secs"
+ )), z))
+ } else {
+ z <- suppressWarnings(c(
+ z[1] - as.numeric(diff(z[1:2])),
+ z
+ ))
+ }
+ process <- scores / sqrt(n)
if (is.null(vcov.)) {
J <- crossprod(process)
J12 <- strucchange::root.matrix(J)
- }
- else {
+ } else {
if (sandwich) {
- Q <- chol2inv(chol(bread(fm)/n))
- J <- (Q %*% vcov.(fm, order.by = order.by, data = data) %*%
- Q)/n
+ Q <- chol2inv(chol(bread(fm) / n))
+ J <- (Q %*% vcov.(fm, order.by = order.by, data = data) %*%
+ Q) / n
J12 <- strucchange::root.matrix(J)
- }
- else {
+ } else {
J12 <- vcov.
}
}
process <- rbind(0, process)
process <- apply(process, 2, cumsum)
- if (decorrelate)
+ if (decorrelate) {
process <- t(chol2inv(chol(J12)) %*% t(process))
- else {
- process <- t(1/sqrt(diag(J)) * t(process))
- if (length(parm) > 1)
+ } else {
+ process <- t(1 / sqrt(diag(J)) * t(process))
+ if (length(parm) > 1) {
stop("limiting process is not a Brownian bridge")
+ }
}
colnames(process) <- colnames(scores)
- if (!is.null(parm))
+ if (!is.null(parm)) {
process <- process[, parm]
- retval <- list(process = suppressWarnings(zoo::zoo(process, z)),
- nreg = k, nobs = n, call = match.call(), fit = fit, scores = scores,
- fitted.model = fm, par = NULL, lim.process = "Brownian bridge",
- type.name = "M-fluctuation test", order.name = order.name,
- J12 = J12)
+ }
+ retval <- list(
+ process = suppressWarnings(zoo::zoo(process, z)),
+ nreg = k, nobs = n, call = match.call(), fit = fit, scores = scores,
+ fitted.model = fm, par = NULL, lim.process = "Brownian bridge",
+ type.name = "M-fluctuation test", order.name = order.name,
+ J12 = J12
+ )
class(retval) <- "gefp"
return(retval)
}
diff --git a/R/getLikelihood.R b/R/getLikelihood.R
index ae17908..17afe9f 100644
--- a/R/getLikelihood.R
+++ b/R/getLikelihood.R
@@ -1,15 +1,13 @@
-getLikelihood<-function(model)
-{
+getLikelihood <- function(model) {
# this seems to be not generic enough
# return(OpenMx::mxEval(objective, model));
-
- msm <- getS3method("summary","MxModel")
+
+ msm <- getS3method("summary", "MxModel")
# alternative:
if (is.null(model)) {
warning("NULL Model in getLikelihood()-call")
return(NULL)
}
-
+
return(msm(model)$Minus2LogLikelihood)
-
- }
\ No newline at end of file
+}
diff --git a/R/getNumNodes.R b/R/getNumNodes.R
index 212b4c0..06642cb 100644
--- a/R/getNumNodes.R
+++ b/R/getNumNodes.R
@@ -1,34 +1,31 @@
#' Tree Size
-#'
+#'
#' Counts the number of nodes in a tree.
-#'
-#'
+#'
+#'
#' @param tree A SEM tree object.
#' @author Andreas M. Brandmaier, John J. Prindle
#' @references Brandmaier, A.M., Oertzen, T. v., McArdle, J.J., & Lindenberger,
#' U. (2013). Structural equation model trees. \emph{Psychological Methods},
#' 18(1), 71-86.
getNumNodes <-
-function(tree)
-{
- if ((is.null(tree$left_child)) && (is.null(tree$right_child)))
- {
- return(1);
- }
+ function(tree) {
+ if ((is.null(tree$left_child)) && (is.null(tree$right_child))) {
+ return(1)
+ }
- count <- 1
-
- if (tree$left_child$caption != "TERMINAL") {
- count <- count + getNumNodes(tree$left_child)
- } else {
- count <- count + 1
- }
- if (tree$right_child$caption != "TERMINAL") {
- count <- count + getNumNodes(tree$right_child)
- } else {
- count <- count + 1
- }
-
- return(count)
-
-}
+ count <- 1
+
+ if (tree$left_child$caption != "TERMINAL") {
+ count <- count + getNumNodes(tree$left_child)
+ } else {
+ count <- count + 1
+ }
+ if (tree$right_child$caption != "TERMINAL") {
+ count <- count + getNumNodes(tree$right_child)
+ } else {
+ count <- count + 1
+ }
+
+ return(count)
+ }
diff --git a/R/getParDiffForest.R b/R/getParDiffForest.R
index 5785a98..85d2f1a 100644
--- a/R/getParDiffForest.R
+++ b/R/getParDiffForest.R
@@ -3,7 +3,7 @@
#' differences between post-split nodes.
#' @param forest a semforest object.
#' @param measure a character. "wald" (default) gives the squared parameter
-#' differences devided by their pooled standard errors. test" gives the
+#' differences divided by their pooled standard errors. test" gives the
#' contributions of the parameters to the test statistic. "raw" gives the
#' absolute values of the parameter differences.
#' @param normalize logical value; if TRUE parameter differences of each split
diff --git a/R/getParDiffTree.R b/R/getParDiffTree.R
index f9b2791..264df55 100644
--- a/R/getParDiffTree.R
+++ b/R/getParDiffTree.R
@@ -3,7 +3,7 @@
#' differences between post-split nodes.
#' @param tree a semtree object.
#' @param measure a character. "wald" (default) gives the squared parameter
-#' differences devided by their pooled standard errors. "test" gives the
+#' differences divided by their pooled standard errors. "test" gives the
#' contributions of the parameters to the test statistic."raw" gives the
#' absolute values of the parameter differences.
#' @param normalize logical value; if TRUE parameter differences of each split
diff --git a/R/getPredictorsLavaan.R b/R/getPredictorsLavaan.R
index 06ce472..f6c6288 100644
--- a/R/getPredictorsLavaan.R
+++ b/R/getPredictorsLavaan.R
@@ -7,7 +7,7 @@ getPredictorsLavaan <- function(model, dataset, covariates)
model.ids[i] <- which(model@Data@ov.names[[1]][i] == names(dataset));
}
all.ids <- 1:length(names(dataset))
- cvid <- sets::as.set(all.ids)-sets::as.set(model.ids)
+ cvid <- all.ids[!all.ids %in% model.ids]
if (length(cvid)==0) {
ui_stop("No covariates contained in dataset!")
}
@@ -18,7 +18,7 @@ getPredictorsLavaan <- function(model, dataset, covariates)
all.ids <- 1:length(names(dataset))
covariate.ids <- sapply(covariates, function(cv) { which(cv==names(dataset))} )
- modid <- sets::as.set(all.ids)-sets::as.set(covariate.ids)
+ modid <- all.ids[!all.ids %in% covariate.ids]
if (length(modid)==0) {
ui_stop("No covariates available to split on!")
}
diff --git a/R/getPredictorsOpenMx.R b/R/getPredictorsOpenMx.R
index addb34d..3df10e5 100644
--- a/R/getPredictorsOpenMx.R
+++ b/R/getPredictorsOpenMx.R
@@ -12,7 +12,7 @@ getPredictorsOpenMx <- function(mxmodel, dataset, covariates)
model.ids[i] <- which(cmp);
}
all.ids <- 1:length(names(dataset))
- cvid <- sets::as.set(all.ids)-sets::as.set(model.ids)
+ cvid <- all.ids[!all.ids %in% model.ids]
if (length(cvid)==0) {
ui_stop("Error. No predictors contained in dataset!")
}
@@ -23,7 +23,7 @@ getPredictorsOpenMx <- function(mxmodel, dataset, covariates)
all.ids <- 1:length(names(dataset))
covariate.ids <- sapply(covariates, function(cv) { which(cv==names(dataset))} )
- modid <- sets::as.set(all.ids)-sets::as.set(covariate.ids)
+ modid <- all.ids[!all.ids %in% covariate.ids]
if (length(modid)==0) {
ui_stop("No covariates contained in dataset!")
}
diff --git a/R/growTree.R b/R/growTree.R
index e6fb930..630426b 100644
--- a/R/growTree.R
+++ b/R/growTree.R
@@ -11,30 +11,28 @@
#
#
#
-growTree <- function(model=NULL, mydata=NULL,
- control=NULL, invariance=NULL, meta=NULL,
- edgelabel=NULL, depth=0, constraints=NULL, ...)
-{
-
- if(is.null(mydata)) {
+growTree <- function(model = NULL, mydata = NULL,
+ control = NULL, invariance = NULL, meta = NULL,
+ edgelabel = NULL, depth = 0, constraints = NULL, ...) {
+ if (is.null(mydata)) {
stop("There was no data for growing the tree")
}
-
+
if (is.null(meta)) {
- warning("SEM tree could not determine model variables and covariates in the dataset.");
- return(NULL);
+ warning("SEM tree could not determine model variables and covariates in the dataset.")
+ return(NULL)
}
-
+
if (control$verbose) {
- ui_message("Growing level ",depth," n = ",nrow(mydata));
+ ui_message("Growing level ", depth, " n = ", nrow(mydata))
}
-
- if (control$report.level>0) {
- report(paste("Growing tree level",depth), 0)
+
+ if (control$report.level > 0) {
+ report(paste("Growing tree level", depth), 0)
}
-
-
-
+
+
+
# Node null settings in testing for significant splitting
node <- list()
node$left_child <- NULL
@@ -42,147 +40,195 @@ growTree <- function(model=NULL, mydata=NULL,
node$caption <- "TERMINAL"
node$N <- dim(mydata)[1]
class(node) <- "semtree"
-
+
# -- sample columns in case of SEM forest usage --
fulldata <- mydata
fullmeta <- meta
if (control$mtry > 0) {
-
# get names of model variables before sampling
model.names <- names(mydata)[meta$model.ids]
covariate.names <- names(mydata)[meta$covariate.ids]
- #perform sampling
+ # perform sampling
mydata <- sampleColumns(mydata, names(mydata)[meta$covariate.ids], control$mtry)
# get new model ids after sampling by name
- meta$model.ids <- sapply(model.names, function(x) {which(x==names(mydata))})
+ meta$model.ids <- sapply(model.names, function(x) {
+ which(x == names(mydata))
+ })
names(meta$model.ids) <- NULL
- meta$covariate.ids <- unlist(lapply(covariate.names, function(x) {which(x==names(mydata))}))
-
+ meta$covariate.ids <- unlist(lapply(covariate.names, function(x) {
+ which(x == names(mydata))
+ }))
+
node$colnames <- colnames(mydata)
if (control$verbose) {
- ui_message("Subsampled predictors: ",paste(node$colnames[meta$covariate.ids]))
+ ui_message("Subsampled predictors: ", paste(node$colnames[meta$covariate.ids]))
}
}
- # determine whether split evaluation can be done on p values
- node$p.values.valid <- control$method != "cv"
+ # override forced split?
+ arguments <- list(...)
+ if ("forced_splits" %in% names(arguments) && !is.null(arguments$forced_splits)) {
+ forced_splits <- arguments$forced_splits
+
+ # get names of model variables before forcing
+ model.names <- names(mydata)[meta$model.ids]
+ covariate.names <- names(mydata)[meta$covariate.ids]
+
+ # select subset with model variables and single, forced predictor
+ forcedsplit.name <- forced_splits[1]
+
+ if (control$verbose) {
+ cat("FORCED split: ",forcedsplit.name,"\n")
+ }
+
+
+ mydata <- fulldata[, c(model.names, forcedsplit.name) ]
+ node$colnames <- colnames(mydata)
+
+ # get new model ids after sampling by name
+ meta$model.ids <- sapply(model.names, function(x) {
+ which(x == names(mydata))
+ })
+ names(meta$model.ids) <- NULL
+ meta$covariate.ids <- unlist(lapply(covariate.names, function(x) {
+ which(x == names(mydata))
+ }))
+
+ } else {
+ forced_splits <- NULL
+ }
+ # determine whether split evaluation can be done on p values
+ node$p.values.valid <- control$method != "cv"
+
# set some default values for the node object
node$lr <- NA
node$edge_label <- edgelabel
-
+
# Estimate model on current data set
## 31.08.2022: Model in root is not refitted if refit is set to FALSE
if (depth == 0 & !control$refit & length(constraints) == 0) { # do not fit model
node$model <- model
} else { # fit model
# OpenMx
- if(control$sem.prog == 'OpenMx'){
- full.model <- mxAddNewModelData(model = model, data = mydata,
- name = "BASE MODEL")
+ if (control$sem.prog == "OpenMx") {
+ full.model <- mxAddNewModelData(
+ model = model, data = mydata,
+ name = "BASE MODEL"
+ )
node$model <- try(
- suppressMessages(OpenMx::mxTryHard(model = full.model, paste = FALSE,
- silent = TRUE)),
- silent = TRUE)
+ suppressMessages(OpenMx::mxTryHard(
+ model = full.model, paste = FALSE,
+ silent = TRUE, bestInitsOutput = FALSE
+ )),
+ silent = TRUE
+ )
}
# lavaan
- if(control$sem.prog == 'lavaan'){
+ if (control$sem.prog == "lavaan") {
## 11.08.2022 Note: fits lavaan model on mydata
node$model <- try(suppressWarnings(eval(
- parse(text=paste("lavaan::",model@Options$model.type,
- '(lavaan::parTable(model),data=mydata,missing=\'',
- model@Options$missing,'\')',sep="")))),silent=T)
+ parse(text = paste("lavaan::", model@Options$model.type,
+ "(lavaan::parTable(model),data=mydata,missing='",
+ model@Options$missing, "')",
+ sep = ""
+ ))
+ )), silent = T)
}
## 26.06.2022: Added code for ctsem models
- if(control$sem.prog == 'ctsem'){
+ if (control$sem.prog == "ctsem") {
full.model <- suppressMessages(try(
- ctsemOMX::ctFit(dat = mydata[, -meta$covariate.ids],
- ctmodelobj = model$ctmodelobj,
- dataform = "wide",
- objective = model$ctfitargs$objective,
- stationary = model$ctfitargs$stationary,
- optimizer = model$ctfitargs$optimizer,
- retryattempts = ifelse(model$ctfitargs$retryattempts >= 20,
- yes = model$ctfitargs$retryattempts,
- no = 20),
- carefulFit = model$ctfitargs$carefulFit,
- showInits = model$ctfitargs$showInits,
- asymptotes = model$ctfitargs$asymptotes,
- meanIntervals = model$ctfitargs$meanIntervals,
- discreteTime = model$ctfitargs$discreteTime,
- verbose = model$ctfitargs$verbose,
- transformedParams = model$ctfitargs$transformedParams)
+ ctsemOMX::ctFit(
+ dat = mydata[, -meta$covariate.ids],
+ ctmodelobj = model$ctmodelobj,
+ dataform = "wide",
+ objective = model$ctfitargs$objective,
+ stationary = model$ctfitargs$stationary,
+ optimizer = model$ctfitargs$optimizer,
+ retryattempts = ifelse(model$ctfitargs$retryattempts >= 20,
+ yes = model$ctfitargs$retryattempts,
+ no = 20
+ ),
+ carefulFit = model$ctfitargs$carefulFit,
+ showInits = model$ctfitargs$showInits,
+ asymptotes = model$ctfitargs$asymptotes,
+ meanIntervals = model$ctfitargs$meanIntervals,
+ discreteTime = model$ctfitargs$discreteTime,
+ verbose = model$ctfitargs$verbose,
+ transformedParams = model$ctfitargs$transformedParams
+ )
))
full.model$mxobj@name <- "BASE MODEL"
node$model <- full.model
}
}
-
-
- if (is(node$model,"try-error"))
- {
+
+
+ if (is(node$model, "try-error")) {
ui_fail("Model had a run error.")
- node$term.reason <- node$model[[1]]
- node$model <- NULL;
- return(node);
+ node$term.reason <- node$model[[1]]
+ node$model <- NULL
+ return(node)
}
-
+
if (is.null(node$model)) {
- node$term.reason <- "Model was NULL! Model could not be estimated.";
- return(node);
+ node$term.reason <- "Model was NULL! Model could not be estimated."
+ return(node)
}
-
-
+
+
###########################################################
### OPENMX USED HERE ###
###########################################################
- if(control$sem.prog == 'OpenMx'){
-
+ if (control$sem.prog == "OpenMx") {
# some export/namespace problem here with the generic
# getS3method("summary","MxModel") gets me the right fun
- S3summary <- getS3method("summary","MxModel")
-
+ S3summary <- getS3method("summary", "MxModel")
+
# list of point estimates, std.dev, and names of all freely estimated parameters
- node$params <- S3summary(node$model)$parameters[,5];
- names(node$params) <- S3summary(node$model)$parameters[,1];
- node$params_sd <- S3summary(node$model)$parameters[,6];
- node$param_names <- S3summary(node$model)$parameters[,1];
+ node$params <- S3summary(node$model)$parameters[, 5]
+ names(node$params) <- S3summary(node$model)$parameters[, 1]
+ node$params_sd <- S3summary(node$model)$parameters[, 6]
+ node$param_names <- S3summary(node$model)$parameters[, 1]
}
###########################################################
### lavaan USED HERE ###
###########################################################
- if(control$sem.prog == 'lavaan'){
-
- node$params <- lavaan::coef(node$model) # put parameters into node
+ if (control$sem.prog == "lavaan") {
+ node$params <- lavaan::coef(node$model) # put parameters into node
names(node$params) <- names(lavaan::coef(node$model)) # parameter names are stored as well
-
- #read in estimated parameters (take only those that have non-NA z values)
- #parameters <- data.frame(
+
+ # read in estimated parameters (take only those that have non-NA z values)
+ # parameters <- data.frame(
# lavaan::parameterEstimates(node$model))[!is.na(
# data.frame(lavaan::parameterEstimates(node$model))[,"z"]),]
-
+
parameters <- lavaan::parameterEstimates(node$model)
-
+
# if any labels are missing (some labels provided), then put default labels in the label col.
- for(i in 1:nrow(parameters)){
- if(!is.null(parameters$label)){
- if(parameters$label[i]==""){parameters$label[i]<-paste(parameters$lhs[i],parameters$op[i],parameters$rhs[i],sep="")}
+ for (i in 1:nrow(parameters)) {
+ if (!is.null(parameters$label)) {
+ if (parameters$label[i] == "") {
+ parameters$label[i] <- paste(parameters$lhs[i], parameters$op[i], parameters$rhs[i], sep = "")
+ }
}
- }
+ }
# if all labels are missing make a label column
- if(is.null(parameters$label)){
- label <- paste(parameters$lhs,parameters$op,parameters$rhs,sep="")
- parameters<- cbind(parameters,label)
- }
-
+ if (is.null(parameters$label)) {
+ label <- paste(parameters$lhs, parameters$op, parameters$rhs, sep = "")
+ parameters <- cbind(parameters, label)
+ }
+
# store the SE of the estimates
- se <- rep(NA,length(unique(parameters$se)))
- for(i in 1:length(unique(parameters$label))){
- for(j in 1:nrow(parameters)){
- if(unique(parameters$label)[i]==parameters$label[j]){se[i]<-parameters$se[j]}
+ se <- rep(NA, length(unique(parameters$se)))
+ for (i in 1:length(unique(parameters$label))) {
+ for (j in 1:nrow(parameters)) {
+ if (unique(parameters$label)[i] == parameters$label[j]) {
+ se[i] <- parameters$se[j]
+ }
}
}
-
+
# list of point estimates, std.dev, and names of all freely estimated parameters
node$params_sd <- se
node$param_names <- names(lavaan::coef(node$model))
@@ -190,8 +236,7 @@ growTree <- function(model=NULL, mydata=NULL,
###########################################################
### ctsemOMX USED HERE ###
###########################################################
- if(control$sem.prog == 'ctsem'){
-
+ if (control$sem.prog == "ctsem") {
# list of point estimates, std.dev, and names of all freely estimated parameters
if (control$ctsem_sd) { # slower
ctsem_summary <- summary(node$model) # this is very slow
@@ -206,9 +251,9 @@ growTree <- function(model=NULL, mydata=NULL,
node$param_names <- names(ctsem_coef)
}
}
-
+
# df
-
+
if (!is.null(constraints$focus.parameters)) {
# df's are equal to number of focus parameters
node$df <- length(constraints$focus.parameters)
@@ -217,160 +262,177 @@ growTree <- function(model=NULL, mydata=NULL,
# df == num. parameters
node$df <- length(node$param_names)
}
-
-
-
+
+
+
# add unique node id via global variable
node$node_id <- getGlobal("global.node.id")
- setGlobal("global.node.id", node$node_id+1)
-
+ setGlobal("global.node.id", node$node_id + 1)
+
# determine whether we should skip splitting
# 1. minimum cases in node
if (!is.na(control$min.N)) {
if (node$N <= control$min.N) {
- if(control$verbose){
+ if (control$verbose) {
ui_message("Minimum user defined N for leaf node.")
}
- node$term.reason <- "Minimum number of cases in leaf node"
- return(node);
+ node$term.reason <- "Minimum number of cases in leaf node"
+ return(node)
}
}
# 2. maximum depth for tree reached
- if (!is.na(control$max.depth)){
+ if (!is.na(control$max.depth)) {
if (depth >= control$max.depth) {
- if(control$verbose){
+ if (control$verbose) {
ui_message("Max user defined tree depth reached.")
}
- node$term.reason <- "Maximum depth reached in leaf node"
- return(node);
+ node$term.reason <- "Maximum depth reached in leaf node"
+ return(node)
}
}
-
+
# determine best split based in chosen method (ml or score) and (naive, cv, fair)
result <- NULL
# 1. unbalanced selection method
if (control$method == "naive") {
-
# if (control$test.type=="ml") {
-
+
result <- tryCatch(
################################################
- naiveSplit(node$model, mydata, control, invariance, meta, constraints=constraints, ...)
+ naiveSplit(node$model, mydata, control, invariance, meta, constraints = constraints, ...)
################################################
,
- error = function(e) { cat(paste("Error occured!",e,sep="\n")); traceback(); return(NULL); }
- );
-
- } else if (control$method=="score") {
-
+ error = function(e) {
+ cat(paste("Error occured!", e, sep = "\n"))
+ traceback()
+ return(NULL)
+ }
+ )
+ } else if (control$method == "score") {
result <- tryCatch(
################################################
- result <- ScoreSplit(node$model, mydata, control, invariance, meta, constraints=constraints, ...)
+ result <- ScoreSplit(node$model, mydata, control, invariance, meta, constraints = constraints, ...)
################################################
,
- error = function(e) { cat(paste("Error occured!",e,sep="\n")); traceback(); return(NULL); }
- );
-
- }
+ error = function(e) {
+ cat(paste("Error occured!", e, sep = "\n"))
+ traceback()
+ return(NULL)
+ }
+ )
+ }
# 2a. split half data to determine best split then use hold out set to compare one split per covariate
else if (control$method == "fair") {
control$fair3Step <- FALSE
result <- tryCatch(
################################################
- fairSplit(node$model, mydata, control, invariance, meta, constraints=constraints, ...)
+ fairSplit(node$model, mydata, control, invariance, meta, constraints = constraints, ...)
################################################
,
- error = function(e) { cat(paste("Error occured!",e,sep="\n")); return(NULL); }
- );
+ error = function(e) {
+ cat(paste("Error occured!", e, sep = "\n"))
+ return(NULL)
+ }
+ )
}
# 2b. split half data to determine best split then use hold out set to compare one split per covariate, with step 3 all splits retest
else if (control$method == "fair3") {
control$fair3Step <- TRUE
result <- tryCatch(
- ################################################
- fairSplit(node$model, mydata, control, invariance, meta, constraints=constraints, ...)
+ ################################################
+ fairSplit(node$model, mydata, control, invariance, meta, constraints = constraints, ...)
################################################
,
- error = function(e) { cat(paste("Error occured!",e,sep="\n")); return(NULL); }
- );
- }
- # 3. Traditional cross validation for covariate split selection
- else if (control$method == "cv") {
- stop("This split selection procedure is not supported anymore. Please see the new score-based tests for split selection.")
+ error = function(e) {
+ cat(paste("Error occured!", e, sep = "\n"))
+ return(NULL)
+ }
+ )
} else {
ui_fail("Error. Unknown split method selected")
stop()
}
-
+
# return values in result are:
# LL.max : numeric, log likelihood ratio of best split
- # split.max : numeric, value to split best column on
+ # split.max : numeric, split point; value to split best column on
+ # (for metric variables)
# col.max : index of best column
- # cov.name : name of best candidate
-
+ # name.max : name of best candidate
+ # type.max :
+ # btn.matrix : a matrix, which contains test statistics and
+ # more information for
+ # the various split points evaluated
+ # n.comp : the (implicit) number of multiple tests evaluated for
+ # determining the best split
+
# store the value of the selected test statistic
node$lr <- NA
if (!is.null(result)) {
node$lr <- result$LL.max
node$result <- result
}
-
+
# if no split found, exit node without continuing growth
if (is.null(result) || is.null(result$LL.max)) {
if (control$verbose) {
ui_message("Best Likelihood Ratio was NULL. Stop splitting")
}
- return(node);
+ return(node)
}
-
+
# provide verbose output to the user about best split
if (control$verbose) {
- ui_ok("Best split is ",result$name.max," with statistic = ",round(node$lr,2));
+ ui_ok("Best split is ", result$name.max, " with statistic = ", round(node$lr, 2))
}
-
+
# compute p value from chi2
if (!is.null(result$p.max)) {
node$p <- result$p.max
} else {
- node$p <- pchisq(node$lr,df=node$df, lower.tail=F)
-
+ node$p <- pchisq(node$lr, df = node$df, lower.tail = F)
+
if (control$use.maxlm) {
-
# Borders for continuous covariates
if (!is.factor(mydata[, result$name.max])) {
-
props <- cumsum(table(mydata[, result$name.max])) / node$N
split_val_lhs <- as.numeric(names(which(props >= control$strucchange.from)[1]))
split_val_rhs <- as.numeric(names(which(props >= control$strucchange.to)[1]))
-
+
btn_matrix_max <- result$btn.matrix[, result$btn.matrix["variable", ] ==
- result$name.max, drop = FALSE]
-
+ result$name.max, drop = FALSE]
+
num_split_val <- as.numeric(btn_matrix_max["split val", ])
-
+
n1 <- which(num_split_val <= split_val_lhs)
n1 <- n1[length(n1)]
n2 <- which(num_split_val >= split_val_rhs)[1]
-
- if (length(n1) == 0) {n1 <- 1}
- if (is.na(n2)) {n2 <- length(num_split_val)}
-
+
+ if (length(n1) == 0) {
+ n1 <- 1
+ }
+ if (is.na(n2)) {
+ n2 <- length(num_split_val)
+ }
+
LR <- as.numeric(btn_matrix_max["LR", n1:n2])
-
+
max_pos <- which.max(LR) + n1 - 1
node$result$LL.max <- node$lr <- as.numeric(btn_matrix_max["LR", max_pos])
node$result$split.max <- as.numeric(btn_matrix_max["split val", max_pos])
}
-
- node$p <- computePval_maxLR(maxLR = node$lr, q = node$df,
- covariate = mydata[,result$col.max], from = control$strucchange.from,
- to = control$strucchange.to, nrep = control$strucchange.nrep)
+
+ node$p <- computePval_maxLR(
+ maxLR = node$lr, q = node$df,
+ covariate = mydata[, result$col.max], from = control$strucchange.from,
+ to = control$strucchange.to, nrep = control$strucchange.nrep
+ )
}
}
-
-
+
+
# --------- determine whether to continue splitting --------------
- if (is(control$custom.stopping.rule,"function")) {
+ if (is(control$custom.stopping.rule, "function")) {
stopping.rule <- control$custom.stopping.rule
} else {
stopping.rule <- stoppingRuleDefault
@@ -378,169 +440,197 @@ growTree <- function(model=NULL, mydata=NULL,
# stoppingRuleDefault() is a function that gets inputs node, result, control
# this function can be replaced by a user-specified function
srule <- stopping.rule(node, result, control)
-
+
# determine whether splitting should be continued depending on return state
# of the function
- if (is(srule,"list")) {
+ if (is(srule, "list")) {
node <- srule$node
cont.split <- !(srule$stop.rule)
} else {
- cont.split <- !srule
+ cont.split <- !srule
node$p.values.valid <- FALSE
}
-
+
# restore mydata here if (mtry was > 0) -- for semforests
if (control$mtry > 0) {
-
# also need to remap col.max to original data!
if (!is.null(result$col.max) && !is.na(result$col.max)) {
col.max.name <- names(mydata)[result$col.max]
- result$col.max <- which(names(fulldata)==col.max.name)
+ result$col.max <- which(names(fulldata) == col.max.name)
} else {
col.max.name <- NULL
}
-
+
# restore data
mydata <- fulldata
meta <- fullmeta
}
- if ((!is.null(cont.split)) && (!is.na(cont.split)) && (cont.split)) {
- if (control$report.level > 10) {
- report("Stop splitting based on stopping rule.", 1)
+ # restore mydata if forced split was true
+ # and (potentially) force continuation of splitting
+ if (!is.null(forced_splits)) {
+
+
+ # also need to remap col.max to original data!
+ if (!is.null(result$col.max) && !is.na(result$col.max)) {
+ col.max.name <- names(mydata)[result$col.max]
+ result$col.max <- which(names(fulldata) == col.max.name)
+ } else {
+ col.max.name <- NULL
}
+ mydata <- fulldata
+ meta <- fullmeta
+ # pop first element
+ forced_splits <- forced_splits[-1]
+ # set to NULL if no splits left
+ if (length(forced_splits)==0) forced_splits <- NULL
+ # force continuation of splitting ?
+ cont.split <- TRUE
+ }
+
+ if ((!is.null(cont.split)) && (!is.na(cont.split)) && (cont.split)) {
+ if (control$report.level > 10) {
+ report("Stop splitting based on stopping rule.", 1)
+ }
+
+
+
# store the split name (covariate name and split value) RHS is yes branch
- if(result$type.max==.SCALE_CATEGORICAL) {
+ if (result$type.max == .SCALE_CATEGORICAL) {
# unordered factor collating and splitting
lvl <- (control$method == "fair")
- result1 <- recodeAllSubsets(mydata[,result$col.max],colnames(mydata)[result$col.max],
- growbool=T, use.levels=lvl)
-
-
+ result1 <- recodeAllSubsets(mydata[, result$col.max], colnames(mydata)[result$col.max],
+ growbool = T, use.levels = lvl
+ )
+
+
test2 <- rep(NA, nrow(mydata))
- if(!is.na(result1$num_sets) & !is.null(result1$num_sets)){
- for(j in 1:result1$num_sets) {
+ if (!is.na(result1$num_sets) & !is.null(result1$num_sets)) {
+ for (j in 1:result1$num_sets) {
test1 <- rep(NA, nrow(mydata))
- for(i in 1:nrow(mydata)) {
- if(isTRUE(result1$columns[i,j])) {test1[i] <- 1}
- else if(!is.na(result1$columns[i,j])){test1[i] <- 0}
- else{test1[i]<-NA}
+ for (i in 1:nrow(mydata)) {
+ if (isTRUE(result1$columns[i, j])) {
+ test1[i] <- 1
+ } else if (!is.na(result1$columns[i, j])) {
+ test1[i] <- 0
+ } else {
+ test1[i] <- NA
+ }
}
test1 <- as.factor(test1)
test2 <- data.frame(test2, test1)
}
}
- test2 <- test2[,-1]
-
- # if var.type==1, then split.max corresponds to the index of
+ test2 <- test2[, -1]
+
+ # if level is categorical, then split.max corresponds to the index of
# the best column in the matrix that represents all subsets
# make sure that this is not casted to a string if there
# are predictors of other types (esp., factors)
# browser()
- result$split.max <- as.integer(result$split.max)
-
- #named <- colnames(result1$columns)[result$split.max]
- # node$caption <- paste(colnames(result1$columns)[result$split.max])
- best_subset_col_id = result$split.max
- best_values = result1$expressions[ (best_subset_col_id-1)*3 +1]$value
-
- node$rule = list(variable=result$col.max, relation="%in%",
- value=best_values,
- name = result$name.max)
- node$caption <- paste(result$name.max, " in [", paste0(best_values,
- collapse=" ")," ]")
-
- if(result1$num_sets==1) {
- sub1 <- subset (mydata, as.numeric(test2) == 2)
- sub2 <- subset (mydata, as.numeric(test2) == 1)
+ if (!all(is.na(result$btn.matrix))) {
+ result$split.max <- as.integer(result$split.max)
+
+ # named <- colnames(result1$columns)[result$split.max]
+ # node$caption <- paste(colnames(result1$columns)[result$split.max])
+ best_subset_col_id <- result$split.max
+ best_values <- result1$expressions[(best_subset_col_id - 1) * 3 + 1]$value
+ } else {
+ best_values <- result$split.max
}
- else {
- sub1 <- subset (mydata, as.numeric(test2[[result$split.max]]) == 2)
- sub2 <- subset (mydata, as.numeric(test2[[result$split.max]]) == 1)
+ node$rule <- list(
+ variable = result$col.max, relation = "%in%",
+ value = best_values,
+ name = result$name.max
+ )
+ node$caption <- paste(result$name.max, " in [", paste0(best_values,
+ collapse = " "
+ ), " ]")
+
+ if (result1$num_sets == 1) {
+ sub1 <- subset(mydata, as.numeric(test2) == 2)
+ sub2 <- subset(mydata, as.numeric(test2) == 1)
+ } else {
+ sub1 <- subset(mydata, as.numeric(test2[[result$split.max]]) == 2)
+ sub2 <- subset(mydata, as.numeric(test2[[result$split.max]]) == 1)
}
-
- }
- else if (result$type.max==.SCALE_METRIC){
-
+ } else if (result$type.max == .SCALE_METRIC) {
# if var.type==2, then split.max corresponds to the split point value
# make sure that this is not casted to a string if there
# are predictors of other types (esp., factors)
result$split.max <- as.numeric(result$split.max)
-
+
# ordered factor splitting of data
- node$caption <- paste(result$name.max,">=", signif(result$split.max,6),sep=" ")
- node$rule = list(variable=result$col.max, relation=">=", value=c(result$split.max), name = result$name.max)
- sub1 <- subset( mydata, as.numeric(as.character(mydata[, (result$col.max)])) >result$split.max)
- sub2 <- subset( mydata, as.numeric(as.character(mydata[, (result$col.max)]))<=result$split.max)
- }
- else if (result$type.max==.SCALE_ORDINAL) {
-
- node$caption <- paste(result$name.max,">", result$split.max,sep=" ")
- node$rule = list(variable=result$col.max, relation=">", value=c(result$split.max), name = result$name.max)
- sub1 <- subset( mydata, mydata[, (result$col.max)] >result$split.max)
- sub2 <- subset( mydata, mydata[, (result$col.max)]<=result$split.max)
-
- }
- else if (result$type.max == 99) {
+ node$caption <- paste(result$name.max, ">=", signif(result$split.max, 6), sep = " ")
+ node$rule <- list(variable = result$col.max, relation = ">=", value = c(result$split.max), name = result$name.max)
+ sub1 <- subset(mydata, as.numeric(as.character(mydata[, (result$col.max)])) > result$split.max)
+ sub2 <- subset(mydata, as.numeric(as.character(mydata[, (result$col.max)])) <= result$split.max)
+ } else if (result$type.max == .SCALE_ORDINAL) {
+ node$caption <- paste(result$name.max, ">", result$split.max, sep = " ")
+ node$rule <- list(variable = result$col.max, relation = ">", value = c(result$split.max), name = result$name.max)
+ sub1 <- subset(mydata, mydata[, (result$col.max)] > result$split.max)
+ sub2 <- subset(mydata, mydata[, (result$col.max)] <= result$split.max)
+ } else if (result$type.max == 99) {
# this is an error code by score test implementation
# return node and stop splitting
# TODO (MA): Do we need to issue a warning?
return(node)
- }
- else {
+ } else {
# TODO: remove this bc this condition should be captured earlier in any case
# continuous variables splitting
- #node$caption <- paste(result$name.max,">=", signif(result$split.max,3),sep=" ")
- #node$rule = list(variable=result$col.max, relation=">=", value=c(result$split.max), name = result$name.max)
- #sub1 <- subset( mydata, as.numeric(mydata[, (result$col.max)]) >result$split.max)
- #sub2 <- subset( mydata, as.numeric(mydata[, (result$col.max)])<=result$split.max)
+ # node$caption <- paste(result$name.max,">=", signif(result$split.max,3),sep=" ")
+ # node$rule = list(variable=result$col.max, relation=">=", value=c(result$split.max), name = result$name.max)
+ # sub1 <- subset( mydata, as.numeric(mydata[, (result$col.max)]) >result$split.max)
+ # sub2 <- subset( mydata, as.numeric(mydata[, (result$col.max)])<=result$split.max)
stop("An error occured!")
}
-
+
flush.console()
-
+
##########################################################
## NEW CODE TO INCLUDE CASES MISSING ON SPLITTING VARIABLE
class(node) <- "semtree"
- if(control$use.all& (nrow(mydata)>(nrow(sub1)+nrow(sub2)))){
- if(control$verbose){message("Missing on splitting variable: ",result$name.max)}
+ if (control$use.all & (nrow(mydata) > (nrow(sub1) + nrow(sub2)))) {
+ if (control$verbose) {
+ message("Missing on splitting variable: ", result$name.max)
+ }
completeSplits <- calculateBestSubsets(model, mydata, sub1, sub2, result)
sub1 <- completeSplits$sub1
sub2 <- completeSplits$sub2
}
##########################################################
-
+
# build a model for missing data
if (control$missing == "ctree") {
ui_warn("Missing data treatment with ctree is not yet implemented.")
- #temp = mydata[!is.na(mydata[,result$name.max]),]
- #node$missing.model = party::ctree(
+ # temp = mydata[!is.na(mydata[,result$name.max]),]
+ # node$missing.model = party::ctree(
# data = temp,
# formula = as.formula(paste0(result$name.max,"~.")))
} else if (control$missing == "rpart") {
- temp = mydata[!is.na(mydata[,result$name.max]),]
- node$missing.model = rpart::rpart(
+ temp <- mydata[!is.na(mydata[, result$name.max]), ]
+ node$missing.model <- rpart::rpart(
data = temp,
- formula = as.formula(paste0(result$name.max,"~.")))
+ formula = as.formula(paste0(result$name.max, "~."))
+ )
}
-
-
+
+
# recursively continue splitting
# result1 - RHS; result2 - LHS
- result2 <- growTree( node$model, sub2, control, invariance, meta, edgelabel=0, depth=depth+1, constraints)
- result1 <- growTree( node$model, sub1, control, invariance, meta, edgelabel=1, depth=depth+1, constraints)
-
+ result2 <- growTree(node$model, sub2, control, invariance, meta, edgelabel = 0, depth = depth + 1, constraints, forced_splits = forced_splits)
+ result1 <- growTree(node$model, sub1, control, invariance, meta, edgelabel = 1, depth = depth + 1, constraints, forced_splits = forced_splits)
+
# store results in recursive list structure
node$left_child <- result2
node$right_child <- result1
-
- return(node);
-
+
+ return(node)
} else {
# if cont.split is F or NA or NULL then return node without further splitting
- return(node);
- }
-}
+ return(node)
+ }
+}
diff --git a/R/mergeForests.R b/R/mergeForests.R
index 68edb61..223f0ce 100644
--- a/R/mergeForests.R
+++ b/R/mergeForests.R
@@ -4,6 +4,7 @@
#'
#'
#' @aliases merge.semforest
+#'
#' @param x A SEM Forest
#' @param y A second SEM Forest
#' @param \dots Extra arguments. Currently unused.
@@ -12,14 +13,15 @@
#' @references Brandmaier, A.M., Oertzen, T. v., McArdle, J.J., & Lindenberger,
#' U. (2013). Structural equation model trees. \emph{Psychological Methods},
#' 18(1), 71-86.
+#'
#' @exportS3Method merge semforest
merge.semforest <- function(x, y, ...)
{
- return(merge.internal(list(x,y)))
+ return(merge_internal(list(x,y)))
}
-merge.internal <- function(forest.list){
+merge_internal <- function(forest.list){
# determine number of forests to merge
num.forests <- length(forest.list)
@@ -33,7 +35,7 @@ merge.internal <- function(forest.list){
if (getModelType(m1) != getModelType(m2)) stop("Incompatible models")
if (getModelType(m1)=="OpenMx") {
# for OpenMx models, we compare whether a selected set of
- # attributes instead of the entire object because eg.
+ # attributes are identical instead of the entire object because eg.
# the output-attribute may differ on time stamps or
# the compute-attribute may differ for the optimizer used or
# the number of iterations
@@ -41,14 +43,14 @@ merge.internal <- function(forest.list){
for (at in list("matrices","algebras","constraints","latentVars","manifestVars",
"data","data means","data type","submodels","expectation","fitfunction",
"independent")) {
- c1_temp <- digest::digest(attr(m1,at))==digest::digest(attr(m2,at))
+ c1_temp <- identical(attr(m1,at), attr(m2,at))
if (!c1_temp) { ui_warn("Models differ on attribute '",at,"'.") }
c1 <- c1 & c1_temp
}
} else if (getModelType(m1)=="lavaan") {
- c1 <- digest::digest(m1)==digest::digest(m2)
+ c1 <- identical(m1, m2)
} else {
- c1 <- digest::digest(m1)==digest::digest(m2)
+ c1 <- identical(m1, m2)
}
# some checks
@@ -56,7 +58,7 @@ merge.internal <- function(forest.list){
tmp1$num.trees <- NA
tmp2 <- forest.list[[i]]$control
tmp2$num.trees <- NA
- c2 <- digest::digest(tmp1)==digest::digest(tmp2)
+ c2 <- identical(tmp1, tmp2)
if (!c1) {
stop("Cannot merge forests! Models differ.");
}
diff --git a/R/naiveSplit.R b/R/naiveSplit.R
index 3ae75f3..4046d76 100644
--- a/R/naiveSplit.R
+++ b/R/naiveSplit.R
@@ -15,44 +15,45 @@ naiveSplit <-
cov.type <- c()
cov.col <- c()
cov.name <- c()
-
+
LL.within <- c()
within.split <- c()
- #Baseline model to compare subgroup fits to
+ # Baseline model to compare subgroup fits to
###########################################################
### OPENMX USED HERE ###
###########################################################
- if (control$sem.prog == 'OpenMx') {
+ if (control$sem.prog == "OpenMx") {
modelnew <- mxAddNewModelData(model, mydata, name = "BASE MODEL")
LL.overall <- safeRunAndEvaluate(modelnew)
- suppressWarnings(if (is.na(LL.overall))
- return(NULL))
+ suppressWarnings(if (is.na(LL.overall)) {
+ return(NULL)
+ })
}
###########################################################
### lavaan USED HERE ###
###########################################################
- if (control$sem.prog == 'lavaan') {
- #if (control$verbose) {message("Assessing overall model")}
+ if (control$sem.prog == "lavaan") {
+ # if (control$verbose) {message("Assessing overall model")}
modelnew <-
eval(parse(
text = paste(
- "lavaan::",model@Options$model.type,
- '(lavaan::parTable(model),data=mydata,missing=\'',
+ "lavaan::", model@Options$model.type,
+ "(lavaan::parTable(model),data=mydata,missing='",
model@Options$missing,
- '\',do.fit=F)',
+ "',do.fit=F)",
sep = ""
)
))
- #modelnew <- lavaan(parTable(model),data=mydata,model.type=model@Options$model.type,do.fit=FALSE)
+ # modelnew <- lavaan(parTable(model),data=mydata,model.type=model@Options$model.type,do.fit=FALSE)
LL.overall <- safeRunAndEvaluate(modelnew)
- suppressWarnings(if (is.na(LL.overall))
- return(NULL))
+ suppressWarnings(if (is.na(LL.overall)) {
+ return(NULL)
+ })
}
if (pp) {
comparedData <- max(meta$model.ids + 1)
- }
- else {
+ } else {
comparedData <- meta$covariate.ids
}
for (cur_col in comparedData) {
@@ -66,37 +67,37 @@ naiveSplit <-
if (control$report.level > 10) {
report(paste("Estimating baseline likelihood: ", LL.baseline), 1)
}
-
-
+
+
# tell the user a little bit about where we are right now
if (control$verbose) {
- ui_message("Testing Predictor: ",
- colnames(mydata)[cur_col])
+ ui_message(
+ "Testing Predictor: ",
+ colnames(mydata)[cur_col]
+ )
}
############################################################
- #case for factored covariates##############################
+ # case for factored covariates##############################
if (is.factor(mydata[, cur_col])) {
- #unordered factors#####################################
+ # unordered factors#####################################
if (!is.ordered(mydata[, cur_col])) {
- var.type = .SCALE_CATEGORICAL
-
+ var.type <- .SCALE_CATEGORICAL
+
val.sets <- unique(mydata[, cur_col])
if (length(val.sets) > 1) {
- #create binaries for comparison of all combinations
+ # create binaries for comparison of all combinations
result <-
recodeAllSubsets(mydata[, cur_col], colnames(mydata)[cur_col])
test1 <- c()
test2 <- rep(NA, length(mydata[, cur_col]))
-
+
for (j in 1:ncol(result$columns)) {
for (i in 1:length(mydata[, cur_col])) {
if (isTRUE(result$columns[i, j])) {
test1[i] <- 1
- }
- else if (!is.na(result$columns[i, j])) {
+ } else if (!is.na(result$columns[i, j])) {
test1[i] <- 0
- }
- else{
+ } else {
test1[i] <- NA
}
}
@@ -104,27 +105,27 @@ naiveSplit <-
test2 <- data.frame(test2, test1)
}
test2 <- test2[, -1]
-
+
for (i in 1:(result$num_sets)) {
LL.temp <- c()
- #subset data for chosen value and store LL
+ # subset data for chosen value and store LL
if (result$num_sets == 1) {
- subset1 <- subset (mydata, as.numeric(test2) == 2)
- subset2 <- subset (mydata, as.numeric(test2) == 1)
- }
- else {
- subset1 <- subset (mydata, as.numeric(test2[[i]]) == 2)
- subset2 <- subset (mydata, as.numeric(test2[[i]]) == 1)
+ subset1 <- subset(mydata, as.numeric(test2) == 2)
+ subset2 <- subset(mydata, as.numeric(test2) == 1)
+ } else {
+ subset1 <- subset(mydata, as.numeric(test2[[i]]) == 2)
+ subset2 <- subset(mydata, as.numeric(test2[[i]]) == 1)
}
-
+
# refit baseline model with focus parameters @TAGX
if (!is.null(constraints) &
- (!is.null(constraints$focus.parameters))) {
+ (!is.null(constraints$focus.parameters))) {
LL.baseline <- fitSubmodels(model,
- subset1,
- subset2,
- control,
- invariance = constraints$focus.parameters)
+ subset1,
+ subset2,
+ control,
+ invariance = constraints$focus.parameters
+ )
if (control$report.level > 10) {
report(
paste(
@@ -135,7 +136,7 @@ naiveSplit <-
)
}
}
- #browser()
+ # browser()
LL.return <-
fitSubmodels(model, subset1, subset2, control, invariance = NULL)
if (!is.na(LL.return)) {
@@ -149,32 +150,33 @@ naiveSplit <-
}
}
}
- #ordered factors#########################################
+ # ordered factors#########################################
if (is.ordered(mydata[, cur_col])) {
- var.type = .SCALE_ORDINAL
-
+ var.type <- .SCALE_ORDINAL
+
val.sets <- sort(unique(mydata[, cur_col]))
-
+
if (length(val.sets) > 1) {
for (i in 2:(length(val.sets))) {
LL.temp <- c()
- #subset data for chosen value and store LL
- #cond1 <- as.numeric(as.character(mydata[,cur_col])) > (val.sets[i]+val.sets[(i-1)])/2
- #cond2 <- as.numeric(as.character(mydata[,cur_col])) < (val.sets[i]+val.sets[(i-1)])/2
-
+ # subset data for chosen value and store LL
+ # cond1 <- as.numeric(as.character(mydata[,cur_col])) > (val.sets[i]+val.sets[(i-1)])/2
+ # cond2 <- as.numeric(as.character(mydata[,cur_col])) < (val.sets[i]+val.sets[(i-1)])/2
+
cond1 <- mydata[, cur_col] > val.sets[i - 1]
cond2 <- mydata[, cur_col] <= val.sets[i - 1]
- subset1 <- subset (mydata, cond1)
- subset2 <- subset (mydata, cond2)
-
+ subset1 <- subset(mydata, cond1)
+ subset2 <- subset(mydata, cond2)
+
# refit baseline model with focus parameters @TAGX
if (!is.null(constraints) &
- (!is.null(constraints$focus.parameters))) {
+ (!is.null(constraints$focus.parameters))) {
LL.baseline <- fitSubmodels(model,
- subset1,
- subset2,
- control,
- invariance = constraints$focus.parameters)
+ subset1,
+ subset2,
+ control,
+ invariance = constraints$focus.parameters
+ )
if (control$report.level > 10) {
report(
paste(
@@ -185,12 +187,12 @@ naiveSplit <-
)
}
}
-
+
LL.return <-
fitSubmodels(model, subset1, subset2, control, invariance = NULL)
if (!is.na(LL.return)) {
LL.within <- cbind(LL.within, (LL.baseline - LL.return))
- #within.split <- cbind(within.split, (val.sets[i]+val.sets[(i-1)])/2)
+ # within.split <- cbind(within.split, (val.sets[i]+val.sets[(i-1)])/2)
within.split <- cbind(within.split, as.character(val.sets[i - 1]))
cov.col <- cbind(cov.col, cur_col)
@@ -203,33 +205,34 @@ naiveSplit <-
}
}
- #numeric (continuous) covariates################################
+ # numeric (continuous) covariates################################
if (is.numeric(mydata[, cur_col])) {
- var.type = .SCALE_METRIC
+ var.type <- .SCALE_METRIC
v <- as.numeric(mydata[, cur_col])
val.sets <- sort(unique(v))
-
+
if (length(val.sets) > 1) {
for (i in 2:(length(val.sets))) {
LL.temp <- c()
- #subset data for chosen value and store LL
+ # subset data for chosen value and store LL
cond1 <-
as.numeric(mydata[, cur_col]) > (val.sets[i] + val.sets[(i - 1)]) / 2
cond2 <-
as.numeric(mydata[, cur_col]) < (val.sets[i] + val.sets[(i - 1)]) / 2
- subset1 <- subset (mydata, cond1)
- subset2 <- subset (mydata, cond2)
-
- #catch LLR for each comparison
-
+ subset1 <- subset(mydata, cond1)
+ subset2 <- subset(mydata, cond2)
+
+ # catch LLR for each comparison
+
# refit baseline model with focus parameters @TAGX
if (!is.null(constraints) &
- (!is.null(constraints$focus.parameters))) {
+ (!is.null(constraints$focus.parameters))) {
LL.baseline <- fitSubmodels(model,
- subset1,
- subset2,
- control,
- invariance = constraints$focus.parameters)
+ subset1,
+ subset2,
+ control,
+ invariance = constraints$focus.parameters
+ )
if (control$report.level > 10) {
report(
paste(
@@ -240,7 +243,7 @@ naiveSplit <-
)
}
}
-
+
LL.return <-
fitSubmodels(model, subset1, subset2, control, invariance = NULL)
# cat("LLreturn:",LL.return," and value split at:",(val.sets[i]+val.sets[(i-1)])/2,"\n")
@@ -253,23 +256,21 @@ naiveSplit <-
cov.type <- cbind(cov.type, var.type)
n.comp <- n.comp + 1
}
-
}
}
-
}
}
-
-
+
+
if (is.null(LL.within)) {
return(NULL)
}
-
+
btn.matrix <- rbind(LL.within, cov.name, cov.col, within.split)
colnames(btn.matrix) <-
c(paste("var", seq(1, ncol(btn.matrix)), sep = ""))
rownames(btn.matrix) <- c("LR", "variable", "column", "split val")
-
+
filter <- c()
if (!is.null(invariance)) {
if (control$verbose) {
@@ -278,9 +279,9 @@ naiveSplit <-
filter <-
invarianceFilter(model, mydata, btn.matrix, LL.baseline, invariance, control)
}
-
+
# find best
-
+
LL.max <- NA
split.max <- NA
name.max <- NA
@@ -295,8 +296,7 @@ naiveSplit <-
name.max <- cov.name[cur_col]
col.max <- cov.col[cur_col]
type.max <- cov.type[cur_col]
- }
- else if (LL.within[cur_col] > LL.max) {
+ } else if (LL.within[cur_col] > LL.max) {
LL.max <- LL.within[cur_col]
split.max <- within.split[cur_col]
name.max <- cov.name[cur_col]
@@ -304,8 +304,7 @@ naiveSplit <-
type.max <- cov.type[cur_col]
}
}
- }
- else {
+ } else {
if (!is.na(LL.within[cur_col])) {
if (is.na(LL.max)) {
LL.max <- LL.within[cur_col]
@@ -313,8 +312,7 @@ naiveSplit <-
name.max <- cov.name[cur_col]
col.max <- cov.col[cur_col]
type.max <- cov.type[cur_col]
- }
- else if (LL.within[cur_col] > LL.max) {
+ } else if (LL.within[cur_col] > LL.max) {
LL.max <- LL.within[cur_col]
split.max <- within.split[cur_col]
name.max <- cov.name[cur_col]
@@ -324,26 +322,24 @@ naiveSplit <-
}
}
}
-
- if (control$verbose & control$report.level==99) {
-
- cat("LL.within:",paste0(LL.within,collapse=","),"\n")
- cat("LL.max: ", paste0(LL.max,collapse=","),"\n")
- cat("within.split: ",paste0(within.split,collapse=","),"\n" )
- cat("split max",split.max,"\n")
+
+ if (control$verbose & control$report.level == 99) {
+ cat("LL.within:", paste0(LL.within, collapse = ","), "\n")
+ cat("LL.max: ", paste0(LL.max, collapse = ","), "\n")
+ cat("within.split: ", paste0(within.split, collapse = ","), "\n")
+ cat("split max", split.max, "\n")
}
-
+
# alternative way of counting the number of comparisons
# count the number of variables instead of tests
if (control$naive.bonferroni.type == 1) {
n.comp <- length(comparedData)
}
-
-
+
+
if (is.na(LL.max)) {
return(NULL)
- }
- else
+ } else {
(
return(
list(
@@ -358,4 +354,5 @@ naiveSplit <-
)
)
)
- }
\ No newline at end of file
+ }
+ }
diff --git a/R/nodeFunSemtree.R b/R/nodeFunSemtree.R
index f83794f..2c35753 100644
--- a/R/nodeFunSemtree.R
+++ b/R/nodeFunSemtree.R
@@ -1,7 +1,6 @@
-
-nodeFunSemtree<-function(x, labs, digits, varlen)
-{
- paste(ifelse(x$frame$var==" Site built with pkgdown 2.0.7. Site built with pkgdown 2.0.6. If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the “copyright” line and a pointer to where the full notice is found. Also add information on how to contact you by electronic and paper mail. If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode: The hypothetical commands You should also get your employer (if you work as a programmer) or school, if any, to sign a “copyright disclaimer” for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see <http://www.gnu.org/licenses/>. The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read <http://www.gnu.org/philosophy/why-not-lgpl.html>.Page not found (404)
diff --git a/docs/CONTRIBUTE.html b/docs/CONTRIBUTE.html
index cb60979..bd16911 100644
--- a/docs/CONTRIBUTE.html
+++ b/docs/CONTRIBUTE.html
@@ -17,7 +17,7 @@
@@ -41,7 +41,7 @@
Getting Started with the semtree package
How to contribute?
Some notes
-
-devtools::test(".")
-devtools::check(".")
checkBinSize()
or getExpectedMean()
.
@@ -100,7 +100,7 @@ {r} devtools::test(".")
+{r} devtools::check(".")
+checkBinSize()
or getExpectedMean()
.Some notes
-
NA
diff --git a/docs/LICENSE.html b/docs/LICENSE.html
index b6fc3fe..8de4495 100644
--- a/docs/LICENSE.html
+++ b/docs/LICENSE.html
@@ -17,7 +17,7 @@
@@ -41,7 +41,7 @@
Getting Started with the semtree package
17. Interpretation of Sectio
How to Apply These Terms to Your New Programs
<one line to give the program's name and a brief idea of what it does.>
-Copyright (C) 2020 pdc
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program. If not, see <http://www.gnu.org/licenses/>.
<one line to give the program's name and a brief idea of what it does.>
+Copyright (C) 2020 pdc
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
Copyright (C) 2020 pdc
- semtree for details type 'show w'.
- This program comes with ABSOLUTELY NO WARRANTY;
- This is free software, and you are welcome to redistribute it'show c' for details. under certain conditions; type
semtree Copyright (C) 2020 pdc
+This program comes with ABSOLUTELY NO WARRANTY; for details type 'show w'.
+This is free software, and you are welcome to redistribute it
+under certain conditions; type 'show c' for details.
show w
and show c
should show the appropriate parts of the General Public License. Of course, your program’s commands might be different; for a GUI interface, you would use an “about box”.How to Apply These Terms
diff --git a/docs/articles/constraints.html b/docs/articles/constraints.html
index e6fd4c4..7fa85f9 100644
--- a/docs/articles/constraints.html
+++ b/docs/articles/constraints.html
@@ -33,7 +33,7 @@
@@ -59,7 +59,7 @@
Getting Started with the semtree package
Constraints in semtree
Andreas M.
Brandmaier
- 2023-04-06
+ 2024-04-15
Source: vignettes/constraints.Rmd
constraints.Rmd
Global Invariance
+<- semtree(model.cfa, data=cfa.sim, constraints=
- tree.gc semtree.constraints(global.invariance =
- c("F__x1","F__x2","F__x3","F__x4")))
- #> > Model was not run. Estimating parameters now.
-#>
-
- Beginning initial fit attempt0, fit=1296.86314066279, new current best! (was 23607.7613598408)
- Fit attempt
- > Global Constraints:
-#> F__x1 F__x2 F__x3 F__x4
-#> > Freely Estimated Parameters:
-#> VAR_x1 VAR_x2 VAR_x3 VAR_x4 const__x2 const__x3 const__x4 const__F
-#>
-
- Beginning initial fit attempt0, fit=1296.8631406626, new current best! (was 1296.86314066277)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=811.249045946747, new current best! (was 1071.01945896895)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=125.670448647526, new current best! (was 411.308446842878)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=97.7311535251429, new current best! (was 399.94059910378)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=-67.3504247916599, new current best! (was 225.843681693765)
- Fit attempt 1 of at maximum 10 extra tries
- Beginning fit attempt 2 of at maximum 10 extra tries
- Beginning fit attempt 3 of at maximum 10 extra tries
- Beginning fit attempt 4 of at maximum 10 extra tries
- Beginning fit attempt 4, fit=-67.350424791684, new current best! (was -67.3504247916599)
- Fit attempt 5 of at maximum 10 extra tries
- Beginning fit attempt 5, fit=-67.3504247916867, new current best! (was -67.350424791684)
- Fit attempt 6 of at maximum 10 extra tries
- Beginning fit attempt 6, fit=-67.3504247917372, new current best! (was -67.3504247916867)
- Fit attempt 7 of at maximum 10 extra tries
- Beginning fit attempt 8 of at maximum 10 extra tries
- Beginning fit attempt 9 of at maximum 10 extra tries
- Beginning fit attempt 10 of at maximum 10 extra tries
- Beginning fit attempt
-
[32m✔
[39m Tree construction finished [took 2s].
tree.gc <- semtree(model.cfa, data=cfa.sim, constraints=
+ semtree.constraints(global.invariance =
+ c("F__x1","F__x2","F__x3","F__x4")))
+#> ❯ Model was not run. Estimating parameters now.
+#>
+Beginning initial fit attempt
+Fit attempt 0, fit=1245.04605304561, new current best! (was 23512.9380282892)
+
+❯ Global Constraints:
+#> F__x1 F__x2 F__x3 F__x4
+#> ❯ Freely Estimated Parameters:
+#> VAR_x1 VAR_x2 VAR_x3 VAR_x4 const__x2 const__x3 const__x4 const__F
+#>
+Beginning initial fit attempt
+Fit attempt 0, fit=1245.0460530455, new current best! (was 1245.04605304558)
+
+
+Beginning initial fit attempt
+Fit attempt 0, fit=700.981679486895, new current best! (was 935.197549062569)
+
+
+Beginning initial fit attempt
+Fit attempt 0, fit=82.3725219274684, new current best! (was 404.027616923408)
+Beginning fit attempt 1 of at maximum 10 extra tries
+Fit attempt 1, fit=82.3725219274052, new current best! (was 82.3725219274684)
+
+
+Beginning initial fit attempt
+Fit attempt 0, fit=-12.8178155443534, new current best! (was 296.954062563265)
+Beginning fit attempt 1 of at maximum 10 extra tries
+Fit attempt 1, fit=-12.8178155443725, new current best! (was -12.8178155443534)
+Beginning fit attempt 2 of at maximum 10 extra tries
+Beginning fit attempt 3 of at maximum 10 extra tries
+Beginning fit attempt 4 of at maximum 10 extra tries
+Beginning fit attempt 5 of at maximum 10 extra tries
+Beginning fit attempt 6 of at maximum 10 extra tries
+Beginning fit attempt 7 of at maximum 10 extra tries
+Beginning fit attempt 8 of at maximum 10 extra tries
+Beginning fit attempt 9 of at maximum 10 extra tries
+Beginning fit attempt 10 of at maximum 10 extra tries
+
+
+Beginning initial fit attempt
+Fit attempt 0, fit=41.8996556241436, new current best! (was 309.848503983403)
+Beginning fit attempt 1 of at maximum 10 extra tries
+Fit attempt 1, fit=41.8996556240704, new current best! (was 41.8996556241436)
+Beginning fit attempt 2 of at maximum 10 extra tries
+Fit attempt 2, fit=41.8996556240695, new current best! (was 41.8996556240704)
+Beginning fit attempt 3 of at maximum 10 extra tries
+Beginning fit attempt 4 of at maximum 10 extra tries
+Beginning fit attempt 5 of at maximum 10 extra tries
+Beginning fit attempt 6 of at maximum 10 extra tries
+Beginning fit attempt 7 of at maximum 10 extra tries
+Beginning fit attempt 8 of at maximum 10 extra tries
+Beginning fit attempt 9 of at maximum 10 extra tries
+Beginning fit attempt 10 of at maximum 10 extra tries
+
+
[32m✔
[39m Tree construction finished [took less than a second].
plot(tree.gc)
Local Invariancelocal.invariance to allow
a tree with weakly measurement-invariant leafs.
<- semtree(model.cfa, data=cfa.sim, constraints=
- tree.lc semtree.constraints(
- local.invariance= c("F__x1","F__x2","F__x3","F__x4")))
- #> > Model was not run. Estimating parameters now.
-#>
-
- Beginning initial fit attempt0, fit=1296.86314066279, new current best! (was 23607.7613598408)
- Fit attempt
- > No Invariance alpha selected. alpha.invariance set to:0.05
-#>
-
- Beginning initial fit attempt0, fit=1296.86314066257, new current best! (was 1296.86314066279)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=796.298827273366, new current best! (was 1071.01945865405)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=-115.56317117639, new current best! (was 225.84368200868)
- Fit attempt 1 of at maximum 10 extra tries
- Beginning fit attempt 1, fit=-115.56317117659, new current best! (was -115.56317117639)
- Fit attempt
-
[32m✔
[39m Tree construction finished [took 2s].
tree.lc <- semtree(model.cfa, data=cfa.sim, constraints=
+ semtree.constraints(
+ local.invariance= c("F__x1","F__x2","F__x3","F__x4")))
+#> ❯ Model was not run. Estimating parameters now.
+#>
+Beginning initial fit attempt
+Fit attempt 0, fit=1245.04605304561, new current best! (was 23512.9380282892)
+
+❯ No Invariance alpha selected. alpha.invariance set to:0.05
+#>
+Beginning initial fit attempt
+Fit attempt 0, fit=1245.04605304549, new current best! (was 1245.04605304561)
+
+
+Beginning initial fit attempt
+Fit attempt 0, fit=674.376601044165, new current best! (was 935.197546229974)
+
+
+Beginning initial fit attempt
+Fit attempt 0, fit=14.8502289550343, new current best! (was 309.84850681597)
+Beginning fit attempt 1 of at maximum 10 extra tries
+Fit attempt 1, fit=14.8502289550283, new current best! (was 14.8502289550343)
+Beginning fit attempt 2 of at maximum 10 extra tries
+Beginning fit attempt 3 of at maximum 10 extra tries
+Beginning fit attempt 4 of at maximum 10 extra tries
+Beginning fit attempt 5 of at maximum 10 extra tries
+Beginning fit attempt 6 of at maximum 10 extra tries
+Beginning fit attempt 7 of at maximum 10 extra tries
+Beginning fit attempt 8 of at maximum 10 extra tries
+Beginning fit attempt 9 of at maximum 10 extra tries
+Beginning fit attempt 10 of at maximum 10 extra tries
+
+
[32m✔
[39m Tree construction finished [took 1s].
Now we find p1
as the only predictor that yields
subgroups that pass the measurement invariance test. Even though we have
chosen the four factor loadings as local.invariance
@@ -352,11 +373,11 @@
Now, we grow a tree without constraints:
-
-<- semtree(model.biv, data=df.biv)
- tree.biv #> > Model was not run. Estimating parameters now.
-#>
-
- Beginning initial fit attempt0, fit=8233.92582585158, new current best! (was 14528.4141425595)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=8233.92582585143, new current best! (was 8233.92582585158)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=3454.12434636158, new current best! (was 4066.88531949563)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=1555.54412300078, new current best! (was 1720.05414322814)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=1569.26472590267, new current best! (was 1734.07020313343)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=3566.5692080098, new current best! (was 4167.0405063558)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=1593.91684303245, new current best! (was 1780.60715330577)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=1576.27862642528, new current best! (was 1785.96205470403)
- Fit attempt
-
[32m✔
[39m Tree construction finished [took 2s].
+tree.biv <- semtree(model.biv, data=df.biv)
+#> ❯ Model was not run. Estimating parameters now.
+#>
+Beginning initial fit attempt
+Fit attempt 0, fit=8233.92582585158, new current best! (was 14528.4141425595)
+
+
+Beginning initial fit attempt
+Fit attempt 0, fit=8233.92582585143, new current best! (was 8233.92582585158)
+
+
+Beginning initial fit attempt
+Fit attempt 0, fit=3454.12434636158, new current best! (was 4066.88531930229)
+
+
+Beginning initial fit attempt
+Fit attempt 0, fit=1555.54412300078, new current best! (was 1720.05414323192)
+
+
+Beginning initial fit attempt
+Fit attempt 0, fit=1569.26472590267, new current best! (was 1734.07020312965)
+
+
+Beginning initial fit attempt
+Fit attempt 0, fit=3566.5692080098, new current best! (was 4167.04050654914)
+
+
+Beginning initial fit attempt
+Fit attempt 0, fit=1593.91684303245, new current best! (was 1780.60715331027)
+
+
+Beginning initial fit attempt
+Fit attempt 0, fit=1576.27862642528, new current best! (was 1785.96205469953)
+
+
[32m✔
[39m Tree construction finished [took less than a second].
As expected, we obtain a tree structure that has both p1
and p2
(here we use the viridis colors to give each leaf
node a different frame color, which we’ll use later again):
Let us first set mu1
as focus parameter:
-<- semtree(model.biv, df.biv, constraints=
- tree.biv2 semtree.constraints(focus.parameters = "mu1"))
- #> > Model was not run. Estimating parameters now.
-#>
-
- Beginning initial fit attempt0, fit=8233.92582585158, new current best! (was 14528.4141425595)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=8233.92582585143, new current best! (was 8233.92582585158)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=3740.92185296731, new current best! (was 4086.36288876948)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=3795.16307144921, new current best! (was 4147.56293708195)
- Fit attempt
-
[32m✔
[39m Tree construction finished [took 1s].
+tree.biv2 <- semtree(model.biv, df.biv, constraints=
+ semtree.constraints(focus.parameters = "mu1"))
+#> ❯ Model was not run. Estimating parameters now.
+#>
+Beginning initial fit attempt
+Fit attempt 0, fit=8233.92582585158, new current best! (was 14528.4141425595)
+
+
+Beginning initial fit attempt
+Fit attempt 0, fit=8233.92582585143, new current best! (was 8233.92582585158)
+
+
+Beginning initial fit attempt
+Fit attempt 0, fit=3740.92185296731, new current best! (was 4086.36288893237)
+
+
+Beginning initial fit attempt
+Fit attempt 0, fit=3795.16307144921, new current best! (was 4147.56293691906)
+
+
[32m✔
[39m Tree construction finished [took less than a second].
plot(tree.biv2)
grp2
did not come up anymore.
Now, if we set mu2
, we should see the exact opposite
picture:
-
-<- semtree(model.biv, df.biv, constraints=
- tree.biv3 semtree.constraints(focus.parameters = "mu2"))
- #> > Model was not run. Estimating parameters now.
-#>
-
- Beginning initial fit attempt0, fit=8233.92582585158, new current best! (was 14528.4141425595)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=8233.92582585143, new current best! (was 8233.92582585158)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=3454.12434636158, new current best! (was 4066.88531949563)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=3566.5692080098, new current best! (was 4167.0405063558)
- Fit attempt
-
[32m✔
[39m Tree construction finished [took 1s].
+tree.biv3 <- semtree(model.biv, df.biv, constraints=
+ semtree.constraints(focus.parameters = "mu2"))
+#> ❯ Model was not run. Estimating parameters now.
+#>
+Beginning initial fit attempt
+Fit attempt 0, fit=8233.92582585158, new current best! (was 14528.4141425595)
+
+
+Beginning initial fit attempt
+Fit attempt 0, fit=8233.92582585143, new current best! (was 8233.92582585158)
+
+
+Beginning initial fit attempt
+Fit attempt 0, fit=3454.12434636158, new current best! (was 4066.88531930229)
+
+
+Beginning initial fit attempt
+Fit attempt 0, fit=3566.5692080098, new current best! (was 4167.04050654914)
+
+
[32m✔
[39m Tree construction finished [took less than a second].
And, indeed, we see only grp2
as predictor whereas
grp1
was not selected this time.
@@ -511,21 +532,21 @@Focus Parameters -
+-<- semtree(model.biv, df.biv, constraints= - tree.biv4 semtree.constraints(focus.parameters = "VAR_x2")) - #> > Model was not run. Estimating parameters now. -#> - - Beginning initial fit attempt0, fit=8233.92582585158, new current best! (was 14528.4141425595) - Fit attempt - - - Beginning initial fit attempt0, fit=8233.92582585143, new current best! (was 8233.92582585158) - Fit attempt - - [32m✔ [39m Tree construction finished [took less than a second]. -plot(tree.biv4)
+tree.biv4 <- semtree(model.biv, df.biv, constraints= + semtree.constraints(focus.parameters = "VAR_x2")) +#> ❯ Model was not run. Estimating parameters now. +#> +Beginning initial fit attempt +Fit attempt 0, fit=8233.92582585158, new current best! (was 14528.4141425595) + + +Beginning initial fit attempt +Fit attempt 0, fit=8233.92582585143, new current best! (was 8233.92582585158) + + [32m✔ [39m Tree construction finished [took less than a second]. + +plot(tree.biv4)
Site built with pkgdown 2.0.7.
+Site built with pkgdown 2.0.6.
diff --git a/docs/articles/constraints_files/figure-html/plbv2-1.png b/docs/articles/constraints_files/figure-html/plbv2-1.png index 9f97b31..2441608 100644 Binary files a/docs/articles/constraints_files/figure-html/plbv2-1.png and b/docs/articles/constraints_files/figure-html/plbv2-1.png differ diff --git a/docs/articles/constraints_files/figure-html/unnamed-chunk-10-1.png b/docs/articles/constraints_files/figure-html/unnamed-chunk-10-1.png index aaf7f14..ad1756f 100644 Binary files a/docs/articles/constraints_files/figure-html/unnamed-chunk-10-1.png and b/docs/articles/constraints_files/figure-html/unnamed-chunk-10-1.png differ diff --git a/docs/articles/constraints_files/figure-html/unnamed-chunk-11-1.png b/docs/articles/constraints_files/figure-html/unnamed-chunk-11-1.png index 9b4d438..e3d44d2 100644 Binary files a/docs/articles/constraints_files/figure-html/unnamed-chunk-11-1.png and b/docs/articles/constraints_files/figure-html/unnamed-chunk-11-1.png differ diff --git a/docs/articles/constraints_files/figure-html/unnamed-chunk-12-1.png b/docs/articles/constraints_files/figure-html/unnamed-chunk-12-1.png index 10cdbd1..7585b2c 100644 Binary files a/docs/articles/constraints_files/figure-html/unnamed-chunk-12-1.png and b/docs/articles/constraints_files/figure-html/unnamed-chunk-12-1.png differ diff --git a/docs/articles/constraints_files/figure-html/unnamed-chunk-14-1.png b/docs/articles/constraints_files/figure-html/unnamed-chunk-14-1.png index 26296be..88d33d5 100644 Binary files a/docs/articles/constraints_files/figure-html/unnamed-chunk-14-1.png and b/docs/articles/constraints_files/figure-html/unnamed-chunk-14-1.png differ diff --git a/docs/articles/constraints_files/figure-html/unnamed-chunk-15-1.png b/docs/articles/constraints_files/figure-html/unnamed-chunk-15-1.png index fea4170..41ba7dc 100644 Binary files a/docs/articles/constraints_files/figure-html/unnamed-chunk-15-1.png and b/docs/articles/constraints_files/figure-html/unnamed-chunk-15-1.png differ diff --git a/docs/articles/constraints_files/figure-html/unnamed-chunk-4-1.png b/docs/articles/constraints_files/figure-html/unnamed-chunk-4-1.png index 0b2fcc1..66505ec 100644 Binary files a/docs/articles/constraints_files/figure-html/unnamed-chunk-4-1.png and b/docs/articles/constraints_files/figure-html/unnamed-chunk-4-1.png differ diff --git a/docs/articles/constraints_files/figure-html/unnamed-chunk-6-1.png b/docs/articles/constraints_files/figure-html/unnamed-chunk-6-1.png index 4a33c18..2a78766 100644 Binary files a/docs/articles/constraints_files/figure-html/unnamed-chunk-6-1.png and b/docs/articles/constraints_files/figure-html/unnamed-chunk-6-1.png differ diff --git a/docs/articles/forests.html b/docs/articles/forests.html index a6cd0ae..a3063da 100644 --- a/docs/articles/forests.html +++ b/docs/articles/forests.html @@ -33,7 +33,7 @@ @@ -59,7 +59,7 @@ Getting Started with the semtree packagevignettes/forests.Rmd
forests.Rmd
This example demonstrates how SEM forests can be grown. SEM forests +are ensembles of typically hundreds to thousands of SEM trees. Using +permutation-based variable importance estimates, we can aggregate the +importance of each predictor for improving model fit.
+Here, we use the affect
dataset and a simple SEM with
+only a single observed variable and no latent variables.
Create a forest control object that stores all tuning parameters of -the forest. Note that we use only 5 trees for demo purposes. Please -increase the number in real applications.
+the forest. Note that we use only 5 trees for illustration. Please +increase the number in real applications to several hundreds. To speed +up computation time, consider score-based test for variable selection in +the trees.
-control <- semforest.control(num.trees = 5)
+control <- semforest_control(num.trees = 5)
print(control)
#> SEM-Forest control:
#> -----------------
@@ -389,243 +397,26 @@ Forest
#> ● Progress Bar: TRUE
#> ● Seed: NA
Now, run the forest using the control
object:
<- semforest( model=model,
- forest data = affect,
- control = control,
- covariates = c("Study","Film", "state1",
- "PA2","NA2","TA2"))
- #>
-
- Beginning initial fit attempt0, fit=1289.15758570645, new current best! (was 1387.78413290756)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=818.25448472129, new current best! (was 819.184832331613)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=540.657188936563, new current best! (was 554.537456022498)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=540.657188936539, new current best! (was 540.657188936563)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=253.262932464789, new current best! (was 265.845563494021)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=267.917686855653, new current best! (was 274.811625442518)
- Fit attempt
-
-
- Beginning initial fit attempt
-
-
- Beginning initial fit attempt0, fit=230.436210492665, new current best! (was 263.717028698793)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=91.8058189525733, new current best! (was 95.4201985173459)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=130.239485916693, new current best! (was 135.016011975319)
- Fit attempt
-
-
[32m✔
[39m Tree construction finished [took 11s].#>
-
- Beginning initial fit attempt0, fit=788.643601362753, new current best! (was 789.364982717159)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=356.921622008996, new current best! (was 375.924543648624)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=274.070041829913, new current best! (was 275.659388744346)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=76.224093068116, new current best! (was 81.2622332646489)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=399.837330598193, new current best! (was 412.719057714128)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=97.1496598369175, new current best! (was 116.06664283684)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=272.800917814333, new current best! (was 283.770687761354)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=123.620225509635, new current best! (was 126.437967074978)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=144.858996740466, new current best! (was 146.362950739354)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=54.3867250107566, new current best! (was 63.3116865213297)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=77.7820652871679, new current best! (was 81.5473102191363)
- Fit attempt
-
-
[32m✔
[39m Tree construction finished [took 10s].#>
-
- Beginning initial fit attempt0, fit=834.887957588376, new current best! (was 835.097735354963)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=484.442995019425, new current best! (was 504.824969995514)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=327.747061041181, new current best! (was 335.605482887242)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=226.487492378292, new current best! (was 232.62717028389)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=122.51145092926, new current best! (was 124.519247808258)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=99.0044984589434, new current best! (was 101.968244570034)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=82.7340026651454, new current best! (was 95.1198907572906)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=132.40038301966, new current best! (was 148.837512132183)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=305.43653582216, new current best! (was 330.062987592862)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=167.668643402405, new current best! (was 170.078100401807)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=131.357409966922, new current best! (was 135.358435420353)
- Fit attempt
-
-
[32m✔
[39m Tree construction finished [took 9s].#>
-
- Beginning initial fit attempt0, fit=848.46463702424, new current best! (was 848.840697274375)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=328.187465026306, new current best! (was 366.760803980649)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=150.789034618148, new current best! (was 163.442241865474)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=152.956425048368, new current best! (was 164.745223160832)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=457.506356717906, new current best! (was 481.703833043592)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=321.91996897439, new current best! (was 326.091979281769)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=158.055244073882, new current best! (was 164.875520523358)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=149.539343979152, new current best! (was 157.044448451032)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=73.5643765034496, new current best! (was 76.0629347964262)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=72.0937976825281, new current best! (was 73.4764091827261)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=118.046439931572, new current best! (was 131.414377436137)
- Fit attempt
-
-
[32m✔
[39m Tree construction finished [took 9s].#>
-
- Beginning initial fit attempt0, fit=816.768764245928, new current best! (was 818.827214348166)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=383.803628624992, new current best! (was 395.149459331774)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=185.014060679185, new current best! (was 198.367577946491)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=171.297462750686, new current best! (was 185.436050678501)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=415.322910009232, new current best! (was 421.619304914154)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=268.217878591544, new current best! (was 284.494189283796)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=80.8254886521707, new current best! (was 86.5883323542989)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=178.831836528103, new current best! (was 181.629546237244)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=88.0410808835288, new current best! (was 97.6984858211589)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=75.6021407498517, new current best! (was 81.1333507069438)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=111.301359647879, new current best! (was 130.828720725437)
- Fit attempt
-
-
[32m✔
[39m Tree construction finished [took 12s].#>
[32m✔
[39m Forest completed [took 50s]
+forest <- semforest( model=model,
+ data = affect,
+ control = control,
+ covariates = c("Study","Film", "state1",
+ "PA2","NA2","TA2"))
Next, we compute permutation-based variable importance.
+Next, we compute permutation-based variable importance. This may take +some time.
vim <- varimp(forest)
print(vim, sort.values=TRUE)
#> Variable Importance
-#> Film Study state1 PA2 TA2 NA2
-#> -0.3848008 2.0517864 9.8412123 25.5626097 31.7029700 84.7036831
+#> Study PA2 state1 TA2 Film
+#> -9.659311e-08 9.418006e+00 1.218599e+01 2.066494e+01 2.537268e+01
+#> NA2
+#> 3.658740e+01
plot(vim)
From this, we can learn that variables such as NA2
@@ -654,7 +445,7 @@
Site built with pkgdown 2.0.7.
+Site built with pkgdown 2.0.6.
Now, we simulate some data from a linear latent growth curve model
+(that is, a random intercept and random slope over time). The dataset
+will be called growth.data
. The dataset contains five
+observations for each individual (X1
to X5
)
+and one predictor P1
. The predictor is dichotomous and
+predicts a (quite large) difference in mean slope.
Now, we specify a linear latent growth curve model using OpenMx’s +path specification. The model has five observed variables. Residual +variances are assumed to be identical over time.
manifests <- names(growth.data)[1:5]
growthCurveModel <- mxModel("Linear Growth Curve Model Path Specification",
@@ -186,35 +198,30 @@ Specify an OpenMx model values=c(1, 1),
labels=c("meani", "means")
)
-) # close model
<- semtree(model = growthCurveModel, data = growth.data)
- tree #> > Model was not run. Estimating parameters now.
-#>
-
- Beginning initial fit attempt0, fit=7638.98979615046, new current best! (was 84974.7019994723)
- Fit attempt
-
-
[33m✖
[39m Variable P1 is numeric but has only few unique values. Consider recoding as ordered factor.#>
-
- Beginning initial fit attempt0, fit=7638.98979614189, new current best! (was 7638.98979615046)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=2780.73090202519, new current best! (was 3625.89178697863)
- Fit attempt
-
-
- Beginning initial fit attempt0, fit=3175.11831426993, new current best! (was 4013.09800926999)
- Fit attempt
-
[32m✔
[39m Tree construction finished [took less than a second].
Now, we grow a SEM tree using the semtree
function,
+which takes the model and the dataset as input. If not specified
+otherwise, SEM tree will assume that all variables in the dataset, which
+are not observed variables in the dataset are potential predictors.
+tree <- semtree(model = growthCurveModel,
+ data = growth.data)
Once the tree is grown, we can plot it:
plot(tree)
Site built with pkgdown 2.0.7.
+Site built with pkgdown 2.0.6.
Dear CRAN team,
+this is a maintenance update with a couple of bug fixes users reported over the last year.
+Thanks & best wishes, Andreas
+ + +An R package for estimating Structural Equation Model Trees and Forests.
+An R package for estimating Structural Equation Model (SEM) Trees and Forests. They are a fusion of SEM and decision trees, or SEM and random forests respectively. While SEM is a confirmatory modeling technique, SEM trees and forests allow to explore whether there are predictors that provide further information about an initial, theory-based model. Potential use cases are the search for potential predictors that explain individual differences, finding omitted variables in a model, or exploring measurement invariance over a large set of predictors. A recent overview is in our latest book chapter in the SEM handbook (Brandmaier & Jacobucci, 2023).
Install the latest stable version from CRAN:
-install.packages("semtree")
To install the latest semtree package directly from GitHub, copy the following line into R:
-library(devtools)
-::install_github("semtree/brandmaier")
- devtools
-# even better: install with package vignette (extra documentation)
-::install_github("brandmaier/semtree",force=TRUE, build_opts = c()) devtools
Package documentation and use-cases with runnable R code can be found on our github pages: https://brandmaier.github.io/semtree/.
-You may also want to visit the semtree website: https://brandmaier.de/semtree
Package vignettes (shipped with the package) contain documentation on how to use the package. Simply type this in R once you have loaded the package:
-browseVignettes("semtree")
Brandmaier, A. M., Driver, C., & Voelkle, M. C. (2019). Recursive partitioning in continuous time analysis. In K. van Montfort, J. Oud, & M. C. Voelkle (Eds.), Continuous time modeling in the behavioral and related sciences. New York: Springer.
Brandmaier, A. M., Prindle, J. J., McArdle, J. J., & Lindenberger, U. (2016). Theory-guided exploration with structural equation model forests. Psychological Methods, 21, 566-582.
Brandmaier, A. M., Prindle, J. J., McArdle, J. J., & Lindenberger, U. (2016). Theory-guided exploration with structural equation model forests. Psychological Methods, 21, 566-582.
Brandmaier, A. M., von Oertzen, T., McArdle, J. J., & Lindenberger, U. (2014). Exploratory data mining with structural equation model trees. In J. J. McArdle & G. Ritschard (Eds.), Contemporary issues in exploratory data mining in the behavioral sciences (pp. 96-127). New York: Routledge.
Brandmaier, A. M., von Oertzen, T., McArdle, J. J., & Lindenberger, U. (2013). Structural equation model trees. Psychological Methods, 18, 71-86.
Brandmaier, A. M., von Oertzen, T., McArdle, J. J., & Lindenberger, U. (2013). Structural equation model trees. Psychological Methods, 18, 71-86.
Applied examples (there are many more):
Brandmaier, A. M., Ram, N., Wagner, G. G., & Gerstorf, D. (2017). Terminal decline in well-being: The role of multi-indicator constellations of physical health and psychosocial correlates. Developmental Psychology.
@@ -181,7 +185,16 @@semforest_control()
over semforest.control()
and semtree_control()
over semtree.control()
+mtry
in forests (if NULL
) and for choosing min.N
and min.bucket
(if NULL
)ctsemOMX
to suggested packagevarimp
, such that na.omit=TRUE, which is consistent with other packages like party or partykittoTable()
-command, by default, all parameters are shown now, also fixed a bug with score-based tests and toTable()BORUTA algorithm for SEM trees
+boruta(
+ model,
+ data,
+ control = NULL,
+ predictors = NULL,
+ percentile_threshold = 100,
+ rounds = 1,
+ ...
+)
A template model specification from OpenMx
using
+the mxModel
function (or a lavaan
model
+using the lavaan
function with option fit=FALSE).
+Model must be syntactically correct within the framework chosen, and
+converge to a solution.
Data.frame used in the model creation using
+mxModel
or lavaan
are input here. Order
+of modeled variables and predictors is not important when providing a
+dataset to semtree
.
semtree
model specifications from
+semtree.control
are input here. Any changes from the default
+setting can be specified here.
Numeric.
Numeric. Number of rounds of the BORUTA algorithm.
numeric from interval (0, 1) specifying start of trimmed -sample period. With the default +sample period. With the default from = 0.15 the first and last 15 percent of observations are trimmed. This is only needed for continuous covariates.
numeric. Number of replications used for simulating from the asymptotic +
numeric. Number of replications used for simulating from the asymptotic distribution (passed to efpFunctional). Only needed for ordinal covariates.
a character. "wald" (default) gives the squared parameter -differences devided by their pooled standard errors. test" gives the +differences divided by their pooled standard errors. test" gives the contributions of the parameters to the test statistic. "raw" gives the absolute values of the parameter differences.
a character. "wald" (default) gives the squared parameter -differences devided by their pooled standard errors. "test" gives the +differences divided by their pooled standard errors. "test" gives the contributions of the parameters to the test statistic."raw" gives the absolute values of the parameter differences.
biodiversity()
Quantify bio diversity of a SEM Forest
BORUTA algorithm for SEM trees
se()
SEMtrees Parameter Estimates Standard Error Table
Create a SEM Forest
SEM Forest Control Object
Create a SEM Forest
SEM Tree Package
SEM Tree: Recursive Partitioning for Structural Equation Models
semtree.control()
SEM Tree Control Object
SEM Tree: Recursive Partitioning for Structural Equation Models
Integer. If mc
is not NULL
, the function will sample
-mc
number of rows from data
with replacement, to estimate
+mc
number of rows from data
with replacement, to estimate
marginal dependency using Monte Carlo integration. This is less
computationally expensive.
Integer. If mc
is not NULL
, the function will sample
-mc
number of rows from data
with replacement, to estimate
+mc
number of rows from data
with replacement, to estimate
marginal dependency using Monte Carlo integration. This is less
computationally expensive.
a character. "wald" (default) gives the squared parameter -differences devided by their pooled standard errors. "test" gives the +differences divided by their pooled standard errors. "test" gives the contributions of the parameters to the test statistic. "raw" gives the absolute values of the parameter differences.
a character. "wald" (default) gives the squared parameter -differences devided by their pooled standard errors. "test" gives the +differences divided by their pooled standard errors. "test" gives the contributions of the parameters to the test statistic. "raw" gives the absolute values of the parameter differences.
SEM Tree Package
+.SCALE_METRIC
An object of class numeric
of length 1.
semtree.control(
- method = "naive",
+ method = c("naive", "score", "fair", "fair3"),
min.N = 20,
max.depth = NA,
alpha = 0.05,
@@ -128,7 +128,8 @@ SEM Tree Control Object
strucchange.from = 0.15,
strucchange.to = NULL,
strucchange.nrep = 50000,
- refit = TRUE
+ refit = TRUE,
+ ctsem_sd = FALSE
)
Use MaxLm statistic
Use MaxLR statistic for split point selection (as proposed by Arnold et al., 2021)
If TRUE (default) the initial model is fitted on the data
provided to semtree
.
If FALSE (default) no standard errors of CT model parameters +are computed. Requesting standard errors increases runtime.
Brandmaier, A.M., Oertzen, T. v., McArdle, J.J., & Lindenberger, U. (2013). Structural equation model trees. Psychological Methods, 18(1), 71-86.
+Arnold, M., Voelkle, M. C., & Brandmaier, A. M. (2021). Score-guided structural equation model trees. Frontiers in Psychology, 11, Article 564403. https://doi.org/10.3389/fpsyg.2020.564403
Add extra columns with parameter estimates.
String. Add extra columns with parameter estimates. Pass a vector with the names of the parameters that should be rendered in the table.
Number of digits to round parameter estiamtes
Integer. Number of digits to round parameter estimates. Default is no rounding (NULL)