Skip to content

Commit

Permalink
Merge pull request #50 from filipamc19/gatai-integration
Browse files Browse the repository at this point in the history
Integrate GATAI functionality into myTAI
  • Loading branch information
HajkD authored Nov 20, 2024
2 parents 04f2c7c + ce203fd commit dcd5610
Show file tree
Hide file tree
Showing 4 changed files with 382 additions and 0 deletions.
129 changes: 129 additions & 0 deletions R/GATAI.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
#' @title Low level interface function with GATAI
#' @description
#' This function interfaces with the GATAI software which removes the genes that create the \code{\link{PlotSignature}} pattern.
#' \code{GATAI} can be found at \url{https://github.com/drostlab/gatai}.
#' @param ExpressionSet a standard PhyloExpressionSet, DivergenceExpressionSet or PolymorphismsExpressionSet object.
#' @param singlecell_data a logical value indicating whether or not the input \code{ExpressionSet} is a single cell dataset.
#' @param gatai_results_path where shall GATAI results be stored. Default \code{gatai_results_path = tempdir()}.
#' @param result_overwrite Logical value indicating whether local results of previous GATAI run should be over-written and re-computed?
#' @param condaenv The conda environment to use. For \code{\link[reticulate]{use_condaenv}}, this can be the name, the absolute prefix path, or the absolute path to the python binary. If the name is ambiguous, the first environment is used and a warning is issued. For \code{\link[reticulate]{use_miniconda}}, the only conda installation searched is the one installed by \code{\link[reticulate]{install_miniconda}}.
#' @author Filipa Martins Costa and Hajk-Georg Drost
#' @export
GATAI <- function(ExpressionSet,
singlecell_data = FALSE,
gatai_results_path = tempdir(),
result_overwrite = FALSE,
condaenv = NULL) {

# use specific conda environment if selected
if (!is.null(condaenv)) {
reticulate::use_condaenv(condaenv = condaenv, required = TRUE)
}

# check if GATAI is properly installed
is_installed_gatai()
message("Starting GATAI optimisation ...")

# generate a tempdir version of the input ExpressionSet to run GATAI with random id attached
export_expressionset_for_gatai <- file.path(tempdir(), "expressionset.tsv")

# file path to extracted genes output file
extracted_genes_path <- file.path(gatai_results_path, "removed_genes", "extracted_genes.txt")

# check if a local version of GATAI result is already present
if(fs::file_exists(extracted_genes_path) && !result_overwrite){
message("It seems like results from a previous GATAI run were still present at '", extracted_genes_path, "'.")
message("This previous GATAI result was now imported. Please select argument 'result_overwrite = TRUE' in case you would like to re-calculate eveything with GATAI.")
removed_genes <- readr::read_tsv(
file.path(gatai_results_path, "removed_genes", "extracted_genes.txt"),
col_names = F,
show_col_types = FALSE
)

colnames(removed_genes) <- "removed_genes"
} else {
message(
"A local version of the input ExpressionSet was stored for GATAI runs at '",
export_expressionset_for_gatai,
"'."
)

# store input ExpressionSet in tempdir()
readr::write_delim(
ExpressionSet,
file = export_expressionset_for_gatai,
delim = "\t",
col_names = TRUE
)

# run GATAI
gatai <- reticulate::import("gatai")

if (!singlecell_data) {
# here we need to specify a path to tempdir()/gatai_results_path where removed genes are stored
gatai_command <- paste(
"gatai run_minimizer",
export_expressionset_for_gatai,
file.path(gatai_results_path, "removed_genes"),
"--save_stats"
)

} else {
# here we need to specify a path to tempdir()/gatai_results_path where removed genes are stored
gatai_command <- paste(
"gatai run_minimizer",
export_expressionset_for_gatai,
file.path(gatai_results_path,
"removed_genes")
, "--single_cell", "--save_stats")
}

# systems call with tryCatch
tryCatch(
{
system(gatai_command)
message("GATAI Optimization Complete.")
},
warning = function(w) {
message("Warning during GATAI optimization: ", conditionMessage(w))
},
error = function(e) {
message("Error during GATAI optimization: ", conditionMessage(e))
}
)

# print the path to GATAI results as message
message(
"GATAI results are stored at '",
file.path(gatai_results_path, "removed_genes"),
"'."
)

# read the extracted_genes.txt file
removed_genes <- readr::read_tsv(
file.path(gatai_results_path, "removed_genes", "extracted_genes.txt"),
col_names = F,
show_col_types = FALSE
)

colnames(removed_genes) <- "removed_genes"

}

# read the summary statistics file
statistics <- readr::read_tsv(
file.path(gatai_results_path, "removed_genes", "summary.txt"),
col_names = F,
show_col_types = FALSE
)

# print summary statistics as messages
colnames(statistics) <- "statistics"
message("Summary statistics:")
for (summary in statistics$statistics) {
message(summary)
}


return(removed_genes)
}
80 changes: 80 additions & 0 deletions R/PlotSignatureGATAIGeneRemoval.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#' @title Plot evolutionary signatures across transcriptomes after GATAI gene removal
#' @description Main function to visualize transcriptome indices after GATAI gene removal .
#' @param ExpressionSet a standard PhyloExpressionSet, DivergenceExpressionSet or PolymorphismsExpressionSet object.
#' @param measure type of transcriptome index that shall be computed. E.g.
#' \itemize{
#' \item \code{measure = "TAI"} (Transcriptome Age Index)
#' \item \code{measure = "TDI"} (Transcriptome Divergence Index)
#' \item \code{measure = "TPI"} (Transcriptome Polymorphism Index)
#' }
#' @param TestStatistic a string defining the type of test statistics to be used to quantify the statistical significance the present phylotranscriptomics pattern.
#' Possible values can be:
#' \itemize{
#' \item \code{TestStatistic} = \code{"FlatLineTest"} : Statistical test for the deviation from a flat line
#' \item \code{TestStatistic} = \code{"ReductiveHourglassTest"} : Statistical test for the existence of a hourglass shape (high-low-high pattern)
#' \item \code{TestStatistic} = \code{"EarlyConservationTest"} : Statistical test for the existence of a early conservation pattern (low-high-high pattern)
#' \item \code{TestStatistic} = \code{"LateConservationTest"} : Statistical test for the existence of a late conservation pattern (high-high-low pattern)
#' \item \code{TestStatistic} = \code{"ReverseHourglassTest"} : Statistical test for the existence of a reverse hourglass pattern (low-high-low pattern)
#' }
#' @param modules a list storing three elements for the \code{\link{ReductiveHourglassTest}}, \code{\link{EarlyConservationTest}}, \code{\link{LateConservationTest}},
#' or \code{\link{ReverseHourglassTest}}: early, mid, and late.
#' Each element expects a numeric vector specifying the developmental stages
#' or experiments that correspond to each module. For example:
#' \itemize{
#' \item \code{module} = \code{list(early = 1:2, mid = 3:5, late = 6:7)} divides a dataset storing seven developmental stages into 3 modules.
#' }
#' @param permutations a numeric value specifying the number of permutations to be performed for the \code{\link{FlatLineTest}}, \code{\link{EarlyConservationTest}}, \code{\link{LateConservationTest}}, \code{\link{ReductiveHourglassTest}} or \code{\link{ReverseHourglassTest}}.
#' @param lillie.test a boolean value specifying whether the Lilliefors Kolmogorov-Smirnov Test shall be performed.
#' @param p.value a boolean value specifying whether the p-value of the test statistic shall be printed as a subtitle.
#' @param shaded.area a boolean value specifying whether a shaded area shall
#' be drawn for the developmental stages defined to be the presumptive phylotypic period.
#' @param custom.perm.matrix a custom \code{\link{bootMatrix}} (permutation matrix) to perform the underlying test statistic visualized by \code{PlotSignature}. Default is \code{custom.perm.matrix = NULL}.
#' @param xlab label of x-axis.
#' @param ylab label of y-axis.
#' @param main figure title.
#' @param lwd line width.
#' @param alpha transparency of the shaded area (between [0,1]). Default is \code{alpha = 0.1}.
#' @param y.ticks number of ticks on the y-axis. Default is \code{ticks = 10}.
#' @param \dots parameters passed on to \code{\link{GATAI}}.
#' @author Filipa Martins Costa and Hajk-Georg Drost
#' @export
PlotSignatureGATAIGeneRemoval <- function(ExpressionSet,
measure = "TAI",
TestStatistic = "FlatLineTest",
modules = NULL,
permutations = 1000,
lillie.test = FALSE,
p.value = TRUE,
shaded.area = FALSE,
custom.perm.matrix = NULL,
xlab = "Ontogeny",
ylab = "Transcriptome Index",
main = "",
lwd = 4,
alpha = 0.1,
y.ticks = 10, ...) {

removed_gene_list <- GATAI(ExpressionSet, ...)

p_final <- myTAI::PlotSignature(dplyr::filter(ExpressionSet, !GeneID %in% as.character(removed_gene_list[[1]])),
measure = measure,
TestStatistic = TestStatistic,
modules = modules,
permutations = permutations,
lillie.test = lillie.test,
p.value = p.value,
shaded.area = shaded.area,
custom.perm.matrix = custom.perm.matrix,
xlab = xlab,
ylab = ylab,
main = paste(nrow(removed_gene_list), "Removed genes"),
lwd = lwd,
alpha = alpha,
y.ticks = y.ticks)

# plot_final <- cowplot::plot_grid(p1, p2, labels = c('A', 'B'), label_size = 18)

print(p_final)

return(p_final)
}
160 changes: 160 additions & 0 deletions R/SignatureGATAIGeneRemoval.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
#' @title Removed genes after GATAI gene removal and plots of evolutionary signatures across transcriptomes before, after GATAI gene removal and with randomly removed genes
#' @description Main function to return the removed genes after GATAI gene removal and plots transcriptome indices pattern before, after GATAI gene removal and by removing random genes (by default same number as GATAI removed).
#' @param ExpressionSet a standard PhyloExpressionSet, DivergenceExpressionSet or PolymorphismsExpressionSet object.
#' @param measure type of transcriptome index that shall be computed. E.g.
#' \itemize{
#' \item \code{measure = "TAI"} (Transcriptome Age Index)
#' \item \code{measure = "TDI"} (Transcriptome Divergence Index)
#' \item \code{measure = "TPI"} (Transcriptome Polymorphism Index)
#' }
#' @param TestStatistic a string defining the type of test statistics to be used to quantify the statistical significance the present phylotranscriptomics pattern.
#' Possible values can be:
#' \itemize{
#' \item \code{TestStatistic} = \code{"FlatLineTest"} : Statistical test for the deviation from a flat line
#' \item \code{TestStatistic} = \code{"ReductiveHourglassTest"} : Statistical test for the existence of a hourglass shape (high-low-high pattern)
#' \item \code{TestStatistic} = \code{"EarlyConservationTest"} : Statistical test for the existence of a early conservation pattern (low-high-high pattern)
#' \item \code{TestStatistic} = \code{"LateConservationTest"} : Statistical test for the existence of a late conservation pattern (high-high-low pattern)
#' \item \code{TestStatistic} = \code{"ReverseHourglassTest"} : Statistical test for the existence of a reverse hourglass pattern (low-high-low pattern)
#' }
#' @param modules a list storing three elements for the \code{\link{ReductiveHourglassTest}}, \code{\link{EarlyConservationTest}}, \code{\link{LateConservationTest}},
#' or \code{\link{ReverseHourglassTest}}: early, mid, and late.
#' Each element expects a numeric vector specifying the developmental stages
#' or experiments that correspond to each module. For example:
#' \itemize{
#' \item \code{module} = \code{list(early = 1:2, mid = 3:5, late = 6:7)} divides a dataset storing seven developmental stages into 3 modules.
#' }
#' @param permutations a numeric value specifying the number of permutations to be performed for the \code{\link{FlatLineTest}}, \code{\link{EarlyConservationTest}}, \code{\link{LateConservationTest}}, \code{\link{ReductiveHourglassTest}} or \code{\link{ReverseHourglassTest}}.
#' @param lillie.test a boolean value specifying whether the Lilliefors Kolmogorov-Smirnov Test shall be performed.
#' @param p.value a boolean value specifying whether the p-value of the test statistic shall be printed as a subtitle.
#' @param shaded.area a boolean value specifying whether a shaded area shall
#' be drawn for the developmental stages defined to be the presumptive phylotypic period.
#' @param custom.perm.matrix a custom \code{\link{bootMatrix}} (permutation matrix) to perform the underlying test statistic visualized by \code{PlotSignature}. Default is \code{custom.perm.matrix = NULL}.
#' @param xlab label of x-axis.
#' @param ylab label of y-axis.
#' @param main figure title.
#' @param lwd line width.
#' @param alpha transparency of the shaded area (between [0,1]). Default is \code{alpha = 0.1}.
#' @param y.ticks number of ticks on the y-axis. Default is \code{ticks = 10}.
#' @param n_random_removal number of randomly removed genes for the third plot. Default is \code{gatai} (same number of genes removed by GATAI).
#' @param \dots parameters passed on to \code{\link{GATAI}}.
#' @author Filipa Martins Costa and Hajk-Georg Drost
#' @export
SignatureGATAIGeneRemoval <- function(ExpressionSet,
measure = "TAI",
TestStatistic = "FlatLineTest",
modules = NULL,
permutations = 1000,
lillie.test = FALSE,
p.value = TRUE,
shaded.area = FALSE,
custom.perm.matrix = NULL,
xlab = "Ontogeny",
ylab = "Transcriptome Index",
main = "",
lwd = 4,
alpha = 0.1,
y.ticks = 10,
n_random_removal = "gatai",
...) {

removed_gene_list <- GATAI(ExpressionSet, ...)

# original sample of genes
P1 <- myTAI::PlotSignature(ExpressionSet,
measure = measure,
TestStatistic = TestStatistic,
modules = modules,
permutations = permutations,
lillie.test = lillie.test,
p.value = p.value,
shaded.area = shaded.area,
custom.perm.matrix = custom.perm.matrix,
xlab = xlab,
ylab = ylab,
main = main,
lwd = lwd,
alpha = alpha,
y.ticks = y.ticks)

# removed genes
P2 <- myTAI::PlotSignature(dplyr::filter(ExpressionSet, !GeneID %in% as.character(removed_gene_list[[1]])),
measure = measure,
TestStatistic = TestStatistic,
modules = modules,
permutations = permutations,
lillie.test = lillie.test,
p.value = p.value,
shaded.area = shaded.area,
custom.perm.matrix = custom.perm.matrix,
xlab = xlab,
ylab = ylab,
main = main,
lwd = lwd,
alpha = alpha,
y.ticks = y.ticks)

if (is.character(n_random_removal) && n_random_removal == "gatai") {
n_random_genes <- length(removed_gene_list[[1]])
} else if (is.numeric(n_random_removal)) {
n_random_genes <- n_random_removal
} else {
message("Invalid value for 'n_random_removal'. It should be 'gatai' or a numeric value.")
message("Removing the same number as GATAI.")
n_random_genes <- length(removed_gene_list[[1]])
}

random_removed_genes <- sample(ExpressionSet$GeneID, n_random_genes)

P3 <- myTAI::PlotSignature(dplyr::filter(ExpressionSet, !GeneID %in% random_removed_genes),
measure = measure,
TestStatistic = TestStatistic,
modules = modules,
permutations = permutations,
lillie.test = lillie.test,
p.value = p.value,
shaded.area = shaded.area,
custom.perm.matrix = custom.perm.matrix,
xlab = xlab,
ylab = ylab,
lwd = lwd,
alpha = alpha,
y.ticks = y.ticks)

# save the data from both plots
data_p1 <- ggplot2::ggplot_build(P1)$data[[1]]
data_p2 <- ggplot2::ggplot_build(P2)$data[[1]]
data_p3 <- ggplot2::ggplot_build(P3)$data[[1]]

# deciding the y index range by looking at the minimum and maximum values of the line and shade from both plots
min_y_p1 <- min(data_p1$y)
max_y_p1 <- max(data_p1$y)
min_y_p2 <- min(data_p2$y)
max_y_p2 <- max(data_p2$y)
min_y_p3 <- min(data_p3$y)
max_y_p3 <- max(data_p3$y)
min_y_p1_shading <- min(data_p1$ymin)
max_y_p1_shading <- max(data_p1$ymax)
min_y_p2_shading <- min(data_p2$ymin)
max_y_p2_shading <- max(data_p2$ymax)
min_y_p3_shading <- min(data_p3$ymin)
max_y_p3_shading <- max(data_p3$ymax)
min_y <- min(min_y_p1, min_y_p2, min_y_p3, min_y_p1_shading, min_y_p2_shading, min_y_p3_shading)
max_y <- max(max_y_p1, max_y_p2, max_y_p3, max_y_p1_shading, max_y_p2_shading, max_y_p3_shading)


P1 <- P1 + ggplot2::scale_y_continuous(limits = c(min_y-0.05, ymax = max_y+0.05),
breaks = scales::breaks_pretty(n = y.ticks))
P2 <- P2 + ggplot2::scale_y_continuous(limits = c(min_y-0.05, ymax = max_y+0.05),
breaks = scales::breaks_pretty(n = y.ticks))

P3 <- P3 + ggplot2::scale_y_continuous(limits = c(min_y-0.05, ymax = max_y+0.05),
breaks = scales::breaks_pretty(n = y.ticks))


plots <- cowplot::plot_grid(P1, P2, P3, labels = c(paste("Original sample of genes:", nrow(ExpressionSet), "genes"),
paste("GATAI:", nrow(removed_gene_list), "Removed genes"), paste(n_random_genes, "Randomly Removed Genes")))
print(plots)

return(removed_gene_list)

}
Loading

0 comments on commit dcd5610

Please sign in to comment.