-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #50 from filipamc19/gatai-integration
Integrate GATAI functionality into myTAI
- Loading branch information
Showing
4 changed files
with
382 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
#' @title Low level interface function with GATAI | ||
#' @description | ||
#' This function interfaces with the GATAI software which removes the genes that create the \code{\link{PlotSignature}} pattern. | ||
#' \code{GATAI} can be found at \url{https://github.com/drostlab/gatai}. | ||
#' @param ExpressionSet a standard PhyloExpressionSet, DivergenceExpressionSet or PolymorphismsExpressionSet object. | ||
#' @param singlecell_data a logical value indicating whether or not the input \code{ExpressionSet} is a single cell dataset. | ||
#' @param gatai_results_path where shall GATAI results be stored. Default \code{gatai_results_path = tempdir()}. | ||
#' @param result_overwrite Logical value indicating whether local results of previous GATAI run should be over-written and re-computed? | ||
#' @param condaenv The conda environment to use. For \code{\link[reticulate]{use_condaenv}}, this can be the name, the absolute prefix path, or the absolute path to the python binary. If the name is ambiguous, the first environment is used and a warning is issued. For \code{\link[reticulate]{use_miniconda}}, the only conda installation searched is the one installed by \code{\link[reticulate]{install_miniconda}}. | ||
#' @author Filipa Martins Costa and Hajk-Georg Drost | ||
#' @export | ||
GATAI <- function(ExpressionSet, | ||
singlecell_data = FALSE, | ||
gatai_results_path = tempdir(), | ||
result_overwrite = FALSE, | ||
condaenv = NULL) { | ||
|
||
# use specific conda environment if selected | ||
if (!is.null(condaenv)) { | ||
reticulate::use_condaenv(condaenv = condaenv, required = TRUE) | ||
} | ||
|
||
# check if GATAI is properly installed | ||
is_installed_gatai() | ||
message("Starting GATAI optimisation ...") | ||
|
||
# generate a tempdir version of the input ExpressionSet to run GATAI with random id attached | ||
export_expressionset_for_gatai <- file.path(tempdir(), "expressionset.tsv") | ||
|
||
# file path to extracted genes output file | ||
extracted_genes_path <- file.path(gatai_results_path, "removed_genes", "extracted_genes.txt") | ||
|
||
# check if a local version of GATAI result is already present | ||
if(fs::file_exists(extracted_genes_path) && !result_overwrite){ | ||
message("It seems like results from a previous GATAI run were still present at '", extracted_genes_path, "'.") | ||
message("This previous GATAI result was now imported. Please select argument 'result_overwrite = TRUE' in case you would like to re-calculate eveything with GATAI.") | ||
removed_genes <- readr::read_tsv( | ||
file.path(gatai_results_path, "removed_genes", "extracted_genes.txt"), | ||
col_names = F, | ||
show_col_types = FALSE | ||
) | ||
|
||
colnames(removed_genes) <- "removed_genes" | ||
} else { | ||
message( | ||
"A local version of the input ExpressionSet was stored for GATAI runs at '", | ||
export_expressionset_for_gatai, | ||
"'." | ||
) | ||
|
||
# store input ExpressionSet in tempdir() | ||
readr::write_delim( | ||
ExpressionSet, | ||
file = export_expressionset_for_gatai, | ||
delim = "\t", | ||
col_names = TRUE | ||
) | ||
|
||
# run GATAI | ||
gatai <- reticulate::import("gatai") | ||
|
||
if (!singlecell_data) { | ||
# here we need to specify a path to tempdir()/gatai_results_path where removed genes are stored | ||
gatai_command <- paste( | ||
"gatai run_minimizer", | ||
export_expressionset_for_gatai, | ||
file.path(gatai_results_path, "removed_genes"), | ||
"--save_stats" | ||
) | ||
|
||
} else { | ||
# here we need to specify a path to tempdir()/gatai_results_path where removed genes are stored | ||
gatai_command <- paste( | ||
"gatai run_minimizer", | ||
export_expressionset_for_gatai, | ||
file.path(gatai_results_path, | ||
"removed_genes") | ||
, "--single_cell", "--save_stats") | ||
} | ||
|
||
# systems call with tryCatch | ||
tryCatch( | ||
{ | ||
system(gatai_command) | ||
message("GATAI Optimization Complete.") | ||
}, | ||
warning = function(w) { | ||
message("Warning during GATAI optimization: ", conditionMessage(w)) | ||
}, | ||
error = function(e) { | ||
message("Error during GATAI optimization: ", conditionMessage(e)) | ||
} | ||
) | ||
|
||
# print the path to GATAI results as message | ||
message( | ||
"GATAI results are stored at '", | ||
file.path(gatai_results_path, "removed_genes"), | ||
"'." | ||
) | ||
|
||
# read the extracted_genes.txt file | ||
removed_genes <- readr::read_tsv( | ||
file.path(gatai_results_path, "removed_genes", "extracted_genes.txt"), | ||
col_names = F, | ||
show_col_types = FALSE | ||
) | ||
|
||
colnames(removed_genes) <- "removed_genes" | ||
|
||
} | ||
|
||
# read the summary statistics file | ||
statistics <- readr::read_tsv( | ||
file.path(gatai_results_path, "removed_genes", "summary.txt"), | ||
col_names = F, | ||
show_col_types = FALSE | ||
) | ||
|
||
# print summary statistics as messages | ||
colnames(statistics) <- "statistics" | ||
message("Summary statistics:") | ||
for (summary in statistics$statistics) { | ||
message(summary) | ||
} | ||
|
||
|
||
return(removed_genes) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
#' @title Plot evolutionary signatures across transcriptomes after GATAI gene removal | ||
#' @description Main function to visualize transcriptome indices after GATAI gene removal . | ||
#' @param ExpressionSet a standard PhyloExpressionSet, DivergenceExpressionSet or PolymorphismsExpressionSet object. | ||
#' @param measure type of transcriptome index that shall be computed. E.g. | ||
#' \itemize{ | ||
#' \item \code{measure = "TAI"} (Transcriptome Age Index) | ||
#' \item \code{measure = "TDI"} (Transcriptome Divergence Index) | ||
#' \item \code{measure = "TPI"} (Transcriptome Polymorphism Index) | ||
#' } | ||
#' @param TestStatistic a string defining the type of test statistics to be used to quantify the statistical significance the present phylotranscriptomics pattern. | ||
#' Possible values can be: | ||
#' \itemize{ | ||
#' \item \code{TestStatistic} = \code{"FlatLineTest"} : Statistical test for the deviation from a flat line | ||
#' \item \code{TestStatistic} = \code{"ReductiveHourglassTest"} : Statistical test for the existence of a hourglass shape (high-low-high pattern) | ||
#' \item \code{TestStatistic} = \code{"EarlyConservationTest"} : Statistical test for the existence of a early conservation pattern (low-high-high pattern) | ||
#' \item \code{TestStatistic} = \code{"LateConservationTest"} : Statistical test for the existence of a late conservation pattern (high-high-low pattern) | ||
#' \item \code{TestStatistic} = \code{"ReverseHourglassTest"} : Statistical test for the existence of a reverse hourglass pattern (low-high-low pattern) | ||
#' } | ||
#' @param modules a list storing three elements for the \code{\link{ReductiveHourglassTest}}, \code{\link{EarlyConservationTest}}, \code{\link{LateConservationTest}}, | ||
#' or \code{\link{ReverseHourglassTest}}: early, mid, and late. | ||
#' Each element expects a numeric vector specifying the developmental stages | ||
#' or experiments that correspond to each module. For example: | ||
#' \itemize{ | ||
#' \item \code{module} = \code{list(early = 1:2, mid = 3:5, late = 6:7)} divides a dataset storing seven developmental stages into 3 modules. | ||
#' } | ||
#' @param permutations a numeric value specifying the number of permutations to be performed for the \code{\link{FlatLineTest}}, \code{\link{EarlyConservationTest}}, \code{\link{LateConservationTest}}, \code{\link{ReductiveHourglassTest}} or \code{\link{ReverseHourglassTest}}. | ||
#' @param lillie.test a boolean value specifying whether the Lilliefors Kolmogorov-Smirnov Test shall be performed. | ||
#' @param p.value a boolean value specifying whether the p-value of the test statistic shall be printed as a subtitle. | ||
#' @param shaded.area a boolean value specifying whether a shaded area shall | ||
#' be drawn for the developmental stages defined to be the presumptive phylotypic period. | ||
#' @param custom.perm.matrix a custom \code{\link{bootMatrix}} (permutation matrix) to perform the underlying test statistic visualized by \code{PlotSignature}. Default is \code{custom.perm.matrix = NULL}. | ||
#' @param xlab label of x-axis. | ||
#' @param ylab label of y-axis. | ||
#' @param main figure title. | ||
#' @param lwd line width. | ||
#' @param alpha transparency of the shaded area (between [0,1]). Default is \code{alpha = 0.1}. | ||
#' @param y.ticks number of ticks on the y-axis. Default is \code{ticks = 10}. | ||
#' @param \dots parameters passed on to \code{\link{GATAI}}. | ||
#' @author Filipa Martins Costa and Hajk-Georg Drost | ||
#' @export | ||
PlotSignatureGATAIGeneRemoval <- function(ExpressionSet, | ||
measure = "TAI", | ||
TestStatistic = "FlatLineTest", | ||
modules = NULL, | ||
permutations = 1000, | ||
lillie.test = FALSE, | ||
p.value = TRUE, | ||
shaded.area = FALSE, | ||
custom.perm.matrix = NULL, | ||
xlab = "Ontogeny", | ||
ylab = "Transcriptome Index", | ||
main = "", | ||
lwd = 4, | ||
alpha = 0.1, | ||
y.ticks = 10, ...) { | ||
|
||
removed_gene_list <- GATAI(ExpressionSet, ...) | ||
|
||
p_final <- myTAI::PlotSignature(dplyr::filter(ExpressionSet, !GeneID %in% as.character(removed_gene_list[[1]])), | ||
measure = measure, | ||
TestStatistic = TestStatistic, | ||
modules = modules, | ||
permutations = permutations, | ||
lillie.test = lillie.test, | ||
p.value = p.value, | ||
shaded.area = shaded.area, | ||
custom.perm.matrix = custom.perm.matrix, | ||
xlab = xlab, | ||
ylab = ylab, | ||
main = paste(nrow(removed_gene_list), "Removed genes"), | ||
lwd = lwd, | ||
alpha = alpha, | ||
y.ticks = y.ticks) | ||
|
||
# plot_final <- cowplot::plot_grid(p1, p2, labels = c('A', 'B'), label_size = 18) | ||
|
||
print(p_final) | ||
|
||
return(p_final) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,160 @@ | ||
#' @title Removed genes after GATAI gene removal and plots of evolutionary signatures across transcriptomes before, after GATAI gene removal and with randomly removed genes | ||
#' @description Main function to return the removed genes after GATAI gene removal and plots transcriptome indices pattern before, after GATAI gene removal and by removing random genes (by default same number as GATAI removed). | ||
#' @param ExpressionSet a standard PhyloExpressionSet, DivergenceExpressionSet or PolymorphismsExpressionSet object. | ||
#' @param measure type of transcriptome index that shall be computed. E.g. | ||
#' \itemize{ | ||
#' \item \code{measure = "TAI"} (Transcriptome Age Index) | ||
#' \item \code{measure = "TDI"} (Transcriptome Divergence Index) | ||
#' \item \code{measure = "TPI"} (Transcriptome Polymorphism Index) | ||
#' } | ||
#' @param TestStatistic a string defining the type of test statistics to be used to quantify the statistical significance the present phylotranscriptomics pattern. | ||
#' Possible values can be: | ||
#' \itemize{ | ||
#' \item \code{TestStatistic} = \code{"FlatLineTest"} : Statistical test for the deviation from a flat line | ||
#' \item \code{TestStatistic} = \code{"ReductiveHourglassTest"} : Statistical test for the existence of a hourglass shape (high-low-high pattern) | ||
#' \item \code{TestStatistic} = \code{"EarlyConservationTest"} : Statistical test for the existence of a early conservation pattern (low-high-high pattern) | ||
#' \item \code{TestStatistic} = \code{"LateConservationTest"} : Statistical test for the existence of a late conservation pattern (high-high-low pattern) | ||
#' \item \code{TestStatistic} = \code{"ReverseHourglassTest"} : Statistical test for the existence of a reverse hourglass pattern (low-high-low pattern) | ||
#' } | ||
#' @param modules a list storing three elements for the \code{\link{ReductiveHourglassTest}}, \code{\link{EarlyConservationTest}}, \code{\link{LateConservationTest}}, | ||
#' or \code{\link{ReverseHourglassTest}}: early, mid, and late. | ||
#' Each element expects a numeric vector specifying the developmental stages | ||
#' or experiments that correspond to each module. For example: | ||
#' \itemize{ | ||
#' \item \code{module} = \code{list(early = 1:2, mid = 3:5, late = 6:7)} divides a dataset storing seven developmental stages into 3 modules. | ||
#' } | ||
#' @param permutations a numeric value specifying the number of permutations to be performed for the \code{\link{FlatLineTest}}, \code{\link{EarlyConservationTest}}, \code{\link{LateConservationTest}}, \code{\link{ReductiveHourglassTest}} or \code{\link{ReverseHourglassTest}}. | ||
#' @param lillie.test a boolean value specifying whether the Lilliefors Kolmogorov-Smirnov Test shall be performed. | ||
#' @param p.value a boolean value specifying whether the p-value of the test statistic shall be printed as a subtitle. | ||
#' @param shaded.area a boolean value specifying whether a shaded area shall | ||
#' be drawn for the developmental stages defined to be the presumptive phylotypic period. | ||
#' @param custom.perm.matrix a custom \code{\link{bootMatrix}} (permutation matrix) to perform the underlying test statistic visualized by \code{PlotSignature}. Default is \code{custom.perm.matrix = NULL}. | ||
#' @param xlab label of x-axis. | ||
#' @param ylab label of y-axis. | ||
#' @param main figure title. | ||
#' @param lwd line width. | ||
#' @param alpha transparency of the shaded area (between [0,1]). Default is \code{alpha = 0.1}. | ||
#' @param y.ticks number of ticks on the y-axis. Default is \code{ticks = 10}. | ||
#' @param n_random_removal number of randomly removed genes for the third plot. Default is \code{gatai} (same number of genes removed by GATAI). | ||
#' @param \dots parameters passed on to \code{\link{GATAI}}. | ||
#' @author Filipa Martins Costa and Hajk-Georg Drost | ||
#' @export | ||
SignatureGATAIGeneRemoval <- function(ExpressionSet, | ||
measure = "TAI", | ||
TestStatistic = "FlatLineTest", | ||
modules = NULL, | ||
permutations = 1000, | ||
lillie.test = FALSE, | ||
p.value = TRUE, | ||
shaded.area = FALSE, | ||
custom.perm.matrix = NULL, | ||
xlab = "Ontogeny", | ||
ylab = "Transcriptome Index", | ||
main = "", | ||
lwd = 4, | ||
alpha = 0.1, | ||
y.ticks = 10, | ||
n_random_removal = "gatai", | ||
...) { | ||
|
||
removed_gene_list <- GATAI(ExpressionSet, ...) | ||
|
||
# original sample of genes | ||
P1 <- myTAI::PlotSignature(ExpressionSet, | ||
measure = measure, | ||
TestStatistic = TestStatistic, | ||
modules = modules, | ||
permutations = permutations, | ||
lillie.test = lillie.test, | ||
p.value = p.value, | ||
shaded.area = shaded.area, | ||
custom.perm.matrix = custom.perm.matrix, | ||
xlab = xlab, | ||
ylab = ylab, | ||
main = main, | ||
lwd = lwd, | ||
alpha = alpha, | ||
y.ticks = y.ticks) | ||
|
||
# removed genes | ||
P2 <- myTAI::PlotSignature(dplyr::filter(ExpressionSet, !GeneID %in% as.character(removed_gene_list[[1]])), | ||
measure = measure, | ||
TestStatistic = TestStatistic, | ||
modules = modules, | ||
permutations = permutations, | ||
lillie.test = lillie.test, | ||
p.value = p.value, | ||
shaded.area = shaded.area, | ||
custom.perm.matrix = custom.perm.matrix, | ||
xlab = xlab, | ||
ylab = ylab, | ||
main = main, | ||
lwd = lwd, | ||
alpha = alpha, | ||
y.ticks = y.ticks) | ||
|
||
if (is.character(n_random_removal) && n_random_removal == "gatai") { | ||
n_random_genes <- length(removed_gene_list[[1]]) | ||
} else if (is.numeric(n_random_removal)) { | ||
n_random_genes <- n_random_removal | ||
} else { | ||
message("Invalid value for 'n_random_removal'. It should be 'gatai' or a numeric value.") | ||
message("Removing the same number as GATAI.") | ||
n_random_genes <- length(removed_gene_list[[1]]) | ||
} | ||
|
||
random_removed_genes <- sample(ExpressionSet$GeneID, n_random_genes) | ||
|
||
P3 <- myTAI::PlotSignature(dplyr::filter(ExpressionSet, !GeneID %in% random_removed_genes), | ||
measure = measure, | ||
TestStatistic = TestStatistic, | ||
modules = modules, | ||
permutations = permutations, | ||
lillie.test = lillie.test, | ||
p.value = p.value, | ||
shaded.area = shaded.area, | ||
custom.perm.matrix = custom.perm.matrix, | ||
xlab = xlab, | ||
ylab = ylab, | ||
lwd = lwd, | ||
alpha = alpha, | ||
y.ticks = y.ticks) | ||
|
||
# save the data from both plots | ||
data_p1 <- ggplot2::ggplot_build(P1)$data[[1]] | ||
data_p2 <- ggplot2::ggplot_build(P2)$data[[1]] | ||
data_p3 <- ggplot2::ggplot_build(P3)$data[[1]] | ||
|
||
# deciding the y index range by looking at the minimum and maximum values of the line and shade from both plots | ||
min_y_p1 <- min(data_p1$y) | ||
max_y_p1 <- max(data_p1$y) | ||
min_y_p2 <- min(data_p2$y) | ||
max_y_p2 <- max(data_p2$y) | ||
min_y_p3 <- min(data_p3$y) | ||
max_y_p3 <- max(data_p3$y) | ||
min_y_p1_shading <- min(data_p1$ymin) | ||
max_y_p1_shading <- max(data_p1$ymax) | ||
min_y_p2_shading <- min(data_p2$ymin) | ||
max_y_p2_shading <- max(data_p2$ymax) | ||
min_y_p3_shading <- min(data_p3$ymin) | ||
max_y_p3_shading <- max(data_p3$ymax) | ||
min_y <- min(min_y_p1, min_y_p2, min_y_p3, min_y_p1_shading, min_y_p2_shading, min_y_p3_shading) | ||
max_y <- max(max_y_p1, max_y_p2, max_y_p3, max_y_p1_shading, max_y_p2_shading, max_y_p3_shading) | ||
|
||
|
||
P1 <- P1 + ggplot2::scale_y_continuous(limits = c(min_y-0.05, ymax = max_y+0.05), | ||
breaks = scales::breaks_pretty(n = y.ticks)) | ||
P2 <- P2 + ggplot2::scale_y_continuous(limits = c(min_y-0.05, ymax = max_y+0.05), | ||
breaks = scales::breaks_pretty(n = y.ticks)) | ||
|
||
P3 <- P3 + ggplot2::scale_y_continuous(limits = c(min_y-0.05, ymax = max_y+0.05), | ||
breaks = scales::breaks_pretty(n = y.ticks)) | ||
|
||
|
||
plots <- cowplot::plot_grid(P1, P2, P3, labels = c(paste("Original sample of genes:", nrow(ExpressionSet), "genes"), | ||
paste("GATAI:", nrow(removed_gene_list), "Removed genes"), paste(n_random_genes, "Randomly Removed Genes"))) | ||
print(plots) | ||
|
||
return(removed_gene_list) | ||
|
||
} |
Oops, something went wrong.