diff --git a/.Rbuildignore b/.Rbuildignore index 35b12209..cc6e0d44 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -9,6 +9,8 @@ ^fixVis$ ^experiment data$ ^docker$ +^Dockerfile$ +^.dockerignore$ ^private$ ^.gitlab-ci.yml$ ^CONTRIBUTION.md$ @@ -18,3 +20,5 @@ ^LICENSE$ ^immunarch-citation.xml$ ^.github$ +^.RDataFiles$ +^.idea$ diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 00000000..0f4a2b7d --- /dev/null +++ b/.dockerignore @@ -0,0 +1,4 @@ +.git* +.RDataFiles +.idea +*.Rproj diff --git a/.github/workflows/dockerhub.yml b/.github/workflows/dockerhub.yml new file mode 100644 index 00000000..a0cc6019 --- /dev/null +++ b/.github/workflows/dockerhub.yml @@ -0,0 +1,45 @@ +name: DockerHub Image + +on: + push: + branches: + - master + - dev + tags: + - "*" + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - name: Get Image Tag + shell: bash + run: echo "##[set-output name=tag;]$([ "${GITHUB_REF}" == "refs/heads/master" ] && echo base || echo ${GITHUB_REF##*/})" + id: get_tag + + - name: Check Out Repo + uses: actions/checkout@v2 + + - name: Login to Docker Hub + uses: docker/login-action@v1 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Set up Docker Buildx + id: buildx + uses: docker/setup-buildx-action@v1 + + - name: Build and push + id: docker_build + uses: docker/build-push-action@v2 + with: + context: ./ + file: ./Dockerfile + builder: ${{ steps.buildx.outputs.name }} + push: true + tags: ${{ secrets.DOCKERHUB_USERNAME }}/immunarch-docker:${{ steps.get_tag.outputs.tag }} + + - name: Image digest + run: echo ${{ steps.docker_build.outputs.digest }} diff --git a/.gitignore b/.gitignore index 235b745a..6db83b5d 100644 --- a/.gitignore +++ b/.gitignore @@ -2,9 +2,10 @@ .Rhistory .RData .Ruserdata +.RDataFiles/* src/*.o src/*.so src/*.dll *.DS_Store docs/* - +.idea/* diff --git a/DESCRIPTION b/DESCRIPTION index 6c43c51c..f222e143 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,23 +1,25 @@ Package: immunarch Type: Package Title: Bioinformatics Analysis of T-Cell and B-Cell Immune Repertoires -Version: 0.6.5 +Version: 0.6.7 Authors@R: c( - person("Vadim I.", "Nazarov", , "vdm.nazarov@gmail.com", c("aut", "cre")), + person("Vadim I.", "Nazarov", , "support@immunomind.io", c("aut", "cre")), person("Vasily O.", "Tsvetkov", , role = "aut"), person("Eugene", "Rumynskiy", , role = "aut"), + person("Aleksandr A.", "Popov", , role = "aut"), + person("Ivan", "Balashov", , role = "aut"), person("Anna", "Lorenc", , role = "ctb"), person("Daniel J.", "Moore", , role = "ctb"), person("Victor", "Greiff", , role = "ctb"), person("ImmunoMind", role = c("cph", "fnd")) ) Contact: support@immunomind.io -Description: A comprehensive framework for bioinformatics exploratory analysis of bulk and single-cell - T-cell receptor and antibody repertoires. It provides seamless data loading, analysis and +Description: A comprehensive framework for bioinformatics exploratory analysis of bulk and single-cell + T-cell receptor and antibody repertoires. It provides seamless data loading, analysis and visualisation for AIRR (Adaptive Immune Receptor Repertoire) data, both bulk immunosequencing (RepSeq) - and single-cell sequencing (scRNAseq). It implements most of the widely used AIRR analysis methods, - such as: clonality analysis, estimation of repertoire similarities in distribution of clonotypes - and gene segments, repertoire diversity analysis, annotation of clonotypes using external immune receptor + and single-cell sequencing (scRNAseq). It implements most of the widely used AIRR analysis methods, + such as: clonality analysis, estimation of repertoire similarities in distribution of clonotypes + and gene segments, repertoire diversity analysis, annotation of clonotypes using external immune receptor databases and clonotype tracking in vaccination and cancer studies. A successor to our previously published 'tcR' immunoinformatics package (Nazarov 2015) . License: AGPL-3 @@ -33,25 +35,28 @@ Imports: circlize, MASS (>= 7.3), Rtsne (>= 0.15), - readr (>= 1.3.1), readxl (>= 1.3.1), shiny (>= 1.4.0), shinythemes, airr, ggseqlogo, - stringr (>= 1.4.0), ggalluvial (>= 0.10.0), Rcpp (>= 1.0), magrittr, - tibble (>= 2.0), methods, scales, ggpubr (>= 0.2), rlang (>= 0.4), plyr, - dbplyr (>= 1.4.0) + dbplyr (>= 1.4.0), + jsonlite, + readr, + stringr, + tibble, + tidyselect, + purrr Depends: - R (>= 3.5.0), + R (>= 4.0.0), ggplot2 (>= 3.1.0), dplyr (>= 0.8.0), dtplyr (>= 1.0.0), @@ -63,8 +68,9 @@ Suggests: roxygen2 (>= 3.0.0), testthat (>= 2.1.0), pkgdown (>= 0.1.0), - assertthat + assertthat, + rmarkdown VignetteBuilder: knitr Encoding: UTF-8 -RoxygenNote: 7.1.0 +RoxygenNote: 7.1.2 LazyData: true diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..cd07466d --- /dev/null +++ b/Dockerfile @@ -0,0 +1,19 @@ +FROM r-base + +# Install apt dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential file libcurl4-openssl-dev libcairo2-dev libxml2-dev libssl-dev \ + libharfbuzz-dev libfribidi-dev libfreetype6-dev libpng-dev libtiff5-dev libjpeg-dev \ + && rm -rf /var/lib/apt/lists/* + +# Install R dependencies +RUN R -e "install.packages(c('remotes', 'svglite'))" + +# Copy source files to the image +COPY . /immunarch-src/ + +# Install Immunarch from source +RUN R -e "remotes::install_local('/immunarch-src', dependencies=TRUE)" + +# Delete Immunarch source from the image +RUN rm -rf /immunarch-src diff --git a/NAMESPACE b/NAMESPACE index f9df9f02..80ac3c2d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -43,6 +43,7 @@ export(cross_entropy) export(dbAnnotate) export(dbLoad) export(entropy) +export(exclude) export(fixVis) export(geneUsage) export(geneUsageAnalysis) @@ -56,10 +57,14 @@ export(immunr_mds) export(immunr_pca) export(immunr_tsne) export(inc_overlap) +export(include) export(inframes) +export(interval) export(js_div) export(kl_div) export(kmer_profile) +export(lessthan) +export(morethan) export(noncoding) export(outofframes) export(pubRep) @@ -73,6 +78,7 @@ export(public_matrix) export(repClonality) export(repDiversity) export(repExplore) +export(repFilter) export(repLoad) export(repOverlap) export(repOverlapAnalysis) @@ -127,7 +133,6 @@ importFrom(dplyr,rename) importFrom(dplyr,select) importFrom(dplyr,summarise) importFrom(dplyr,tally) -importFrom(dplyr,tbl_df) importFrom(dplyr,top_n) importFrom(dplyr,ungroup) importFrom(dtplyr,lazy_dt) @@ -151,12 +156,15 @@ importFrom(grDevices,colorRampPalette) importFrom(graphics,plot) importFrom(grid,gpar) importFrom(grid,rectGrob) +importFrom(jsonlite,read_json) +importFrom(magrittr,"%<>%") importFrom(magrittr,"%>%") importFrom(methods,as) importFrom(patchwork,plot_annotation) importFrom(patchwork,wrap_plots) importFrom(pheatmap,pheatmap) importFrom(plyr,mapvalues) +importFrom(purrr,map) importFrom(readr,col_character) importFrom(readr,col_double) importFrom(readr,col_guess) @@ -214,7 +222,9 @@ importFrom(stringr,str_order) importFrom(stringr,str_replace_all) importFrom(stringr,str_sort) importFrom(stringr,str_split) +importFrom(stringr,str_trim) importFrom(tibble,tibble) +importFrom(tidyselect,starts_with) importFrom(utils,packageVersion) importFrom(utils,read.table) importFrom(utils,setTxtProgressBar) diff --git a/R/RcppExports.R b/R/RcppExports.R index 44287e4d..a4ba46b8 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -2,10 +2,9 @@ # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 fill_vec <- function(read_vec, read_indices) { - .Call(`_immunarch_fill_vec`, read_vec, read_indices) + .Call(`_immunarch_fill_vec`, read_vec, read_indices) } fill_reads <- function(new_reads, new_counts) { - .Call(`_immunarch_fill_reads`, new_reads, new_counts) + .Call(`_immunarch_fill_reads`, new_reads, new_counts) } - diff --git a/R/diversity.R b/R/diversity.R index 62f69d39..460ea61e 100644 --- a/R/diversity.R +++ b/R/diversity.R @@ -184,8 +184,7 @@ repDiversity <- function(.data, .method = "chao1", .col = "aa", .max.q = 6, .min res <- add_class(res, new_class) return(res) - } - else if (.method == "raref") { + } else if (.method == "raref") { if (!has_class(.data, "list")) { .data <- list(Sample = .data) } @@ -203,8 +202,7 @@ repDiversity <- function(.data, .method = "chao1", .col = "aa", .max.q = 6, .min summarise(Div.count = sum(!!sym(IMMCOL$count))) %>% pull(Div.count) }) - } - else { + } else { .col <- process_col_argument(.col) if (has_class(.data, "data.table")) { @@ -406,7 +404,7 @@ rarefaction <- function(.data, .step = NA, .quantile = c(.025, .975), lo <- Sind hi <- Sind } - res <- c(sz, Sind, lo, hi) + res <- c(sz, lo, Sind, hi) names(res) <- c("Size", paste0("Q", .quantile[1]), "Mean", paste0("Q", .quantile[2])) if (.verbose) add_pb(pb) res diff --git a/R/dynamics.R b/R/dynamics.R index d3a9bdc1..59d4d03f 100644 --- a/R/dynamics.R +++ b/R/dynamics.R @@ -97,9 +97,7 @@ #' vis(tc, .order = sample_order) #' @export trackClonotypes trackClonotypes <- function(.data, .which = list(1, 15), .col = "aa", .norm = TRUE) { - if (!has_class(.data, "list")) { - stop("Error: please pass a list with immune repertoires to track clonotypes.") - } + .validate_repertoires_data(.data) if (length(.data) < 2) { stop("Error: please pass a list with 2 or more immune repertoires to track clonotypes.") } @@ -147,8 +145,8 @@ trackClonotypes <- function(.data, .which = list(1, 15), .col = "aa", .norm = TR result_df <- NULL for (i_df in 1:length(.data)) { temp_df <- .data[[i_df]] %>% - select(.col, Count = count_col) - setDT(temp_df) + select(.col, Count = count_col) %>% + as.data.table() if (.norm) { temp_df$Count <- temp_df$Count / sum(temp_df$Count) diff --git a/R/explore.R b/R/explore.R index 7ea4ba9b..3a6b7afc 100644 --- a/R/explore.R +++ b/R/explore.R @@ -101,7 +101,11 @@ repExplore <- function(.data, .method = c("volume", "count", "len", "clones"), . res <- do.call(rbind, res) res <- add_class(res, "immunr_exp_count") } else if (.method[1] %in% c("len", "lens", "length")) { - seq_col <- switch(.col[1], nt = IMMCOL$cdr3nt, aa = IMMCOL$cdr3aa, stop("Unknown sequence column: ", .col, ". Please provide either 'nt' or 'aa'")) + seq_col <- switch(.col[1], + nt = IMMCOL$cdr3nt, + aa = IMMCOL$cdr3aa, + stop("Unknown sequence column: ", .col, ". Please provide either 'nt' or 'aa'") + ) res <- lapply(.data, function(df) { if (has_class(df, "data.table")) { df <- df %>% lazy_dt() diff --git a/R/filters.R b/R/filters.R new file mode 100644 index 00000000..5c5cf8f4 --- /dev/null +++ b/R/filters.R @@ -0,0 +1,281 @@ +#' Main function for data filtering +#' +#' @concept filters +#' +#' @aliases repFilter filter_by_meta filter_by_repertoire filter_by_clonotype filter_table startswith_rows substring_rows validate_input_data include exclude lessthan morethan interval +#' +#' @importFrom magrittr "%>%" "%<>%" +#' @importFrom tidyselect starts_with +#' +#' @param .data The data to be processed. Must be the list of 2 elements: +#' data table and metadata table. +#' @param .method Method of filtering. Implemented methods: +#' by.meta, by.repertoire (by.rep), by.clonotype (by.cl) +#' Default value: 'by.clonotype'. +#' @param .query Filtering query. It's a named list of filters that will be applied +#' to data. +#' Possible values for names in this list are dependent on filter methods: +#' - by.meta: filter by metadata. Names in the named list are metadata column headers. +#' - by.repertoire: filter by number of clonotypes or total number of clones in sample. +#' Possible names in the named list are "n_clonotypes" and "n_clones". +#' - by.clonotype: filter by data in all samples. Names in the named list are +#' data column headers. +#' Elements of the named list for each of the filters are filtering options. +#' Possible values for filtering options: +#' - include("STR1", "STR2", ...): keep only rows with matching values. +#' Available for methods: "by.meta", "by.clonotype". +#' - exclude("STR1", "STR2", ...): remove rows with matching values. +#' Available for methods: "by.meta", "by.clonotype". +#' - lessthan(value): keep rows/samples with numeric values less than specified. +#' Available for methods: "by.meta", "by.repertoire", "by.clonotype". +#' - morethan(value): keep rows/samples with numeric values more than specified. +#' Available for methods: "by.meta", "by.repertoire", "by.clonotype". +#' - interval(from, to): keep rows/samples with numeric values that fits in this interval. +#' from is inclusive, to is exclusive. +#' Available for methods: "by.meta", "by.repertoire", "by.clonotype". +#' Default value: 'list(CDR3.aa = exclude("partial", "out_of_frame"))'. +#' @param .match Matching method for "include" and "exclude" options in query. +#' Possible values: +#' - exact: match only the exact specified string; +#' - startswith: match all strings starting with the specified substring; +#' - substring: match all strings containing the specified substring. +#' Default value: 'exact'. +#' +#' @examples +#' data(immdata) +#' +#' # Select samples with status "MS" +#' repFilter(immdata, "by.meta", list(Status = include("MS"))) +#' +#' # Select samples without status "MS" +#' repFilter(immdata, "by.meta", list(Status = exclude("MS"))) +#' +#' # Select samples from lanes "A" and "B" with age > 15 +#' repFilter(immdata, "by.meta", list(Lane = include("A", "B"), Age = morethan(15))) +#' +#' # Select samples that are not from lanes "A" and "B" +#' repFilter(immdata, "by.meta", list(Lane = exclude("A", "B"))) +#' +#' # Select samples with a number of clonotypes from 1000 to 5000 +#' repFilter(immdata, "by.repertoire", list(n_clonotypes = interval(1000, 5000))) +#' +#' # Select clonotypes in all samples with alpha chains +#' repFilter(immdata, "by.clonotype", +#' list(V.name = include("AV"), J.name = include("AJ")), +#' .match = "substring" +#' ) +#' @export repFilter include exclude lessthan morethan interval +repFilter <- function(.data, .method = "by.clonotype", + .query = list(CDR3.aa = exclude("partial", "out_of_frame")), + .match = "exact") { + .validate_immdata(.data) + + if (length(names(.query)) == 0) { + stop("Unnamed list could not be passed as query, please provide a named list!") + } + for (name in names(.query)) { + if (name == "") { + stop("Query list contains unnamed item!") + } + } + + if (!(.match %in% c("exact", "startswith", "substring"))) { + stop(paste0( + "Unknown matching method \"", .match, + "\"! Supported matching methods are: ", + "\"exact\", \"startswith\", \"substring\"." + )) + } + + filtered_data <- switch(tolower(.method), + by.meta = filter_by_meta(.data, .query, .match), + by.repertoire = filter_by_repertoire(.data, .query), + by.rep = filter_by_repertoire(.data, .query), + by.clonotype = filter_by_clonotype(.data, .query, .match), + by.cl = filter_by_clonotype(.data, .query, .match), + stop(paste0( + "You entered wrong method \"", .method, "\"! Supported methods are: ", + "\"by.meta\", \"by.repertoire\", \"by.clonotype\"." + )) + ) + # keep intact all .data elements except "data" and "meta" + filtered_data_full <- .data + filtered_data_full$data <- filtered_data$data + filtered_data_full$meta <- filtered_data$meta + return(filtered_data_full) +} + +filter_by_meta <- function(.data, .query, .match) { + filtered_meta <- .data$meta + for (i in seq_along(names(.query))) { + name <- names(.query)[i] + if (!(name %in% names(.data$meta))) { + stop(paste0("Column \"", name, "\" not found in metadata.")) + } + column_query <- .query[[name]] + query_type <- column_query[1] + query_args <- column_query[-1] + if (nrow(filtered_meta) > 0) { + filtered_meta %<>% filter_table(name, query_type, query_args, .match) + if (nrow(filtered_meta) == 0) { + warning(paste0("Filter by column \"", name, "\" removed all remaining samples!")) + } + } + } + filtered_data <- .data$data[filtered_meta$Sample] + + return(list(data = filtered_data, meta = filtered_meta)) +} + +filter_by_repertoire <- function(.data, .query) { + filtered_data <- .data$data + for (i in seq_along(names(.query))) { + name <- names(.query)[i] + key_query <- .query[[name]] + query_type <- key_query[[1]] + query_args <- key_query[-1] + if (length(filtered_data) > 0) { + counts <- switch(name, + n_clonotypes = lapply(filtered_data, nrow), + n_clones = lapply(filtered_data, function(sample) { + sum(sample$Clones) + }), + stop(paste0( + "Wrong filter key for \"by.repertoire\": \"", name, + "\"! Available keys: \"n_clonotypes\", \"n_clones\"." + )) + ) + + if (query_type == "lessthan") { + filtered_data <- filtered_data[ + counts < as_numeric_or_fail(query_args) + ] + } else if (query_type == "morethan") { + filtered_data <- filtered_data[ + counts > as_numeric_or_fail(query_args) + ] + } else if (query_type == "interval") { + from <- as_numeric_or_fail(query_args[[1]]) + to <- as_numeric_or_fail(query_args[[2]]) + filtered_data <- filtered_data[ + counts >= from & counts < to + ] + } + + if (length(filtered_data) == 0) { + warning(paste0("Filter by key \"", name, "\" removed all remaining samples!")) + } + } + } + filtered_meta <- subset(.data$meta, Sample %in% names(filtered_data)) + + return(list(data = filtered_data, meta = filtered_meta)) +} + +filter_by_clonotype <- function(.data, .query, .match) { + filtered_data <- .data$data + for (i in seq_along(names(.query))) { + name <- names(.query)[i] + column_query <- .query[[name]] + query_type <- column_query[[1]] + query_args <- column_query[-1] + for (j in seq_along(names(.data$data))) { + sample_name <- names(.data$data)[j] + if (sample_name %in% names(filtered_data)) { + sample <- filtered_data[[sample_name]] + if (!(name %in% names(sample))) { + stop(paste0("Column \"", name, "\" not found in sample \"", sample_name, "\".")) + } + if (nrow(sample) == 0) { + warning(paste0("Sample \"", sample_name, "\" was empty, removed!")) + } else { + sample %<>% filter_table(name, query_type, query_args, .match) + if (nrow(sample) == 0) { + warning(paste0( + "Filter by column \"", name, + "\" removed all remaining clonotypes from sample \"", + sample_name, "\"; sample was removed!" + )) + } + } + + if (nrow(sample) == 0) { + # removing empty sample + filtered_data <- filtered_data[names(filtered_data) != sample_name] + } else { + # updating sample in filtered_data + filtered_data[[sample_name]] <- sample + } + } + } + } + filtered_meta <- subset(.data$meta, Sample %in% names(filtered_data)) + + return(list(data = filtered_data, meta = filtered_meta)) +} + +filter_table <- function(.table, .column_name, .query_type, .query_args, .match) { + if (.query_type == "include") { + if (.match == "exact") { + .table %<>% subset(get(.column_name) %in% .query_args) + } else if (.match == "startswith") { + .table <- .table[startswith_rows(.table, .column_name, .query_args), ] + } else if (.match == "substring") { + .table <- .table[substring_rows(.table, .column_name, .query_args), ] + } + } else if (.query_type == "exclude") { + if (.match == "exact") { + .table %<>% subset(!get(.column_name) %in% .query_args) + } else if (.match == "startswith") { + .table <- .table[-startswith_rows(.table, .column_name, .query_args), ] + } else if (.match == "substring") { + .table <- .table[-substring_rows(.table, .column_name, .query_args), ] + } + } else if (.query_type == "lessthan") { + .table %<>% subset(get(.column_name) < as_numeric_or_fail(.query_args)) + } else if (.query_type == "morethan") { + .table %<>% subset(get(.column_name) > as_numeric_or_fail(.query_args)) + } else if (.query_type == "interval") { + .table %<>% subset(get(.column_name) >= as_numeric_or_fail(.query_args[[1]])) + .table %<>% subset(get(.column_name) < as_numeric_or_fail(.query_args[[2]])) + } + return(.table) +} + +# return indices for rows with "startswith" match +startswith_rows <- function(.table, .column_name, .query_args) { + starts_with(match = .query_args, vars = .table[[.column_name]], ignore.case = FALSE) +} + +# return indices for rows with "substring" match +substring_rows <- function(.table, .column_name, .query_args) { + unique(unlist(lapply(.query_args, grep, .table[[.column_name]], fixed = TRUE))) +} + +include <- function(...) { + args <- unlist(unname(list(...))) + if (length(args) == 0) { + stop("include() expects at least 1 argument!") + } + return(c("include", args)) +} + +exclude <- function(...) { + args <- unlist(unname(list(...))) + if (length(args) == 0) { + stop("exclude() expects at least 1 argument!") + } + return(c("exclude", args)) +} + +lessthan <- function(value) { + return(c("lessthan", unname(value))) +} + +morethan <- function(value) { + return(c("morethan", unname(value))) +} + +interval <- function(from, to) { + return(c("interval", unname(from), unname(to))) +} diff --git a/R/gene_usage.R b/R/gene_usage.R index 8eda15eb..df1b3c96 100644 --- a/R/gene_usage.R +++ b/R/gene_usage.R @@ -32,7 +32,15 @@ if (getRversion() >= "2.15.1") { #' @param .quant Select the column with data to evaluate. #' Pass NA if you want to compute gene statistics at the clonotype level without re-weighting. #' Pass "count" to use the "Clones" column to weight genes by abundance of their corresponding clonotypes. -#' @param .ambig An option to handle ambiguous data. We recommend to turn in on by passing "inc" (turned on by default). +#' @param .ambig An option to handle ambiguous gene assigments, e.g., "TRAV1,TRAV2". +#' +#' +#' - Pass "inc" to include all possible gene segments, so "TRAV1,TRAV2" is counted as a different gene segment. +#' +#' - Pass "exc" to exclude all ambiguous gene assignments, so "TRAV1,TRAV2" is excluded from the resultant gene table. +#' +#' +#' We recommend to turn in on by passing "inc" (turned on by default). #' You can exclude data for the cases where #' there is no clear match for gene, include it for every supplied gene, #' or pick only first from the set. Set it to "exc", "inc" or "maj", correspondingly. @@ -149,8 +157,7 @@ geneUsage <- function(.data, # df, list, MonetDB Names = names(chosen), Count = unlist(chosen), stringsAsFactors = FALSE ) - } - else if (.ambig == "maj") { + } else if (.ambig == "maj") { maj_counts <- data.frame(Names = sapply(gene_col_split, "[[", 1), Counts = counts, stringsAsFactors = FALSE) row.names(maj_counts) <- NULL res <- maj_counts %>% diff --git a/R/immunr_data_format.R b/R/immunr_data_format.R index 1b3f5662..b3a29c6f 100644 --- a/R/immunr_data_format.R +++ b/R/immunr_data_format.R @@ -42,7 +42,7 @@ #' NULL -# Immunarch column names +# Immunarch basic column names IMMCOL <- new.env() IMMCOL$count <- "Clones" @@ -73,9 +73,30 @@ IMMCOL$type <- c( "integer", "integer", "integer", "character" ) +# TODO: move V/D/J coordinates here +# Immunarch extended column names +IMMCOL_EXT <- new.env() + +IMMCOL_EXT$cdr1nt <- "CDR1.nt" +IMMCOL_EXT$cdr1aa <- "CDR1.aa" +IMMCOL_EXT$cdr2nt <- "CDR2.nt" +IMMCOL_EXT$cdr2aa <- "CDR2.aa" +IMMCOL_EXT$fr1nt <- "FR1.nt" +IMMCOL_EXT$fr1aa <- "FR1.aa" +IMMCOL_EXT$fr2nt <- "FR2.nt" +IMMCOL_EXT$fr2aa <- "FR2.aa" +IMMCOL_EXT$fr3nt <- "FR3.nt" +IMMCOL_EXT$fr3aa <- "FR3.aa" +IMMCOL_EXT$fr4nt <- "FR4.nt" +IMMCOL_EXT$fr4aa <- "FR4.aa" +IMMCOL_EXT$c <- "C.name" +IMMCOL_EXT$order <- c() + # Additional information IMMCOL_ADD <- new.env() -# separator for paired chain data +# Separator for values inside columns, e.g., V assignments. +IMMCOL_ADD$valsep <- "," +# Separator for paired chain data IMMCOL_ADD$scsep <- ";" IMMUNR_ERROR_NOT_IMPL <- "ERROR: not implemented yet. Please contact us via support@immunomind.io if you need it in your research." diff --git a/R/io-parsers.R b/R/io-parsers.R new file mode 100644 index 00000000..455f9b6d --- /dev/null +++ b/R/io-parsers.R @@ -0,0 +1,1197 @@ +parse_repertoire <- function(.filename, .mode, .nuc.seq, .aa.seq, .count, + .vgenes, .jgenes, .dgenes, + .vend, .jstart, .dstart, .dend, + .vd.insertions, .dj.insertions, .total.insertions, + .skip = 0, .sep = "\t", .add = NA) { + .nuc.seq <- .make_names(.nuc.seq) + .aa.seq <- .make_names(.aa.seq) + .count <- .make_names(.count) + .vgenes <- .make_names(.vgenes) + .jgenes <- .make_names(.jgenes) + .dgenes <- .make_names(.dgenes) + .vend <- .make_names(.vend) + .jstart <- .make_names(.jstart) + .vd.insertions <- .make_names(.vd.insertions) + .dj.insertions <- .make_names(.dj.insertions) + .total.insertions <- .make_names(.total.insertions) + .dstart <- .make_names(.dstart) + .dend <- .make_names(.dend) + .add <- .make_names(.add) + + col.classes <- .get_coltypes(.filename, .nuc.seq, .aa.seq, .count, + .vgenes, .jgenes, .dgenes, + .vend, .jstart, .dstart, .dend, + .vd.insertions, .dj.insertions, .total.insertions, + .skip = .skip, .sep = "\t", .add + ) + + # IO_REFACTOR + suppressMessages(df <- readr::read_delim(.filename, + col_names = TRUE, + col_types = col.classes, delim = .sep, + quote = "", escape_double = FALSE, + comment = "", trim_ws = TRUE, + skip = .skip, na = c("", "NA", ".") + )) + # suppressMessages(df <- fread(.filename, skip = .skip, data.table = FALSE, na.strings = c("", "NA", "."))) + + names(df) <- tolower(names(df)) + recomb_type <- .which_recomb_type(df[[.vgenes]]) + + table.colnames <- names(col.classes) + + df[[.nuc.seq]] <- toupper(df[[.nuc.seq]]) + + if (is.na(.aa.seq)) { + df$CDR3.amino.acid.sequence <- bunch_translate(df[[.nuc.seq]]) + .aa.seq <- "CDR3.amino.acid.sequence" + } + + if (is.na(.count)) { + .count <- "Count" + df$Count <- 1 + } + + df$Proportion <- df[[.count]] / sum(df[[.count]]) + .prop <- "Proportion" + + ins_ok <- FALSE + if (is.na(.vd.insertions)) { + .vd.insertions <- "VD.insertions" + df$VD.insertions <- NA + } + + if (!(.vd.insertions %in% table.colnames)) { + .vd.insertions <- "VD.insertions" + df$VD.insertions <- NA + + if (!is.na(.vend) && !is.na(.dstart)) { + if (!is.na(recomb_type) && recomb_type == "VDJ") { + df$VD.insertions <- df[[.dstart]] - df[[.vend]] - 1 + df$VD.insertions[is.na(df[[.dstart]])] <- NA + df$VD.insertions[is.na(df[[.vend]])] <- NA + + ins_ok <- TRUE + } + } + } + + if (!ins_ok) { + df$V.end <- NA + df$D.start <- NA + df$D.end <- NA + .vend <- "V.end" + .dstart <- "D.start" + .dend <- "D.end" + } + + ins_ok <- FALSE + if (is.na(.dj.insertions)) { + .dj.insertions <- "DJ.insertions" + df$DJ.insertions <- NA + } + + if (!(.dj.insertions %in% table.colnames)) { + .dj.insertions <- "DJ.insertions" + df$DJ.insertions <- NA + + if (!is.na(.jstart) && !is.na(.dend)) { + if (!is.na(recomb_type) && recomb_type == "VDJ") { + df$DJ.insertions <- df[[.jstart]] - df[[.dend]] - 1 + df$DJ.insertions[is.na(df[[.dend]])] <- NA + df$DJ.insertions[is.na(df[[.jstart]])] <- NA + + ins_ok <- TRUE + } + } + } + if (!ins_ok) { + df$J.start <- NA + df$D.start <- NA + df$D.end <- NA + .jstart <- "J.start" + .dstart <- "D.start" + .dend <- "D.end" + } + + ins_ok <- FALSE + if (is.na(.total.insertions)) { + .total.insertions <- "Total.insertions" + df$Total.insertions <- NA + } + + if (!(.total.insertions %in% table.colnames)) { + .total.insertions <- "Total.insertions" + df$Total.insertions <- -1 + if (!is.na(recomb_type)) { + if (recomb_type == "VJ") { + df$Total.insertions <- df[[.jstart]] - df[[.vend]] - 1 + df$Total.insertions[df$Total.insertions < 0] <- 0 + } else if (recomb_type == "VDJ") { + df$Total.insertions <- df[[.vd.insertions]] + df[[.dj.insertions]] + } + } else { + df$Total.insertions <- NA + } + } + + vdj <- c(.vgenes, .dgenes, .jgenes) + names(vdj) <- c(".vgenes", ".dgenes", ".jgenes") + # if V, D or J columns are missing in the data, add empty columns + for (i in length(vdj)) { + if (is.na(vdj[[i]])) { + # if genes header argument is NA, use ".vgenes", ".dgenes" or ".jgenes" as column name + genes_header <- names(vdj)[i] + vdj[[i]] <- genes_header + # add empty column with this name to parsed dataframe + df[, genes_header] <- NA + } + } + + vec_names <- c( + .count, .prop, .nuc.seq, .aa.seq, + vdj[[".vgenes"]], vdj[[".dgenes"]], vdj[[".jgenes"]], + .vend, .dstart, .dend, .jstart, + .total.insertions, .vd.insertions, .dj.insertions + ) + if (!is.na(.add[1])) { + vec_names <- c(vec_names, .add) + } + + df <- df[, vec_names] + + colnames(df)[1] <- IMMCOL$count + colnames(df)[2] <- IMMCOL$prop + colnames(df)[3] <- IMMCOL$cdr3nt + colnames(df)[4] <- IMMCOL$cdr3aa + colnames(df)[5] <- IMMCOL$v + colnames(df)[6] <- IMMCOL$d + colnames(df)[7] <- IMMCOL$j + colnames(df)[8] <- IMMCOL$ve + colnames(df)[9] <- IMMCOL$ds + colnames(df)[10] <- IMMCOL$de + colnames(df)[11] <- IMMCOL$js + colnames(df)[12] <- IMMCOL$vnj + colnames(df)[13] <- IMMCOL$vnd + colnames(df)[14] <- IMMCOL$dnj + + .postprocess(df, .mode) +} + +parse_immunoseq <- function(.filename, .mode, .wash.alleles = TRUE) { + .fix.immunoseq.genes <- function(.col) { + # fix "," + .col <- gsub(",", ", ", .col, fixed = TRUE, useBytes = TRUE) + # fix forward zeros + .col <- gsub("-([0])([0-9])", "-\\2", .col, useBytes = TRUE) + .col <- gsub("([VDJ])([0])([0-9])", "\\1\\3", .col, useBytes = TRUE) + # fix gene names + .col <- gsub("TCR", "TR", .col, fixed = TRUE, useBytes = TRUE) + .col + } + + filename <- .filename + file_cols <- list() + file_cols[[IMMCOL$count]] <- "templates" + file_cols[[IMMCOL$cdr3nt]] <- "rearrangement" + file_cols[[IMMCOL$cdr3aa]] <- "amino_acid" + file_cols[[IMMCOL$v]] <- "v_resolved" + file_cols[[IMMCOL$d]] <- "d_resolved" + file_cols[[IMMCOL$j]] <- "j_resolved" + file_cols[[IMMCOL$vnj]] <- "n1_insertions" + file_cols[[IMMCOL$vnd]] <- "n1_insertions" + file_cols[[IMMCOL$dnj]] <- "n2_insertions" + + v_index_col_name <- "v_index" + d_index_col_name <- "d_index" + j_index_col_name <- "j_index" + n1_index_col_name <- "n1_index" + n2_index_col_name <- "n2_index" + + # + # Check for the version of ImmunoSEQ files + # + f <- file(.filename, "r") + l <- readLines(f, 2) + close(f) + if (str_detect(l[[1]], "v_gene") && !str_detect(l[[1]], "v_resolved")) { + file_cols[[IMMCOL$v]] <- "v_gene" + file_cols[[IMMCOL$d]] <- "d_gene" + file_cols[[IMMCOL$j]] <- "j_gene" + } else if (str_detect(l[[1]], "MaxResolved")) { + file_cols[[IMMCOL$v]] <- "vMaxResolved" + file_cols[[IMMCOL$d]] <- "dMaxResolved" + file_cols[[IMMCOL$j]] <- "jMaxResolved" + + file_cols[[IMMCOL$vnj]] <- "n1insertion" + file_cols[[IMMCOL$vnd]] <- "n1insertion" + file_cols[[IMMCOL$dnj]] <- "n2insertion" + } + + + l_split <- strsplit(l, "\t") + if (str_detect(l[[1]], "templates")) { + if (str_detect(l[[1]], "templates/reads")) { + file_cols[[IMMCOL$count]] <- "count (templates/reads)" + file_cols[[IMMCOL$cdr3nt]] <- "nucleotide" + file_cols[[IMMCOL$cdr3aa]] <- "aminoAcid" + } else if (l_split[[2]][match("templates", l_split[[1]])] == "null") { + file_cols[[IMMCOL$count]] <- "reads" + } + } + + if (!str_detect(l[[1]], "v_index")) { + v_index_col_name <- "vindex" + d_index_col_name <- "dindex" + j_index_col_name <- "jindex" + n1_index_col_name <- "n1index" + n2_index_col_name <- "n2index" + } + + for (col_name in names(file_cols)) { + file_cols[[col_name]] <- .make_names(file_cols[[col_name]]) + } + + file_cols[[IMMCOL$prop]] <- IMMCOL$prop + file_cols[[IMMCOL$ve]] <- IMMCOL$ve + file_cols[[IMMCOL$ds]] <- IMMCOL$ds + file_cols[[IMMCOL$de]] <- IMMCOL$de + file_cols[[IMMCOL$js]] <- IMMCOL$js + file_cols[[IMMCOL$seq]] <- IMMCOL$seq + + # IO_REFACTOR + suppressMessages(df <- readr::read_delim(.filename, + col_names = TRUE, col_types = cols(), + delim = "\t", quote = "", + escape_double = FALSE, comment = "", + trim_ws = TRUE, skip = 0 + )) + # suppressMessages(df <- fread(.filename, data.table = FALSE)) + + names(df) <- tolower(names(df)) + + df[[file_cols[[IMMCOL$prop]]]] <- df[[file_cols[[IMMCOL$count]]]] / sum(df[[file_cols[[IMMCOL$count]]]]) + + # Save full nuc sequences and cut them down to CDR3 + df[[IMMCOL$seq]] <- df[[file_cols[[IMMCOL$cdr3nt]]]] + + # TODO: what if df[["v_index]] has "-1" or something like that? + df[[file_cols[[IMMCOL$cdr3nt]]]] <- stringr::str_sub(df[[IMMCOL$seq]], df[[v_index_col_name]] + 1, nchar(df[[IMMCOL$seq]])) + + df[[file_cols[[IMMCOL$v]]]] <- .fix.immunoseq.genes(df[[file_cols[[IMMCOL$v]]]]) + df[[file_cols[[IMMCOL$d]]]] <- .fix.immunoseq.genes(df[[file_cols[[IMMCOL$d]]]]) + df[[file_cols[[IMMCOL$j]]]] <- .fix.immunoseq.genes(df[[file_cols[[IMMCOL$j]]]]) + + recomb_type <- "VDJ" + if (recomb_type == "VDJ") { + df[[file_cols[[IMMCOL$ve]]]] <- df[[n1_index_col_name]] - df[[v_index_col_name]] + df[[file_cols[[IMMCOL$ds]]]] <- df[[d_index_col_name]] - df[[v_index_col_name]] + df[[file_cols[[IMMCOL$de]]]] <- df[[n2_index_col_name]] - df[[v_index_col_name]] + df[[file_cols[[IMMCOL$js]]]] <- df[[j_index_col_name]] - df[[v_index_col_name]] + file_cols[[IMMCOL$vnj]] <- IMMCOL$vnj + df[[IMMCOL$vnj]] <- -1 + } + + sample_name_vec <- NA + if ("sample_name" %in% colnames(df)) { + if (length(unique(df[["sample_name"]])) > 1) { + sample_name_vec <- df[["sample_name"]] + } + } + + df <- df[unlist(file_cols[IMMCOL$order])] + names(df) <- IMMCOL$order + + if (.wash.alleles) { + df <- .remove.alleles(df) + df[[IMMCOL$v]] <- gsub("([VDJ][0-9]*)$", "\\1-1", df[[IMMCOL$v]], useBytes = TRUE) + df[[IMMCOL$j]] <- gsub("([VDJ][0-9]*)$", "\\1-1", df[[IMMCOL$j]], useBytes = TRUE) + } + + if (nrow(df) > 0) { + if (has_class(df[[IMMCOL$vnj]], "character")) { + df[[IMMCOL$vnj]][df[[IMMCOL$vnj]] == "no data"] <- NA + } + if (has_class(df[[IMMCOL$vnd]], "character")) { + df[[IMMCOL$vnd]][df[[IMMCOL$vnd]] == "no data"] <- NA + } + if (has_class(df[[IMMCOL$dnj]], "character")) { + df[[IMMCOL$dnj]][df[[IMMCOL$dnj]] == "no data"] <- NA + } + + df[[IMMCOL$vnj]] <- as.integer(df[[IMMCOL$vnj]]) + df[[IMMCOL$vnd]] <- as.integer(df[[IMMCOL$vnd]]) + df[[IMMCOL$dnj]] <- as.integer(df[[IMMCOL$dnj]]) + } + + df[[IMMCOL$v]][df[[IMMCOL$v]] == "unresolved"] <- NA + df[[IMMCOL$d]][df[[IMMCOL$d]] == "unresolved"] <- NA + df[[IMMCOL$j]][df[[IMMCOL$j]] == "unresolved"] <- NA + + if (!is.na(sample_name_vec[1])) { + df <- lapply(split(df, sample_name_vec), .postprocess) + } else { + .postprocess(df) + } +} + +parse_mitcr <- function(.filename, .mode) { + .skip <- 0 + f <- file(.filename, "r") + l <- readLines(f, 1) + # Check for different levels of the MiTCR output + if (any(stringr::str_detect(l, c("MiTCRFullExport", "mitcr")))) { + .skip <- 1 + } + mitcr_format <- 1 + if (stringr::str_detect(l, "MiTCRFullExport") || .skip == 0) { + mitcr_format <- 2 + } + close(f) + + if (mitcr_format == 1) { + filename <- .filename + .count <- "count" + nuc.seq <- "cdr3nt" + aa.seq <- "cdr3aa" + vgenes <- "v" + jgenes <- "j" + dgenes <- "d" + vend <- "VEnd" + jstart <- "JStart" + dstart <- "DStart" + dend <- "DEnd" + vd.insertions <- NA + dj.insertions <- NA + total.insertions <- NA + .sep <- "\t" + } else { + # Check if there are barcodes + f <- file(.filename, "r") + l <- readLines(f, 1 + .skip)[.skip + 1] + barcodes <- NA + .count <- "Read count" + if ("NNNs" %in% strsplit(l, "\t", TRUE)[[1]]) { + .count <- "NNNs" + } + close(f) + + filename <- .filename + nuc.seq <- "CDR3 nucleotide sequence" + aa.seq <- "CDR3 amino acid sequence" + vgenes <- "V segments" + jgenes <- "J segments" + dgenes <- "D segments" + vend <- "Last V nucleotide position" + jstart <- "First J nucleotide position" + dstart <- "First D nucleotide position" + dend <- "Last D nucleotide position" + vd.insertions <- "VD insertions" + dj.insertions <- "DJ insertions" + total.insertions <- "Total insertions" + .sep <- "\t" + } + + parse_repertoire( + .filename = filename, .mode = .mode, .nuc.seq = nuc.seq, .aa.seq = aa.seq, .count = .count, .vgenes = vgenes, .jgenes = jgenes, .dgenes = dgenes, + .vend = vend, .jstart = jstart, .dstart = dstart, .dend = dend, + .vd.insertions = vd.insertions, .dj.insertions = dj.insertions, + .total.insertions = total.insertions, .skip = .skip, .sep = .sep + ) +} + +parse_mixcr <- function(.filename, .mode) { + fix.alleles <- function(.data) { + .data[[IMMCOL$v]] <- gsub("[*][[:digit:]]*", "", .data[[IMMCOL$v]]) + .data[[IMMCOL$d]] <- gsub("[*][[:digit:]]*", "", .data[[IMMCOL$d]]) + .data[[IMMCOL$j]] <- gsub("[*][[:digit:]]*", "", .data[[IMMCOL$j]]) + .data + } + + .filename <- .filename + .count <- "clonecount" + .sep <- "\t" + .vend <- "allvalignments" + .jstart <- "alljalignments" + .dalignments <- "alldalignments" + .vd.insertions <- "VD.insertions" + .dj.insertions <- "DJ.insertions" + .total.insertions <- "Total.insertions" + + table.colnames <- tolower(make.names(read.table(.filename, sep = .sep, skip = 0, nrows = 1, stringsAsFactors = FALSE, strip.white = TRUE, comment.char = "", quote = "")[1, ])) + table.colnames <- gsub(".", "", table.colnames, fixed = TRUE) + + # Columns of different MiXCR formats + # Clone count - Clonal sequence(s) - N. Seq. CDR3 + # cloneCount - clonalSequence - nSeqCDR3 + # cloneCount - targetSequences - nSeqImputedCDR3 + # cloneCount - targetSequences - nSeqCDR3 + + # TODO: when refactoring, CDR headers can be implemented as objects of data class + # that contains headers for nucleotide sequence and amino acid sequence (such as "nseqcdr3"), + # and IDs for these headers (such as ".nuc.seq.cdr3") + SEQ_NUM <- 3 + nuc_headers <- rep(NA, SEQ_NUM) + aa_headers <- rep(NA, SEQ_NUM) + names(nuc_headers) <- c(".nuc.seq.cdr1", ".nuc.seq.cdr2", ".nuc.seq.cdr3") + names(aa_headers) <- c(".aa.seq.cdr1", ".aa.seq.cdr2", ".aa.seq.cdr3") + # configure headers for nucleotide sequences for CDR1, CDR2, CDR3 + for (i in 1:SEQ_NUM) { + cdr <- paste0("cdr", i) + if ("targetsequences" %in% table.colnames) { + if (paste0("nseqimputed", cdr) %in% table.colnames) { + nuc_headers[[i]] <- paste0("nseqimputed", cdr) + } else { + nuc_headers[[i]] <- paste0("nseq", cdr) + } + } else { + nuc_headers[[i]] <- paste0("nseq", cdr) + } + } + + if ("targetsequences" %in% table.colnames) { + .big.seq <- "targetsequences" + } else { + if ("clonalsequences" %in% table.colnames) { + .big.seq <- "clonalsequences" + } else if ("clonalsequence" %in% table.colnames) { + .big.seq <- "clonalsequence" + } else { + .big.seq <- NA + } + } + + if (!("allvalignments" %in% table.colnames)) { + if ("allvalignment" %in% table.colnames) { + .vend <- "allvalignment" + } else { + .vend <- NA + } + } + if (!("alldalignments" %in% table.colnames)) { + if ("alldalignment" %in% table.colnames) { + .dalignments <- "alldalignment" + } else { + .dalignments <- NA + } + } + if (!("alljalignments" %in% table.colnames)) { + if ("alljalignment" %in% table.colnames) { + .jstart <- "alljalignment" + } else { + .jstart <- NA + } + } + + if ("bestvhit" %in% table.colnames) { + .vgenes <- "bestvhit" + } else if ("allvhits" %in% table.colnames) { + .vgenes <- "allvhits" + } else if ("vhits" %in% table.colnames) { + .vgenes <- "vhits" + } else if ("allvhitswithscore" %in% table.colnames) { + .vgenes <- "allvhitswithscore" + } else if ("bestvgene" %in% table.colnames) { + .vgenes <- "bestvgene" + } else { + message("Error: can't find a column with V genes") + } + + if ("bestjhit" %in% table.colnames) { + .jgenes <- "bestjhit" + } else if ("alljhits" %in% table.colnames) { + .jgenes <- "alljhits" + } else if ("jhits" %in% table.colnames) { + .jgenes <- "jhits" + } else if ("alljhitswithscore" %in% table.colnames) { + .jgenes <- "alljhitswithscore" + } else if ("bestjgene" %in% table.colnames) { + .jgenes <- "bestjgene" + } else { + message("Error: can't find a column with J genes") + } + + if ("bestdhit" %in% table.colnames) { + .dgenes <- "bestdhit" + } else if ("alldhits" %in% table.colnames) { + .dgenes <- "alldhits" + } else if ("dhits" %in% table.colnames) { + .dgenes <- "dhits" + } else if ("alldhitswithscore" %in% table.colnames) { + .dgenes <- "alldhitswithscore" + } else if ("bestdgene" %in% table.colnames) { + .dgenes <- "bestdgene" + } else { + message("Error: can't find a column with D genes") + } + + + # IO_REFACTOR + df <- read_delim( + file = .filename, col_types = cols(), + delim = .sep, skip = 0, comment = "", + quote = "", escape_double = FALSE, trim_ws = TRUE + ) + # df <- fread(.filename, data.table = FALSE) + + # + # return NULL if there is no clonotypes in the data frame + # + if (nrow(df) == 0) { + return(NULL) + } + + names(df) <- make.names(names(df)) + names(df) <- tolower(gsub(".", "", names(df), fixed = TRUE)) + names(df) <- str_replace_all(names(df), " ", "") + + # check for VJ or VDJ recombination + # VJ / VDJ / Undeterm + recomb_type <- "Undeterm" + if (sum(substr(head(df)[[.vgenes]], 1, 4) %in% c("TCRA", "TRAV", "TRGV", "IGKV", "IGLV"))) { + recomb_type <- "VJ" + } else if (sum(substr(head(df)[[.vgenes]], 1, 4) %in% c("TCRB", "TRBV", "TRDV", "IGHV"))) { + recomb_type <- "VDJ" + } + + if (!is.na(.vend) && !is.na(.jstart)) { + .vd.insertions <- "VD.insertions" + df$VD.insertions <- -1 + if (recomb_type == "VJ") { + df$VD.insertions <- -1 + } else if (recomb_type == "VDJ") { + logic <- sapply(strsplit(df[[.dalignments]], "|", TRUE, FALSE, TRUE), length) >= 4 & + sapply(strsplit(df[[.vend]], "|", TRUE, FALSE, TRUE), length) >= 5 + df$VD.insertions[logic] <- + as.numeric(sapply(strsplit(df[[.dalignments]][logic], "|", TRUE, FALSE, TRUE), "[[", 4)) - + as.numeric(sapply(strsplit(df[[.vend]][logic], "|", TRUE, FALSE, TRUE), "[[", 5)) - 1 + } + + .dj.insertions <- "DJ.insertions" + df$DJ.insertions <- -1 + if (recomb_type == "VJ") { + df$DJ.insertions <- -1 + } else if (recomb_type == "VDJ") { + logic <- sapply(strsplit(df[[.jstart]], "|", TRUE, FALSE, TRUE), length) >= 4 & + sapply(strsplit(df[[.dalignments]], "|", TRUE, FALSE, TRUE), length) >= 5 + df$DJ.insertions[logic] <- + as.numeric(sapply(strsplit(df[[.jstart]][logic], "|", TRUE, FALSE, TRUE), "[[", 4)) - + as.numeric(sapply(strsplit(df[[.dalignments]][logic], "|", TRUE, FALSE, TRUE), "[[", 5)) - 1 + } + + # VJ.insertions + logic <- (sapply(strsplit(df[[.vend]], "|", TRUE, FALSE, TRUE), length) > 4) & (sapply(strsplit(df[[.jstart]], "|", TRUE, FALSE, TRUE), length) >= 4) + .total.insertions <- "Total.insertions" + if (recomb_type == "VJ") { + df$Total.insertions <- NA + if (length(which(logic)) > 0) { + df$Total.insertions[logic] <- + as.numeric(sapply(strsplit(df[[.jstart]][logic], "|", TRUE, FALSE, TRUE), "[[", 4)) - as.numeric(sapply(strsplit(df[[.vend]][logic], "|", TRUE, FALSE, TRUE), "[[", 5)) - 1 + } + } else if (recomb_type == "VDJ") { + df$Total.insertions <- df[[.vd.insertions]] + df[[.dj.insertions]] + } else { + df$Total.insertions <- NA + } + df$Total.insertions[df$Total.insertions < 0] <- -1 + + df$V.end <- -1 + df$J.start <- -1 + df[[.vend]] <- gsub(";", "", df[[.vend]], fixed = TRUE) + logic <- sapply(strsplit(df[[.vend]], "|", TRUE, FALSE, TRUE), length) >= 5 + df$V.end[logic] <- sapply(strsplit(df[[.vend]][logic], "|", TRUE, FALSE, TRUE), "[[", 5) + logic <- sapply(strsplit(df[[.jstart]], "|", TRUE, FALSE, TRUE), length) >= 4 + df$J.start[logic] <- sapply(strsplit(df[[.jstart]][logic], "|", TRUE, FALSE, TRUE), "[[", 4) + } else { + df$V.end <- -1 + df$J.start <- -1 + df$Total.insertions <- -1 + df$VD.insertions <- -1 + df$DJ.insertions <- -1 + + .dj.insertions <- "DJ.insertions" + .vd.insertions <- "VD.insertions" + } + + .vend <- "V.end" + .jstart <- "J.start" + + if (!is.na(.dalignments)) { + logic <- sapply(str_split(df[[.dalignments]], "|"), length) >= 5 + df$D5.end <- -1 + df$D3.end <- -1 + df$D5.end[logic] <- sapply(str_split(df[[.dalignments]][logic], "|"), "[[", 4) + df$D3.end[logic] <- sapply(str_split(df[[.dalignments]][logic], "|"), "[[", 5) + .dalignments <- c("D5.end", "D3.end") + } else { + df$D5.end <- -1 + df$D3.end <- -1 + } + + .dalignments <- c("D5.end", "D3.end") + + if (!(.count %in% table.colnames)) { + warn_msg <- c(" [!] Warning: can't find a column with clonal counts. Setting all clonal counts to 1.") + warn_msg <- c(warn_msg, "\n Did you apply repLoad to MiXCR file *_alignments.txt?") + warn_msg <- c(warn_msg, " If so please consider moving all *.clonotypes.*.txt MiXCR files to") + warn_msg <- c(warn_msg, " a separate folder and apply repLoad to the folder.") + warn_msg <- c(warn_msg, "\n Note: The *_alignments.txt file IS NOT a repertoire file suitable for any analysis.") + message(warn_msg) + + df[[.count]] <- 1 + } + .freq <- "Proportion" + df$Proportion <- df[[.count]] / sum(df[[.count]], na.rm = TRUE) + + aa_headers[[".aa.seq.cdr1"]] <- IMMCOL_EXT$cdr1aa + aa_headers[[".aa.seq.cdr2"]] <- IMMCOL_EXT$cdr2aa + aa_headers[[".aa.seq.cdr3"]] <- IMMCOL$cdr3aa + for (i in 1:SEQ_NUM) { + df[[aa_headers[[i]]]] <- bunch_translate(df[[nuc_headers[[i]]]]) + } + + if (is.na(.big.seq)) { + .big.seq <- "BigSeq" + df$BigSeq <- df[[nuc_headers[[".nuc.seq.cdr3"]]]] + } + + df <- df[, make.names(c( + .count, .freq, + nuc_headers[[".nuc.seq.cdr3"]], aa_headers[[".aa.seq.cdr3"]], + .vgenes, .dgenes, .jgenes, + .vend, .dalignments, .jstart, + .total.insertions, .vd.insertions, .dj.insertions, .big.seq, + nuc_headers[[".nuc.seq.cdr1"]], aa_headers[[".aa.seq.cdr1"]], + nuc_headers[[".nuc.seq.cdr2"]], aa_headers[[".aa.seq.cdr2"]] + ))] + + colnames(df) <- c( + IMMCOL$order, + IMMCOL_EXT$cdr1nt, IMMCOL_EXT$cdr1aa, IMMCOL_EXT$cdr2nt, IMMCOL_EXT$cdr2aa + ) + + df[[IMMCOL$v]] <- gsub("([*][[:digit:]]*)([(][[:digit:]]*[.,]*[[:digit:]]*[)])", "", df[[IMMCOL$v]]) + df[[IMMCOL$v]] <- gsub(",", ", ", df[[IMMCOL$v]]) + df[[IMMCOL$v]] <- str_replace_all(df[[IMMCOL$v]], '"', "") + + # Remove sorting because MiXCR outputs segments in a specific order + df[[IMMCOL$v]] <- sapply( + strsplit(df[[IMMCOL$v]], ", ", useBytes = TRUE), + # function(x) paste0(sort(unique(x)), collapse = ", ") + function(x) paste0(unique(x), collapse = ", ") + ) + + df[[IMMCOL$d]] <- gsub("([*][[:digit:]]*)([(][[:digit:]]*[.,]*[[:digit:]]*[)])", "", df[[IMMCOL$d]]) + df[[IMMCOL$d]] <- gsub(",", ", ", df[[IMMCOL$d]]) + df[[IMMCOL$d]] <- str_replace_all(df[[IMMCOL$d]], '"', "") + df[[IMMCOL$d]] <- sapply( + strsplit(df[[IMMCOL$d]], ", ", useBytes = TRUE), + # function(x) paste0(sort(unique(x)), collapse = ", ") + function(x) paste0(unique(x), collapse = ", ") + ) + + df[[IMMCOL$j]] <- gsub("([*][[:digit:]]*)([(][[:digit:]]*[.,]*[[:digit:]]*[)])", "", df[[IMMCOL$j]]) + df[[IMMCOL$j]] <- gsub(",", ", ", df[[IMMCOL$j]]) + df[[IMMCOL$j]] <- str_replace_all(df[[IMMCOL$j]], '"', "") + df[[IMMCOL$j]] <- sapply( + strsplit(df[[IMMCOL$j]], ", ", useBytes = TRUE), + # function(x) paste0(sort(unique(x)), collapse = ", ") + function(x) paste0(unique(x), collapse = ", ") + ) + + .postprocess(fix.alleles(df)) +} + +parse_migec <- function(.filename, .mode) { + filename <- .filename + nuc.seq <- "CDR3 nucleotide sequence" + aa.seq <- "CDR3 amino acid sequence" + .count <- "Good events" + vgenes <- "V segments" + jgenes <- "J segments" + dgenes <- "D segments" + vend <- "Last V nucleotide position" + jstart <- "First J nucleotide position" + dstart <- "First D nucleotide position" + dend <- "Last D nucleotide position" + vd.insertions <- "VD insertions" + dj.insertions <- "DJ insertions" + total.insertions <- "Total insertions" + .skip <- 0 + .sep <- "\t" + + parse_repertoire( + .filename = filename, .mode = .mode, .nuc.seq = nuc.seq, .aa.seq = aa.seq, .count = .count, + .vgenes = vgenes, .jgenes = jgenes, .dgenes = dgenes, + .vend = vend, .jstart = jstart, .dstart = dstart, .dend = dend, + .vd.insertions = vd.insertions, .dj.insertions = dj.insertions, + .total.insertions = total.insertions, .skip = .skip, .sep = .sep + ) +} + +parse_migmap <- function(.filename, .mode) { + filename <- .filename + nuc.seq <- "cdr3nt" + aa.seq <- "cdr3aa" + .count <- "count" + vgenes <- "v" + jgenes <- "j" + dgenes <- "d" + vend <- "v.end.in.cdr3" + jstart <- "j.start.in.cdr3" + dstart <- "d.start.in.cdr3" + dend <- "d.end.in.cdr3" + vd.insertions <- NA + dj.insertions <- NA + total.insertions <- NA + .skip <- 0 + .sep <- "\t" + + parse_repertoire( + .filename = filename, .mode = .mode, .nuc.seq = nuc.seq, .aa.seq = aa.seq, .count = .count, .vgenes = vgenes, .jgenes = jgenes, .dgenes = dgenes, + .vend = vend, .jstart = jstart, .dstart = dstart, .dend = dend, + .vd.insertions = vd.insertions, .dj.insertions = dj.insertions, + .total.insertions = total.insertions, .skip = .skip, .sep = .sep + ) +} + +parse_tcr <- function(.filename, .mode) { + f <- file(.filename, "r") + l <- readLines(f, 2)[2] + close(f) + + nuc.seq <- "CDR3.nucleotide.sequence" + aa.seq <- "CDR3.amino.acid.sequence" + .count <- "Read.count" + vgenes <- "V.gene" + jgenes <- "J.gene" + dgenes <- "D.gene" + vend <- "V.end" + jstart <- "J.start" + dstart <- "D5.end" + dend <- "D3.end" + vd.insertions <- "VD.insertions" + dj.insertions <- "DJ.insertions" + total.insertions <- "Total.insertions" + .skip <- 0 + .sep <- "\t" + + if (substr(l, 1, 2) != "NA") { + .count <- "Umi.count" + } + + parse_repertoire( + .filename = .filename, .mode = .mode, .nuc.seq = nuc.seq, .aa.seq = aa.seq, .count = .count, .vgenes = vgenes, .jgenes = jgenes, .dgenes = dgenes, + .vend = vend, .jstart = jstart, .dstart = dstart, .dend = dend, + .vd.insertions = vd.insertions, .dj.insertions = dj.insertions, + .total.insertions = total.insertions, .skip = .skip, .sep = .sep + ) +} + +parse_vdjtools <- function(.filename, .mode) { + skip <- 0 + + # Check for different VDJtools outputs + f <- file(.filename, "r") + l <- readLines(f, 1) + close(f) + + .skip <- 0 + .count <- "count" + filename <- .filename + nuc.seq <- "cdr3nt" + aa.seq <- "CDR3aa" + vgenes <- "V" + jgenes <- "J" + dgenes <- "D" + vend <- "Vend" + jstart <- "Jstart" + dstart <- "Dstart" + dend <- "Dend" + vd.insertions <- NA + dj.insertions <- NA + total.insertions <- NA + .sep <- "\t" + + if (length(strsplit(l, "-", TRUE)) > 0) { + if (length(strsplit(l, "-", TRUE)[[1]]) == 3) { + if (strsplit(l, "-", TRUE)[[1]][2] == "header") { + .count <- "count" + .skip <- 1 + } + } else if (tolower(substr(l, 1, 2)) == "#s") { + .count <- "#Seq. count" + nuc.seq <- "N Sequence" + aa.seq <- "AA Sequence" + vgenes <- "V segments" + jgenes <- "J segments" + dgenes <- "D segment" + vend <- NA + jstart <- NA + dstart <- NA + dend <- NA + } else if (tolower(substr(l, 1, 2)) == "#c") { + .count <- "#count" + nuc.seq <- "CDR3nt" + aa.seq <- "CDR3aa" + vgenes <- "V" + jgenes <- "J" + dgenes <- "D" + vend <- NA + jstart <- NA + dstart <- NA + dend <- NA + } else if (stringr::str_detect(l, "#")) { + .count <- "X.count" + } else { + .count <- "count" + } + } + + parse_repertoire( + .filename = filename, .mode = .mode, .nuc.seq = nuc.seq, .aa.seq = aa.seq, .count = .count, .vgenes = vgenes, .jgenes = jgenes, .dgenes = dgenes, + .vend = vend, .jstart = jstart, .dstart = dstart, .dend = dend, + .vd.insertions = vd.insertions, .dj.insertions = dj.insertions, + .total.insertions = total.insertions, .skip = .skip, .sep = .sep + ) +} + +parse_imgt <- function(.filename, .mode) { + .fix.imgt.alleles <- function(.col) { + sapply(strsplit(.col, " "), function(x) { + if (length(x) > 1) { + x[[2]] + } else { + NA + } + }) + } + + f <- file(.filename, "r") + readLines(f, 2)[2] + close(f) + + nuc.seq <- "JUNCTION" + aa.seq <- NA + .count <- NA + vgenes <- "V-GENE and allele" + jgenes <- "J-GENE and allele" + dgenes <- "D-GENE and allele" + vend <- "3'V-REGION end" + jstart <- "5'J-REGION start" + dstart <- "D-REGION start" + dend <- "D-REGION end" + vd.insertions <- NA + dj.insertions <- NA + total.insertions <- NA + .skip <- 0 + .sep <- "\t" + junc_start <- .make_names("JUNCTION start") + + df <- parse_repertoire( + .filename = .filename, .mode = .mode, .nuc.seq = nuc.seq, .aa.seq = aa.seq, .count = .count, .vgenes = vgenes, .jgenes = jgenes, .dgenes = dgenes, + .vend = vend, .jstart = jstart, .dstart = dstart, .dend = dend, + .vd.insertions = vd.insertions, .dj.insertions = dj.insertions, + .total.insertions = total.insertions, .skip = .skip, .sep = .sep, .add = junc_start + ) + + df[[IMMCOL$ve]] <- df[[IMMCOL$ve]] - df[[junc_start]] + df[[IMMCOL$ds]] <- df[[IMMCOL$ds]] - df[[junc_start]] + df[[IMMCOL$de]] <- df[[IMMCOL$de]] - df[[junc_start]] + df[[IMMCOL$js]] <- df[[IMMCOL$js]] - df[[junc_start]] + + df[[IMMCOL$ve]][df[[IMMCOL$ve]] < 0] <- NA + df[[IMMCOL$ds]][df[[IMMCOL$ds]] < 0] <- NA + df[[IMMCOL$de]][df[[IMMCOL$de]] < 0] <- NA + df[[IMMCOL$js]][df[[IMMCOL$js]] < 0] <- NA + + df[[IMMCOL$v]] <- .fix.imgt.alleles(df[[IMMCOL$v]]) + df[[IMMCOL$d]] <- .fix.imgt.alleles(df[[IMMCOL$d]]) + df[[IMMCOL$j]] <- .fix.imgt.alleles(df[[IMMCOL$j]]) + + df[[junc_start]] <- NULL + + df +} + +parse_airr <- function(.filename, .mode) { + df <- .filename %>% + .as_tsv() %>% + airr::read_rearrangement() + + df <- df %>% + select( + sequence, v_call, d_call, j_call, junction, junction_aa, + contains("v_germline_end"), contains("d_germline_start"), contains("d_germline_end"), + contains("j_germline_start"), contains("np1_length"), contains("np2_length"), + contains("duplicate_count") + ) + + namekey <- c( + duplicate_count = IMMCOL$count, junction = IMMCOL$cdr3nt, junction_aa = IMMCOL$cdr3aa, + v_call = IMMCOL$v, d_call = IMMCOL$d, j_call = IMMCOL$j, v_germline_end = IMMCOL$ve, + d_germline_start = IMMCOL$ds, d_germline_end = IMMCOL$de, j_germline_start = IMMCOL$js, + np1_length = "unidins", np2_length = IMMCOL$dnj, sequence = IMMCOL$seq + ) + + names(df) <- namekey[names(df)] + + if (!("unidins" %in% colnames(df))) { + df["unidins"] <- NA + } + + recomb_type <- .which_recomb_type(df[[IMMCOL$v]]) + + if (!is.na(recomb_type)) { + if (recomb_type == "VJ") { + df[IMMCOL$vnj] <- df["unidins"] + df[IMMCOL$vnd] <- NA + df[IMMCOL$dnj] <- NA + } else if (recomb_type == "VDJ") { + df[IMMCOL$vnj] <- NA + df[IMMCOL$vnd] <- df["unidins"] + } + } + + for (column in IMMCOL$order) { + if (!(column %in% colnames(df))) { + df[column] <- NA + } + } + + df <- df[IMMCOL$order] + total <- sum(df$Clones) + df[IMMCOL$prop] <- df[IMMCOL$count] / total + df[IMMCOL$seq] <- stringr::str_remove_all(df[[IMMCOL$seq]], "N") + df <- .postprocess(df) + df +} + +parse_immunarch <- function(.filename, .mode) { + df <- readr::read_tsv(.filename, col_types = cols(), comment = "#") + if (ncol(df) == 1) { + # "," in the files, parse differently then + df <- readr::read_csv(.filename, col_types = cols(), comment = "#") + } + df <- .postprocess(df) + df +} + +parse_10x_consensus <- function(.filename, .mode) { + df <- parse_repertoire(.filename, + .mode = .mode, + .nuc.seq = "cdr3_nt", .aa.seq = NA, .count = "umis", + .vgenes = "v_gene", .jgenes = "j_gene", .dgenes = "d_gene", + .vend = NA, .jstart = NA, .dstart = NA, .dend = NA, + .vd.insertions = NA, .dj.insertions = NA, .total.insertions = NA, + .skip = 0, .sep = ",", .add = c("chain", "clonotype_id", "consensus_id") + ) + setnames(df, "clonotype_id", "ClonotypeID") + setnames(df, "consensus_id", "ConsensusID") + df +} + +parse_10x_filt_contigs <- function(.filename, .mode) { + df <- parse_repertoire(.filename, + .mode = .mode, + .nuc.seq = "cdr3_nt", .aa.seq = NA, .count = "umis", + .vgenes = "v_gene", .jgenes = "j_gene", .dgenes = "d_gene", + .vend = NA, .jstart = NA, .dstart = NA, .dend = NA, + .vd.insertions = NA, .dj.insertions = NA, .total.insertions = NA, + .skip = 0, .sep = ",", # .add = c("chain", "raw_clonotype_id", "raw_consensus_id", "barcode", "contig_id") + .add = c("chain", "barcode", "raw_clonotype_id", "contig_id") + ) + # setnames(df, "raw_clonotype_id", "RawClonotypeID") + # setnames(df, "raw_consensus_id", "RawConsensusID") + + # Process 10xGenomics filtered contigs files - count barcodes, merge consensues ids, clonotype ids and contig ids + df <- df[order(df$chain), ] + setDT(df) + + if (.mode == "paired") { + df <- df %>% + lazy_dt() %>% + group_by(barcode, raw_clonotype_id) %>% + summarise( + CDR3.nt = paste0(CDR3.nt, collapse = IMMCOL_ADD$scsep), + CDR3.aa = paste0(CDR3.aa, collapse = IMMCOL_ADD$scsep), + V.name = paste0(V.name, collapse = IMMCOL_ADD$scsep), + J.name = paste0(J.name, collapse = IMMCOL_ADD$scsep), + D.name = paste0(D.name, collapse = IMMCOL_ADD$scsep), + chain = paste0(chain, collapse = IMMCOL_ADD$scsep), + # raw_clonotype_id = gsub("clonotype", "", paste0(raw_clonotype_id, collapse = IMMCOL_ADD$scsep)), + # raw_consensus_id = gsub("clonotype|consensus", "", paste0(raw_consensus_id, collapse = IMMCOL_ADD$scsep)), + contig_id = gsub("_contig_", "", paste0(contig_id, collapse = IMMCOL_ADD$scsep)) + ) %>% + as.data.table() + } + df <- df %>% + lazy_dt() %>% + group_by(CDR3.nt, V.name, J.name) %>% + summarise( + Clones = length(unique(barcode)), + CDR3.aa = first(CDR3.aa), + D.name = first(D.name), + chain = first(chain), + barcode = paste0(unique(barcode), collapse = IMMCOL_ADD$scsep), + raw_clonotype_id = gsub("clonotype|None", "", paste0(unique(raw_clonotype_id), collapse = IMMCOL_ADD$scsep)), + # raw_clonotype_id = gsub("clonotype", "", paste0(raw_clonotype_id, collapse = IMMCOL_ADD$scsep)), + # raw_consensus_id = gsub("clonotype|consensus", "", paste0(raw_consensus_id, collapse = IMMCOL_ADD$scsep)), + contig_id = paste0(contig_id, collapse = IMMCOL_ADD$scsep) + ) %>% + as.data.table() + + df$V.end <- NA + df$J.start <- NA + df$D.end <- NA + df$D.start <- NA + df$VD.ins <- NA + df$DJ.ins <- NA + df$VJ.ins <- NA + df$Sequence <- df$CDR3.nt + + setnames(df, "contig_id", "ContigID") + setnames(df, "barcode", "Barcode") + + df[[IMMCOL$prop]] <- df[[IMMCOL$count]] / sum(df[[IMMCOL$count]]) + setcolorder(df, IMMCOL$order) + + setDF(df) + + .postprocess(df) +} + +parse_archer <- function(.filename, .mode) { + parse_repertoire(.filename, + .mode = .mode, + .nuc.seq = "Clonotype Sequence", .aa.seq = NA, .count = "Clone Abundance", + .vgenes = "Predicted V Region", .jgenes = "Predicted J Region", .dgenes = "Predicted D Region", + .vend = NA, .jstart = NA, .dstart = NA, .dend = NA, + .vd.insertions = NA, .dj.insertions = NA, .total.insertions = NA, + .skip = 0, .sep = "\t" + ) +} + +parse_catt <- function(.filename, .mode) { + filename <- .filename + nuc.seq <- "NNseq" + aa.seq <- "AAseq" + .count <- "Frequency" + vgenes <- "Vregion" + jgenes <- "Jregion" + dgenes <- "Dregion" + vend <- NA + jstart <- NA + dstart <- NA + dend <- NA + vd.insertions <- NA + dj.insertions <- NA + total.insertions <- NA + .skip <- 0 + .sep <- "," + + parse_repertoire( + .filename = filename, .mode = .mode, .nuc.seq = nuc.seq, .aa.seq = aa.seq, .count = .count, + .vgenes = vgenes, .jgenes = jgenes, .dgenes = dgenes, + .vend = vend, .jstart = jstart, .dstart = dstart, .dend = dend, + .vd.insertions = vd.insertions, .dj.insertions = dj.insertions, + .total.insertions = total.insertions, .skip = .skip, .sep = .sep + ) +} + +parse_rtcr <- function(.filename, .mode) { + filename <- .filename + nuc.seq <- "Junction nucleotide sequence" + aa.seq <- "Amino acid sequence" + .count <- "Number of reads" + vgenes <- "V gene" + jgenes <- "J gene" + dgenes <- NA + vend <- "V gene end position" + jstart <- "J gene start position" + dstart <- NA + dend <- NA + vd.insertions <- NA + dj.insertions <- NA + total.insertions <- NA + .skip <- 0 + .sep <- "\t" + + parse_repertoire( + .filename = filename, .mode = .mode, .nuc.seq = nuc.seq, .aa.seq = aa.seq, .count = .count, + .vgenes = vgenes, .jgenes = jgenes, .dgenes = dgenes, + .vend = vend, .jstart = jstart, .dstart = dstart, .dend = dend, + .vd.insertions = vd.insertions, .dj.insertions = dj.insertions, + .total.insertions = total.insertions, .skip = .skip, .sep = .sep + ) +} + +parse_imseq <- function(.filename, .mode) { + filename <- .filename + nuc.seq <- "cdrNucSeq" + aa.seq <- "cdrAASeq" + .count <- NA + vgenes <- "leftMatches" + jgenes <- "rightMatches" + dgenes <- NA + vend <- NA + jstart <- NA + dstart <- NA + dend <- NA + vd.insertions <- NA + dj.insertions <- NA + total.insertions <- NA + .skip <- 0 + .sep <- "\t" + + parse_repertoire( + .filename = filename, .mode = .mode, .nuc.seq = nuc.seq, .aa.seq = aa.seq, .count = .count, + .vgenes = vgenes, .jgenes = jgenes, .dgenes = dgenes, + .vend = vend, .jstart = jstart, .dstart = dstart, .dend = dend, + .vd.insertions = vd.insertions, .dj.insertions = dj.insertions, + .total.insertions = total.insertions, .skip = .skip, .sep = .sep + ) +} + +parse_vidjil <- function(.filename, .mode) { + json_data <- read_json(.filename, simplifyVector = TRUE) + clones <- json_data[["clones"]] + + count <- as.vector(clones[["reads"]], mode = "numeric") + proportion <- count / sum(count) + cdr3nt <- NA + cdr3aa <- as.vector(clones[["seg"]][["cdr3"]][["aa"]]) + vgenes <- as.vector(clones[["seg"]][["5"]]) + dgenes <- NA + jgenes <- as.vector(clones[["seg"]][["3"]]) + vend <- as.vector(clones[["seg"]][["5end"]], mode = "numeric") + dstart <- NA + dend <- NA + jstart <- as.vector(clones[["seg"]][["3start"]], mode = "numeric") + vj.insertions <- NA + vd.insertions <- NA + dj.insertions <- NA + + df <- data.frame( + count, proportion, cdr3nt, cdr3aa, vgenes, dgenes, jgenes, + vend, dstart, dend, jstart, vj.insertions, vd.insertions, dj.insertions + ) + + colnames(df)[1] <- IMMCOL$count + colnames(df)[2] <- IMMCOL$prop + colnames(df)[3] <- IMMCOL$cdr3nt + colnames(df)[4] <- IMMCOL$cdr3aa + colnames(df)[5] <- IMMCOL$v + colnames(df)[6] <- IMMCOL$d + colnames(df)[7] <- IMMCOL$j + colnames(df)[8] <- IMMCOL$ve + colnames(df)[9] <- IMMCOL$ds + colnames(df)[10] <- IMMCOL$de + colnames(df)[11] <- IMMCOL$js + colnames(df)[12] <- IMMCOL$vnj + colnames(df)[13] <- IMMCOL$vnd + colnames(df)[14] <- IMMCOL$dnj + + df <- .postprocess(df) + df +} diff --git a/R/io-savers.R b/R/io-savers.R new file mode 100644 index 00000000..e5f3703a --- /dev/null +++ b/R/io-savers.R @@ -0,0 +1,47 @@ +save_immunarch <- function(.data, .path, .compress = TRUE) { + if (.compress) { + filepath <- gzfile(paste0(.path, ".tsv.gz"), compression = 9) + } else { + filepath <- paste0(.path, ".tsv") + } + readr::write_lines(paste0("# Exported from immunarch ", packageVersion("immunarch"), " https://immunarch.com"), + path = filepath + ) + filepath <- gzfile(paste0(.path, ".tsv.gz"), compression = 9) + readr::write_tsv(x = .data, path = filepath, append = TRUE, col_names = TRUE) +} + +save_vdjtools <- function(.data, .path, .compress = TRUE) { + if (.compress) { + filepath <- gzfile(paste0(.path, ".tsv.gz"), compression = 9) + } else { + filepath <- paste0(.path, ".tsv") + } + + old <- c( + IMMCOL$count, + IMMCOL$prop, + IMMCOL$cdr3nt, + IMMCOL$cdr3aa, + IMMCOL$v, + IMMCOL$d, + IMMCOL$j + ) + + new <- c( + "#Seq. Count", + "Percent", + "N Sequence", + "AA Sequence", + "V Segments", + "D Segment", + "J Segments" + ) + + names(.data) <- plyr::mapvalues(names(.data), + from = old, + to = as.character(new) + ) + + readr::write_tsv(x = .data, path = filepath) +} diff --git a/R/io-utility.R b/R/io-utility.R new file mode 100644 index 00000000..a2781559 --- /dev/null +++ b/R/io-utility.R @@ -0,0 +1,274 @@ +.remove.ext <- function(.str) { + # gsub(pattern = '.*/|[.].*$', replacement = '', x = .str) + gsub(pattern = ".*/|[.](txt|tsv|csv)$|([.](txt|tsv|csv))?[.](gz|bzip|bzip2|bz2)$", replacement = "", x = .str) +} + + +.detect_format <- function(.filename) { + res_format <- NA + + f <- file(.filename, "r") + l <- readLines(f, 1) + # use 2nd line of file for JSON formats + l2 <- ifelse(str_trim(l) == "{", readLines(f, 1), NA) + close(f) + + if (any(str_detect(l, c("MiTCRFullExport", "mitcr")))) { + res_format <- "mitcr" + } else if (str_detect(l, "CDR3 amino acid sequence") && str_detect(l, "V segment") && !str_detect(l, "Good events")) { + res_format <- "mitcr" + } else if (str_detect(l, "CDR3 amino acid sequence") && str_detect(l, "V segment") && str_detect(l, "Good events")) { + res_format <- "migec" + } else if (str_detect(l, "v.end.in.cdr3") && str_detect(l, "cdr3aa")) { + res_format <- "migmap" + } else if (str_detect(l, "CDR3.amino.acid.sequence") && str_detect(l, "Umi.count")) { + res_format <- "tcr" + } else if (str_detect(tolower(l), "frequency") && str_detect(tolower(l), "cdr3nt") && str_detect(tolower(l), "v")) { + res_format <- "vdjtools" + } else if (str_detect(tolower(l), "count") && str_detect(tolower(l), "sequence") && str_detect(tolower(l), "d segment")) { + res_format <- "vdjtools" + } else if (str_detect(tolower(l), "junction start") && str_detect(tolower(l), "v-d-j-region end") && str_detect(tolower(l), "v-region")) { + res_format <- "imgt" + } else if (str_detect(tolower(l), "v_resolved") && str_detect(tolower(l), "amino_acid")) { + res_format <- "immunoseq" + } else if (str_detect(tolower(l), "maxresolved")) { + res_format <- "immunoseq" + } else if (str_detect(tolower(l), "v_gene") && str_detect(tolower(l), "templates") && str_detect(tolower(l), "amino_acid")) { + res_format <- "immunoseq" + } else if (str_detect(tolower(l), "allvalignment") && str_detect(tolower(l), "vhit")) { + res_format <- "mixcr" + } else if (str_detect(tolower(l), "bestvhit") && str_detect(tolower(l), "bestjhit")) { + res_format <- "mixcr" + } else if (str_detect(tolower(l), "clonal sequence")) { + res_format <- "mixcr" + } else if (str_detect(tolower(l), "clonalsequence")) { + res_format <- "mixcr" + } else if (str_detect(tolower(l), "targetsequences")) { + res_format <- "mixcr" + } else if (str_detect(tolower(l), "junction_aa") && str_detect(tolower(l), "cigar")) { + res_format <- "airr" + } else if (str_detect(tolower(l), "raw_clonotype_id") && str_detect(tolower(l), "barcode") && str_detect(tolower(l), "v_gene")) { + res_format <- "10x (filt.contigs)" + } else if (str_detect(tolower(l), "clonotype_id") && str_detect(tolower(l), "v_gene")) { + res_format <- "10x (consensus)" + } else if (str_detect(tolower(l), "clonotype sequence") && str_detect(tolower(l), "v regions")) { + res_format <- "archer" + } else if (str_detect(tolower(l), "exported from immunarch")) { + res_format <- "immunarch" + } else if (str_detect(tolower(l), "clones") && str_detect(tolower(l), "v.name") && str_detect(tolower(l), "proportion")) { + res_format <- "immunarch" + } else if (str_detect(l, "AAseq") && str_detect(l, "Vregion") && str_detect(l, "Frequency")) { + res_format <- "catt" + } else if (str_detect(l, "Number of reads") && str_detect(l, "Amino acid sequence") && str_detect(l, "V gene")) { + res_format <- "rtcr" + } else if (str_detect(l, "seqId") && str_detect(l, "cdrNucSeq") && str_detect(l, "cdrAASeq")) { + res_format <- "imseq" + } else if (!is.na(l2)) { + if (str_trim(l2) == "\"clones\": [") { + res_format <- "vidjil" + } + } + + res_format +} + + +.make_names <- function(.char) { + if (is.na(.char[1])) { + NA + } + # else { tolower(make.names(.char)) } + else { + tolower(.char) + } +} + + +.which_recomb_type <- function(.name) { + recomb_type <- NA + + i <- 1 + + while (is.na(recomb_type) && i < 100 && !is.na(.name[i])) { + if (any(str_detect(.name[i], c("TCRA", "TRAV", "TCRG", "TRGV", "IGKV", "IGLV")))) { + recomb_type <- "VJ" + } else if (any(str_detect(.name[i], c("TCRB", "TRBV", "TCRD", "TRDV", "IGHV")))) { + recomb_type <- "VDJ" + } + i <- i + 1 + } + if (is.na(recomb_type)) { + warning("Can't determine the type of V(D)J recombination. No insertions will be presented in the resulting data table.") + } + + recomb_type +} + + +.get_coltypes <- function(.filename, .nuc.seq, .aa.seq, .count, + .vgenes, .jgenes, .dgenes, + .vend, .jstart, .dstart, .dend, + .vd.insertions, .dj.insertions, .total.insertions, + .skip, .sep, .add = NA) { + table.colnames <- colnames(readr::read_delim(.filename, + col_types = cols(), + delim = .sep, + quote = "", + escape_double = FALSE, + comment = "", + n_max = 1, + trim_ws = TRUE, + skip = .skip + )) + + swlist <- list( + col_character(), col_character(), + col_integer(), + col_character(), col_character(), col_character(), + col_integer(), col_integer(), col_integer(), col_integer(), + col_integer(), col_integer(), col_integer() + ) + + names(swlist) <- tolower(c( + .nuc.seq, ifelse(is.na(.aa.seq), "NA", .aa.seq), + .count, + .vgenes, .jgenes, .dgenes, + .vend, .jstart, .dstart, .dend, + .vd.insertions, .dj.insertions, .total.insertions + )) + if (!is.na(.add[1])) { + swlist <- c(swlist, rep(col_guess(), length(.add))) + names(swlist)[tail(1:length(swlist), length(.add))] <- .add + } + + swlist <- c(swlist, "_") + + if (is.na(.aa.seq)) { + swlist <- swlist[-2] + } + + col.classes <- list(sapply(tolower(table.colnames), function(x) { + do.call(switch, c(x, swlist)) + }, USE.NAMES = FALSE))[[1]] + names(col.classes) <- table.colnames + + col.classes +} + +.remove.alleles <- function(.data) { + if (has_class(.data, "list")) { + lapply(.data, .remove.alleles) + } else { + .data[[IMMCOL$v]] <- return_segments(.data[[IMMCOL$v]]) + .data[[IMMCOL$j]] <- return_segments(.data[[IMMCOL$j]]) + .data + } +} + +.postprocess <- function(.data, .mode) { + .data[[IMMCOL$cdr3nt]][.data[[IMMCOL$cdr3nt]] == "NONE"] <- NA + logic <- is.na(.data[[IMMCOL$cdr3aa]]) & !is.na(.data[[IMMCOL$cdr3nt]]) + if (any(logic)) { + .data[[IMMCOL$cdr3aa]][logic] <- bunch_translate(.data[[IMMCOL$cdr3nt]][logic]) + } + + logic <- is.na(.data[[IMMCOL$cdr3aa]]) & is.na(.data[[IMMCOL$cdr3nt]]) + if (any(logic)) { + warn_msg <- c(" [!] Removed ", sum(logic)) + warn_msg <- c(warn_msg, " clonotypes with no nucleotide and amino acid CDR3 sequence.") + message(warn_msg) + } + .data <- .data[!logic, ] + + if (nrow(.data)) { + for (colname in c(IMMCOL$ve, IMMCOL$ds, IMMCOL$de, IMMCOL$js, IMMCOL$vnj, IMMCOL$vnd, IMMCOL$dnj)) { + if (colname %in% colnames(.data)) { + logic <- is.na(.data[[colname]]) + .data[[colname]][logic] <- -1 + + logic <- .data[[colname]] < 0 + .data[[colname]][logic] <- NA + } + } + + for (col_i in 1:length(IMMCOL$order)) { + colname <- IMMCOL$order[col_i] + if (colname %in% colnames(.data)) { + if (!has_class(.data[[colname]], IMMCOL$type[col_i])) { + .data[[colname]] <- as(.data[[colname]], IMMCOL$type[col_i]) + } + } + } + + logic <- is.na(.data[[IMMCOL$count]]) + if (any(logic)) { + message(" [!] Warning: found NAs in clonal counts. Setting them to 1's.") + .data[[IMMCOL$count]][logic] <- 1 + } + .data <- .data[order(.data[[IMMCOL$count]], decreasing = TRUE), ] + } else { + .data <- NULL + } + + .data +} + +.as_tsv <- function(.delim_file) { + df <- readr::read_tsv(.delim_file, comment = "#") + if (ncol(df) == 1) { + # treat file as csv and convert it to temporary tsv + df <- readr::read_delim(.delim_file, comment = "#") + tsv_file <- tempfile() + readr::write_tsv(df, tsv_file) + return(tsv_file) + } else { + return(.delim_file) + } +} + +.check_empty_repertoires <- function(.data) { + empty_reps <- .data %>% + sapply(function(repertoire) { + nrow(repertoire) == 0 + }) %>% + sum() + if (empty_reps > 0) { + warning("Input data contains ", empty_reps, " empty repertoire(s)!") + } +} + +.validate_repertoires_data <- function(.data) { + if (!inherits(.data, "list")) { + stop("Wrong input data format: expected list of immune repertoires!") + } else if (length(.data) == 0) { + stop("Input list of immune repertoires is empty!") + } + .check_empty_repertoires(.data) +} + +.validate_immdata <- function(.immdata) { + if (!inherits(.immdata, "list")) { + stop("Input data is not a list; please pass Immunarch dataset object as input.") + } else if (length(.immdata) < 2 | + !("data" %in% names(.immdata)) | + !("meta" %in% names(.immdata))) { + stop( + "Input list must contain \"data\" and \"meta\" elements;\n", + "please pass Immunarch dataset object as input." + ) + } else if (!inherits(.immdata$data, "list") | !is.data.frame(.immdata$meta)) { + stop( + "Wrong input data format: expected list with \"data\" as list ", + "and \"meta\" as dataframe;\n", + "please pass Immunarch dataset object as input." + ) + } else if (length(.immdata$data) == 0) { + stop("Input list of immune repertoires in \"data\" is empty!") + } else if (length(.immdata$data) != nrow(.immdata$meta)) { + stop( + "Number of samples is different in data (", length(.immdata$data), + ") and metadata (", nrow(.immdata$meta), ")!" + ) + } + .check_empty_repertoires(.immdata$data) +} diff --git a/R/io.R b/R/io.R index d5eb3210..15ad9529 100644 --- a/R/io.R +++ b/R/io.R @@ -17,7 +17,8 @@ if (getRversion() >= "2.15.1") { #' @concept io #' #' @importFrom readr read_delim read_tsv read_csv col_integer col_character col_double col_logical col_guess cols write_lines -#' @importFrom stringr str_split str_detect str_replace_all +#' @importFrom jsonlite read_json +#' @importFrom stringr str_split str_detect str_replace_all str_trim #' @importFrom methods as #' @importFrom dplyr contains first #' @importFrom utils read.table @@ -165,8 +166,7 @@ repLoad <- function(.path, .format = NA, .mode = "paired", .coding = TRUE) { # Parse the file if (is.na(cur_format)) { message("unsupported format, skipping") - } - else { + } else { message(cur_format) parse_fun <- switch(cur_format, @@ -183,13 +183,16 @@ repLoad <- function(.path, .format = NA, .mode = "paired", .coding = TRUE) { `10x (filt.contigs)` = parse_10x_filt_contigs, archer = parse_archer, immunarch = parse_immunarch, + catt = parse_catt, + rtcr = parse_rtcr, + imseq = parse_imseq, + vidjil = parse_vidjil, NA ) if (suppressWarnings(is.na(parse_fun))) { message("unknown format, skipping") - } - else { + } else { parse_res <- parse_fun(.path, .mode) if (is.null(parse_res)) { @@ -221,11 +224,10 @@ repLoad <- function(.path, .format = NA, .mode = "paired", .coding = TRUE) { metadata <- tibble() for (.filepath in .files) { - message(' -- Parsing "', .filepath, '" -- ', appendLF = FALSE) + message(" -- [", match(.filepath, .files), "/", length(.files), '] Parsing "', .filepath, '" -- ', appendLF = FALSE) if (!file.exists(.filepath)) { message('Can\'t find\t"', .filepath, '", skipping') - } - else { + } else { # Check for the type: repertoire, metadata or barcodes if (stringr::str_detect(.filepath, "metadata")) { message("metadata") @@ -236,11 +238,9 @@ repLoad <- function(.path, .format = NA, .mode = "paired", .coding = TRUE) { if (!("Sample" %in% colnames(metadata))) { stop('No "Sample" column found in the metadata file. The "Sample" column with the names of samples without extensions (e.g., ".txt", ".tsv") is required. Please provide it and run the parsing again.') } - } - else if (stringr::str_detect(.filepath, "barcode")) { + } else if (stringr::str_detect(.filepath, "barcode")) { # TODO: add the barcode processing subroutine to split by samples - } - else { + } else { repertoire <- .read_repertoire(.filepath, .format, .mode, .coding) if (length(repertoire) != 0) { parsed_batch <- c(parsed_batch, repertoire) @@ -433,6 +433,7 @@ repLoad <- function(.path, .format = NA, .mode = "paired", .coding = TRUE) { #' #' @importFrom utils packageVersion #' @importFrom plyr mapvalues +#' @importFrom purrr map #' #' @description The \code{repSave} function is deigned to save your data to the disk #' in desirable format. Currently supports "immunarch" and "vdjtools" file formats. @@ -453,6 +454,8 @@ repLoad <- function(.path, .format = NA, .mode = "paired", .coding = TRUE) { #' #' @examples #' data(immdata) +#' # Reduce data to save time on examples +#' immdata$data <- purrr::map(immdata$data, ~ .x %>% head(10)) #' dirpath <- tempdir() #' # Save the list of repertoires #' repSave(immdata, dirpath) @@ -514,1289 +517,3 @@ repSave <- function(.data, .path, .format = c("immunarch", "vdjtools"), } } } - - - - - -##### IO utility functions ##### -.remove.ext <- function(.str) { - # gsub(pattern = '.*/|[.].*$', replacement = '', x = .str) - gsub(pattern = ".*/|[.](txt|tsv|csv)$|([.](txt|tsv|csv))?[.](gz|bzip|bzip2|bz2)$", replacement = "", x = .str) -} - - -.detect_format <- function(.filename) { - res_format <- NA - - f <- file(.filename, "r") - l <- readLines(f, 1) - close(f) - - if (any(str_detect(l, c("MiTCRFullExport", "mitcr")))) { - res_format <- "mitcr" - } else if (str_detect(l, "CDR3 amino acid sequence") && str_detect(l, "V segment") && !str_detect(l, "Good events")) { - res_format <- "mitcr" - } else if (str_detect(l, "CDR3 amino acid sequence") && str_detect(l, "V segment") && str_detect(l, "Good events")) { - res_format <- "migec" - } else if (str_detect(l, "v.end.in.cdr3") && str_detect(l, "cdr3aa")) { - res_format <- "migmap" - } else if (str_detect(l, "CDR3.amino.acid.sequence") && str_detect(l, "Umi.count")) { - res_format <- "tcr" - } else if (str_detect(tolower(l), "cdr3nt") && str_detect(tolower(l), "vend") && str_detect(tolower(l), "v")) { - res_format <- "vdjtools" - } else if (str_detect(tolower(l), "count") && str_detect(tolower(l), "sequence") && str_detect(tolower(l), "d segment")) { - res_format <- "vdjtools" - } else if (str_detect(tolower(l), "junction start") && str_detect(tolower(l), "v-d-j-region end") && str_detect(tolower(l), "v-region")) { - res_format <- "imgt" - } else if (str_detect(tolower(l), "v_resolved") && str_detect(tolower(l), "amino_acid")) { - res_format <- "immunoseq" - } else if (str_detect(tolower(l), "maxresolved")) { - res_format <- "immunoseq" - } else if (str_detect(tolower(l), "v_gene") && str_detect(tolower(l), "templates") && str_detect(tolower(l), "amino_acid")) { - res_format <- "immunoseq" - } else if (str_detect(tolower(l), "allvalignment") && str_detect(tolower(l), "vhit")) { - res_format <- "mixcr" - } else if (str_detect(tolower(l), "bestvhit") && str_detect(tolower(l), "bestjhit")) { - res_format <- "mixcr" - } else if (str_detect(tolower(l), "clonal sequence")) { - res_format <- "mixcr" - } else if (str_detect(tolower(l), "clonalsequence")) { - res_format <- "mixcr" - } else if (str_detect(tolower(l), "targetsequences")) { - res_format <- "mixcr" - } else if (str_detect(tolower(l), "junction_aa") && str_detect(tolower(l), "cigar")) { - res_format <- "airr" - } else if (str_detect(tolower(l), "raw_clonotype_id") && str_detect(tolower(l), "barcode") && str_detect(tolower(l), "v_gene")) { - res_format <- "10x (filt.contigs)" - } else if (str_detect(tolower(l), "clonotype_id") && str_detect(tolower(l), "v_gene")) { - res_format <- "10x (consensus)" - } else if (str_detect(tolower(l), "clonotype sequence") && str_detect(tolower(l), "v regions")) { - res_format <- "archer" - } else if (str_detect(tolower(l), "exported from immunarch")) { - res_format <- "immunarch" - } else if (str_detect(tolower(l), "clones") && str_detect(tolower(l), "v.name") && str_detect(tolower(l), "proportion")) { - res_format <- "immunarch" - } - - res_format -} - - -.make_names <- function(.char) { - if (is.na(.char[1])) { - NA - } - # else { tolower(make.names(.char)) } - else { - tolower(.char) - } -} - - -.which_recomb_type <- function(.name) { - recomb_type <- NA - - i <- 1 - - while (is.na(recomb_type) && i < 100) { - if (any(str_detect(.name[i], c("TCRA", "TRAV", "TCRG", "TRGV", "IGKV", "IGLV")))) { - recomb_type <- "VJ" - } else if (any(str_detect(.name[i], c("TCRB", "TRBV", "TCRD", "TRDV", "IGHV")))) { - recomb_type <- "VDJ" - } - i <- i + 1 - } - if (is.na(recomb_type)) { - warning("Can't determine the type of V(D)J recombination. No insertions will be presented in the resulting data table.") - } - - recomb_type -} - - -.get_coltypes <- function(.filename, .nuc.seq, .aa.seq, .count, - .vgenes, .jgenes, .dgenes, - .vend, .jstart, .dstart, .dend, - .vd.insertions, .dj.insertions, .total.insertions, - .skip, .sep, .add = NA) { - table.colnames <- colnames(readr::read_delim(.filename, - col_types = cols(), - delim = .sep, - quote = "", - escape_double = FALSE, - comment = "", - n_max = 1, - trim_ws = TRUE, - skip = .skip - )) - - swlist <- list( - col_character(), col_character(), - col_integer(), - col_character(), col_character(), col_character(), - col_integer(), col_integer(), col_integer(), col_integer(), - col_integer(), col_integer(), col_integer() - ) - - names(swlist) <- tolower(c( - .nuc.seq, ifelse(is.na(.aa.seq), "NA", .aa.seq), - .count, - .vgenes, .jgenes, .dgenes, - .vend, .jstart, .dstart, .dend, - .vd.insertions, .dj.insertions, .total.insertions - )) - if (!is.na(.add[1])) { - swlist <- c(swlist, rep(col_guess(), length(.add))) - names(swlist)[tail(1:length(swlist), length(.add))] <- .add - } - - swlist <- c(swlist, "_") - - if (is.na(.aa.seq)) { - swlist <- swlist[-2] - } - - col.classes <- list(sapply(tolower(table.colnames), function(x) { - do.call(switch, c(x, swlist)) - }, USE.NAMES = FALSE))[[1]] - names(col.classes) <- table.colnames - - col.classes -} - -.remove.alleles <- function(.data) { - if (has_class(.data, "list")) { - lapply(.data, .remove.alleles) - } else { - .data[[IMMCOL$v]] <- return_segments(.data[[IMMCOL$v]]) - .data[[IMMCOL$j]] <- return_segments(.data[[IMMCOL$j]]) - .data - } -} - -.postprocess <- function(.data, .mode) { - .data[[IMMCOL$cdr3nt]][.data[[IMMCOL$cdr3nt]] == "NONE"] <- NA - logic <- is.na(.data[[IMMCOL$cdr3aa]]) & !is.na(.data[[IMMCOL$cdr3nt]]) - if (any(logic)) { - .data[[IMMCOL$cdr3aa]][logic] <- bunch_translate(.data[[IMMCOL$cdr3nt]][logic]) - } - - logic <- is.na(.data[[IMMCOL$cdr3aa]]) & is.na(.data[[IMMCOL$cdr3nt]]) - if (any(logic)) { - warn_msg <- c(" [!] Removed ", sum(logic)) - warn_msg <- c(warn_msg, " clonotypes with no nucleotide and amino acid CDR3 sequence.") - message(warn_msg) - } - .data <- .data[!logic, ] - - if (nrow(.data)) { - for (colname in c(IMMCOL$ve, IMMCOL$ds, IMMCOL$de, IMMCOL$js, IMMCOL$vnj, IMMCOL$vnd, IMMCOL$dnj)) { - if (colname %in% colnames(.data)) { - logic <- is.na(.data[[colname]]) - .data[[colname]][logic] <- -1 - - logic <- .data[[colname]] < 0 - .data[[colname]][logic] <- NA - } - } - - for (col_i in 1:length(IMMCOL$order)) { - colname <- IMMCOL$order[col_i] - if (colname %in% colnames(.data)) { - if (!has_class(.data[[colname]], IMMCOL$type[col_i])) { - .data[[colname]] <- as(.data[[colname]], IMMCOL$type[col_i]) - } - } - } - - logic <- is.na(.data[[IMMCOL$count]]) - if (any(logic)) { - message(" [!] Warning: found NAs in clonal counts. Setting them to 1's.") - .data[[IMMCOL$count]][logic] <- 1 - } - .data <- .data[order(.data[[IMMCOL$count]], decreasing = TRUE), ] - } else { - .data <- NULL - } - - .data -} - - -##### Parsers ##### - -parse_repertoire <- function(.filename, .mode, .nuc.seq, .aa.seq, .count, - .vgenes, .jgenes, .dgenes, - .vend, .jstart, .dstart, .dend, - .vd.insertions, .dj.insertions, .total.insertions, - .skip = 0, .sep = "\t", .add = NA) { - .nuc.seq <- .make_names(.nuc.seq) - .aa.seq <- .make_names(.aa.seq) - .count <- .make_names(.count) - .vgenes <- .make_names(.vgenes) - .jgenes <- .make_names(.jgenes) - .dgenes <- .make_names(.dgenes) - .vend <- .make_names(.vend) - .jstart <- .make_names(.jstart) - .vd.insertions <- .make_names(.vd.insertions) - .dj.insertions <- .make_names(.dj.insertions) - .total.insertions <- .make_names(.total.insertions) - .dstart <- .make_names(.dstart) - .dend <- .make_names(.dend) - .add <- .make_names(.add) - - col.classes <- .get_coltypes(.filename, .nuc.seq, .aa.seq, .count, - .vgenes, .jgenes, .dgenes, - .vend, .jstart, .dstart, .dend, - .vd.insertions, .dj.insertions, .total.insertions, - .skip = .skip, .sep = "\t", .add - ) - - # IO_REFACTOR - suppressMessages(df <- readr::read_delim(.filename, - col_names = TRUE, - col_types = col.classes, delim = .sep, - quote = "", escape_double = FALSE, - comment = "", trim_ws = TRUE, - skip = .skip, na = c("", "NA", ".") - )) - # suppressMessages(df <- fread(.filename, skip = .skip, data.table = FALSE, na.strings = c("", "NA", "."))) - - names(df) <- tolower(names(df)) - recomb_type <- .which_recomb_type(df[[.vgenes]]) - - table.colnames <- names(col.classes) - - df[[.nuc.seq]] <- toupper(df[[.nuc.seq]]) - - if (is.na(.aa.seq)) { - df$CDR3.amino.acid.sequence <- bunch_translate(df[[.nuc.seq]]) - .aa.seq <- "CDR3.amino.acid.sequence" - } - - if (is.na(.count)) { - .count <- "Count" - df$Count <- 1 - } - - df$Proportion <- df[[.count]] / sum(df[[.count]]) - .prop <- "Proportion" - - ins_ok <- FALSE - if (is.na(.vd.insertions)) { - .vd.insertions <- "VD.insertions" - df$VD.insertions <- NA - } - - if (!(.vd.insertions %in% table.colnames)) { - .vd.insertions <- "VD.insertions" - df$VD.insertions <- NA - - if (!is.na(.vend) && !is.na(.dstart)) { - if (!is.na(recomb_type) && recomb_type == "VDJ") { - df$VD.insertions <- df[[.dstart]] - df[[.vend]] - 1 - df$VD.insertions[is.na(df[[.dstart]])] <- NA - df$VD.insertions[is.na(df[[.vend]])] <- NA - - ins_ok <- TRUE - } - } - } - - if (!ins_ok) { - df$V.end <- NA - df$D.start <- NA - df$D.end <- NA - .vend <- "V.end" - .dstart <- "D.start" - .dend <- "D.end" - } - - ins_ok <- FALSE - if (is.na(.dj.insertions)) { - .dj.insertions <- "DJ.insertions" - df$DJ.insertions <- NA - } - - if (!(.dj.insertions %in% table.colnames)) { - .dj.insertions <- "DJ.insertions" - df$DJ.insertions <- NA - - if (!is.na(.jstart) && !is.na(.dend)) { - if (!is.na(recomb_type) && recomb_type == "VDJ") { - df$DJ.insertions <- df[[.jstart]] - df[[.dend]] - 1 - df$DJ.insertions[is.na(df[[.dend]])] <- NA - df$DJ.insertions[is.na(df[[.jstart]])] <- NA - - ins_ok <- TRUE - } - } - } - if (!ins_ok) { - df$J.start <- NA - df$D.start <- NA - df$D.end <- NA - .jstart <- "J.start" - .dstart <- "D.start" - .dend <- "D.end" - } - - ins_ok <- FALSE - if (is.na(.total.insertions)) { - .total.insertions <- "Total.insertions" - df$Total.insertions <- NA - } - - if (!(.total.insertions %in% table.colnames)) { - .total.insertions <- "Total.insertions" - df$Total.insertions <- -1 - if (!is.na(recomb_type)) { - if (recomb_type == "VJ") { - df$Total.insertions <- df[[.jstart]] - df[[.vend]] - 1 - df$Total.insertions[df$Total.insertions < 0] <- 0 - } else if (recomb_type == "VDJ") { - df$Total.insertions <- df[[.vd.insertions]] + df[[.dj.insertions]] - } - } else { - df$Total.insertions <- NA - } - } - - vec_names <- c( - .count, .prop, .nuc.seq, .aa.seq, - .vgenes, .dgenes, .jgenes, - .vend, .dstart, .dend, .jstart, - .total.insertions, .vd.insertions, .dj.insertions - ) - if (!is.na(.add[1])) { - vec_names <- c(vec_names, .add) - } - - df <- df[, vec_names] - - colnames(df)[1] <- IMMCOL$count - colnames(df)[2] <- IMMCOL$prop - colnames(df)[3] <- IMMCOL$cdr3nt - colnames(df)[4] <- IMMCOL$cdr3aa - colnames(df)[5] <- IMMCOL$v - colnames(df)[6] <- IMMCOL$d - colnames(df)[7] <- IMMCOL$j - colnames(df)[8] <- IMMCOL$ve - colnames(df)[9] <- IMMCOL$ds - colnames(df)[10] <- IMMCOL$de - colnames(df)[11] <- IMMCOL$js - colnames(df)[12] <- IMMCOL$vnj - colnames(df)[13] <- IMMCOL$vnd - colnames(df)[14] <- IMMCOL$dnj - - .postprocess(df, .mode) -} - -parse_immunoseq <- function(.filename, .mode, .wash.alleles = TRUE) { - .fix.immunoseq.genes <- function(.col) { - # fix "," - .col <- gsub(",", ", ", .col, fixed = TRUE, useBytes = TRUE) - # fix forward zeros - .col <- gsub("-([0])([0-9])", "-\\2", .col, useBytes = TRUE) - .col <- gsub("([VDJ])([0])([0-9])", "\\1\\3", .col, useBytes = TRUE) - # fix gene names - .col <- gsub("TCR", "TR", .col, fixed = TRUE, useBytes = TRUE) - .col - } - - filename <- .filename - file_cols <- list() - file_cols[[IMMCOL$count]] <- "templates" - file_cols[[IMMCOL$cdr3nt]] <- "rearrangement" - file_cols[[IMMCOL$cdr3aa]] <- "amino_acid" - file_cols[[IMMCOL$v]] <- "v_resolved" - file_cols[[IMMCOL$d]] <- "d_resolved" - file_cols[[IMMCOL$j]] <- "j_resolved" - file_cols[[IMMCOL$vnj]] <- "n1_insertions" - file_cols[[IMMCOL$vnd]] <- "n1_insertions" - file_cols[[IMMCOL$dnj]] <- "n2_insertions" - - v_index_col_name <- "v_index" - d_index_col_name <- "d_index" - j_index_col_name <- "j_index" - n1_index_col_name <- "n1_index" - n2_index_col_name <- "n2_index" - - # - # Check for the version of ImmunoSEQ files - # - f <- file(.filename, "r") - l <- readLines(f, 2) - close(f) - if (str_detect(l[[1]], "v_gene") && !str_detect(l[[1]], "v_resolved")) { - file_cols[[IMMCOL$v]] <- "v_gene" - file_cols[[IMMCOL$d]] <- "d_gene" - file_cols[[IMMCOL$j]] <- "j_gene" - } else if (str_detect(l[[1]], "MaxResolved")) { - file_cols[[IMMCOL$v]] <- "vMaxResolved" - file_cols[[IMMCOL$d]] <- "dMaxResolved" - file_cols[[IMMCOL$j]] <- "jMaxResolved" - - file_cols[[IMMCOL$vnj]] <- "n1insertion" - file_cols[[IMMCOL$vnd]] <- "n1insertion" - file_cols[[IMMCOL$dnj]] <- "n2insertion" - } - - - l_split <- strsplit(l, "\t") - if (str_detect(l[[1]], "templates")) { - if (str_detect(l[[1]], "templates/reads")) { - file_cols[[IMMCOL$count]] <- "count (templates/reads)" - file_cols[[IMMCOL$cdr3nt]] <- "nucleotide" - file_cols[[IMMCOL$cdr3aa]] <- "aminoAcid" - } else if (l_split[[2]][match("templates", l_split[[1]])] == "null") { - file_cols[[IMMCOL$count]] <- "reads" - } - } - - if (!str_detect(l[[1]], "v_index")) { - v_index_col_name <- "vindex" - d_index_col_name <- "dindex" - j_index_col_name <- "jindex" - n1_index_col_name <- "n1index" - n2_index_col_name <- "n2index" - } - - for (col_name in names(file_cols)) { - file_cols[[col_name]] <- .make_names(file_cols[[col_name]]) - } - - file_cols[[IMMCOL$prop]] <- IMMCOL$prop - file_cols[[IMMCOL$ve]] <- IMMCOL$ve - file_cols[[IMMCOL$ds]] <- IMMCOL$ds - file_cols[[IMMCOL$de]] <- IMMCOL$de - file_cols[[IMMCOL$js]] <- IMMCOL$js - file_cols[[IMMCOL$seq]] <- IMMCOL$seq - - # IO_REFACTOR - suppressMessages(df <- readr::read_delim(.filename, - col_names = TRUE, col_types = cols(), - delim = "\t", quote = "", - escape_double = FALSE, comment = "", - trim_ws = TRUE, skip = 0 - )) - # suppressMessages(df <- fread(.filename, data.table = FALSE)) - - names(df) <- tolower(names(df)) - - df[[file_cols[[IMMCOL$prop]]]] <- df[[file_cols[[IMMCOL$count]]]] / sum(df[[file_cols[[IMMCOL$count]]]]) - - # Save full nuc sequences and cut them down to CDR3 - df[[IMMCOL$seq]] <- df[[file_cols[[IMMCOL$cdr3nt]]]] - - # TODO: what if df[["v_index]] has "-1" or something like that? - df[[file_cols[[IMMCOL$cdr3nt]]]] <- stringr::str_sub(df[[IMMCOL$seq]], df[[v_index_col_name]] + 1, nchar(df[[IMMCOL$seq]])) - - df[[file_cols[[IMMCOL$v]]]] <- .fix.immunoseq.genes(df[[file_cols[[IMMCOL$v]]]]) - df[[file_cols[[IMMCOL$d]]]] <- .fix.immunoseq.genes(df[[file_cols[[IMMCOL$d]]]]) - df[[file_cols[[IMMCOL$j]]]] <- .fix.immunoseq.genes(df[[file_cols[[IMMCOL$j]]]]) - - recomb_type <- "VDJ" - if (recomb_type == "VDJ") { - df[[file_cols[[IMMCOL$ve]]]] <- df[[n1_index_col_name]] - df[[v_index_col_name]] - df[[file_cols[[IMMCOL$ds]]]] <- df[[d_index_col_name]] - df[[v_index_col_name]] - df[[file_cols[[IMMCOL$de]]]] <- df[[n2_index_col_name]] - df[[v_index_col_name]] - df[[file_cols[[IMMCOL$js]]]] <- df[[j_index_col_name]] - df[[v_index_col_name]] - file_cols[[IMMCOL$vnj]] <- IMMCOL$vnj - df[[IMMCOL$vnj]] <- -1 - } - - sample_name_vec <- NA - if ("sample_name" %in% colnames(df)) { - if (length(unique(df[["sample_name"]])) > 1) { - sample_name_vec <- df[["sample_name"]] - } - } - - df <- df[unlist(file_cols[IMMCOL$order])] - names(df) <- IMMCOL$order - - if (.wash.alleles) { - df <- .remove.alleles(df) - df[[IMMCOL$v]] <- gsub("([VDJ][0-9]*)$", "\\1-1", df[[IMMCOL$v]], useBytes = TRUE) - df[[IMMCOL$j]] <- gsub("([VDJ][0-9]*)$", "\\1-1", df[[IMMCOL$j]], useBytes = TRUE) - } - - if (nrow(df) > 0) { - if (has_class(df[[IMMCOL$vnj]], "character")) { - df[[IMMCOL$vnj]][df[[IMMCOL$vnj]] == "no data"] <- NA - } - if (has_class(df[[IMMCOL$vnd]], "character")) { - df[[IMMCOL$vnd]][df[[IMMCOL$vnd]] == "no data"] <- NA - } - if (has_class(df[[IMMCOL$dnj]], "character")) { - df[[IMMCOL$dnj]][df[[IMMCOL$dnj]] == "no data"] <- NA - } - - df[[IMMCOL$vnj]] <- as.integer(df[[IMMCOL$vnj]]) - df[[IMMCOL$vnd]] <- as.integer(df[[IMMCOL$vnd]]) - df[[IMMCOL$dnj]] <- as.integer(df[[IMMCOL$dnj]]) - } - - df[[IMMCOL$v]][df[[IMMCOL$v]] == "unresolved"] <- NA - df[[IMMCOL$d]][df[[IMMCOL$d]] == "unresolved"] <- NA - df[[IMMCOL$j]][df[[IMMCOL$j]] == "unresolved"] <- NA - - if (!is.na(sample_name_vec[1])) { - df <- lapply(split(df, sample_name_vec), .postprocess) - } else { - .postprocess(df) - } -} - -parse_mitcr <- function(.filename, .mode) { - .skip <- 0 - f <- file(.filename, "r") - l <- readLines(f, 1) - # Check for different levels of the MiTCR output - if (any(stringr::str_detect(l, c("MiTCRFullExport", "mitcr")))) { - .skip <- 1 - } - mitcr_format <- 1 - if (stringr::str_detect(l, "MiTCRFullExport") || .skip == 0) { - mitcr_format <- 2 - } - close(f) - - if (mitcr_format == 1) { - filename <- .filename - .count <- "count" - nuc.seq <- "cdr3nt" - aa.seq <- "cdr3aa" - vgenes <- "v" - jgenes <- "j" - dgenes <- "d" - vend <- "VEnd" - jstart <- "JStart" - dstart <- "DStart" - dend <- "DEnd" - vd.insertions <- NA - dj.insertions <- NA - total.insertions <- NA - .sep <- "\t" - } else { - # Check if there are barcodes - f <- file(.filename, "r") - l <- readLines(f, 1 + .skip)[.skip + 1] - barcodes <- NA - .count <- "Read count" - if ("NNNs" %in% strsplit(l, "\t", TRUE)[[1]]) { - .count <- "NNNs" - } - close(f) - - filename <- .filename - nuc.seq <- "CDR3 nucleotide sequence" - aa.seq <- "CDR3 amino acid sequence" - vgenes <- "V segments" - jgenes <- "J segments" - dgenes <- "D segments" - vend <- "Last V nucleotide position" - jstart <- "First J nucleotide position" - dstart <- "First D nucleotide position" - dend <- "Last D nucleotide position" - vd.insertions <- "VD insertions" - dj.insertions <- "DJ insertions" - total.insertions <- "Total insertions" - .sep <- "\t" - } - - parse_repertoire( - .filename = filename, .mode = .mode, .nuc.seq = nuc.seq, .aa.seq = aa.seq, .count = .count, .vgenes = vgenes, .jgenes = jgenes, .dgenes = dgenes, - .vend = vend, .jstart = jstart, .dstart = dstart, .dend = dend, - .vd.insertions = vd.insertions, .dj.insertions = dj.insertions, - .total.insertions = total.insertions, .skip = .skip, .sep = .sep - ) -} - -parse_mixcr <- function(.filename, .mode) { - fix.alleles <- function(.data) { - .data[[IMMCOL$v]] <- gsub("[*][[:digit:]]*", "", .data[[IMMCOL$v]]) - .data[[IMMCOL$d]] <- gsub("[*][[:digit:]]*", "", .data[[IMMCOL$d]]) - .data[[IMMCOL$j]] <- gsub("[*][[:digit:]]*", "", .data[[IMMCOL$j]]) - .data - } - - .filename <- .filename - .count <- "clonecount" - .sep <- "\t" - .vend <- "allvalignments" - .jstart <- "alljalignments" - .dalignments <- "alldalignments" - .vd.insertions <- "VD.insertions" - .dj.insertions <- "DJ.insertions" - .total.insertions <- "Total.insertions" - - table.colnames <- tolower(make.names(read.table(.filename, sep = .sep, skip = 0, nrows = 1, stringsAsFactors = FALSE, strip.white = TRUE, comment.char = "", quote = "")[1, ])) - table.colnames <- gsub(".", "", table.colnames, fixed = TRUE) - - # Columns of different MiXCR formats - # Clone count - Clonal sequence(s) - N. Seq. CDR3 - # cloneCount - clonalSequence - nSeqCDR3 - # cloneCount - targetSequences - nSeqImputedCDR3 - # cloneCount - targetSequences - nSeqCDR3 - if ("targetsequences" %in% table.colnames) { - if ("nseqimputedcdr3" %in% table.colnames) { - .nuc.seq <- "nseqimputedcdr3" - } else { - .nuc.seq <- "nseqcdr3" - } - - .big.seq <- "targetsequences" - } else { - .nuc.seq <- "nseqcdr3" - - if ("clonalsequences" %in% table.colnames) { - .big.seq <- "clonalsequences" - } else if ("clonalsequence" %in% table.colnames) { - .big.seq <- "clonalsequence" - } else { - .big.seq <- NA - } - } - - if (!("allvalignments" %in% table.colnames)) { - if ("allvalignment" %in% table.colnames) { - .vend <- "allvalignment" - } else { - .vend <- NA - } - } - if (!("alldalignments" %in% table.colnames)) { - if ("alldalignment" %in% table.colnames) { - .dalignments <- "alldalignment" - } else { - .dalignments <- NA - } - } - if (!("alljalignments" %in% table.colnames)) { - if ("alljalignment" %in% table.colnames) { - .jstart <- "alljalignment" - } else { - .jstart <- NA - } - } - - if ("bestvhit" %in% table.colnames) { - .vgenes <- "bestvhit" - } else if ("allvhits" %in% table.colnames) { - .vgenes <- "allvhits" - } else if ("vhits" %in% table.colnames) { - .vgenes <- "vhits" - } else if ("allvhitswithscore" %in% table.colnames) { - .vgenes <- "allvhitswithscore" - } else { - message("Error: can't find a column with V genes") - } - - if ("bestjhit" %in% table.colnames) { - .jgenes <- "bestjhit" - } else if ("alljhits" %in% table.colnames) { - .jgenes <- "alljhits" - } else if ("jhits" %in% table.colnames) { - .jgenes <- "jhits" - } else if ("alljhitswithscore" %in% table.colnames) { - .jgenes <- "alljhitswithscore" - } else { - message("Error: can't find a column with J genes") - } - - if ("bestdhit" %in% table.colnames) { - .dgenes <- "bestdhit" - } else if ("alldhits" %in% table.colnames) { - .dgenes <- "alldhits" - } else if ("dhits" %in% table.colnames) { - .dgenes <- "dhits" - } else if ("alldhitswithscore" %in% table.colnames) { - .dgenes <- "alldhitswithscore" - } else { - message("Error: can't find a column with D genes") - } - - - # IO_REFACTOR - df <- read_delim( - file = .filename, col_types = cols(), - delim = .sep, skip = 0, comment = "", - quote = "", escape_double = FALSE, trim_ws = TRUE - ) - # df <- fread(.filename, data.table = FALSE) - - # - # return NULL if there is no clonotypes in the data frame - # - if (nrow(df) == 0) { - return(NULL) - } - - names(df) <- make.names(names(df)) - names(df) <- tolower(gsub(".", "", names(df), fixed = TRUE)) - names(df) <- str_replace_all(names(df), " ", "") - - # check for VJ or VDJ recombination - # VJ / VDJ / Undeterm - recomb_type <- "Undeterm" - if (sum(substr(head(df)[[.vgenes]], 1, 4) %in% c("TCRA", "TRAV", "TRGV", "IGKV", "IGLV"))) { - recomb_type <- "VJ" - } else if (sum(substr(head(df)[[.vgenes]], 1, 4) %in% c("TCRB", "TRBV", "TRDV", "IGHV"))) { - recomb_type <- "VDJ" - } - - if (!is.na(.vend) && !is.na(.jstart)) { - .vd.insertions <- "VD.insertions" - df$VD.insertions <- -1 - if (recomb_type == "VJ") { - df$VD.insertions <- -1 - } else if (recomb_type == "VDJ") { - logic <- sapply(strsplit(df[[.dalignments]], "|", TRUE, FALSE, TRUE), length) >= 4 & - sapply(strsplit(df[[.vend]], "|", TRUE, FALSE, TRUE), length) >= 5 - df$VD.insertions[logic] <- - as.numeric(sapply(strsplit(df[[.dalignments]][logic], "|", TRUE, FALSE, TRUE), "[[", 4)) - - as.numeric(sapply(strsplit(df[[.vend]][logic], "|", TRUE, FALSE, TRUE), "[[", 5)) - 1 - } - - .dj.insertions <- "DJ.insertions" - df$DJ.insertions <- -1 - if (recomb_type == "VJ") { - df$DJ.insertions <- -1 - } else if (recomb_type == "VDJ") { - logic <- sapply(strsplit(df[[.jstart]], "|", TRUE, FALSE, TRUE), length) >= 4 & - sapply(strsplit(df[[.dalignments]], "|", TRUE, FALSE, TRUE), length) >= 5 - df$DJ.insertions[logic] <- - as.numeric(sapply(strsplit(df[[.jstart]][logic], "|", TRUE, FALSE, TRUE), "[[", 4)) - - as.numeric(sapply(strsplit(df[[.dalignments]][logic], "|", TRUE, FALSE, TRUE), "[[", 5)) - 1 - } - - # VJ.insertions - logic <- (sapply(strsplit(df[[.vend]], "|", TRUE, FALSE, TRUE), length) > 4) & (sapply(strsplit(df[[.jstart]], "|", TRUE, FALSE, TRUE), length) >= 4) - .total.insertions <- "Total.insertions" - if (recomb_type == "VJ") { - df$Total.insertions <- NA - if (length(which(logic)) > 0) { - df$Total.insertions[logic] <- - as.numeric(sapply(strsplit(df[[.jstart]][logic], "|", TRUE, FALSE, TRUE), "[[", 4)) - as.numeric(sapply(strsplit(df[[.vend]][logic], "|", TRUE, FALSE, TRUE), "[[", 5)) - 1 - } - } else if (recomb_type == "VDJ") { - df$Total.insertions <- df[[.vd.insertions]] + df[[.dj.insertions]] - } else { - df$Total.insertions <- NA - } - df$Total.insertions[df$Total.insertions < 0] <- -1 - - df$V.end <- -1 - df$J.start <- -1 - df[[.vend]] <- gsub(";", "", df[[.vend]], fixed = TRUE) - logic <- sapply(strsplit(df[[.vend]], "|", TRUE, FALSE, TRUE), length) >= 5 - df$V.end[logic] <- sapply(strsplit(df[[.vend]][logic], "|", TRUE, FALSE, TRUE), "[[", 5) - logic <- sapply(strsplit(df[[.jstart]], "|", TRUE, FALSE, TRUE), length) >= 4 - df$J.start[logic] <- sapply(strsplit(df[[.jstart]][logic], "|", TRUE, FALSE, TRUE), "[[", 4) - } else { - df$V.end <- -1 - df$J.start <- -1 - df$Total.insertions <- -1 - df$VD.insertions <- -1 - df$DJ.insertions <- -1 - - .dj.insertions <- "DJ.insertions" - .vd.insertions <- "VD.insertions" - } - - .vend <- "V.end" - .jstart <- "J.start" - - if (!is.na(.dalignments)) { - logic <- sapply(str_split(df[[.dalignments]], "|"), length) >= 5 - df$D5.end <- -1 - df$D3.end <- -1 - df$D5.end[logic] <- sapply(str_split(df[[.dalignments]][logic], "|"), "[[", 4) - df$D3.end[logic] <- sapply(str_split(df[[.dalignments]][logic], "|"), "[[", 5) - .dalignments <- c("D5.end", "D3.end") - } else { - df$D5.end <- -1 - df$D3.end <- -1 - } - - .dalignments <- c("D5.end", "D3.end") - - if (!(.count %in% table.colnames)) { - warn_msg <- c(" [!] Warning: can't find a column with clonal counts. Setting all clonal counts to 1.") - warn_msg <- c(warn_msg, "\n Did you apply repLoad to MiXCR file *_alignments.txt?") - warn_msg <- c(warn_msg, " If so please consider moving all *.clonotypes.*.txt MiXCR files to") - warn_msg <- c(warn_msg, " a separate folder and apply repLoad to the folder.") - warn_msg <- c(warn_msg, "\n Note: The *_alignments.txt file IS NOT a repertoire file suitable for any analysis.") - message(warn_msg) - - df[[.count]] <- 1 - } - .freq <- "Proportion" - df$Proportion <- df[[.count]] / sum(df[[.count]], na.rm = TRUE) - - .aa.seq <- IMMCOL$cdr3aa - df[[.aa.seq]] <- bunch_translate(df[[.nuc.seq]]) - - if (is.na(.big.seq)) { - .big.seq <- "BigSeq" - df$BigSeq <- df[[.nuc.seq]] - } - - df <- df[, make.names(c( - .count, .freq, - .nuc.seq, .aa.seq, - .vgenes, .dgenes, .jgenes, - .vend, .dalignments, .jstart, - .total.insertions, .vd.insertions, .dj.insertions, .big.seq - ))] - - colnames(df) <- IMMCOL$order - - df[[IMMCOL$v]] <- gsub("([*][[:digit:]]*)([(][[:digit:]]*[.,]*[[:digit:]]*[)])", "", df[[IMMCOL$v]]) - df[[IMMCOL$v]] <- gsub(",", ", ", df[[IMMCOL$v]]) - df[[IMMCOL$v]] <- str_replace_all(df[[IMMCOL$v]], '"', "") - df[[IMMCOL$v]] <- sapply( - strsplit(df[[IMMCOL$v]], ", ", useBytes = TRUE), - function(x) paste0(sort(unique(x)), collapse = ", ") - ) - - df[[IMMCOL$d]] <- gsub("([*][[:digit:]]*)([(][[:digit:]]*[.,]*[[:digit:]]*[)])", "", df[[IMMCOL$d]]) - df[[IMMCOL$d]] <- gsub(",", ", ", df[[IMMCOL$d]]) - df[[IMMCOL$d]] <- str_replace_all(df[[IMMCOL$d]], '"', "") - df[[IMMCOL$d]] <- sapply( - strsplit(df[[IMMCOL$d]], ", ", useBytes = TRUE), - function(x) paste0(sort(unique(x)), collapse = ", ") - ) - - df[[IMMCOL$j]] <- gsub("([*][[:digit:]]*)([(][[:digit:]]*[.,]*[[:digit:]]*[)])", "", df[[IMMCOL$j]]) - df[[IMMCOL$j]] <- gsub(",", ", ", df[[IMMCOL$j]]) - df[[IMMCOL$j]] <- str_replace_all(df[[IMMCOL$j]], '"', "") - df[[IMMCOL$j]] <- sapply( - strsplit(df[[IMMCOL$j]], ", ", useBytes = TRUE), - function(x) paste0(sort(unique(x)), collapse = ", ") - ) - - .postprocess(fix.alleles(df)) -} - -parse_migec <- function(.filename, .mode) { - filename <- .filename - nuc.seq <- "CDR3 nucleotide sequence" - aa.seq <- "CDR3 amino acid sequence" - .count <- "Good events" - vgenes <- "V segments" - jgenes <- "J segments" - dgenes <- "D segments" - vend <- "Last V nucleotide position" - jstart <- "First J nucleotide position" - dstart <- "First D nucleotide position" - dend <- "Last D nucleotide position" - vd.insertions <- "VD insertions" - dj.insertions <- "DJ insertions" - total.insertions <- "Total insertions" - .skip <- 0 - .sep <- "\t" - - parse_repertoire( - .filename = filename, .mode = .mode, .nuc.seq = nuc.seq, .aa.seq = aa.seq, .count = .count, - .vgenes = vgenes, .jgenes = jgenes, .dgenes = dgenes, - .vend = vend, .jstart = jstart, .dstart = dstart, .dend = dend, - .vd.insertions = vd.insertions, .dj.insertions = dj.insertions, - .total.insertions = total.insertions, .skip = .skip, .sep = .sep - ) -} - -parse_migmap <- function(.filename, .mode) { - filename <- .filename - nuc.seq <- "cdr3nt" - aa.seq <- "cdr3aa" - .count <- "count" - vgenes <- "v" - jgenes <- "j" - dgenes <- "d" - vend <- "v.end.in.cdr3" - jstart <- "j.start.in.cdr3" - dstart <- "d.start.in.cdr3" - dend <- "d.end.in.cdr3" - vd.insertions <- NA - dj.insertions <- NA - total.insertions <- NA - .skip <- 0 - .sep <- "\t" - - parse_repertoire( - .filename = filename, .mode = .mode, .nuc.seq = nuc.seq, .aa.seq = aa.seq, .count = .count, .vgenes = vgenes, .jgenes = jgenes, .dgenes = dgenes, - .vend = vend, .jstart = jstart, .dstart = dstart, .dend = dend, - .vd.insertions = vd.insertions, .dj.insertions = dj.insertions, - .total.insertions = total.insertions, .skip = .skip, .sep = .sep - ) -} - -parse_tcr <- function(.filename, .mode) { - f <- file(.filename, "r") - l <- readLines(f, 2)[2] - close(f) - - nuc.seq <- "CDR3.nucleotide.sequence" - aa.seq <- "CDR3.amino.acid.sequence" - .count <- "Read.count" - vgenes <- "V.gene" - jgenes <- "J.gene" - dgenes <- "D.gene" - vend <- "V.end" - jstart <- "J.start" - dstart <- "D5.end" - dend <- "D3.end" - vd.insertions <- "VD.insertions" - dj.insertions <- "DJ.insertions" - total.insertions <- "Total.insertions" - .skip <- 0 - .sep <- "\t" - - if (substr(l, 1, 2) != "NA") { - .count <- "Umi.count" - } - - parse_repertoire( - .filename = .filename, .mode = .mode, .nuc.seq = nuc.seq, .aa.seq = aa.seq, .count = .count, .vgenes = vgenes, .jgenes = jgenes, .dgenes = dgenes, - .vend = vend, .jstart = jstart, .dstart = dstart, .dend = dend, - .vd.insertions = vd.insertions, .dj.insertions = dj.insertions, - .total.insertions = total.insertions, .skip = .skip, .sep = .sep - ) -} - -parse_vdjtools <- function(.filename, .mode) { - skip <- 0 - - # Check for different VDJtools outputs - f <- file(.filename, "r") - l <- readLines(f, 1) - close(f) - - .skip <- 0 - .count <- "count" - filename <- .filename - nuc.seq <- "cdr3nt" - aa.seq <- "CDR3aa" - vgenes <- "V" - jgenes <- "J" - dgenes <- "D" - vend <- "Vend" - jstart <- "Jstart" - dstart <- "Dstart" - dend <- "Dend" - vd.insertions <- NA - dj.insertions <- NA - total.insertions <- NA - .sep <- "\t" - - if (length(strsplit(l, "-", TRUE)) > 0) { - if (length(strsplit(l, "-", TRUE)[[1]]) == 3) { - if (strsplit(l, "-", TRUE)[[1]][2] == "header") { - .count <- "count" - .skip <- 1 - } - } else if (tolower(substr(l, 1, 2)) == "#s") { - .count <- "#Seq. count" - nuc.seq <- "N Sequence" - aa.seq <- "AA Sequence" - vgenes <- "V segments" - jgenes <- "J segments" - dgenes <- "D segment" - vend <- NA - jstart <- NA - dstart <- NA - dend <- NA - } else if (stringr::str_detect(l, "#")) { - .count <- "X.count" - } else { - .count <- "count" - } - } - - parse_repertoire( - .filename = filename, .mode = .mode, .nuc.seq = nuc.seq, .aa.seq = aa.seq, .count = .count, .vgenes = vgenes, .jgenes = jgenes, .dgenes = dgenes, - .vend = vend, .jstart = jstart, .dstart = dstart, .dend = dend, - .vd.insertions = vd.insertions, .dj.insertions = dj.insertions, - .total.insertions = total.insertions, .skip = .skip, .sep = .sep - ) -} - -parse_imgt <- function(.filename, .mode) { - .fix.imgt.alleles <- function(.col) { - sapply(strsplit(.col, " "), function(x) { - if (length(x) > 1) { - x[[2]] - } else { - NA - } - }) - } - - f <- file(.filename, "r") - l <- readLines(f, 2)[2] - close(f) - - nuc.seq <- "JUNCTION" - aa.seq <- NA - .count <- NA - vgenes <- "V-GENE and allele" - jgenes <- "J-GENE and allele" - dgenes <- "D-GENE and allele" - vend <- "3'V-REGION end" - jstart <- "5'J-REGION start" - dstart <- "D-REGION start" - dend <- "D-REGION end" - vd.insertions <- NA - dj.insertions <- NA - total.insertions <- NA - .skip <- 0 - .sep <- "\t" - junc_start <- .make_names("JUNCTION start") - - df <- parse_repertoire( - .filename = .filename, .mode = .mode, .nuc.seq = nuc.seq, .aa.seq = aa.seq, .count = .count, .vgenes = vgenes, .jgenes = jgenes, .dgenes = dgenes, - .vend = vend, .jstart = jstart, .dstart = dstart, .dend = dend, - .vd.insertions = vd.insertions, .dj.insertions = dj.insertions, - .total.insertions = total.insertions, .skip = .skip, .sep = .sep, .add = junc_start - ) - - df[[IMMCOL$ve]] <- df[[IMMCOL$ve]] - df[[junc_start]] - df[[IMMCOL$ds]] <- df[[IMMCOL$ds]] - df[[junc_start]] - df[[IMMCOL$de]] <- df[[IMMCOL$de]] - df[[junc_start]] - df[[IMMCOL$js]] <- df[[IMMCOL$js]] - df[[junc_start]] - - df[[IMMCOL$ve]][df[[IMMCOL$ve]] < 0] <- NA - df[[IMMCOL$ds]][df[[IMMCOL$ds]] < 0] <- NA - df[[IMMCOL$de]][df[[IMMCOL$de]] < 0] <- NA - df[[IMMCOL$js]][df[[IMMCOL$js]] < 0] <- NA - - df[[IMMCOL$v]] <- .fix.imgt.alleles(df[[IMMCOL$v]]) - df[[IMMCOL$d]] <- .fix.imgt.alleles(df[[IMMCOL$d]]) - df[[IMMCOL$j]] <- .fix.imgt.alleles(df[[IMMCOL$j]]) - - df[[junc_start]] <- NULL - - df -} - -# parse_vidjil <- function (.filename) { -# stop(IMMUNR_ERROR_NOT_IMPL) -# } -# -# parse_rtcr <- function (.filename) { -# stop(IMMUNR_ERROR_NOT_IMPL) -# } -# -# parse_imseq <- function (.filename) { -# stop(IMMUNR_ERROR_NOT_IMPL) -# } - -parse_airr <- function(.filename, .mode) { - df <- airr::read_rearrangement(.filename) - - df <- df %>% - select( - sequence, v_call, d_call, j_call, junction, junction_aa, - contains("v_germline_end"), contains("d_germline_start"), contains("d_germline_end"), - contains("j_germline_start"), contains("np1_length"), contains("np2_length"), - contains("duplicate_count") - ) - - namekey <- c( - duplicate_count = IMMCOL$count, junction = IMMCOL$cdr3nt, junction_aa = IMMCOL$cdr3aa, - v_call = IMMCOL$v, d_call = IMMCOL$d, j_call = IMMCOL$j, v_germline_end = IMMCOL$ve, - d_germline_start = IMMCOL$ds, d_germline_end = IMMCOL$de, j_germline_start = IMMCOL$js, - np1_length = "unidins", np2_length = IMMCOL$dnj, sequence = IMMCOL$seq - ) - - names(df) <- namekey[names(df)] - - if (!("unidins" %in% colnames(df))) { - df["unidins"] <- NA - } - - recomb_type <- .which_recomb_type(df[[IMMCOL$v]]) - - if (!is.na(recomb_type)) { - if (recomb_type == "VJ") { - df[IMMCOL$vnj] <- df["unidins"] - df[IMMCOL$vnd] <- NA - df[IMMCOL$dnj] <- NA - } else if (recomb_type == "VDJ") { - df[IMMCOL$vnj] <- NA - df[IMMCOL$vnd] <- df["unidins"] - } - } - - for (column in IMMCOL$order) { - if (!(column %in% colnames(df))) { - df[column] <- NA - } - } - - df <- df[IMMCOL$order] - total <- sum(df$Clones) - df[IMMCOL$prop] <- df[IMMCOL$count] / total - df[IMMCOL$seq] <- stringr::str_remove_all(df[[IMMCOL$seq]], "N") - df <- .postprocess(df) - df -} - -parse_immunarch <- function(.filename, .mode) { - df <- readr::read_tsv(.filename, col_types = cols(), comment = "#") - if (ncol(df) == 1) { - # "," in the files, parse differently then - df <- readr::read_csv(.filename, col_types = cols(), comment = "#") - } - df <- .postprocess(df) - df -} - -parse_10x_consensus <- function(.filename, .mode) { - df <- parse_repertoire(.filename, - .mode = .mode, - .nuc.seq = "cdr3_nt", .aa.seq = NA, .count = "umis", - .vgenes = "v_gene", .jgenes = "j_gene", .dgenes = "d_gene", - .vend = NA, .jstart = NA, .dstart = NA, .dend = NA, - .vd.insertions = NA, .dj.insertions = NA, .total.insertions = NA, - .skip = 0, .sep = ",", .add = c("chain", "clonotype_id", "consensus_id") - ) - setnames(df, "clonotype_id", "ClonotypeID") - setnames(df, "consensus_id", "ConsensusID") - df -} - -parse_10x_filt_contigs <- function(.filename, .mode) { - df <- parse_repertoire(.filename, - .mode = .mode, - .nuc.seq = "cdr3_nt", .aa.seq = NA, .count = "umis", - .vgenes = "v_gene", .jgenes = "j_gene", .dgenes = "d_gene", - .vend = NA, .jstart = NA, .dstart = NA, .dend = NA, - .vd.insertions = NA, .dj.insertions = NA, .total.insertions = NA, - .skip = 0, .sep = ",", # .add = c("chain", "raw_clonotype_id", "raw_consensus_id", "barcode", "contig_id") - .add = c("chain", "barcode", "raw_clonotype_id", "contig_id") - ) - # setnames(df, "raw_clonotype_id", "RawClonotypeID") - # setnames(df, "raw_consensus_id", "RawConsensusID") - - # Process 10xGenomics filtered contigs files - count barcodes, merge consensues ids, clonotype ids and contig ids - df <- df[order(df$chain),] - setDT(df) - - if (.mode == "paired") { - df <- df %>% - lazy_dt() %>% - group_by(barcode, raw_clonotype_id) %>% - summarise( - CDR3.nt = paste0(CDR3.nt, collapse = IMMCOL_ADD$scsep), - CDR3.aa = paste0(CDR3.aa, collapse = IMMCOL_ADD$scsep), - V.name = paste0(V.name, collapse = IMMCOL_ADD$scsep), - J.name = paste0(J.name, collapse = IMMCOL_ADD$scsep), - D.name = paste0(D.name, collapse = IMMCOL_ADD$scsep), - chain = paste0(chain, collapse = IMMCOL_ADD$scsep), - # raw_clonotype_id = gsub("clonotype", "", paste0(raw_clonotype_id, collapse = IMMCOL_ADD$scsep)), - # raw_consensus_id = gsub("clonotype|consensus", "", paste0(raw_consensus_id, collapse = IMMCOL_ADD$scsep)), - contig_id = gsub("_contig_", "", paste0(contig_id, collapse = IMMCOL_ADD$scsep)) - ) %>% - as.data.table() - } - df <- df %>% - lazy_dt() %>% - group_by(CDR3.nt, V.name, J.name) %>% - summarise( - Clones = length(unique(barcode)), - CDR3.aa = first(CDR3.aa), - D.name = first(D.name), - chain = first(chain), - barcode = paste0(unique(barcode), collapse = IMMCOL_ADD$scsep), - raw_clonotype_id = gsub("clonotype|None", "", paste0(unique(raw_clonotype_id), collapse = IMMCOL_ADD$scsep)), - # raw_clonotype_id = gsub("clonotype", "", paste0(raw_clonotype_id, collapse = IMMCOL_ADD$scsep)), - # raw_consensus_id = gsub("clonotype|consensus", "", paste0(raw_consensus_id, collapse = IMMCOL_ADD$scsep)), - contig_id = paste0(contig_id, collapse = IMMCOL_ADD$scsep) - ) %>% - as.data.table() - - df$V.end <- NA - df$J.start <- NA - df$D.end <- NA - df$D.start <- NA - df$VD.ins <- NA - df$DJ.ins <- NA - df$VJ.ins <- NA - df$Sequence <- df$CDR3.nt - - setnames(df, "contig_id", "ContigID") - setnames(df, "barcode", "Barcode") - - df[[IMMCOL$prop]] <- df[[IMMCOL$count]] / sum(df[[IMMCOL$count]]) - setcolorder(df, IMMCOL$order) - - setDF(df) - - .postprocess(df) -} - -parse_archer <- function(.filename, .mode) { - parse_repertoire(.filename, - .mode = .mode, - .nuc.seq = "Clonotype Sequence", .aa.seq = NA, .count = "Clone Abundance", - .vgenes = "Predicted V Region", .jgenes = "Predicted J Region", .dgenes = "Predicted D Region", - .vend = NA, .jstart = NA, .dstart = NA, .dend = NA, - .vd.insertions = NA, .dj.insertions = NA, .total.insertions = NA, - .skip = 0, .sep = "\t" - ) -} - -##### Savers ##### - -save_immunarch <- function(.data, .path, .compress = TRUE) { - if (.compress) { - filepath <- gzfile(paste0(.path, ".tsv.gz"), compression = 9) - } else { - filepath <- paste0(.path, ".tsv") - } - readr::write_lines(paste0("# Exported from immunarch ", packageVersion("immunarch"), " https://immunarch.com"), - path = filepath - ) - filepath <- gzfile(paste0(.path, ".tsv.gz"), compression = 9) - readr::write_tsv(x = .data, path = filepath, append = TRUE, col_names = TRUE) -} - -save_vdjtools <- function(.data, .path, .compress = TRUE) { - if (.compress) { - filepath <- gzfile(paste0(.path, ".tsv.gz"), compression = 9) - } else { - filepath <- paste0(.path, ".tsv") - } - - old <- c( - IMMCOL$count, - IMMCOL$prop, - IMMCOL$cdr3nt, - IMMCOL$cdr3aa, - IMMCOL$v, - IMMCOL$d, - IMMCOL$j - ) - - new <- c( - "#Seq. Count", - "Percent", - "N Sequence", - "AA Sequence", - "V Segments", - "D Segment", - "J Segments" - ) - - names(.data) <- plyr::mapvalues(names(.data), - from = old, - to = as.character(new) - ) - - readr::write_tsv(x = .data, path = filepath) -} diff --git a/R/kmers.R b/R/kmers.R index 0313fd6e..e796d62a 100644 --- a/R/kmers.R +++ b/R/kmers.R @@ -1,7 +1,7 @@ -#' Calculate the kmer tatistics of immune repertoires +#' Calculate the kmer statistics of immune repertoires #' #' @importFrom data.table data.table -#' @importFrom dplyr tbl_df +#' @importFrom dplyr as_tibble #' #' @concept kmers #' @@ -51,7 +51,7 @@ getKmers <- function(.data, .k, .col = c("aa", "nt"), .coding = TRUE) { res <- split_to_kmers(collect(select(.data, seq_col), n = Inf)[[1]], .k = .k) } - add_class(tbl_df(as.data.frame(res)), "immunr_kmer_table") + add_class(as_tibble(as.data.frame(res)), "immunr_kmer_table") } diff --git a/R/overlap.R b/R/overlap.R index 3d83c273..10334703 100644 --- a/R/overlap.R +++ b/R/overlap.R @@ -47,7 +47,7 @@ #' @param .n.steps Something. Skipped if ".step" is a numeric vector. #' #' @param .downsample If TRUE then perform downsampling to N clonotypes at each step instead of choosing the -#' top N clonotypes. +#' top N clonotypes in incremental overlaps. Change nothing for conventional methods. #' #' @param .bootstrap Pass NA to turn off any bootstrapping, pass a number to perform bootstrapping with this number of tries. #' @@ -76,9 +76,11 @@ #' with their counts are summed up. #' #' @return -#' In most cases the return value is a matrix with overlap values. +#' In most cases the return value is a matrix with overlap values for each pair of repertoires. +#' +#' If only two repertoires were provided, return value is single numeric value. #' -#' If only two repertoires were provided, +#' If one of the incremental method is chosen, return list of overlap matrix. #' #' @seealso \link{inc_overlap}, \link{vis} #' @@ -106,7 +108,7 @@ repOverlap <- function(.data, .bootstrap = NA, .verbose.inc = NA, .force.matrix = FALSE) { - assertthat::assert_that(has_class(.data, "list")) + .validate_repertoires_data(.data) .method <- .method[1] @@ -362,7 +364,6 @@ horn_index <- function(.x, .y) { #' data(immdata) #' ov <- repOverlap(immdata$data, "inc+overlap", .step = 100, .verbose.inc = FALSE, .verbose = FALSE) #' vis(ov) -#' #' @export inc_overlap inc_overlap <- function(.data, .fun, .step = 1000, .n.steps = 10, .downsample = FALSE, .bootstrap = NA, .verbose.inc = TRUE, ...) { .n.steps <- as.integer(.n.steps) diff --git a/R/public.R b/R/public.R index 338e46a5..3124227e 100644 --- a/R/public.R +++ b/R/public.R @@ -46,10 +46,12 @@ #' @examples #' # Subset the data to make the example faster to run #' immdata$data <- lapply(immdata$data, head, 2000) -#' pr <- pubRep(immdata$data, .verbose=FALSE) +#' pr <- pubRep(immdata$data, .verbose = FALSE) #' vis(pr, "clonotypes", 1, 2) #' @export pubRep publicRepertoire pubRep <- function(.data, .col = "aa+v", .quant = c("count", "prop"), .coding = TRUE, .min.samples = 1, .max.samples = NA, .verbose = TRUE) { + .validate_repertoires_data(.data) + .preprocess <- function(dt) { if (has_class(dt, "data.table")) { dt <- dt %>% lazy_dt() @@ -61,8 +63,6 @@ pubRep <- function(.data, .col = "aa+v", .quant = c("count", "prop"), .coding = dt[, sum(get(.quant)), by = .col] } - assertthat::assert_that(has_class(.data, "list")) - .col <- sapply(unlist(strsplit(.col, split = "\\+")), switch_type, USE.NAMES = FALSE) .quant <- .quant_column_choice(.quant[1]) @@ -94,8 +94,6 @@ pubRep <- function(.data, .col = "aa+v", .quant = c("count", "prop"), .coding = add_pb(pb) } - # res = res %>% ungroup() - # Add #samples column after .col res[["Samples"]] <- rowSums(!is.na(as.matrix(res[, (length(.col) + 1):ncol(res), with = FALSE]))) if (is.na(.max.samples)) { @@ -133,7 +131,7 @@ publicRepertoire <- pubRep #' @examples #' data(immdata) #' immdata$data <- lapply(immdata$data, head, 2000) -#' pr <- pubRep(immdata$data, .verbose=FALSE) +#' pr <- pubRep(immdata$data, .verbose = FALSE) #' pr.mat <- public_matrix(pr) #' dim(pr.mat) #' head(pr.mat) @@ -141,7 +139,10 @@ publicRepertoire <- pubRep public_matrix <- function(.data) { sample_i <- match("Samples", colnames(.data)) + 1 max_col <- dim(.data)[2] - .data %>% dplyr::select(sample_i:max_col) %>% collect(n = Inf) %>% as.matrix() + .data %>% + dplyr::select(sample_i:max_col) %>% + collect(n = Inf) %>% + as.matrix() } @@ -180,7 +181,7 @@ get_public_repertoire_names <- function(.pr) { #' @examples #' data(immdata) #' immdata$data <- lapply(immdata$data, head, 2000) -#' pr <- pubRep(immdata$data, .verbose=FALSE) +#' pr <- pubRep(immdata$data, .verbose = FALSE) #' pr1 <- pubRepFilter(pr, immdata$meta, .by = c(Status = "MS")) #' head(pr1) #' @export pubRepFilter publicRepertoireFilter @@ -212,10 +213,16 @@ pubRepFilter <- function(.pr, .meta, .by, .min.samples = 1) { sample_i <- match("Samples", colnames(.pr)) indices <- c(1:(match("Samples", colnames(.pr))), match(samples_of_interest, colnames(.pr))) - new.pr <- .pr %>% lazy_dt() %>% dplyr::select(indices) %>% as.data.table() + new.pr <- .pr %>% + lazy_dt() %>% + dplyr::select(indices) %>% + as.data.table() new.pr[["Samples"]] <- rowSums(!is.na(as.matrix(new.pr[, (sample_i + 1):ncol(new.pr), with = FALSE]))) - new.pr <- new.pr %>% lazy_dt() %>% dplyr::filter(Samples >= .min.samples) %>% as.data.table() + new.pr <- new.pr %>% + lazy_dt() %>% + dplyr::filter(Samples >= .min.samples) %>% + as.data.table() new.pr } @@ -243,7 +250,7 @@ publicRepertoireFilter <- pubRepFilter #' @examples #' data(immdata) #' immdata$data <- lapply(immdata$data, head, 2000) -#' pr <- pubRep(immdata$data, .verbose=FALSE) +#' pr <- pubRep(immdata$data, .verbose = FALSE) #' pr1 <- pubRepFilter(pr, immdata$meta, .by = c(Status = "MS")) #' pr2 <- pubRepFilter(pr, immdata$meta, .by = c(Status = "C")) #' prapp <- pubRepApply(pr1, pr2) @@ -252,12 +259,10 @@ publicRepertoireFilter <- pubRepFilter pubRepApply <- function(.pr1, .pr2, .fun = function(x) log10(x[1]) / log10(x[2])) { col_before_samples <- names(.pr1)[1:(match("Samples", colnames(.pr1)) - 1)] - # tmp = apply(public_matrix(.pr1), 1, .inner.fun) tmp <- rowMeans(public_matrix(.pr1), na.rm = TRUE) .pr1[, (match("Samples", colnames(.pr1)) + 1):ncol(.pr1)] <- NULL .pr1[["Quant"]] <- tmp - # tmp = apply(public_matrix(.pr2), 1, .inner.fun) tmp <- rowMeans(public_matrix(.pr2), na.rm = TRUE) .pr2[, (match("Samples", colnames(.pr2)) + 1):ncol(.pr2)] <- NULL .pr2[["Quant"]] <- tmp @@ -293,7 +298,7 @@ publicRepertoireApply <- pubRepApply #' @examples #' data(immdata) #' immdata$data <- lapply(immdata$data, head, 2000) -#' pr <- pubRep(immdata$data, .verbose=FALSE) +#' pr <- pubRep(immdata$data, .verbose = FALSE) #' pubRepStatistics(pr) %>% vis() #' @export pubRepStatistics pubRepStatistics <- function(.data, .by = NA, .meta = NA) { @@ -310,8 +315,12 @@ pubRepStatistics <- function(.data, .by = NA, .meta = NA) { melted_pr <- melted_pr %>% group_by(CDR3.aa) %>% mutate(Group = paste0(variable, collapse = "&")) %>% - filter(Samples > 1) - melted_pr <- as_tibble(table(melted_pr$Group), .name_repair = function(x) c("Group", "Count")) + filter(Samples > 1) %>% + as_tibble() + + group_tab <- table(melted_pr$Group) + + melted_pr <- as_tibble(group_tab, .name_repair = function(x) c("Group", "Count")) # melted_pr = bind_cols(melted_pr, Samples = sapply(melted_pr$Group, function (s) stringr::str_count(s, ";") + 1)) %>% # arrange(Count) %>% # mutate(Group = factor(Group, levels=rev(Group), ordered=TRUE)) diff --git a/R/shiny.R b/R/shiny.R index 3e6eb626..3f60077a 100644 --- a/R/shiny.R +++ b/R/shiny.R @@ -91,15 +91,12 @@ fixVis <- function(.plot = NA) { ui <- fluidPage( theme = shinytheme("cosmo"), titlePanel("FixVis: make your plots publication-ready already!"), - sidebarLayout( sidebarPanel( downloadButton("save_plot", "Save"), actionButton("console_plot", "Plot to R console"), - br(), br(), - tabsetPanel( tabPanel( "General", @@ -127,7 +124,6 @@ fixVis <- function(.plot = NA) { ) ) ), - tabPanel( "Title & subtitle", br(), @@ -153,7 +149,6 @@ fixVis <- function(.plot = NA) { list(Plain = "plain", Bold = "bold", Italic = "italic", "Bold Italic" = "bold.italic") ) ), - tabPanel( "Subtitle", br(), @@ -179,7 +174,6 @@ fixVis <- function(.plot = NA) { ) ) ), - tabPanel( "Legends", br(), @@ -237,7 +231,6 @@ fixVis <- function(.plot = NA) { ) ) ), - tabPanel( "X axis", br(), @@ -265,7 +258,6 @@ fixVis <- function(.plot = NA) { list(Plain = "plain", Bold = "bold", Italic = "italic", "Bold Italic" = "bold.italic") ) ), - tabPanel( "X ticks", br(), @@ -288,7 +280,6 @@ fixVis <- function(.plot = NA) { ) ) ), - tabPanel( "Y axis", br(), @@ -316,7 +307,6 @@ fixVis <- function(.plot = NA) { list(Plain = "plain", Bold = "bold", Italic = "italic", "Bold Italic" = "bold.italic") ) ), - tabPanel( "Y ticks", br(), @@ -341,7 +331,6 @@ fixVis <- function(.plot = NA) { ) ) ), - mainPanel( uiOutput("main_plot", style = "position:fixed;") ) @@ -403,7 +392,6 @@ fixVis <- function(.plot = NA) { angle = .get("title_angle"), face = .get("title_face") ), - label.theme = element_text( size = .get("text_size"), hjust = .get("text_hjust"), diff --git a/R/singlecell.R b/R/singlecell.R index 3822d77b..e6cf101c 100644 --- a/R/singlecell.R +++ b/R/singlecell.R @@ -53,8 +53,8 @@ if (getRversion() >= "2.15.1") { #' @export select_barcodes <- function(.data, .barcodes, .force.list = FALSE) { if (has_class(.data, "list")) { - stop('Please apply the function to a repertoire instead of a list of repertoires. - In order to update both the data and metadata, consider using the select_clusters() function.') + stop("Please apply the function to a repertoire instead of a list of repertoires. + In order to update both the data and metadata, consider using the select_clusters() function.") } if (!("Barcode" %in% colnames(.data))) { stop('No "Barcode" column in the input data. Did you apply the function to a single-cell repertoire?') @@ -78,7 +78,7 @@ select_barcodes <- function(.data, .barcodes, .force.list = FALSE) { } # Un-nest the data table - df <- df[, c(strsplit(Barcode, IMMCOL_ADD$scsep, useBytes = TRUE, fixed=TRUE), .SD), by = .(Barcode)] + df <- df[, c(strsplit(Barcode, IMMCOL_ADD$scsep, useBytes = TRUE, fixed = TRUE), .SD), by = .(Barcode)] df[[IMMCOL$count]] <- 1 df[, Barcode := NULL] setnames(df, "", "Barcode") @@ -154,7 +154,7 @@ select_clusters <- function(.data, .clusters, .field = "Cluster") { } else if (!("data" %in% names(.data) && "meta" %in% names(.data))) { stop("Your list is missing one of the elements required. Please provide a list with both 'data' and 'meta' elements.") - } else if (!all(sapply(.data$data, function (x) "Barcode" %in% colnames(x)))) { + } else if (!all(sapply(.data$data, function(x) "Barcode" %in% colnames(x)))) { stop('No "Barcode" column in the input data. Did you apply the function to a single-cell repertoire?') } diff --git a/R/spectratyping.R b/R/spectratyping.R index 8da058f4..bc4eec27 100644 --- a/R/spectratyping.R +++ b/R/spectratyping.R @@ -30,7 +30,7 @@ #' @examples #' # Load the data #' data(immdata) -#' sp <- spectratype(immdata$data[[1]], .col="aa+v") +#' sp <- spectratype(immdata$data[[1]], .col = "aa+v") #' vis(sp) #' @export spectratype spectratype <- function(.data, .quant = c("id", "count"), .col = "nt") { diff --git a/R/tools.R b/R/tools.R index 5dcc36f9..c22f28cf 100644 --- a/R/tools.R +++ b/R/tools.R @@ -18,7 +18,7 @@ #' #' @return Numeric vector. #' -#' @examples +#' @section Developer Examples: #' immunarch:::check_distribution(c(1, 2, 3)) #' immunarch:::check_distribution(c(1, 2, 3), TRUE) #' immunarch:::check_distribution(c(1, 2, 3), FALSE) @@ -75,7 +75,7 @@ check_distribution <- function(.data, .do.norm = NA, .laplace = 1, .na.val = 0, #' @return #' Input object with additional class \code{.class}. #' -#' @examples +#' @section Developer Examples: #' tmp <- "abc" #' class(tmp) #' tmp <- immunarch:::add_class(tmp, "new_class") @@ -99,7 +99,7 @@ add_class <- function(.obj, .class) { #' @return #' Logical value. #' -#' @examples +#' @section Developer Examples: #' tmp <- "abc" #' immunarch:::has_class(tmp, "new_class") #' tmp <- immunarch:::add_class(tmp, "new_class") @@ -129,7 +129,7 @@ has_class <- function(.data, .class) { #' @return #' An updated progress bar. #' -#' @examples +#' @section Developer Examples: #' pb <- immunarch:::set_pb(100) #' immunarch:::add_pb(pb, 25) #' immunarch:::add_pb(pb, 25) @@ -155,7 +155,7 @@ add_pb <- function(.pb, .value = 1) { #' @return #' A string with the column name. #' -#' @examples +#' @section Developer Examples: #' immunarch:::.quant_column_choice("count") #' immunarch:::.quant_column_choice("freq") .quant_column_choice <- function(x, .verbose = TRUE) { @@ -185,7 +185,7 @@ add_pb <- function(.pb, .value = 1) { #' @return #' Matrix with its upper tri part copied to the lower tri part. #' -#' @examples +#' @section Developer Examples: #' mat <- matrix(0, 3, 3) #' mat #' mat[1, 3] <- 1 @@ -276,7 +276,7 @@ check_group_names <- function(.meta, .by) { #' @return #' Character vector with group names. #' -#' @examples +#' @section Developer Examples: #' immunarch:::group_from_metadata("Status", data.frame(Status = c("A", "A", "B", "B", "C"))) group_from_metadata <- function(.by, .metadata, .sep = "; ") { if (length(.by) == 1) { @@ -374,8 +374,7 @@ apply_symm <- function(.datalist, .fun, ..., .diag = NA, .verbose = TRUE) { for (j in i:length(.datalist)) { if (i == j && is.na(.diag)) { res[i, j] <- NA - } - else { + } else { res[i, j] <- .fun(.datalist[[i]], .datalist[[j]], ...) } if (.verbose) add_pb(pb) @@ -394,8 +393,7 @@ apply_asymm <- function(.datalist, .fun, ..., .diag = NA, .verbose = TRUE) { for (j in 1:length(.datalist)) { if (i == j && is.na(.diag)) { res[i, j] <- NA - } - else { + } else { res[i, j] <- .fun(.datalist[[i]], .datalist[[j]], ...) } if (.verbose) add_pb(pb) @@ -431,7 +429,7 @@ apply_asymm <- function(.datalist, .fun, ..., .diag = NA, .verbose = TRUE) { #' @return #' A column's name. #' -#' @examples +#' @section Developer Examples: #' immunarch:::switch_type("nuc") #' immunarch:::switch_type("v") switch_type <- function(type) { @@ -476,3 +474,11 @@ load_segments <- function(.path, .alias, .gene_df = GENE_SEGMENTS, .filter = NA) segm <- segm[order(segm$gene, segm$allele_id), ] new_gs <- rbind(segm, .gene_df[.gene_df$alias != .alias, ]) } + +as_numeric_or_fail <- function(.string) { + result <- as.numeric(.string) + if (is.na(result)) { + stop(paste0("\"", .string, "\" is not a valid number.")) + } + return(result) +} diff --git a/R/vis.R b/R/vis.R index 36cb2fd6..55daabc5 100644 --- a/R/vis.R +++ b/R/vis.R @@ -50,16 +50,14 @@ if (getRversion() >= "2.15.1") { palette_name <- "" if (.n == 1) { palette_name <- "Set2" - } - else if (.n == 2) { + } else if (.n == 2) { palette_name <- "Set1" } # else if (.n < 4) { palette_name = "YlGnBu" } # else if (.n < 6) { palette_name = "RdBu" } else if (.n < 12) { palette_name <- "Spectral" - } - else { + } else { return(scale_fill_hue()) } @@ -70,16 +68,14 @@ if (getRversion() >= "2.15.1") { palette_name <- "" if (.n == 1) { palette_name <- "Set2" - } - else if (.n == 2) { + } else if (.n == 2) { palette_name <- "Set1" } # else if (.n < 4) { palette_name = "YlGnBu" } # else if (.n < 6) { palette_name = "RdBu" } else if (.n < 12) { palette_name <- "Spectral" - } - else { + } else { return(scale_colour_hue()) } @@ -95,8 +91,7 @@ theme_cleveland2 <- function(rotate = TRUE) { linetype = "dashed" ) ) - } - else { + } else { theme( panel.grid.major.x = element_line( colour = "grey70", @@ -371,7 +366,8 @@ vis_heatmap <- function(.data, .text = TRUE, .scientific = FALSE, .signif.digits m[, 1] <- factor(m[, 1], levels = rev(rownames(.data))) m[, 2] <- factor(m[, 2], levels = colnames(.data)) - m$label <- format(m$value, scientific = .scientific, digits = .signif.digits) + format <- if (.scientific) "g" else "fg" + m$label <- formatC(m$value, format = format, digits = .signif.digits) p <- ggplot(m, aes(x = variable, y = name, fill = value)) p <- p + geom_tile(aes(fill = value), colour = "white") @@ -413,9 +409,12 @@ vis_heatmap <- function(.data, .text = TRUE, .scientific = FALSE, .signif.digits #' #' @param .data Input matrix. Column names and row names (if presented) will be used as names for labs. #' -#' @param .title The text for the plot's title (same as the "main" argument in \link[pheatmap]{pheatmap}). +#' @param .meta A metadata object. An R dataframe with sample names and their properties, +#' such as age, serostatus or hla. #' -#' @param .labs A character vector of length two with names for x-axis and y-axis, respectively. +#' @param .by Pass NA if you want to plot samples without grouping. +#' +#' @param .title The text for the plot's title (same as the "main" argument in \link[pheatmap]{pheatmap}). #' #' @param .color A vector specifying the colors (same as the "color" argument in \link[pheatmap]{pheatmap}). #' Pass NA to use the default pheatmap colors. @@ -432,14 +431,22 @@ vis_heatmap <- function(.data, .text = TRUE, .scientific = FALSE, .signif.digits #' ov <- repOverlap(immdata$data) #' vis_heatmap2(ov) #' @export -vis_heatmap2 <- function(.data, .title = NA, .labs = NA, .color = colorRampPalette(c("#67001f", "#d6604d", "#f7f7f7", "#4393c3", "#053061"))(1024), ...) { - args <- list() +vis_heatmap2 <- function(.data, .meta = NA, .by = NA, .title = NA, .color = colorRampPalette(c("#67001f", "#d6604d", "#f7f7f7", "#4393c3", "#053061"))(1024), ...) { + args <- list(...) + if (!is.na(.by)[1]) { + if (!is.na(.meta)[1]) { + args[["annotation_col"]] <- .meta %>% + tibble::column_to_rownames(var = "Sample") %>% + dplyr::select(tidyselect::any_of(.by)) + } + } + args[["mat"]] <- .data args[["main"]] <- .title if (!is.na(.color)[1]) { args[["color"]] <- .color } - do.call(pheatmap, c(args, ...)) + do.call(pheatmap, args) } @@ -630,9 +637,7 @@ vis.immunr_inc_overlap <- function(.data, .target = 1, .grid = FALSE, .ncol = 2, Value.min = quantile(Overlap, 0.025, na.rm = TRUE), Value.max = quantile(Overlap, 0.975, na.rm = TRUE) ) - } - - else { + } else { sample_names <- colnames(.data[[1]]) .data <- lapply(.data, function(mat) replace(mat, lower.tri(mat, TRUE), NA)) @@ -792,7 +797,8 @@ vis_public_frequencies <- function(.data, .by = NA, .meta = NA, } else if (.type == "mean") { melted_pr <- melted_pr %>% group_by(Sequence, Samples) %>% - summarise(Value = mean(value, na.rm = TRUE)) + summarise(Value = mean(value, na.rm = TRUE)) %>% + as.data.table() p <- ggplot() + geom_jitter(aes(x = Value, y = Samples), data = melted_pr) + @@ -1232,8 +1238,7 @@ vis_hist <- function(.data, .by = NA, .meta = NA, .title = "Gene usage", .ncol = } return(p + .add.layer) - } - else { + } else { if (.grid) { if (is.na(.legend)) { .legend <- FALSE @@ -1268,7 +1273,6 @@ vis_hist <- function(.data, .by = NA, .meta = NA, .title = "Gene usage", .ncol = p <- do.call(wrap_plots, c(ps, ncol = .ncol)) p <- p + plot_annotation(title = .title) return(p) - } else { res <- split(res, res$Sample) ps <- list() @@ -1861,6 +1865,10 @@ vis_bar_stacked <- function(.data, .by = NA, .meta = NA, #' data(immdata) #' clp <- repClonality(immdata$data, "clonal.prop") #' vis(clp) +#' +#' hom <- repClonality(immdata$data, "homeo") +#' # Remove p values and points from the plot +#' vis(hom, .by = "Status", .meta = immdata$meta, .test = FALSE, .points = FALSE) #' @export vis.immunr_clonal_prop <- function(.data, .by = NA, .meta = NA, .errorbars = c(0.025, 0.975), .errorbars.off = FALSE, .points = TRUE, .test = TRUE, .signif.label.size = 3.5, ...) { # ToDo: this and other repClonality and repDiversity functions doesn't work on a single repertoire. Fix it @@ -2099,7 +2107,7 @@ vis_bar <- function(.data, .by = NA, .meta = NA, .errorbars = c(0.025, 0.975), . if (.stack) { p <- ggplot() + geom_col(aes(x = Group, y = Value.mean, fill = Grouping.var), - position = "stack", data = .data_proc, col = "black" + position = "stack", data = .data_proc, col = "black" ) } else { p <- ggplot(aes(x = Grouping.var, y = Value, color = Group, fill = Group, group = Group), data = .data) + @@ -2121,7 +2129,7 @@ vis_bar <- function(.data, .by = NA, .meta = NA, .errorbars = c(0.025, 0.975), . if (!.errorbars.off) { p <- p + geom_errorbar(aes(x = Grouping.var, y = Value.mean, ymin = Value.min, ymax = Value.max, color = Group), - color = "black", data = .data_proc, width = .errorbar.width, position = position_dodge(.9) + color = "black", data = .data_proc, width = .errorbar.width, position = position_dodge(.9) ) } @@ -2640,7 +2648,11 @@ vis.immunr_exp_clones <- function(.data, .by = NA, .meta = NA, #' p1 + p2 #' @export vis.immunr_kmer_table <- function(.data, .head = 100, .position = c("stack", "dodge", "fill"), .log = FALSE, ...) { - .position <- switch(substr(.position[1], 1, 1), s = "stack", d = "dodge", f = "fill") + .position <- switch(substr(.position[1], 1, 1), + s = "stack", + d = "dodge", + f = "fill" + ) # .data[is.na(.data)] <- 0 max_counts <- apply(.data[, -1], 1, max, na.rm = TRUE) diff --git a/README.md b/README.md index 09d5ee20..2c6c7c41 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,13 @@ [![Follow](https://img.shields.io/twitter/follow/immunomind.svg?style=social)](https://twitter.com/intent/follow?screen_name=immunomind) [![CRAN](http://www.r-pkg.org/badges/version-ago/immunarch?style=flat-square)](https://cran.r-project.org/package=immunarch) -[![Downloads_all](http://cranlogs.r-pkg.org/badges/grand-total/immunarch)](http://www.r-pkg.org/pkg/immunarch) -[![Downloads_week](http://cranlogs.r-pkg.org/badges/last-week/immunarch)](http://www.r-pkg.org/pkg/immunarch) -[![Issues](https://img.shields.io/github/issues/immunomind/immunarch?style=flat-square)](http://github.com/immunomind/immunarch/issues) +[![Downloads_all](http://cranlogs.r-pkg.org/badges/grand-total/immunarch)](https://www.r-pkg.org/pkg/immunarch) +[![Downloads_week](http://cranlogs.r-pkg.org/badges/last-week/immunarch)](https://www.r-pkg.org/pkg/immunarch) +[![Issues](https://img.shields.io/github/issues/immunomind/immunarch?style=flat-square)](https://github.com/immunomind/immunarch/issues) [![CI](https://gitlab.com/immunomind/immunarch/badges/master/pipeline.svg?style=flat-square)](https://gitlab.com/immunomind/immunarch/-/jobs) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3367200.svg)](https://doi.org/10.5281/zenodo.3367200) ![Visitors](https://visitor-badge.glitch.me/badge?page_id=immunomind.immunarch) -[![Downloads_all](http://cranlogs.r-pkg.org/badges/grand-total/tcR)](http://www.r-pkg.org/pkg/tcR) -[![Downloads_week](http://cranlogs.r-pkg.org/badges/last-week/tcR)](http://www.r-pkg.org/pkg/tcR) +[![Downloads_all](http://cranlogs.r-pkg.org/badges/grand-total/tcR)](https://www.r-pkg.org/pkg/tcR) +[![Downloads_week](http://cranlogs.r-pkg.org/badges/last-week/tcR)](https://www.r-pkg.org/pkg/tcR) # `immunarch` --- Fast and Seamless Exploration of Single-cell and Bulk T-cell/Antibody Immune Repertoires in R @@ -32,7 +32,7 @@ repDiversity(immdata$data) %>% vis(.by = "Status", .meta = immdata$meta) # ### From Berkeley with devotion -`immunarch` is brought to you by [ImmunoMind](https://immunomind.io) --- a [UC Berkeley SkyDeck](https://www.forbes.com/sites/avivalegatt/2019/01/07/launch-your-startup-at-these-five-college-incubators/) startup. ImmunoMind Data Science tools for single-cell and immunomics [exploration](https://immunarch.com) and [biomarker discovery](https://immunomind.io) are trusted by researchers from top pharma companies and universities, including 10X Genomics, Pfizer, Regeneron, UCSF, MIT, Stanford, John Hopkins School of Medicine and Vanderbilt University. +`immunarch` is brought to you by [ImmunoMind](https://immunomind.io) --- a [UC Berkeley SkyDeck](https://www.forbes.com/sites/avivalegatt/2019/01/07/launch-your-startup-at-these-five-college-incubators/) startup. ImmunoMind improves the design of adoptive T-cell therapies such as CAR-T by precisely identifying T-cell subpopulations and their immune profile. ImmunoMind's tools are trusted by researchers from top pharma companies and universities, including 10X Genomics, Pfizer, Regeneron, UCSF, MIT, Stanford, John Hopkins School of Medicine and Vanderbilt University. [![Follow](https://img.shields.io/twitter/follow/immunomind.svg?style=social)](https://twitter.com/intent/follow?screen_name=immunomind) @@ -110,21 +110,21 @@ devtools::install_github("immunomind/immunarch", ref="dev") You can find the list of releases of `immunarch` here: https://github.com/immunomind/immunarch/releases -## Features +## Key Features -1. Fast and easy manipulation of immune repertoire data: +1. Data agnostic. Fast and easy manipulation of immune repertoire data: + The package automatically detects the format of your files---no more guessing what format is *that* file, just pass them to the package; - + Supports all popular TCR and BCR analysis and post-analysis formats, including single-cell data: [ImmunoSEQ](https://www.adaptivebiotech.com/products-services/immunoseq/), [IMGT](http://www.imgt.org/IMGTindex/IMGTHighV-QUEST.php), [MiTCR](https://github.com/milaboratory/mitcr/), [MiXCR](https://milaboratory.com/software/mixcr/), [MiGEC](https://milaboratory.com/software/migec/), [MigMap](https://github.com/mikessh/migmap), [VDJtools](https://milaboratory.com/software/vdjtools/), [tcR](https://github.com/imminfo/tcr), [AIRR](http://docs.airr-community.org/en/latest/), [10XGenomics](https://support.10xgenomics.com/single-cell-vdj/datasets/), [ArcherDX](https://archerdx.com/immunology/). More coming in the future; + + Supports all popular TCR and BCR analysis and post-analysis formats, including single-cell data: [ImmunoSEQ](https://www.immunoseq.com/), [IMGT](http://www.imgt.org/IMGTindex/IMGTHighV-QUEST.php), [MiTCR](https://github.com/milaboratory/mitcr), [MiXCR](https://github.com/milaboratory/mixcr), [MiGEC](https://github.com/mikessh/migec), [MigMap](https://github.com/mikessh/migmap), [VDJtools](https://github.com/mikessh/vdjtools), [tcR](https://github.com/imminfo/tcr), [AIRR](http://docs.airr-community.org/en/latest/), [10XGenomics](https://www.10xgenomics.com/resources/datasets?menu%5Bproducts.name%5D=Single+Cell+Immune+Profiling), ArcherDX. More coming in the future; + Works on any data source you are comfortable with: R data frames, data tables from [data.table](https://rdatatable.gitlab.io/data.table/), databases like [MonetDB](https://github.com/MonetDB), Apache Spark data frames via [sparklyr](https://spark.rstudio.com/); + Tutorial is available [here](https://immunarch.com/articles/v2_data.html). -2. Immune repertoire analysis made simple: +2. Beginner-friendly. Immune repertoire analysis made simple: - + Most methods are incorporated in a couple of main functions with clear naming---no more remembering tens and tens of functions with obscure names. For details see [link](https://immunarch.com/articles/v3_basic_analysis.html); + + Most methods are incorporated in a couple of main functions with clear naming---no more remembering tens and tens of functions with obscure names. For details see [link](https://immunarch.com/articles/web_only/v3_basic_analysis.html); + Repertoire overlap analysis *(common indices including overlap coefficient, Jaccard index and Morisita's overlap index)*. Tutorial is available [here](https://immunarch.com/articles/web_only/v4_overlap.html); @@ -138,7 +138,7 @@ You can find the list of releases of `immunarch` here: https://github.com/immuno + Coming in the next releases: CDR3 amino acid physical and chemical properties assessment, mutation networks. -3. Publication-ready plots with a built-in tool for visualisation manipulation: +3. Seamless publication-ready plots with a built-in tool for visualisation manipulation: + Rich visualisation procedures with [ggplot2](https://ggplot2.tidyverse.org/); @@ -203,9 +203,9 @@ Have an aspiration to help the community build the ecosystem of scRNAseq & AIRR We are always open to contributions. There are two ways to contribute: -1. Create an issue [here](http://github.com/immunomind/immunarch/issues) and describe what would you like to improve or discuss. +1. Create an issue [here](https://github.com/immunomind/immunarch/issues) and describe what would you like to improve or discuss. -2. Create an issue or find one [here](http://github.com/immunomind/immunarch/issues), fork the repository and make a pull request with the bugfix or improvement. +2. Create an issue or find one [here](https://github.com/immunomind/immunarch/issues), fork the repository and make a pull request with the bugfix or improvement. ## Citation diff --git a/_pkgdown.yml b/_pkgdown.yml index 116be35a..8943bf01 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -1,5 +1,4 @@ url: https://immunarch.com - template: params: bootswatch: cosmo @@ -7,11 +6,13 @@ template: docsearch: api_key: '94207562f34b455c0796790d104e3549' index_name: 'immunarch' - authors: ImmunoMind: href: https://immunomind.io - + Vadim I. Nazarov: + href: https://www.linkedin.com/in/vdnaz + Vasily O. Tsvetkov: + href: https://www.linkedin.com/in/vasily-tsvetkov-227218ab articles: - title: All tutorials desc: ~ @@ -20,6 +21,8 @@ articles: - '`v2_data`' - '`web_only/v21_singlecell`' - '`web_only/v3_basic_analysis`' + - '`web_only/load_mixcr`' + - '`web_only/load_10x`' - '`web_only/v4_overlap`' - '`web_only/v5_gene_usage`' - '`web_only/v6_diversity`' @@ -27,10 +30,9 @@ articles: - '`web_only/v8_tracking`' - '`web_only/v9_kmers`' - '`web_only/v11_db`' - navbar: structure: - left: [covid19, intro, articles, reference] + left: [articles, reference, covid19] right: [im_link, twitter, github] components: home: ~ @@ -38,50 +40,46 @@ navbar: covid19: text: "COVID-19" href: https://github.com/immunomind/covid19 - intro: - text: "Get started" - href: articles/v1_introduction.html reference: text: Reference href: reference/index.html articles: - text: Tutorials + text: Start Here menu: - - text: First steps - - text: Introduction to immunarch + - text: Installation & troubleshooting href: articles/v1_introduction.html - - text: Working with data in immunarch + - text: '---' + - text: First steps + - text: Data loading href: articles/v2_data.html - - text: Basic analysis and clonality - href: articles/web_only/v3_basic_analysis.html - - text: Single-cell and paired chain data + - text: 'How-to: Loading MiXCR Data' + href: articles/web_only/load_mixcr.html + - text: 'How-to: Loading 10x Genomics Data' + href: articles/web_only/load_10x.html + - text: 'How-to: Single-cell and paired chain data' href: articles/web_only/v21_singlecell.html - + - text: Basic statistics and clonality + href: articles/web_only/v3_basic_analysis.html - text: '---' - - - text: Explore and compare repertoires + - text: Repertoire-level exploration and comparison - text: Repertoire overlap and public clonotypes href: articles/web_only/v4_overlap.html - - text: Gene usage analysis + - text: Gene usage href: articles/web_only/v5_gene_usage.html - text: Diversity estimation href: articles/web_only/v6_diversity.html + - text: '---' + - text: Clonotype-level exploration and comparison + - text: Track clonotypes across samples and time + href: articles/web_only/v8_tracking.html - text: Annotate clonotypes using immune receptor databases href: articles/web_only/v11_db.html - + - text: Kmer and sequence motif analysis and visualisation + href: articles/web_only/v9_kmers.html - text: '---' - - text: Preparing to publication - text: Make your plots publication-ready with fixVis href: articles/web_only/v7_fixvis.html - - - text: '---' - - - text: Advanced methods - - text: Tracking clonotypes across time points - href: articles/web_only/v8_tracking.html - - text: Kmer and sequence motif analysis and visualisation - href: articles/web_only/v9_kmers.html twitter: icon: fa-lg fa-twitter href: http://twitter.com/immunomind @@ -91,7 +89,6 @@ navbar: github: icon: fa-github href: https://github.com/immunomind/immunarch - reference: - title: Bulk and single-cell data - subtitle: Loading and saving any data @@ -104,7 +101,6 @@ reference: - subtitle: Preprocessing contents: - has_concept("preprocessing") - - title: Basic immune repertoire statistics - subtitle: Exploratory data analysis contents: @@ -112,8 +108,6 @@ reference: - subtitle: Clonality analysis contents: - has_concept("clonality") - - - title: Clonotype annotation and dynamics - subtitle: Clonotype annotation desc: Annotate clonotypes in immune repertoires using external immune receptor databases (VDJDB, McPAS and PIRD). @@ -123,7 +117,6 @@ reference: desc: Track the differences in clonotype abundances over time. contents: - has_concept("dynamics") - - title: Compare repertoires - subtitle: Repertoire diversity analysis contents: @@ -139,7 +132,6 @@ reference: - subtitle: Public repertoire contents: - has_concept("pubrep") - - title: Advanced immune repertoire analysis - subtitle: Kmers analysis contents: @@ -148,7 +140,6 @@ reference: desc: Advanced methods for post-analysis of gene usage, overlap and other statistics. contents: - has_concept("post_analysis") - - title: Visualisations - subtitle: General visualisation functions desc: Functions for visualisations of different data types. For analysis-specific visualisation see related sections. @@ -158,7 +149,6 @@ reference: - subtitle: Publication-ready plots contents: - has_concept("fixvis") - - title: Utilities - subtitle: Data contents: @@ -169,4 +159,3 @@ reference: - subtitle: Internal utility functions contents: - has_concept("utility_private") - diff --git a/fixVis/app.R b/fixVis/app.R index 56a72861..c085fc1c 100644 --- a/fixVis/app.R +++ b/fixVis/app.R @@ -11,36 +11,48 @@ library(shiny) library(shinythemes) library(ggplot2) -make_legend_tab <- function (.label, .name, .title.text, .is.title) { - - .full <- function (.l) { +make_legend_tab <- function(.label, .name, .title.text, .is.title) { + .full <- function(.l) { stringr::str_c("legend", .label, .l, sep = "_") } - objs = list(title = .name) + objs <- list(title = .name) - objs = c(objs, list(br())) + objs <- c(objs, list(br())) if (.is.title) { - objs = c(objs, list( splitLayout(cellWidths = c("60%", "40%"), - checkboxInput(.full("remove"), "Remove the legend"), - checkboxInput(.full("contin"), "Continuous?")), - sliderInput(.full("ncol"), "Number of columns:", - min = 1, max = 40, value = 2, step=1), - br(), - textInput(.full("text"), "Title text:", .title.text, placeholder = "Samples"))) + objs <- c(objs, list( + splitLayout( + cellWidths = c("60%", "40%"), + checkboxInput(.full("remove"), "Remove the legend"), + checkboxInput(.full("contin"), "Continuous?") + ), + sliderInput(.full("ncol"), "Number of columns:", + min = 1, max = 40, value = 2, step = 1 + ), + br(), + textInput(.full("text"), "Title text:", .title.text, placeholder = "Samples") + )) } - objs = c(objs, list(sliderInput(.full("size"), "Text size:", - min = 1, max = 40, value = ifelse(.is.title, 16, 11), step=.5), - sliderInput(.full("hjust"), "Text horizontal adjustment:", - min = 0, max = 1, value = 0, step=.05), - sliderInput(.full("vjust"), "Text vertical adjustment:", - min = -4, max = 4, value = .5, step=.25), - sliderInput(.full("angle"), "Text angle:", - min = 0, max = 90, value = 0, step=1), - selectInput(.full("face"), "Face:", - list(Plain = "plain", Bold = "bold", Italic = "italic", "Bold Italic" = "bold.italic"))) ) + objs <- c(objs, list( + sliderInput(.full("size"), "Text size:", + min = 1, max = 40, value = ifelse(.is.title, 16, 11), step = .5 + ), + sliderInput(.full("hjust"), "Text horizontal adjustment:", + min = 0, max = 1, value = 0, step = .05 + ), + sliderInput(.full("vjust"), "Text vertical adjustment:", + min = -4, max = 4, value = .5, step = .25 + ), + sliderInput(.full("angle"), "Text angle:", + min = 0, max = 90, value = 0, step = 1 + ), + selectInput( + .full("face"), "Face:", + list(Plain = "plain", Bold = "bold", Italic = "italic", "Bold Italic" = "bold.italic") + ) + )) do.call(tabPanel, objs) } @@ -49,160 +61,244 @@ make_legend_tab <- function (.label, .name, .title.text, .is.title) { ui <- fluidPage( theme = shinytheme("cosmo"), titlePanel("FixVis: make your plots publication-ready already!"), - sidebarLayout( sidebarPanel( downloadButton("save_plot", "Save"), actionButton("console_plot", "Plot to R console"), - br(), br(), - tabsetPanel( - tabPanel("General", - br(), - textOutput("save_text"), - br(), - # textOutput("save_text2"), - # br(), - sliderInput("plot_width", "Plot width (in):", min = 2, max = 24, value = 10), - sliderInput("plot_height", "Plot height (in):", min = 2, max = 20, value = 6), - checkboxInput("coord_flip", "Flip coordinates"), - # checkboxInput("do_interactive", "Interactive plot"), - selectInput("ggplot_theme", "Theme", - list("Linedraw", - "Black-white" , - "Grey / gray", - "Light", - "Dark", - "Minimal", - "Classic"))), - - tabPanel("Title & subtitle", - br(), - tabsetPanel(tabPanel("Title", - br(), - textInput("title_text", "Title text:", "Diamonds dataset visualisation", placeholder = "Gene usage"), - sliderInput("title_text_size", "Title text size:", - min = 1, max = 40, value = 25, step=.5), - sliderInput("title_text_hjust", "Title text horizontal adjustment:", - min = 0, max = 1, value = 0, step=.05), - sliderInput("title_text_vjust", "Title text vertical adjustment:", - min = -4, max = 4, value = .5, step=.25), - sliderInput("title_text_angle", "Title text angle:", - min = 0, max = 90, value = 0, step=1), - selectInput("title_face", "Face:", - list(Plain = "plain", Bold = "bold", Italic = "italic", "Bold Italic" = "bold.italic"))), - - tabPanel("Subtitle", - br(), - textAreaInput("subtitle_text", "Subtitle text:", "Load it via data(dataset)", - placeholder = "Frequency of Variable gene segments presented in the input samples"), - sliderInput("subtitle_text_size", "Subtitle text size:", - min = 1, max = 40, value = 16, step=.5), - sliderInput("subtitle_text_hjust", "Subtitle text horizontal adjustment:", - min = 0, max = 1, value = 0, step=.05), - sliderInput("subtitle_text_vjust", "Subtitle text vertical adjustment:", - min = -4, max = 4, value = .5, step=.25), - sliderInput("subtitle_text_angle", "Subtitle text angle:", - min = 0, max = 90, value = 0, step=1), - selectInput("subtitle_face", "Face:", - list(Plain = "plain", Bold = "bold", Italic = "italic", "Bold Italic" = "bold.italic")))) + tabPanel( + "General", + br(), + textOutput("save_text"), + br(), + # textOutput("save_text2"), + # br(), + sliderInput("plot_width", "Plot width (in):", min = 2, max = 24, value = 10), + sliderInput("plot_height", "Plot height (in):", min = 2, max = 20, value = 6), + checkboxInput("coord_flip", "Flip coordinates"), + # checkboxInput("do_interactive", "Interactive plot"), + selectInput( + "ggplot_theme", "Theme", + list( + "Linedraw", + "Black-white", + "Grey / gray", + "Light", + "Dark", + "Minimal", + "Classic" + ) + ) ), - - tabPanel("Legends", - br(), - selectInput("legend_position", "Legend position", - list("right", - "top" , - "bottom", - "left")), - selectInput("legend_box", "Legend arrangement", - list("vertical", - "horizontal")), - tabsetPanel(tabPanel("Color", - tabsetPanel(make_legend_tab("col_title", "Title (color)", "Colour", T), - make_legend_tab("col_text", "Labels (color)", "", F))), - tabPanel("Fill", - tabsetPanel(make_legend_tab("fill_title", "Title (fill)", "Cut", T), - make_legend_tab("fill_text", "Labels (fill)", "", F))), - tabPanel("Size", - tabsetPanel(make_legend_tab("size_title", "Title (size)", "Clarity", T), - make_legend_tab("size_text", "Labels (size)", "", F))), - tabPanel("Shape", - tabsetPanel(make_legend_tab("shape_title", "Title (shape)", "Cut", T), - make_legend_tab("shape_text", "Labels (shape)", "", F))), - tabPanel("Linetype", - tabsetPanel(make_legend_tab("linetype_title", "Title (linetype)", "Linetype", T), - make_legend_tab("linetype_text", "Labels (linetype)", "", F))) - ) + tabPanel( + "Title & subtitle", + br(), + tabsetPanel( + tabPanel( + "Title", + br(), + textInput("title_text", "Title text:", "Diamonds dataset visualisation", placeholder = "Gene usage"), + sliderInput("title_text_size", "Title text size:", + min = 1, max = 40, value = 25, step = .5 + ), + sliderInput("title_text_hjust", "Title text horizontal adjustment:", + min = 0, max = 1, value = 0, step = .05 + ), + sliderInput("title_text_vjust", "Title text vertical adjustment:", + min = -4, max = 4, value = .5, step = .25 + ), + sliderInput("title_text_angle", "Title text angle:", + min = 0, max = 90, value = 0, step = 1 + ), + selectInput( + "title_face", "Face:", + list(Plain = "plain", Bold = "bold", Italic = "italic", "Bold Italic" = "bold.italic") + ) + ), + tabPanel( + "Subtitle", + br(), + textAreaInput("subtitle_text", "Subtitle text:", "Load it via data(dataset)", + placeholder = "Frequency of Variable gene segments presented in the input samples" + ), + sliderInput("subtitle_text_size", "Subtitle text size:", + min = 1, max = 40, value = 16, step = .5 + ), + sliderInput("subtitle_text_hjust", "Subtitle text horizontal adjustment:", + min = 0, max = 1, value = 0, step = .05 + ), + sliderInput("subtitle_text_vjust", "Subtitle text vertical adjustment:", + min = -4, max = 4, value = .5, step = .25 + ), + sliderInput("subtitle_text_angle", "Subtitle text angle:", + min = 0, max = 90, value = 0, step = 1 + ), + selectInput( + "subtitle_face", "Face:", + list(Plain = "plain", Bold = "bold", Italic = "italic", "Bold Italic" = "bold.italic") + ) + ) + ) ), - - tabPanel("X axis", - br(), - tabsetPanel(tabPanel("X title", - br(), - textInput("x_text", "X axis label:", "Carat", placeholder = "V genes"), - checkboxInput("apply_x2y", "Apply X axis settings to Y axis"), - br(), - sliderInput("x_title_size", "X axis title text size:", - min = 1, max = 40, value = 16, step=.5), - sliderInput("x_title_hjust", "X axis title text horizontal adjustment:", - min = 0, max = 1, value = 0.5, step=.05), - sliderInput("x_title_vjust", "X axis title text vertical adjustment:", - min = -4, max = 4, value = .5, step=.25), - sliderInput("x_title_angle", "X axis title text angle:", - min = 0, max = 90, value = 0, step=1), - selectInput("x_title_face", "Face:", - list(Plain = "plain", Bold = "bold", Italic = "italic", "Bold Italic" = "bold.italic"))), - - tabPanel("X ticks", - br(), - sliderInput("x_text_size", "X axis text size:", - min = 1, max = 40, value = 11, step=.5), - sliderInput("x_text_hjust", "X axis text horizontal adjustment:", - min = -2, max = 2, value = .5, step=.1), - sliderInput("x_text_vjust", "X axis text vertical adjustment:", - min = -4, max = 4, value = .5, step=.25), - sliderInput("x_text_angle", "X axis text angle:", - min = 0, max = 90, value = 0, step=1), - selectInput("x_text_face", "Face:", - list(Plain = "plain", Bold = "bold", Italic = "italic", "Bold Italic" = "bold.italic")))) + tabPanel( + "Legends", + br(), + selectInput( + "legend_position", "Legend position", + list( + "right", + "top", + "bottom", + "left" + ) + ), + selectInput( + "legend_box", "Legend arrangement", + list( + "vertical", + "horizontal" + ) + ), + tabsetPanel( + tabPanel( + "Color", + tabsetPanel( + make_legend_tab("col_title", "Title (color)", "Colour", T), + make_legend_tab("col_text", "Labels (color)", "", F) + ) + ), + tabPanel( + "Fill", + tabsetPanel( + make_legend_tab("fill_title", "Title (fill)", "Cut", T), + make_legend_tab("fill_text", "Labels (fill)", "", F) + ) + ), + tabPanel( + "Size", + tabsetPanel( + make_legend_tab("size_title", "Title (size)", "Clarity", T), + make_legend_tab("size_text", "Labels (size)", "", F) + ) + ), + tabPanel( + "Shape", + tabsetPanel( + make_legend_tab("shape_title", "Title (shape)", "Cut", T), + make_legend_tab("shape_text", "Labels (shape)", "", F) + ) + ), + tabPanel( + "Linetype", + tabsetPanel( + make_legend_tab("linetype_title", "Title (linetype)", "Linetype", T), + make_legend_tab("linetype_text", "Labels (linetype)", "", F) + ) + ) + ) ), - - tabPanel("Y axis", - br(), - tabsetPanel(tabPanel("Y title", - br(), - textInput("y_text", "Y axis label:", "Price", placeholder = "Gene frequency"), - checkboxInput("apply_y2x", "Apply Y axis settings to X axis"), - br(), - sliderInput("y_title_size", "Y axis title text size:", - min = 1, max = 40, value = 16, step=.5), - sliderInput("y_title_hjust", "Y axis title text horizontal adjustment:", - min = 0, max = 1, value = 0.5, step=.05), - sliderInput("y_title_vjust", "Y axis title text vertical adjustment:", - min = -4, max = 4, value = .5, step=.25), - sliderInput("y_title_angle", "Y axis title text angle:", - min = 0, max = 90, value = 90, step=1), - selectInput("y_title_face", "Face:", - list(Plain = "plain", Bold = "bold", Italic = "italic", "Bold Italic" = "bold.italic"))), - - tabPanel("Y ticks", - br(), - sliderInput("y_text_size", "Y axis text size:", - min = 1, max = 40, value = 11, step=.5), - sliderInput("y_text_hjust", "Y axis text horizontal adjustment:", - min = -2, max = 2, value = .5, step=.1), - sliderInput("y_text_vjust", "Y axis text vertical adjustment:", - min = -4, max = 4, value = .5, step=.25), - sliderInput("y_text_angle", "Y axis text angle:", - min = 0, max = 90, value = 0, step=1), - selectInput("y_text_face", "Face:", - list(Plain = "plain", Bold = "bold", Italic = "italic", "Bold Italic" = "bold.italic")))) + tabPanel( + "X axis", + br(), + tabsetPanel( + tabPanel( + "X title", + br(), + textInput("x_text", "X axis label:", "Carat", placeholder = "V genes"), + checkboxInput("apply_x2y", "Apply X axis settings to Y axis"), + br(), + sliderInput("x_title_size", "X axis title text size:", + min = 1, max = 40, value = 16, step = .5 + ), + sliderInput("x_title_hjust", "X axis title text horizontal adjustment:", + min = 0, max = 1, value = 0.5, step = .05 + ), + sliderInput("x_title_vjust", "X axis title text vertical adjustment:", + min = -4, max = 4, value = .5, step = .25 + ), + sliderInput("x_title_angle", "X axis title text angle:", + min = 0, max = 90, value = 0, step = 1 + ), + selectInput( + "x_title_face", "Face:", + list(Plain = "plain", Bold = "bold", Italic = "italic", "Bold Italic" = "bold.italic") + ) + ), + tabPanel( + "X ticks", + br(), + sliderInput("x_text_size", "X axis text size:", + min = 1, max = 40, value = 11, step = .5 + ), + sliderInput("x_text_hjust", "X axis text horizontal adjustment:", + min = -2, max = 2, value = .5, step = .1 + ), + sliderInput("x_text_vjust", "X axis text vertical adjustment:", + min = -4, max = 4, value = .5, step = .25 + ), + sliderInput("x_text_angle", "X axis text angle:", + min = 0, max = 90, value = 0, step = 1 + ), + selectInput( + "x_text_face", "Face:", + list(Plain = "plain", Bold = "bold", Italic = "italic", "Bold Italic" = "bold.italic") + ) + ) + ) + ), + tabPanel( + "Y axis", + br(), + tabsetPanel( + tabPanel( + "Y title", + br(), + textInput("y_text", "Y axis label:", "Price", placeholder = "Gene frequency"), + checkboxInput("apply_y2x", "Apply Y axis settings to X axis"), + br(), + sliderInput("y_title_size", "Y axis title text size:", + min = 1, max = 40, value = 16, step = .5 + ), + sliderInput("y_title_hjust", "Y axis title text horizontal adjustment:", + min = 0, max = 1, value = 0.5, step = .05 + ), + sliderInput("y_title_vjust", "Y axis title text vertical adjustment:", + min = -4, max = 4, value = .5, step = .25 + ), + sliderInput("y_title_angle", "Y axis title text angle:", + min = 0, max = 90, value = 90, step = 1 + ), + selectInput( + "y_title_face", "Face:", + list(Plain = "plain", Bold = "bold", Italic = "italic", "Bold Italic" = "bold.italic") + ) + ), + tabPanel( + "Y ticks", + br(), + sliderInput("y_text_size", "Y axis text size:", + min = 1, max = 40, value = 11, step = .5 + ), + sliderInput("y_text_hjust", "Y axis text horizontal adjustment:", + min = -2, max = 2, value = .5, step = .1 + ), + sliderInput("y_text_vjust", "Y axis text vertical adjustment:", + min = -4, max = 4, value = .5, step = .25 + ), + sliderInput("y_text_angle", "Y axis text angle:", + min = 0, max = 90, value = 0, step = 1 + ), + selectInput( + "y_text_face", "Face:", + list(Plain = "plain", Bold = "bold", Italic = "italic", "Bold Italic" = "bold.italic") + ) + ) + ) ) - )), - + ) + ), mainPanel( uiOutput("main_plot", style = "position:fixed;") ) @@ -211,25 +307,25 @@ ui <- fluidPage( # Define server logic required to draw a histogram server <- function(input, output, session) { - data("diamonds") - .plot <- qplot(x = carat, y = price, fill = cut, shape = cut, color = color, size = clarity, data=diamonds[sample.int(nrow(diamonds), 5000),]) + theme_classic() + .plot <- qplot(x = carat, y = price, fill = cut, shape = cut, color = color, size = clarity, data = diamonds[sample.int(nrow(diamonds), 5000), ]) + theme_classic() - create_plot <- function (input) { + create_plot <- function(input) { # TODO: make automatic detection of available themes from ggplot2 and other packages - choose_theme <- function (theme_label) { - switch (theme_label, - Linedraw = theme_linedraw(), - `Black-white` = theme_bw(), - `Grey / gray` = theme_gray(), - `Light` = theme_light(), - `Dark` = theme_dark(), - `Minimal` = theme_minimal(), - `Classic` = theme_classic()) + choose_theme <- function(theme_label) { + switch(theme_label, + Linedraw = theme_linedraw(), + `Black-white` = theme_bw(), + `Grey / gray` = theme_gray(), + `Light` = theme_light(), + `Dark` = theme_dark(), + `Minimal` = theme_minimal(), + `Classic` = theme_classic() + ) } - check_empty_str <- function (.str) { + check_empty_str <- function(.str) { if (.str == "" || .str == "\n" || .str == "\t") { NULL } else { @@ -237,106 +333,129 @@ server <- function(input, output, session) { } } - get_legend_params <- function (.input, .label) { - - .get <- function (.l) { + get_legend_params <- function(.input, .label) { + .get <- function(.l) { .input[[stringr::str_c("legend", .label, .l, sep = "_")]] } - .remove = .get("title_remove") + .remove <- .get("title_remove") if (.remove) { - F # return F to pay respects (please remove it in the next release) + F # return F to pay respects (please remove it in the next release) } else { - - guide_fun = guide_legend + guide_fun <- guide_legend if (.get("title_contin")) { - guide_fun = guide_colorbar + guide_fun <- guide_colorbar } guide_fun( title = .get("title_text"), ncol = .get("title_ncol"), - title.theme = element_text(size = .get("title_size"), - hjust = .get("title_hjust"), - vjust = .get("title_vjust"), - angle = .get("title_angle"), - face = .get("title_face")), - - label.theme = element_text(size = .get("text_size"), - hjust = .get("text_hjust"), - vjust = .get("text_vjust"), - angle = .get("text_angle"), - face = .get("text_face")) + title.theme = element_text( + size = .get("title_size"), + hjust = .get("title_hjust"), + vjust = .get("title_vjust"), + angle = .get("title_angle"), + face = .get("title_face") + ), + label.theme = element_text( + size = .get("text_size"), + hjust = .get("text_hjust"), + vjust = .get("text_vjust"), + angle = .get("text_angle"), + face = .get("text_face") + ) ) } } - .plot = .plot + + .plot <- .plot + choose_theme(input$ggplot_theme) + - labs(x = check_empty_str(input$x_text), - y = check_empty_str(input$y_text), - title = check_empty_str(input$title_text), - subtitle = check_empty_str(input$subtitle_text), - fill = input$legend_text, - color = input$legend_text) + - guides(col = get_legend_params(input, "col"), - fill = get_legend_params(input, "fill"), - size = get_legend_params(input, "size"), - shape = get_legend_params(input, "shape"), - linetype = get_legend_params(input, "linetype")) + - theme(plot.title = element_text(size=input$title_text_size, - hjust=input$title_text_hjust, - vjust=input$title_text_vjust, - angle=input$title_text_angle, - face = input$title_face), - plot.subtitle = element_text(size=input$subtitle_text_size, - hjust=input$subtitle_text_hjust, - vjust=input$subtitle_text_vjust, - angle=input$subtitle_text_angle, - face = input$subtitle_face), - legend.position = input$legend_position, - legend.box = input$legend_box, - axis.title.x = element_text(size=input$x_title_size, - hjust=input$x_title_hjust, - vjust=input$x_title_vjust, - angle=input$x_title_angle, - face = input$x_title_face), - axis.title.y = element_text(size=input$y_title_size, - hjust=input$y_title_hjust, - vjust=input$y_title_vjust, - angle=input$y_title_angle, - face = input$y_title_face), - axis.text.x = element_text(size=input$x_text_size, - hjust=input$x_text_hjust, - vjust=input$x_text_vjust, - angle=input$x_text_angle, - face = input$x_text_face), - axis.text.y = element_text(size=input$y_text_size, - hjust=input$y_text_hjust, - vjust=input$y_text_vjust, - angle=input$y_text_angle, - face = input$y_text_face)) + labs( + x = check_empty_str(input$x_text), + y = check_empty_str(input$y_text), + title = check_empty_str(input$title_text), + subtitle = check_empty_str(input$subtitle_text), + fill = input$legend_text, + color = input$legend_text + ) + + guides( + col = get_legend_params(input, "col"), + fill = get_legend_params(input, "fill"), + size = get_legend_params(input, "size"), + shape = get_legend_params(input, "shape"), + linetype = get_legend_params(input, "linetype") + ) + + theme( + plot.title = element_text( + size = input$title_text_size, + hjust = input$title_text_hjust, + vjust = input$title_text_vjust, + angle = input$title_text_angle, + face = input$title_face + ), + plot.subtitle = element_text( + size = input$subtitle_text_size, + hjust = input$subtitle_text_hjust, + vjust = input$subtitle_text_vjust, + angle = input$subtitle_text_angle, + face = input$subtitle_face + ), + legend.position = input$legend_position, + legend.box = input$legend_box, + axis.title.x = element_text( + size = input$x_title_size, + hjust = input$x_title_hjust, + vjust = input$x_title_vjust, + angle = input$x_title_angle, + face = input$x_title_face + ), + axis.title.y = element_text( + size = input$y_title_size, + hjust = input$y_title_hjust, + vjust = input$y_title_vjust, + angle = input$y_title_angle, + face = input$y_title_face + ), + axis.text.x = element_text( + size = input$x_text_size, + hjust = input$x_text_hjust, + vjust = input$x_text_vjust, + angle = input$x_text_angle, + face = input$x_text_face + ), + axis.text.y = element_text( + size = input$y_text_size, + hjust = input$y_text_hjust, + vjust = input$y_text_vjust, + angle = input$y_text_angle, + face = input$y_text_face + ) + ) if (input$coord_flip) { - .plot = .plot + coord_flip() + .plot <- .plot + coord_flip() } .plot } - output$save_text = renderText({'To save the plot, press the "Save" button above or drag-n-drop - the plot to your Desktop or into any file manager (Finder, File Explorer, etc.)'}) - output$save_text2 = renderText({'Note: saving via the "Save" button will be different from the drag-n-drop method - due to R\'s peculiar properties.'}) + output$save_text <- renderText({ + 'To save the plot, press the "Save" button above or drag-n-drop + the plot to your Desktop or into any file manager (Finder, File Explorer, etc.)' + }) + output$save_text2 <- renderText({ + 'Note: saving via the "Save" button will be different from the drag-n-drop method + due to R\'s peculiar properties.' + }) - output$main_plot = renderUI({ + output$main_plot <- renderUI({ # if (input$do_interactive) { # output$main_plot_helper = renderPlotly(ggplotly(create_plot(input))) # plotlyOutput("main_plot_helper") # } else { - output$main_plot_helper = renderPlot(create_plot(input)) - plotOutput("main_plot_helper", width = input$plot_width*72, height = input$plot_height*72) + output$main_plot_helper <- renderPlot(create_plot(input)) + plotOutput("main_plot_helper", width = input$plot_width * 72, height = input$plot_height * 72) # } }) @@ -385,13 +504,13 @@ server <- function(input, output, session) { # # Save plots # - output$save_plot = downloadHandler( + output$save_plot <- downloadHandler( filename = paste0("plot shiny ", Sys.time(), ".png"), content = function(file) { ggsave(file, plot = create_plot(input), width = input$plot_width, height = input$plot_height, device = "png") } ) - } +} # Run the application shinyApp(ui = ui, server = server) diff --git a/man/add_class.Rd b/man/add_class.Rd index 97c2f6d6..b2940057 100644 --- a/man/add_class.Rd +++ b/man/add_class.Rd @@ -17,10 +17,12 @@ Input object with additional class \code{.class}. \description{ Add a new class attribute } -\examples{ +\section{Developer Examples}{ + tmp <- "abc" class(tmp) tmp <- immunarch:::add_class(tmp, "new_class") class(tmp) } + \concept{utility_private} diff --git a/man/check_distribution.Rd b/man/check_distribution.Rd index 4ca26e32..9a6d111f 100644 --- a/man/check_distribution.Rd +++ b/man/check_distribution.Rd @@ -36,9 +36,11 @@ Numeric vector. \description{ Check if the given .data is a distribution and normalise it if necessary with an optional laplace correction. } -\examples{ +\section{Developer Examples}{ + immunarch:::check_distribution(c(1, 2, 3)) immunarch:::check_distribution(c(1, 2, 3), TRUE) immunarch:::check_distribution(c(1, 2, 3), FALSE) } + \concept{utility_private} diff --git a/man/dot-quant_column_choice.Rd b/man/dot-quant_column_choice.Rd index 7eddc96d..e7a7972e 100644 --- a/man/dot-quant_column_choice.Rd +++ b/man/dot-quant_column_choice.Rd @@ -17,8 +17,10 @@ A string with the column name. \description{ Get a column's name using the input alias } -\examples{ +\section{Developer Examples}{ + immunarch:::.quant_column_choice("count") immunarch:::.quant_column_choice("freq") } + \concept{utility_private} diff --git a/man/geneUsage.Rd b/man/geneUsage.Rd index 3dbc9b55..04a53027 100644 --- a/man/geneUsage.Rd +++ b/man/geneUsage.Rd @@ -37,7 +37,15 @@ are "hs.trbv", "HomoSapiens.TRBJ" or "macmul.IGHV". For details run \code{gene_s Pass NA if you want to compute gene statistics at the clonotype level without re-weighting. Pass "count" to use the "Clones" column to weight genes by abundance of their corresponding clonotypes.} -\item{.ambig}{An option to handle ambiguous data. We recommend to turn in on by passing "inc" (turned on by default). +\item{.ambig}{An option to handle ambiguous gene assigments, e.g., "TRAV1,TRAV2". + + +- Pass "inc" to include all possible gene segments, so "TRAV1,TRAV2" is counted as a different gene segment. + +- Pass "exc" to exclude all ambiguous gene assignments, so "TRAV1,TRAV2" is excluded from the resultant gene table. + + +We recommend to turn in on by passing "inc" (turned on by default). You can exclude data for the cases where there is no clear match for gene, include it for every supplied gene, or pick only first from the set. Set it to "exc", "inc" or "maj", correspondingly.} diff --git a/man/getKmers.Rd b/man/getKmers.Rd index 5fec11cb..6f39762b 100644 --- a/man/getKmers.Rd +++ b/man/getKmers.Rd @@ -4,7 +4,7 @@ \alias{getKmers} \alias{get.kmers} \alias{makeKmerTable} -\title{Calculate the kmer tatistics of immune repertoires} +\title{Calculate the kmer statistics of immune repertoires} \usage{ getKmers(.data, .k, .col = c("aa", "nt"), .coding = TRUE) } @@ -32,7 +32,7 @@ pass "nt" for CDR3 nucleotide sequences.} Data frame with two columns (kmers and their counts). } \description{ -Calculate the kmer tatistics of immune repertoires +Calculate the kmer statistics of immune repertoires } \examples{ data(immdata) diff --git a/man/group_from_metadata.Rd b/man/group_from_metadata.Rd index 2884dc08..e9bce0a7 100644 --- a/man/group_from_metadata.Rd +++ b/man/group_from_metadata.Rd @@ -19,7 +19,9 @@ Character vector with group names. \description{ Get a character vector of samples' groups from the input metadata file } -\examples{ +\section{Developer Examples}{ + immunarch:::group_from_metadata("Status", data.frame(Status = c("A", "A", "B", "B", "C"))) } + \concept{utility_private} diff --git a/man/has_class.Rd b/man/has_class.Rd index 0e61c865..6d7a39db 100644 --- a/man/has_class.Rd +++ b/man/has_class.Rd @@ -17,10 +17,12 @@ Logical value. \description{ A function to check if an input object has a specific class name. } -\examples{ +\section{Developer Examples}{ + tmp <- "abc" immunarch:::has_class(tmp, "new_class") tmp <- immunarch:::add_class(tmp, "new_class") immunarch:::has_class(tmp, "new_class") } + \concept{utility_private} diff --git a/man/inc_overlap.Rd b/man/inc_overlap.Rd index 42f79d08..dee3f263 100644 --- a/man/inc_overlap.Rd +++ b/man/inc_overlap.Rd @@ -58,6 +58,5 @@ Like in paper https://www.pnas.org/content/111/16/5980 (Fig. 4). data(immdata) ov <- repOverlap(immdata$data, "inc+overlap", .step = 100, .verbose.inc = FALSE, .verbose = FALSE) vis(ov) - } \concept{overlap} diff --git a/man/matrixdiagcopy.Rd b/man/matrixdiagcopy.Rd index a637496f..86568927 100644 --- a/man/matrixdiagcopy.Rd +++ b/man/matrixdiagcopy.Rd @@ -15,11 +15,13 @@ Matrix with its upper tri part copied to the lower tri part. \description{ Copy the upper matrix triangle to the lower one } -\examples{ +\section{Developer Examples}{ + mat <- matrix(0, 3, 3) mat mat[1, 3] <- 1 mat <- immunarch:::matrixdiagcopy(mat) mat } + \concept{utility_private} diff --git a/man/pubRep.Rd b/man/pubRep.Rd index 6cc6fe75..4980c744 100644 --- a/man/pubRep.Rd +++ b/man/pubRep.Rd @@ -62,7 +62,7 @@ Create a repertoire of public clonotypes \examples{ # Subset the data to make the example faster to run immdata$data <- lapply(immdata$data, head, 2000) -pr <- pubRep(immdata$data, .verbose=FALSE) +pr <- pubRep(immdata$data, .verbose = FALSE) vis(pr, "clonotypes", 1, 2) } \concept{pubrep} diff --git a/man/pubRepApply.Rd b/man/pubRepApply.Rd index b9bc2763..5cfe9d5c 100644 --- a/man/pubRepApply.Rd +++ b/man/pubRepApply.Rd @@ -25,7 +25,7 @@ Work In Progress \examples{ data(immdata) immdata$data <- lapply(immdata$data, head, 2000) -pr <- pubRep(immdata$data, .verbose=FALSE) +pr <- pubRep(immdata$data, .verbose = FALSE) pr1 <- pubRepFilter(pr, immdata$meta, .by = c(Status = "MS")) pr2 <- pubRepFilter(pr, immdata$meta, .by = c(Status = "C")) prapp <- pubRepApply(pr1, pr2) diff --git a/man/pubRepFilter.Rd b/man/pubRepFilter.Rd index 8308704d..f35d9fbd 100644 --- a/man/pubRepFilter.Rd +++ b/man/pubRepFilter.Rd @@ -25,7 +25,7 @@ Filter our clonotypes with low incidence in a specific group. \examples{ data(immdata) immdata$data <- lapply(immdata$data, head, 2000) -pr <- pubRep(immdata$data, .verbose=FALSE) +pr <- pubRep(immdata$data, .verbose = FALSE) pr1 <- pubRepFilter(pr, immdata$meta, .by = c(Status = "MS")) head(pr1) } diff --git a/man/pubRepStatistics.Rd b/man/pubRepStatistics.Rd index b9f1da1a..4ce5aeb1 100644 --- a/man/pubRepStatistics.Rd +++ b/man/pubRepStatistics.Rd @@ -22,7 +22,7 @@ Statistics of number of public clonotypes for each possible combinations of repe \examples{ data(immdata) immdata$data <- lapply(immdata$data, head, 2000) -pr <- pubRep(immdata$data, .verbose=FALSE) +pr <- pubRep(immdata$data, .verbose = FALSE) pubRepStatistics(pr) \%>\% vis() } \concept{pubrep} diff --git a/man/public_matrix.Rd b/man/public_matrix.Rd index 01c3d31c..07ea842d 100644 --- a/man/public_matrix.Rd +++ b/man/public_matrix.Rd @@ -18,7 +18,7 @@ Get a matrix with public clonotype frequencies \examples{ data(immdata) immdata$data <- lapply(immdata$data, head, 2000) -pr <- pubRep(immdata$data, .verbose=FALSE) +pr <- pubRep(immdata$data, .verbose = FALSE) pr.mat <- public_matrix(pr) dim(pr.mat) head(pr.mat) diff --git a/man/repFilter.Rd b/man/repFilter.Rd new file mode 100644 index 00000000..6c7ee0fc --- /dev/null +++ b/man/repFilter.Rd @@ -0,0 +1,91 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filters.R +\name{repFilter} +\alias{repFilter} +\alias{filter_by_meta} +\alias{filter_by_repertoire} +\alias{filter_by_clonotype} +\alias{filter_table} +\alias{startswith_rows} +\alias{substring_rows} +\alias{validate_input_data} +\alias{include} +\alias{exclude} +\alias{lessthan} +\alias{morethan} +\alias{interval} +\title{Main function for data filtering} +\usage{ +repFilter( + .data, + .method = "by.clonotype", + .query = list(CDR3.aa = exclude("partial", "out_of_frame")), + .match = "exact" +) +} +\arguments{ +\item{.data}{The data to be processed. Must be the list of 2 elements: +data table and metadata table.} + +\item{.method}{Method of filtering. Implemented methods: +by.meta, by.repertoire (by.rep), by.clonotype (by.cl) +Default value: 'by.clonotype'.} + +\item{.query}{Filtering query. It's a named list of filters that will be applied +to data. +Possible values for names in this list are dependent on filter methods: +- by.meta: filter by metadata. Names in the named list are metadata column headers. +- by.repertoire: filter by number of clonotypes or total number of clones in sample. +Possible names in the named list are "n_clonotypes" and "n_clones". +- by.clonotype: filter by data in all samples. Names in the named list are +data column headers. +Elements of the named list for each of the filters are filtering options. +Possible values for filtering options: +- include("STR1", "STR2", ...): keep only rows with matching values. +Available for methods: "by.meta", "by.clonotype". +- exclude("STR1", "STR2", ...): remove rows with matching values. +Available for methods: "by.meta", "by.clonotype". +- lessthan(value): keep rows/samples with numeric values less than specified. +Available for methods: "by.meta", "by.repertoire", "by.clonotype". +- morethan(value): keep rows/samples with numeric values more than specified. +Available for methods: "by.meta", "by.repertoire", "by.clonotype". +- interval(from, to): keep rows/samples with numeric values that fits in this interval. +from is inclusive, to is exclusive. +Available for methods: "by.meta", "by.repertoire", "by.clonotype". +Default value: 'list(CDR3.aa = exclude("partial", "out_of_frame"))'.} + +\item{.match}{Matching method for "include" and "exclude" options in query. +Possible values: +- exact: match only the exact specified string; +- startswith: match all strings starting with the specified substring; +- substring: match all strings containing the specified substring. +Default value: 'exact'.} +} +\description{ +Main function for data filtering +} +\examples{ +data(immdata) + +# Select samples with status "MS" +repFilter(immdata, "by.meta", list(Status = include("MS"))) + +# Select samples without status "MS" +repFilter(immdata, "by.meta", list(Status = exclude("MS"))) + +# Select samples from lanes "A" and "B" with age > 15 +repFilter(immdata, "by.meta", list(Lane = include("A", "B"), Age = morethan(15))) + +# Select samples that are not from lanes "A" and "B" +repFilter(immdata, "by.meta", list(Lane = exclude("A", "B"))) + +# Select samples with a number of clonotypes from 1000 to 5000 +repFilter(immdata, "by.repertoire", list(n_clonotypes = interval(1000, 5000))) + +# Select clonotypes in all samples with alpha chains +repFilter(immdata, "by.clonotype", + list(V.name = include("AV"), J.name = include("AJ")), + .match = "substring" +) +} +\concept{filters} diff --git a/man/repOverlap.Rd b/man/repOverlap.Rd index 1a36b944..5cb09980 100644 --- a/man/repOverlap.Rd +++ b/man/repOverlap.Rd @@ -58,7 +58,7 @@ In the second case, the vector encodes all repertoire sampling depths.} \item{.n.steps}{Something. Skipped if ".step" is a numeric vector.} \item{.downsample}{If TRUE then perform downsampling to N clonotypes at each step instead of choosing the -top N clonotypes.} +top N clonotypes in incremental overlaps. Change nothing for conventional methods.} \item{.bootstrap}{Pass NA to turn off any bootstrapping, pass a number to perform bootstrapping with this number of tries.} @@ -67,9 +67,11 @@ top N clonotypes.} \item{.force.matrix}{Logical. If TRUE than always force the matrix output even in case of two input repertoires.} } \value{ -In most cases the return value is a matrix with overlap values. +In most cases the return value is a matrix with overlap values for each pair of repertoires. -If only two repertoires were provided, +If only two repertoires were provided, return value is single numeric value. + +If one of the incremental method is chosen, return list of overlap matrix. } \description{ The \code{repOverlap} function is designed to analyse the overlap between diff --git a/man/repSave.Rd b/man/repSave.Rd index d5d9dadc..539fb9c4 100644 --- a/man/repSave.Rd +++ b/man/repSave.Rd @@ -32,6 +32,8 @@ does not exist it will be created automatically. } \examples{ data(immdata) +# Reduce data to save time on examples +immdata$data <- purrr::map(immdata$data, ~ .x \%>\% head(10)) dirpath <- tempdir() # Save the list of repertoires repSave(immdata, dirpath) diff --git a/man/set_pb.Rd b/man/set_pb.Rd index 9d65e679..69682d05 100644 --- a/man/set_pb.Rd +++ b/man/set_pb.Rd @@ -22,7 +22,8 @@ An updated progress bar. \description{ Set and update progress bars } -\examples{ +\section{Developer Examples}{ + pb <- immunarch:::set_pb(100) immunarch:::add_pb(pb, 25) immunarch:::add_pb(pb, 25) @@ -30,4 +31,5 @@ immunarch:::add_pb(pb, 25) immunarch:::add_pb(pb, 25) close(pb) } + \concept{utility_private} diff --git a/man/spectratype.Rd b/man/spectratype.Rd index cc6b071e..bdd6339f 100644 --- a/man/spectratype.Rd +++ b/man/spectratype.Rd @@ -38,7 +38,7 @@ Immune repertoire spectratyping \examples{ # Load the data data(immdata) -sp <- spectratype(immdata$data[[1]], .col="aa+v") +sp <- spectratype(immdata$data[[1]], .col = "aa+v") vis(sp) } \concept{vis} diff --git a/man/switch_type.Rd b/man/switch_type.Rd index 2d6d4f2e..ffb8ed9f 100644 --- a/man/switch_type.Rd +++ b/man/switch_type.Rd @@ -26,8 +26,10 @@ A column's name. \description{ Return a column's name } -\examples{ +\section{Developer Examples}{ + immunarch:::switch_type("nuc") immunarch:::switch_type("v") } + \concept{utility_private} diff --git a/man/vis.immunr_clonal_prop.Rd b/man/vis.immunr_clonal_prop.Rd index e098e896..732350ee 100644 --- a/man/vis.immunr_clonal_prop.Rd +++ b/man/vis.immunr_clonal_prop.Rd @@ -69,6 +69,10 @@ You can execute the command \code{?p.adjust} in the R console to see more. data(immdata) clp <- repClonality(immdata$data, "clonal.prop") vis(clp) + +hom <- repClonality(immdata$data, "homeo") +# Remove p values and points from the plot +vis(hom, .by = "Status", .meta = immdata$meta, .test = FALSE, .points = FALSE) } \seealso{ \link{repClonality} \link{vis} diff --git a/man/vis_heatmap2.Rd b/man/vis_heatmap2.Rd index 595dcbbf..4975ed8f 100644 --- a/man/vis_heatmap2.Rd +++ b/man/vis_heatmap2.Rd @@ -6,8 +6,9 @@ \usage{ vis_heatmap2( .data, + .meta = NA, + .by = NA, .title = NA, - .labs = NA, .color = colorRampPalette(c("#67001f", "#d6604d", "#f7f7f7", "#4393c3", "#053061"))(1024), ... @@ -16,9 +17,12 @@ vis_heatmap2( \arguments{ \item{.data}{Input matrix. Column names and row names (if presented) will be used as names for labs.} -\item{.title}{The text for the plot's title (same as the "main" argument in \link[pheatmap]{pheatmap}).} +\item{.meta}{A metadata object. An R dataframe with sample names and their properties, +such as age, serostatus or hla.} + +\item{.by}{Pass NA if you want to plot samples without grouping.} -\item{.labs}{A character vector of length two with names for x-axis and y-axis, respectively.} +\item{.title}{The text for the plot's title (same as the "main" argument in \link[pheatmap]{pheatmap}).} \item{.color}{A vector specifying the colors (same as the "color" argument in \link[pheatmap]{pheatmap}). Pass NA to use the default pheatmap colors.} diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 10f1c204..703bd91b 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -5,6 +5,11 @@ using namespace Rcpp; +#ifdef RCPP_USE_GLOBAL_ROSTREAM +Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); +Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); +#endif + // fill_vec NumericVector fill_vec(NumericVector read_vec, NumericVector read_indices); RcppExport SEXP _immunarch_fill_vec(SEXP read_vecSEXP, SEXP read_indicesSEXP) { diff --git a/tests/testthat/helper-preload.R b/tests/testthat/helper-preload.R index f4636d7d..7430608d 100644 --- a/tests/testthat/helper-preload.R +++ b/tests/testthat/helper-preload.R @@ -19,6 +19,22 @@ ap <- function(.name, .prefix) { paste0(.prefix, .name) } +add_mock_sample <- function(.immdata, .sample_name, .meta = list(), .empty = FALSE) { + if (.empty) { + # copy only column headers + .immdata$data[[.sample_name]] <- .immdata$data[[1]][0, ] + } else { + # copy dataframe of 1st sample to the new sample + .immdata$data[[.sample_name]] <- .immdata$data[[1]] + } + + # .meta must be a named list containing metadata row (full or partial) for the sample + .meta[["Sample"]] <- .sample_name + .immdata$meta %<>% bind_rows(as.data.frame(.meta)) + + return(.immdata) +} + data(immdata) frame_data <- immdata$data table_data <- lapply(frame_data, as.data.table) diff --git a/tests/testthat/test-filter.R b/tests/testthat/test-filter.R new file mode 100644 index 00000000..62e280e5 --- /dev/null +++ b/tests/testthat/test-filter.R @@ -0,0 +1,285 @@ +test_cases <- list() + +data <- immdata$data +meta <- immdata$meta +original_samples_count <- nrow(meta) + +prepare_immdata <- function() { + return(list(data = data, meta = meta)) +} + +test_cases[[length(test_cases) + 1]] <- list( + data_factory = function() { + prepare_immdata() %>% + add_mock_sample("S1", list(Status = "N")) + }, + method = "by.meta", query = list(Status = include("N")), + expected_samples = 1 +) + +test_cases[[length(test_cases) + 1]] <- list( + data_factory = function() { + prepare_immdata() %>% + add_mock_sample("S1", list(Lane = "D")) + }, + method = "by.meta", query = list(Lane = exclude("D")), + expected_samples = original_samples_count +) + +test_cases[[length(test_cases) + 1]] <- list( + data_factory = function() { + prepare_immdata() %>% + add_mock_sample("S1", list(Age = 1)) %>% + add_mock_sample("S2", list(Age = 2)) + }, + method = "by.meta", query = list(Age = lessthan(5)), + expected_samples = 2 +) + +test_cases[[length(test_cases) + 1]] <- list( + data_factory = function() { + prepare_immdata() %>% + add_mock_sample("S1", list(Age = 95)) %>% + add_mock_sample("S2", list(Age = 99)) + }, + method = "by.meta", query = list(Age = interval(95, 100)), + expected_samples = 2 +) + +test_cases[[length(test_cases) + 1]] <- list( + data_factory = function() { + prepare_immdata() %>% + add_mock_sample("S1", list(Age = 100)) + }, + method = "by.meta", query = list(Age = interval(95, 100)), + expected_samples = 0, + expect_warnings = TRUE +) + +test_cases[[length(test_cases) + 1]] <- list( + data_factory = function() { + prepare_immdata() %>% + add_mock_sample("S1", list(Lane = "D")) %>% + add_mock_sample("S2", list(Lane = "E")) + }, + method = "by.meta", query = list(Lane = include("D", "E")), + expected_samples = 2 +) + +test_cases[[length(test_cases) + 1]] <- list( + data_factory = function() { + prepare_immdata() %>% + add_mock_sample("S1", list(Lane = "D")) %>% + add_mock_sample("S2", list(Lane = "E")) + }, + method = "by.meta", query = list(Lane = exclude("D", "E")), + expected_samples = original_samples_count +) + +test_cases[[length(test_cases) + 1]] <- list( + data_factory = function() { + prepare_immdata() %>% + add_mock_sample("S1", list(Lane = "D", Age = 95)) %>% + add_mock_sample("S2", list(Lane = "E", Age = 96)) + }, + method = "by.meta", query = list(Lane = include("D", "E"), Age = morethan(95)), + expected_samples = 1 +) + +test_cases[[length(test_cases) + 1]] <- list( + data_factory = function() { + immdata <- prepare_immdata() %>% + add_mock_sample("S1") + immdata$data[["S1"]] %<>% rbind(immdata$data[["S1"]][rep(1, 10000), ]) + return(immdata) + }, + method = "by.repertoire", query = list(n_clonotypes = morethan(10000)), + expected_samples = 1 +) + +test_cases[[length(test_cases) + 1]] <- list( + data_factory = function() { + immdata <- prepare_immdata() + immdata %<>% add_mock_sample("S1") + immdata$data[["S1"]] <- immdata$data[["S1"]][1, ] + immdata$data[["S1"]][["Clones"]] <- 50 + immdata %<>% add_mock_sample("S2") + immdata$data[["S2"]] <- immdata$data[["S2"]][1:2, ] + immdata$data[["S2"]][1, ][["Clones"]] <- 50 + immdata$data[["S2"]][2, ][["Clones"]] <- 50 + return(immdata) + }, + method = "by.repertoire", query = list(n_clones = lessthan(100)), + expected_samples = 1 +) + +# repeat the last 2 test cases, but abbreviate method as "by.rep" +for (i in 1:2) { + test_cases[[length(test_cases) + 1]] <- test_cases[[length(test_cases) - 1]] + test_cases[[length(test_cases)]][["method"]] <- "by.rep" +} + +test_cases[[length(test_cases) + 1]] <- list( + data_factory = function() { + immdata <- prepare_immdata() %>% + add_mock_sample("S1", .empty = TRUE) + immdata$data[["S1"]] %<>% + bind_rows(as.data.frame(list(CDR3.aa = "partial"))) %>% + bind_rows(as.data.frame(list(CDR3.aa = "out_of_frame"))) %>% + bind_rows(as.data.frame(list(CDR3.aa = "other"))) + return(immdata) + }, + method = "by.clonotype", query = list(CDR3.aa = exclude("partial", "out_of_frame")), + expected_samples = original_samples_count + 1, + expected_sample_rows = list(S1 = 1) +) + +test_cases[[length(test_cases) + 1]] <- list( + data_factory = function() { + immdata <- prepare_immdata() %>% + add_mock_sample("S1", .empty = TRUE) + immdata$data[["S1"]] %<>% + bind_rows(as.data.frame(list(Clones = 1000))) %>% + bind_rows(as.data.frame(list(Clones = 1500))) %>% + bind_rows(as.data.frame(list(Clones = 2000))) + return(immdata) + }, + method = "by.clonotype", query = list(Clones = interval(1000, 2000)), + expected_samples = 1, + expected_sample_rows = list(S1 = 2), + expect_warnings = TRUE +) + +mock_genes <- function() { + immdata <- prepare_immdata() %>% + add_mock_sample("S1", .empty = TRUE) + # delete all other samples + immdata$data <- immdata$data[names(immdata$data) == "S1"] + immdata$meta %<>% filter(Sample == "S1") + + immdata$data[["S1"]] %<>% + bind_rows(as.data.frame(list(V.name = "TRBV1"))) %>% + bind_rows(as.data.frame(list(V.name = "TRAV1"))) %>% + bind_rows(as.data.frame(list(V.name = "TRAV2"))) %>% + bind_rows(as.data.frame(list(V.name = "TRAV11"))) + return(immdata) +} + +test_cases[[length(test_cases) + 1]] <- list( + data_factory = mock_genes, + method = "by.clonotype", query = list(V.name = exclude("TRBV1", "TRAV1")), + expected_samples = 1, + expected_sample_rows = list(S1 = 2) +) + +test_cases[[length(test_cases) + 1]] <- list( + data_factory = mock_genes, + method = "by.clonotype", query = list(V.name = exclude("TRBV1", "TRAV1")), match = "exact", + expected_samples = 1, + expected_sample_rows = list(S1 = 2) +) + +test_cases[[length(test_cases) + 1]] <- list( + data_factory = mock_genes, + method = "by.clonotype", query = list(V.name = exclude("TRBV1", "TRAV1")), match = "startswith", + expected_samples = 1, + expected_sample_rows = list(S1 = 1) +) + +test_cases[[length(test_cases) + 1]] <- list( + data_factory = mock_genes, + method = "by.clonotype", query = list(V.name = include("TRBV1", "TRAV1")), match = "startswith", + expected_samples = 1, + expected_sample_rows = list(S1 = 3) +) + +test_cases[[length(test_cases) + 1]] <- list( + data_factory = mock_genes, + method = "by.clonotype", query = list(V.name = exclude("V1")), match = "substring", + expected_samples = 1, + expected_sample_rows = list(S1 = 1) +) + +test_cases[[length(test_cases) + 1]] <- list( + data_factory = mock_genes, + method = "by.clonotype", query = list(V.name = include("V1")), match = "substring", + expected_samples = 1, + expected_sample_rows = list(S1 = 3) +) + +# repeat the last 8 test cases, but abbreviate method as "by.cl" +for (i in 1:8) { + test_cases[[length(test_cases) + 1]] <- test_cases[[length(test_cases) - 7]] + test_cases[[length(test_cases)]][["method"]] <- "by.cl" +} + +test_cases[[length(test_cases) + 1]] <- list( + data_factory = function() { + immdata <- prepare_immdata() %>% + add_mock_sample("S1", .empty = TRUE) + immdata$data <- immdata$data[names(immdata$data) == "S1"] + immdata$meta %<>% filter(Sample == "S1") + immdata$data[["S1"]] %<>% + bind_rows(as.data.frame(list(V.name = "TRBV1", J.name = "TRAJ1"))) %>% + bind_rows(as.data.frame(list(V.name = "TRAV1", J.name = "TRAJ11"))) %>% + bind_rows(as.data.frame(list(V.name = "TRAV2", J.name = "TRAJ1"))) %>% + bind_rows(as.data.frame(list(V.name = "TRAV11", J.name = "TRBJ2"))) + return(immdata) + }, + method = "by.clonotype", + query = list(V.name = include("AV1"), J.name = include("AJ")), + match = "substring", + expected_samples = 1, + expected_sample_rows = list(S1 = 1) +) + +for (i in seq_along(test_cases)) { + # Arrange + data_factory <- test_cases[[i]][["data_factory"]] + method <- test_cases[[i]][["method"]] + query <- test_cases[[i]][["query"]] + match <- test_cases[[i]][["match"]] + expected_samples <- test_cases[[i]][["expected_samples"]] + expected_sample_rows <- test_cases[[i]][["expected_sample_rows"]] + # expected_sample_rows is named list, contains sample names and expected rows; + # if not specified, don't check sample rows + if (is.null(expected_sample_rows)) { + expected_sample_rows <- list() + } + expect_warnings <- test_cases[[i]][["expect_warnings"]] + if (is.null(expect_warnings)) { + options(warn = 0) + } else { + options(warn = -1) + } + + test_name <- paste0("method:", method, ".case:", i) + immdata <- data_factory() + frame_with_meta <- immdata + table_with_meta <- list(data = lapply(immdata$data, as.data.table), meta = immdata$meta) + + # Act + if (is.null(match)) { + compute_res <- apply_DF_DT(frame_with_meta, table_with_meta, + repFilter, + .method = method, .query = query + ) + } else { + compute_res <- apply_DF_DT(frame_with_meta, table_with_meta, + repFilter, + .method = method, .query = query, .match = match + ) + } + + # Assert + test_that(ap(test_name, "compute_"), { + expect_equal(compute_res[[1]]$data %>% length(), expected_samples) + expect_equal(compute_res[[1]]$meta %>% nrow(), expected_samples) + for (j in seq_along(expected_sample_rows)) { + sample_name <- names(expected_sample_rows)[[j]] + expected_rows <- expected_sample_rows[[j]] + expect_equal(compute_res[[1]]$data[[sample_name]] %>% nrow(), expected_rows) + } + expect_equal(lapply(compute_res[[1]]$data, as.data.table), compute_res[[2]]$data) + }) +} diff --git a/vignettes/v1_introduction.Rmd b/vignettes/v1_introduction.Rmd index 016feff4..8e4508e6 100644 --- a/vignettes/v1_introduction.Rmd +++ b/vignettes/v1_introduction.Rmd @@ -1,6 +1,6 @@ --- title: "Introduction to `immunarch`" -author: 'ImmunoMind' +author: 'ImmunoMind – improving design of T-cell therapies using multi-omics and AI. Research and biopharma partnerships, more details: immunomind.io' date: "support@immunomind.io" output: html_document: @@ -23,7 +23,7 @@ output: # Introduction -`immunarch` is an R package designed to analyse T-cell receptor (TCR) and B-cell receptor (BCR) repertoires, aimed at medical scientists and bioinformaticians. The mission of `immunarch` is to make immune sequencing data analysis as effortless as possible and help you focus on research instead of coding. Follow us on [Twitter](http://twitter.com/immunomind) for news and updates. +`immunarch` is an R package designed to analyse T-cell receptor (TCR) and B-cell receptor (BCR) repertoires, aimed at medical scientists and bioinformaticians. The mission of `immunarch` is to make immune sequencing data analysis as effortless as possible and help you focus on research instead of coding. Follow us on [Twitter](https://twitter.com/immunomind) for news and updates. ## Installation @@ -35,7 +35,7 @@ In order to install `immunarch` execute the following command: install.packages("immunarch") ``` -That's it, you can start using `immunarch` now! See the [Quick Start](#quick-start) section below to dive into immune repertoire data analysis. If you run in any trouble with installation, take a look at the [Installation Troubleshooting](http://immunarch.com/articles/v1_introduction.html#installation-troubleshooting) section. +That's it, you can start using `immunarch` now! See the [Quick Start](#quick-start) section below to dive into immune repertoire data analysis. If you run in any trouble with installation, take a look at the [Installation Troubleshooting](https://immunarch.com/articles/v1_introduction.html#installation-troubleshooting) section. Note: there are quite a lot of dependencies to install with the package because it installs all the widely-used packages for data analysis and visualisation. You got both the AIRR data analysis framework and the full Data Science package ecosystem with only one command, making `immunarch` the entry-point for single-cell & immune repertoire Data Science. @@ -57,7 +57,7 @@ install.packages("devtools") # skip this if you already installed devtools devtools::install_github("immunomind/immunarch", ref="dev") ``` -You can find the list of releases of `immunarch` here: http://github.com/immunomind/immunarch/releases +You can find the list of releases of `immunarch` here: https://github.com/immunomind/immunarch/releases ## Quick start @@ -104,7 +104,7 @@ If you can not install `devtools`, check sections 1 and 2 below. If you run in any other trouble, try the following steps: -1. Check your R version. Run `version` command in the console to get your R versions. If the R version is below 3.5.0 (for example, `R version 3.1.0`), try updating your R version to the latest one. Check this [this link](https://cran.r-project.org/bin/linux/ubuntu/README.html) if you are on Ubuntu. Note: if you try to install a package after the update and it still fails with the following message: +1. Check your R version. Run `version` command in the console to get your R versions. If the R version is below 3.5.0 (for example, `R version 3.1.0`), try updating your R version to the latest one. Note: if you try to install a package after the update and it still fails with the following message: ``` ERROR: dependencies ‘httr’, ‘usethis’ are not available for package ‘devtools’ @@ -127,11 +127,11 @@ Error: package ‘dtplyr’ 0.0.3 was found, but >= 1.0.0 is required by ‘immu Execution halted ERROR: lazy loading failed for package ‘immunarch’ ``` - + ``` byte-compile and prepare package for lazy loading -Error in loadNamespace(j <- i[[1L]], c(lib.loc, .libPaths()), versionCheck = vI[[j]]) : +Error in loadNamespace(j <- i[[1L]], c(lib.loc, .libPaths()), versionCheck = vI[[j]]) : namespace 'ggalluvial' 0.9.1 is being loaded, but >= 0.10.0 is required Calls: ... namespaceImportFrom -> asNamespace -> loadNamespace Execution halted @@ -150,7 +150,7 @@ Execution halted ``` Open Terminal, execute the following command and try again to install `immunarch`: - + ``` sudo installer -pkg /Library/Developer/CommandLineTools/Packages/macOS_SDK_headers_for_macOS_10.14.pkg -target / ``` @@ -163,23 +163,23 @@ Error: package or namespace load failed for 'igraph' in dyn.load(file, DLLpath = unable to load shared object '/usr/local/lib/R/site-library/00LOCK-igraph/00new/igraph/libs/igraph.so': libgfortran.so.4: cannot open shared object file: No such file or directory ``` - - See [this link](http://ashokragavendran.wordpress.com/2017/10/24/error-installing-rigraph-unable-to-load-shared-object-igraph-so-libgfortran-so-4-cannot-open-shared-object-file-no-such-file-or-directory/) for help. + + See [this link](https://ashokragavendran.wordpress.com/2017/10/24/error-installing-rigraph-unable-to-load-shared-object-igraph-so-libgfortran-so-4-cannot-open-shared-object-file-no-such-file-or-directory/) for help. 7. For Linux users. If you have issues with the `rgl` package: - + ``` -configure: error: missing required header GL/gl.h +configure: error: missing required header GL/gl.h ERROR: configuration failed for package ‘rgl’ ``` - + Install "mesa-common-dev" via OS terminal by executing the following command: - + ``` apt-get install mesa-common-dev ``` - - Check [this link](http://stackoverflow.com/questions/15292905/how-to-solve-the-error-missing-required-header-gl-gl-h-while-installing-the-p) for more information and other possible workarounds. + + Check [this link](https://stackoverflow.com/questions/15292905/how-to-solve-the-error-missing-required-header-gl-gl-h-while-installing-the-p) for more information and other possible workarounds. 7. If you have error messages with `rlang` in them such as: @@ -190,7 +190,7 @@ apt-get install mesa-common-dev ``` Remove the `rlang` package and install it again. This error is often happens after updating R to a newer version, while `rlang` not being properly updated. - + 8. If you have error messages like the following (note the `(converted from warning)` part): ``` @@ -201,21 +201,21 @@ ERROR: lazy loading failed for package 'immunarch' ``` Execute the following command in R and try again to install the package: - + ``` Sys.setenv(R_REMOTES_NO_ERRORS_FROM_WARNINGS="true") ``` - -9. For Windows users. If you have issues with the package installation, or if you want to change the folder for R packages, feel free to check [this forum post](http://community.rstudio.com/t/help-regarding-package-installation-renviron-rprofile-r-libs-r-libs-site-and-r-libs-user-oh-my/13888/8). -9. For Windows users. Make sure to install [Rtools](https://cran.r-project.org/bin/windows/Rtools/). Before installation close RStudio, install Rtools and re-open it afterwards. To check if Rtools installed correctly, run the `devtools::find_rtools()` command (after installing the devtools package). If you have an error, check [this link](http://github.com/r-lib/devtools/issues/1941) for help. +9. For Windows users. If you have issues with the package installation, or if you want to change the folder for R packages, feel free to check [this forum post](https://community.rstudio.com/t/help-regarding-package-installation-renviron-rprofile-r-libs-r-libs-site-and-r-libs-user-oh-my/13888/8). + +9. For Windows users. Make sure to install [Rtools](https://cran.r-project.org/bin/windows/Rtools/). Before installation close RStudio, install Rtools and re-open it afterwards. To check if Rtools installed correctly, run the `devtools::find_rtools()` command (after installing the devtools package). If you have an error, check [this link](https://github.com/r-lib/devtools/issues/1941) for help. 9. If you can not install dependencies for `immunarch`, please try manual installation of all dependencies by executing the following command in R console: - + ``` install.packages(c("rematch", "prettyunits", "forcats", "cellranger", "progress", "zip", "backports", "ellipsis", "zeallot", "SparseM", "MatrixModels", "sp", "haven", "curl", "readxl", "openxlsx", "minqa", "nloptr", "RcppEigen", "utf8", "vctrs", "carData", "pbkrtest", "quantreg", "maptools", "rio", "lme4", "labeling", "munsell", "cli", "fansi", "pillar", "viridis", "car", "ellipse", "flashClust", "leaps", "scatterplot3d", "modeltools", "DEoptimR", "digest", "gtable", "lazyeval", "rlang", "scales", "tibble", "viridisLite", "withr", "assertthat", "glue", "magrittr", "pkgconfig", "R6", "tidyselect", "BH", "plogr", "purrr", "ggsci", "cowplot", "ggsignif", "polynom", "fastcluster", "plyr", "abind", "dendextend", "FactoMineR", "mclust", "flexmix", "prabclus", "diptest", "robustbase", "kernlab", "GlobalOptions", "shape", "colorspace", "stringi", "hms", "clipr", "crayon", "httpuv", "mime", "jsonlite", "xtable", "htmltools", "sourcetools", "later", "promises", "gridBase", "RColorBrewer", "yaml", "ggplot2", "dplyr", "dtplyr", "dbplyr", "data.table", "gridExtra", "ggpubr", "pheatma3", "ggrepel", "reshape2", "DBI", "factoextra", "fpc", "circlize", "tidyr", "Rtsne", "readr", "readxl", "shiny", "shinythemes", "treemap", "igraph", "airr", "ggseqlogo", "UpSetR", "stringr", "ggalluvial", "Rcpp")) ``` - + 9. If you encounter the following error while running the `devtools::install_local` function: ``` @@ -229,4 +229,4 @@ install.packages(c("rematch", "prettyunits", "forcats", "cellranger", "progress" Check your path to the downloaded package archive file. It should not be "path/to/your/folder/with/immunarch.tar.gz", but a path on your PC to the downloaded file, e.g., "C:/Users/UserName/Downloads/immunarch.tar.gz" or "/Users/UserName/Downloads/immunarch.tar.gz". -9. If troubles still persist, let us know via [GitHub](http://github.com/immunomind/immunarch/issues) (preferably) or [support@immunomind.io](mailto:support@immunomind.io) (in case of private data). +9. If troubles still persist, let us know via [GitHub](https://github.com/immunomind/immunarch/issues) (preferably) or [support@immunomind.io](mailto:support@immunomind.io) (in case of private data). diff --git a/vignettes/v2_data.Rmd b/vignettes/v2_data.Rmd index a06f41e9..6b3f281c 100644 --- a/vignettes/v2_data.Rmd +++ b/vignettes/v2_data.Rmd @@ -1,6 +1,6 @@ --- -title: 'Working with data in `immunarch`' -author: 'ImmunoMind' +title: "Data loading" +author: 'ImmunoMind – improving design of T-cell therapies using multi-omics and AI. Research and biopharma partnerships, more details: immunomind.io' date: "support@immunomind.io" output: html_document: @@ -38,39 +38,6 @@ library(immunarch) data(immdata) ``` -# Immunarch data format -`immunarch` comes with its own data format, including tab-delimited columns that can be specified as follows: - - - "Clones" - count or number of barcodes (events, UMIs) or reads; - - - "Proportion" - proportion of barcodes (events, UMIs) or reads; - - - "CDR3.nt" - CDR3 nucleotide sequence; - - - "CDR3.aa" - CDR3 amino acid sequence; - - - "V.name" - names of aligned Variable gene segments; - - - "D.name" - names of aligned Diversity gene segments or NA; - - - "J.name" - names of aligned Joining gene segments; - - - "V.end" - last positions of aligned V gene segments (1-based); - - - "D.start" - positions of D'5 end of aligned D gene segments (1-based); - - - "D.end" - positions of D'3 end of aligned D gene segments (1-based); - - - "J.start" - first positions of aligned J gene segments (1-based); - - - "VJ.ins" - number of inserted nucleotides (N-nucleotides) at V-J junction (-1 for receptors with VDJ recombination); - - - "VD.ins" - number of inserted nucleotides (N-nucleotides) at V-D junction (-1 for receptors with VJ recombination); - - - "DJ.ins" - number of inserted nucleotides (N-nucleotides) at D-J junction (-1 for receptors with VJ recombination); - - - "Sequence" - full nucleotide sequence. - # Input / output The package provides several IO functions: @@ -83,7 +50,7 @@ The package provides several IO functions: - `"immunarch"` - current software tool, in case you forgot it :) -- `"immunoseq"` - http://www.adaptivebiotech.com/immunoseq +- `"immunoseq"` - https://www.immunoseq.com - `"mitcr"` - https://github.com/milaboratory/mitcr @@ -103,9 +70,7 @@ The package provides several IO functions: - `"10x"` - https://support.10xgenomics.com/single-cell-vdj/software/pipelines/latest/output/annotation -- `"archer"` - ArcherDX clonotype tables. https://archerdx.com/immunology/ - -These parsers will be available soon. +- `"archer"` - ArcherDX clonotype tables. - `"imseq"` - http://www.imtools.org/ @@ -264,6 +229,40 @@ ds = repSample(immdata$data, "sample", .n = 10) sapply(ds, nrow) ``` + +# Immunarch data format +`immunarch` comes with its own data format, including tab-delimited columns that can be specified as follows: + + - "Clones" - count or number of barcodes (events, UMIs) or reads; + + - "Proportion" - proportion of barcodes (events, UMIs) or reads; + + - "CDR3.nt" - CDR3 nucleotide sequence; + + - "CDR3.aa" - CDR3 amino acid sequence; + + - "V.name" - names of aligned Variable gene segments; + + - "D.name" - names of aligned Diversity gene segments or NA; + + - "J.name" - names of aligned Joining gene segments; + + - "V.end" - last positions of aligned V gene segments (1-based); + + - "D.start" - positions of D'5 end of aligned D gene segments (1-based); + + - "D.end" - positions of D'3 end of aligned D gene segments (1-based); + + - "J.start" - first positions of aligned J gene segments (1-based); + + - "VJ.ins" - number of inserted nucleotides (N-nucleotides) at V-J junction (-1 for receptors with VDJ recombination); + + - "VD.ins" - number of inserted nucleotides (N-nucleotides) at V-D junction (-1 for receptors with VJ recombination); + + - "DJ.ins" - number of inserted nucleotides (N-nucleotides) at D-J junction (-1 for receptors with VJ recombination); + + - "Sequence" - full nucleotide sequence. + diff --git a/vignettes/web_only/community.Rmd b/vignettes/web_only/community.Rmd new file mode 100644 index 00000000..2a5751a0 --- /dev/null +++ b/vignettes/web_only/community.Rmd @@ -0,0 +1,28 @@ +--- +title: "ImmunoMinds Community" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +## R Markdown + +This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see . + +When you click the **Knit** button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this: + +```{r cars} +summary(cars) +``` + +## Including Plots + +You can also embed plots, for example: + +```{r pressure, echo=FALSE} +plot(pressure) +``` + +Note that the `echo = FALSE` parameter was added to the code chunk to prevent printing of the R code that generated the plot. diff --git a/vignettes/web_only/load_10x.Rmd b/vignettes/web_only/load_10x.Rmd new file mode 100644 index 00000000..6761833b --- /dev/null +++ b/vignettes/web_only/load_10x.Rmd @@ -0,0 +1,135 @@ +--- +title: "Loading 10x Genomics Data" +author: 'ImmunoMind – improving design of T-cell therapies using multi-omics and AI. Research and biopharma partnerships, more details: immunomind.io' +date: "support@immunomind.io" +output: + html_document: + fig_height: 8 + fig_width: 10 + theme: spacelab + toc: yes + pdf_document: + toc: yes + word_document: + toc: yes +--- + + + +# Intro to 10x Genomics +10x Genomics has various pipelines for single cell and spatial views of biological systems, including single cell immune profiling. +The 10x Genomics Chromium Single Cell Immune Profiling Solution enables simultaneous analysis of the following: + +- V(D)J transcripts and clonotypes for T and B cells. +- 5' gene expression. +- Cell surface proteins/antigen specificity (feature barcodes) at single-cell resolution for the same set of cells. + +Their end-to-end pipeline also includes the Cell Ranger software, which include the following pipelines for Immune profiling analysis: + + `cellranger mkfastq` demultiplexes raw base call (BCL) files generated by Illumina sequencers into FASTQ files. It is a wrapper around Illumina's bcl2fastq, with additional useful features that are specific to 10x libraries and a simplified sample sheet format. + + `cellranger vdj` takes FASTQ files from cellranger mkfastq for V(D)J libraries and performs sequence assembly and paired clonotype calling. It uses the Chromium cellular barcodes and UMIs to assemble V(D)J transcripts per cell. Clonotypes and CDR3 sequences are output as a .vloupe file which can be loaded into Loupe V(D)J Browser. + + `cellranger count` takes FASTQ files for 5' Gene Expression and/or Feature Barcode (cell surface protein or antigen) libraries and performs alignment, filtering, barcode counting, and UMI counting. It uses the Chromium cellular barcodes to generate feature-barcode matrices, determine clusters, and perform gene expression analysis. The cellranger count pipeline outputs a .cloupe file which can be loaded into Loupe Browser for interactive visualization, clustering, and differential expression analysis. + + +# Cell Ranger Data Pipeline +Follow the instructions here to use Cellranger on your data. Note that Cellranger is only supported by Linux operating systems currently. The `cellranger count` and `cellranger vdj` methods will be particularly useful. + +You can find sample data of full runs to download here. In this tutorial, I used this single cell mouse data. If you are only using `immunarch` you can download only the .csv files. + +# Prepare 10x Data +Upon processing the data, you will have a lot of output files. You should use the `filtered contigs` csv files because they contain barcode information. + +```{r, eval=F} +. +├── vdj_v1_mm_c57bl6_pbmc_t_filtered_contig_annotations.csv <-- This contains the count data we want! +├── vdj_v1_mm_c57bl6_pbmc_t_consensus_annotations.csv +├── vdj_v1_mm_c57bl6_pbmc_t_clonotypes.csv +├── vdj_v1_mm_c57bl6_pbmc_t_all_contig_annotations.csv +├── vdj_v1_mm_c57bl6_pbmc_t_matrix.h5 +├── vdj_v1_mm_c57bl6_pbmc_t_bam.bam.bai +├── vdj_v1_mm_c57bl6_pbmc_t_molecule_info.h5 +├── vdj_v1_mm_c57bl6_pbmc_t_raw_feature_bc_matrix.tar.gz +├── vdj_v1_mm_c57bl6_pbmc_t_analysis.tar.gz + +``` + +# Load into Immunarch + +Run the code below in your R environment to load the data into Immunarch's format. You can run it on the entire folder with the Cellranger output files. `repLoad` will ignore the file formats that are unsupported. + +```{r, eval=F} +# 1.1) Load the package into R: +> library(immunarch) + +# 1.2) Replace with the path to your processed 10x data or to the clonotypes file +> file_path = "~/path/to/your/cellranger/data/" + +# 1.3) Load 10x data with repLoad +> immdata_10x <- repLoad(file_path) + +== Step 1/3: loading repertoire files... == + +Processing "/filepath/C57BL_mice_igenrichment" ... + -- Parsing "/filepath/vdj_v1_mm_c57bl6_pbmc_t_all_contig_annotations.csv" -- 10x (filt.contigs) + [!] Removed 2917 clonotypes with no nucleotide and amino acid CDR3 sequence. + -- Parsing "/filepath/vdj_v1_mm_c57bl6_pbmc_t_clonotypes.csv" -- unsupported format, skipping + -- Parsing "/filepath/vdj_v1_mm_c57bl6_pbmc_t_consensus_annotations.csv" -- 10x (consensus) + -- Parsing "/filepath/vdj_v1_mm_c57bl6_pbmc_t_filtered_contig_annotations.csv" -- 10x (filt.contigs) + [!] Removed 1198 clonotypes with no nucleotide and amino acid CDR3 sequence. + +== Step 2/3: checking metadata files and merging... == + +Processing "" ... + -- Metadata file not found; creating a dummy metadata... + +== Step 3/3: splitting data by barcodes and chains... == + +Done! +``` + +Now let's take a look at the data! Your output should look something like below. + +```{r, eval=F} +> immdata_10x +$data$vdj_v1_mm_c57bl6_splenocytes_t_consensus_annotations_TRA +# A tibble: 710 x 17 + Clones Proportion CDR3.nt CDR3.aa V.name D.name J.name V.end D.start D.end J.start VJ.ins VD.ins DJ.ins chain ClonotypeID ConsensusID + + 1 55 0.00414 TGTGCTATGGC… CAMATGG… TRAV13… None TRAJ56 NA NA NA NA NA NA NA TRA clonotype306 clonotype30… + 2 55 0.00414 TGTGCAGCTAG… CAASGNT… TRAV7-4 None TRAJ27 NA NA NA NA NA NA NA TRA clonotype338 clonotype33… + 3 53 0.00399 TGTGCAGCAAG… CAARDSG… TRAV14… None TRAJ11 NA NA NA NA NA NA NA TRA clonotype617 clonotype61… + 4 45 0.00339 TGCGCAGTCAG… CAVSNNT… TRAV3-3 None TRAJ27 NA NA NA NA NA NA NA TRA clonotype435 clonotype43… + 5 43 0.00324 TGTGCAGTCAG… CAVSNMG… TRAV7D… None TRAJ9 NA NA NA NA NA NA NA TRA clonotype401 clonotype40… + 6 42 0.00316 TGTGCAGCAAG… CAASPNY… TRAV14… None TRAJ21 NA NA NA NA NA NA NA TRA clonotype5 clonotype5_… + 7 37 0.00279 TGTGCAGTGAG… CAVSSGG… TRAV7D… None TRAJ6 NA NA NA NA NA NA NA TRA clonotype453 clonotype45… + 8 35 0.00264 TGTGCAGCAAG… CAASATS… TRAV14… None TRAJ22 NA NA NA NA NA NA NA TRA clonotype809 clonotype80… + 9 32 0.00241 TGTGCAGCAAG… CAASPNY… TRAV14… None TRAJ21 NA NA NA NA NA NA NA TRA clonotype150 clonotype15… +10 32 0.00241 TGTGCTCTGGG… CALGDEA… TRAV6-… None TRAJ30 NA NA NA NA NA NA NA TRA clonotype393 clonotype39… +# … with 700 more rows + +$meta + Sample Chain Source +1 vdj_v1_mm_c57bl6_splenocytes_t_all_contig_annotations_Multi Multi vdj_v1_mm_c57bl6_splenocytes_t_all_contig_annotations +2 vdj_v1_mm_c57bl6_splenocytes_t_all_contig_annotations_TRA TRA vdj_v1_mm_c57bl6_splenocytes_t_all_contig_annotations +3 vdj_v1_mm_c57bl6_splenocytes_t_all_contig_annotations_TRB TRB vdj_v1_mm_c57bl6_splenocytes_t_all_contig_annotations +5 vdj_v1_mm_c57bl6_splenocytes_t_consensus_annotations_TRA TRA vdj_v1_mm_c57bl6_splenocytes_t_consensus_annotations +6 vdj_v1_mm_c57bl6_splenocytes_t_consensus_annotations_TRB TRB vdj_v1_mm_c57bl6_splenocytes_t_consensus_annotations +``` + +Congrats! Now your data is ready for exploration. Follow the steps [here](https://immunarch.com/articles/v3_basic_analysis.html) to learn more about how to explore your dataset. + +## Note on barcodes + +Another important note is that some of the contigs files lack a column for barcodes - a unique identified of a cell. + +These files can be useful for analysis of single chain data (only alpha, or beta TCRs), but in order to analyze paired-chain data and fully utilize the full power of single-cell technologies, you should upload the file with barcodes to the Immunarch. + +# Paired-chain data + +Do you have single-cell data? Immunarch can now parse both single-chain and paired-chain single-cell data. Try out these features here. diff --git a/vignettes/web_only/load_mixcr.Rmd b/vignettes/web_only/load_mixcr.Rmd new file mode 100644 index 00000000..44ae28f1 --- /dev/null +++ b/vignettes/web_only/load_mixcr.Rmd @@ -0,0 +1,254 @@ +--- +title: "Loading MiXCR Data" +author: 'ImmunoMind – improving design of T-cell therapies using multi-omics and AI. Research and biopharma partnerships, more details: immunomind.io' +date: "support@immunomind.io" +output: + html_document: + fig_height: 8 + fig_width: 10 + theme: spacelab + toc: yes + pdf_document: + toc: yes + word_document: + toc: yes +--- + + + + +# Intro to MiXCR +MiXCR is a universal software for fast and accurate extraction of T- and B- cell receptor repertoires from any type of sequencing data. It handles paired- and single-end reads, considers sequence quality, corrects PCR errors and identifies germline hypermutations. The software supports both partial- and full-length profiling and employs all available RNA or DNA information, including sequences upstream of V and downstream of J gene segments. + +Some features include: + +- Extracts both T- and B-cell receptor repertoires +- Extracts repertoire data even from regular RNA-Seq +- Successfully analyses full-length antibody data + +Find more info about MiXCR here + +# Prepare MiXCR Data +Follow these instructions here to install MiXCR and get started with processing data. **Important Note:** Currently supports MiXCR version 3 and above. + +MiXCR supports the following formats of sequencing data: fasta, fastq, fastq.gz, paired-end fastq and fastq.gz. In this tutorial I used the real IGH data from here. + +You can choose to use the `analyze amplicon` method to process in one go: + +```{r, eval=F} +> mixcr analyze amplicon --species hs \ + --starting-material dna \ + --5-end v-primers \ + --3-end j-primers \ + --adapters adapters-present \ + --receptor-type IGH \ + input_R1.fastq input_R2.fastq analysis +``` + +or execute each step `align`, `assemble`, and `exportClones` individually. + +```{r, eval=F} +> mixcr align -s hs -OvParameters.geneFeatureToAlign=VTranscript \ + --report analysis.report input_R1.fastq input_R2.fastq analysis.vdjca + +Analysis Date: Mon Aug 25 15:22:39 MSK 2014 +Input file(s): input_r1.fastq,input_r2.fastq +Output file: alignments.vdjca +Command line arguments: align --report alignmentReport.log input_r1.fastq input_r2.fastq alignments.vdjca +Total sequencing reads: 323248 +Successfully aligned reads: 210360 +Successfully aligned, percent: 65.08% +Alignment failed because of absence of V hits: 4.26% +Alignment failed because of absence of J hits: 30.19% +Alignment failed because of low total score: 0.48% + +``` + +# Prepare files to load + +After you run these commands, you will generate these files with detailed information about calculated clonotypes: +```{r, eval=F} +. +├── analysis.clonotypes..txt <-- This contains the count data we want! +├── analysis.clna <- Build clonotypes correct PCR and sequencing errors +├── analysis.vdjca <- Align raw sequences to reference sequences of segments (V, D, J) of IGH gene +├── analysis.report <- Information on the run +``` + +Create a new folder that only contains the specified clonotype text files from your runs, and create a metadata.txt file in the following format. + + +The metadata file "metadata.txt" has to be tab delimited file with first column named "Sample" and any number of additional columns with arbitrary names. The first column should contain base names of files without extensions in your folder. + +| **Sample** |**Sex**|**Age**|**Status**| +|:----------:|:-----:|:-----:|:--------:| +|immunoseq\_1|M |1 |C | +|immunoseq\_2|M |2 |C | +|immunoseq\_3|F |3 |A | + +The next section will explain how to load a single file or multiple samples in a folder. + +# Load into Immunarch + +In your R environment, run the commands below. The output should be similar. You can run it on the [entire folder](#loading-a-folder) or a [single file](#loading-a-single-file). `repLoad` will ignore the file formats that are unsupported. + +## Loading a single file + +```{r, eval=F} +# 1.1) Load the package into R: +> library(immunarch) + +# 1.2) Replace with the path to your clonotypes file +> file_path = "path/to/your/mixcr/data/analysis.clonotypes.IGH.txt" + +# 1.3) Load MiXCR data with repLoad +> immdata_mixcr <- repLoad(file_path) + +== Step 1/3: loading repertoire files... == + +Processing "" ... + -- Parsing "/path/to/your/mixcr/data/analysis.clonotypes.IGH.txt" -- mixcr + +== Step 2/3: checking metadata files and merging... == + +Processing "" ... + -- Metadata file not found; creating a dummy metadata... + +== Step 3/3: splitting data by barcodes and chains... == + +Done! +``` + +Congrats! Now your data is ready for exploration. Follow the steps here to learn more about how to explore your dataset. + +```{r, eval=F} +r$> immdata_mixcr +$data +$data$analysis.clonotypes.IGH +# A tibble: 33,812 x 15 + Clones Proportion CDR3.nt CDR3.aa V.name D.name J.name V.end D.start D.end J.start VJ.ins VD.ins DJ.ins Sequence + + 1 230 0.00284 TGTGTGAGACATAAACC… CVRHKPMVQ… IGHV4-39 IGHD3-10… IGHJ6 12 NA 5 36 9 3 6 TGTGTGAGACATAAACC… + 2 201 0.00248 TGTGCGATTTGGGATGT… CAIWDVGLR… IGHV4-34 IGHD2-21 IGHJ4… 7 NA 5 29 10 7 3 TGTGCGATTTGGGATGT… + 3 179 0.00221 TGTGCGAGAGATCATGC… CARDHAGFG… IGHV1-69… IGHD3-10 IGHJ6 13 NA 4 40 18 5 13 TGTGCGAGAGATCATGC… + 4 99 0.00122 TGTGCGAGATGGGGATA… CARWGYCIN… IGHV4-39 IGHD2-8 IGHJ6 9 NA 6 64 23 2 21 TGTGCGAGATGGGGATA… + 5 97 0.00120 TGTGCGAGAGGCCCCAC… CARGPTSSE… IGHV4-34 IGHD3-22… IGHJ6 13 NA 6 52 26 24 2 TGTGCGAGAGGCCCCAC… + 6 97 0.00120 TGTGCGCACCACTATAC… CAHHYTSDY… IGHV2-5 IGHD1-26 IGHJ5 9 NA 2 39 19 NA 20 TGTGCGCACCACTATAC… + 7 92 0.00114 TGTGCGAGAGGCCCTCC… CARGPPSMG… IGHV4-34 IGHD5-24… IGHJ4 13 NA 3 38 11 6 5 TGTGCGAGAGGCCCTCC… + 8 84 0.00104 TGTGCGAGGTGGCTTGG… CARWLGEDI… IGHV4-39 IGHD3-16… IGHJ4… 8 NA 6 32 13 4 9 TGTGCGAGGTGGCTTGG… + 9 83 0.00103 TGTGCGAGAGGCCGCAG… CARGRSGDP… IGHV4-34 IGHD2-2,… IGHJ5 13 NA 4 50 18 13 5 TGTGCGAGAGGCCGCAG… +10 81 0.00100 TGTGTGAGTCACCTCCT… CVSHLLDTS… IGHV1-2 IGHD2-21… IGHJ4… 8 NA 3 40 20 14 6 TGTGTGAGTCACCTCCT… +# … with 33,802 more rows + + +$meta +# A tibble: 1 x 1 + Sample + +1 analysis.clonotypes.IGH +``` + +```{r, eval=F} + +``` + +## Loading a folder + +In this tutorial I used three identical samples just to show the output, but you should put all your output `.txt` clonotype files in this folder, along with your `metadata.txt` file. + +```{r, eval=F} +# 1.1) Load the package into R: +> library(immunarch) + +# 1.2) Replace with the path to the folder with your processed MiXCR data. +> file_path = "/path/to/your/mixcr/data/" + +# 1.3) Load MiXCR data with repLoad +> immdata_mixcr <- repLoad(file_path) + +== Step 1/3: loading repertoire files... == + +Processing "/path/to/your/mixcr/data/" ... + -- Parsing "/path/to/your/mixcr/data/analysis.clonotypes.IGH_1.txt" -- mixcr + -- Parsing "/path/to/your/mixcr/data/analysis.clonotypes.IGH_2.txt" -- mixcr + -- Parsing "/path/to/your/mixcr/data/analysis.clonotypes.IGH_3.txt" -- mixcr + -- Parsing "/path/to/your/mixcr/data/metadata.txt" -- metadata + +== Step 2/3: checking metadata files and merging files... == + +Processing "/path/to/your/mixcr/data/" ... + -- Everything is OK! + +== Step 3/3: processing paired chain data... == + +Done! +``` + +Now let's take a look at the data! Your output should look something like below. + +```{r, eval=F} +r$> immdata_mixcr +$data +$data$analysis.clonotypes.IGH_1 +# A tibble: 32,744 x 15 + Clones Proportion CDR3.nt CDR3.aa V.name D.name J.name V.end D.start D.end J.start VJ.ins VD.ins DJ.ins Sequence + + 1 230 0.00284 TGTGTGAGACATAAACCTATGG… CVRHKPMVQG… IGHV4-39 IGHD3-10, … IGHJ6 12 NA 5 36 9 3 6 TGTGTGAGACATAAACCTATG… + 2 201 0.00248 TGTGCGATTTGGGATGTGGGAC… CAIWDVGLRH… IGHV4-34 IGHD2-21 IGHJ4,… 7 NA 5 29 10 7 3 TGTGCGATTTGGGATGTGGGA… + 3 179 0.00221 TGTGCGAGAGATCATGCGGGGT… CARDHAGFGK… IGHV1-69… IGHD3-10 IGHJ6 13 NA 4 40 18 5 13 TGTGCGAGAGATCATGCGGGG… + 4 99 0.00122 TGTGCGAGATGGGGATATTGTA… CARWGYCING… IGHV4-39 IGHD2-8 IGHJ6 9 NA 6 64 23 2 21 TGTGCGAGATGGGGATATTGT… + 5 97 0.00120 TGTGCGAGAGGCCCCACGAGCA… CARGPTSSEW… IGHV4-34 IGHD3-22, … IGHJ6 13 NA 6 52 26 24 2 TGTGCGAGAGGCCCCACGAGC… + 6 97 0.00120 TGTGCGCACCACTATACCAGCG… CAHHYTSDYY… IGHV2-5 IGHD1-26 IGHJ5 9 NA 2 39 19 NA 20 TGTGCGCACCACTATACCAGC… + 7 92 0.00114 TGTGCGAGAGGCCCTCCGTCGA… CARGPPSMGT… IGHV4-34 IGHD5-24, … IGHJ4 13 NA 3 38 11 6 5 TGTGCGAGAGGCCCTCCGTCG… + 8 84 0.00104 TGTGCGAGGTGGCTTGGGGAAG… CARWLGEDIR… IGHV4-39 IGHD3-16, … IGHJ4,… 8 NA 6 32 13 4 9 TGTGCGAGGTGGCTTGGGGAA… + 9 83 0.00103 TGTGCGAGAGGCCGCAGCGGCG… CARGRSGDPY… IGHV4-34 IGHD2-2, I… IGHJ5 13 NA 4 50 18 13 5 TGTGCGAGAGGCCGCAGCGGC… +10 81 0.00100 TGTGTGAGTCACCTCCTCGACA… CVSHLLDTSD… IGHV1-2 IGHD2-21, … IGHJ4,… 8 NA 3 40 20 14 6 TGTGTGAGTCACCTCCTCGAC… +# … with 32,734 more rows + +$data$analysis.clonotypes.IGH_2 +# A tibble: 32,744 x 15 + Clones Proportion CDR3.nt CDR3.aa V.name D.name J.name V.end D.start D.end J.start VJ.ins VD.ins DJ.ins Sequence + + 1 230 0.00284 TGTGTGAGACATAAACCTATGG… CVRHKPMVQG… IGHV4-39 IGHD3-10, … IGHJ6 12 NA 5 36 9 3 6 TGTGTGAGACATAAACCTATG… + 2 201 0.00248 TGTGCGATTTGGGATGTGGGAC… CAIWDVGLRH… IGHV4-34 IGHD2-21 IGHJ4,… 7 NA 5 29 10 7 3 TGTGCGATTTGGGATGTGGGA… + 3 179 0.00221 TGTGCGAGAGATCATGCGGGGT… CARDHAGFGK… IGHV1-69… IGHD3-10 IGHJ6 13 NA 4 40 18 5 13 TGTGCGAGAGATCATGCGGGG… + 4 99 0.00122 TGTGCGAGATGGGGATATTGTA… CARWGYCING… IGHV4-39 IGHD2-8 IGHJ6 9 NA 6 64 23 2 21 TGTGCGAGATGGGGATATTGT… + 5 97 0.00120 TGTGCGAGAGGCCCCACGAGCA… CARGPTSSEW… IGHV4-34 IGHD3-22, … IGHJ6 13 NA 6 52 26 24 2 TGTGCGAGAGGCCCCACGAGC… + 6 97 0.00120 TGTGCGCACCACTATACCAGCG… CAHHYTSDYY… IGHV2-5 IGHD1-26 IGHJ5 9 NA 2 39 19 NA 20 TGTGCGCACCACTATACCAGC… + 7 92 0.00114 TGTGCGAGAGGCCCTCCGTCGA… CARGPPSMGT… IGHV4-34 IGHD5-24, … IGHJ4 13 NA 3 38 11 6 5 TGTGCGAGAGGCCCTCCGTCG… + 8 84 0.00104 TGTGCGAGGTGGCTTGGGGAAG… CARWLGEDIR… IGHV4-39 IGHD3-16, … IGHJ4,… 8 NA 6 32 13 4 9 TGTGCGAGGTGGCTTGGGGAA… + 9 83 0.00103 TGTGCGAGAGGCCGCAGCGGCG… CARGRSGDPY… IGHV4-34 IGHD2-2, I… IGHJ5 13 NA 4 50 18 13 5 TGTGCGAGAGGCCGCAGCGGC… +10 81 0.00100 TGTGTGAGTCACCTCCTCGACA… CVSHLLDTSD… IGHV1-2 IGHD2-21, … IGHJ4,… 8 NA 3 40 20 14 6 TGTGTGAGTCACCTCCTCGAC… +# … with 32,734 more rows + +$data$analysis.clonotypes.IGH_3 +# A tibble: 32,744 x 15 + Clones Proportion CDR3.nt CDR3.aa V.name D.name J.name V.end D.start D.end J.start VJ.ins VD.ins DJ.ins Sequence + + 1 230 0.00284 TGTGTGAGACATAAACCTATGG… CVRHKPMVQG… IGHV4-39 IGHD3-10, … IGHJ6 12 NA 5 36 9 3 6 TGTGTGAGACATAAACCTATG… + 2 201 0.00248 TGTGCGATTTGGGATGTGGGAC… CAIWDVGLRH… IGHV4-34 IGHD2-21 IGHJ4,… 7 NA 5 29 10 7 3 TGTGCGATTTGGGATGTGGGA… + 3 179 0.00221 TGTGCGAGAGATCATGCGGGGT… CARDHAGFGK… IGHV1-69… IGHD3-10 IGHJ6 13 NA 4 40 18 5 13 TGTGCGAGAGATCATGCGGGG… + 4 99 0.00122 TGTGCGAGATGGGGATATTGTA… CARWGYCING… IGHV4-39 IGHD2-8 IGHJ6 9 NA 6 64 23 2 21 TGTGCGAGATGGGGATATTGT… + 5 97 0.00120 TGTGCGAGAGGCCCCACGAGCA… CARGPTSSEW… IGHV4-34 IGHD3-22, … IGHJ6 13 NA 6 52 26 24 2 TGTGCGAGAGGCCCCACGAGC… + 6 97 0.00120 TGTGCGCACCACTATACCAGCG… CAHHYTSDYY… IGHV2-5 IGHD1-26 IGHJ5 9 NA 2 39 19 NA 20 TGTGCGCACCACTATACCAGC… + 7 92 0.00114 TGTGCGAGAGGCCCTCCGTCGA… CARGPPSMGT… IGHV4-34 IGHD5-24, … IGHJ4 13 NA 3 38 11 6 5 TGTGCGAGAGGCCCTCCGTCG… + 8 84 0.00104 TGTGCGAGGTGGCTTGGGGAAG… CARWLGEDIR… IGHV4-39 IGHD3-16, … IGHJ4,… 8 NA 6 32 13 4 9 TGTGCGAGGTGGCTTGGGGAA… + 9 83 0.00103 TGTGCGAGAGGCCGCAGCGGCG… CARGRSGDPY… IGHV4-34 IGHD2-2, I… IGHJ5 13 NA 4 50 18 13 5 TGTGCGAGAGGCCGCAGCGGC… +10 81 0.00100 TGTGTGAGTCACCTCCTCGACA… CVSHLLDTSD… IGHV1-2 IGHD2-21, … IGHJ4,… 8 NA 3 40 20 14 6 TGTGTGAGTCACCTCCTCGAC… +# … with 32,734 more rows + + +$meta +# A tibble: 3 x 4 + Sample Sex Age Status + +1 analysis.clonotypes.IGH_1 M 1 C +2 analysis.clonotypes.IGH_2 M 2 C +3 analysis.clonotypes.IGH_3 F 3 A +``` + +Congrats! Now your data is ready for exploration. Follow the steps [here](https://immunarch.com/articles/v3_basic_analysis.html) to learn more about how to explore your dataset. diff --git a/vignettes/web_only/v10_prop.Rmd b/vignettes/web_only/v10_prop.Rmd index 93347128..0391ed5e 100644 --- a/vignettes/web_only/v10_prop.Rmd +++ b/vignettes/web_only/v10_prop.Rmd @@ -1,6 +1,6 @@ --- -title: 'Analysis of biophysicochemical properties of CDR3 sequences' -author: 'ImmunoMind' +title: "Analysis of biophysicochemical properties of CDR3 sequences" +author: 'ImmunoMind – improving design of T-cell therapies using multi-omics and AI. Research and biopharma partnerships, more details: immunomind.io' date: "support@immunomind.io" output: html_document: diff --git a/vignettes/web_only/v11_db.Rmd b/vignettes/web_only/v11_db.Rmd index 03dc81c0..1c27b48a 100644 --- a/vignettes/web_only/v11_db.Rmd +++ b/vignettes/web_only/v11_db.Rmd @@ -1,9 +1,9 @@ --- -title: 'Annotate clonotypes using immune receptor databases' -author: 'ImmunoMind' +title: "Annotate clonotypes using immune receptor databases" +author: 'ImmunoMind – improving design of T-cell therapies using multi-omics and AI. Research and biopharma partnerships, more details: immunomind.io' date: "support@immunomind.io" output: - html_document: + html_document: fig_height: 8 fig_width: 10 theme: spacelab @@ -110,7 +110,7 @@ After downloading the database, we can proceed to the annotation part with R. To ## Preprocessing databases with R -For the start, we need to load databases into R and filter out non-human, non-TRB and non-CMV data from the input database. With databases, we follow the same philosophy as with `repLoad` and `vis` functions: the function `dbLoad` provides a singular interface to loading and basic quering for all supported databases. +For the start, we need to load databases into R and filter out non-human, non-TRB and non-CMV data from the input database. With databases, we follow the same philosophy as with `repLoad` and `vis` functions: the function `dbLoad` provides a singular interface to loading and basic quering for all supported databases. For demonstration purposes, we will process each of the supported databases below. @@ -188,7 +188,7 @@ dbAnnotate(immdata$data, local_db, c("CDR3.aa", "V.name"), c("Seq", "V")) ### Visualisation -Visualisation with the `vis()` function will be supported in the next major release of `immunarch`. You can use `ggplot2` to visualise distributions of found clonotypes. +Visualisation with the `vis()` function will be supported in the next major release of `immunarch`. You can use `ggplot2` to visualise distributions of found clonotypes. ## Advanced filtering diff --git a/vignettes/web_only/v21_singlecell.Rmd b/vignettes/web_only/v21_singlecell.Rmd index 659b68cd..fa27cdaf 100644 --- a/vignettes/web_only/v21_singlecell.Rmd +++ b/vignettes/web_only/v21_singlecell.Rmd @@ -1,6 +1,6 @@ --- title: "Single-cell and paired chain data: scTCRseq and scBCRseq exploration" -author: 'ImmunoMind' +author: 'ImmunoMind – improving design of T-cell therapies using multi-omics and AI. Research and biopharma partnerships, more details: immunomind.io' date: "support@immunomind.io" output: html_document: @@ -41,7 +41,7 @@ data(scdata) > This is a vignette dedicated to provide an overview on how to work with single-cell paired chain data in `immunarch` -> Single-cell support is currently in the development version. In order to access it, you need to install the latest development version of the package by executing the following command: +> Single-cell support is currently in the development version. In order to access it, you need to install the latest development version of the package by executing the following command: ```r install.packages("devtools"); devtools::install_github("immunomind/immunarch", ref="dev") @@ -131,4 +131,3 @@ p3 <- trackClonotypes(scdata_cl$data, target, .col = "aa") %>% vis() ``` Several functions may work incorrectly with paired chain data in this release of `immunarch`. Let us know via [GitHub Issues](http://github.com/immunomind/immunarch/issues)! - diff --git a/vignettes/web_only/v3_basic_analysis.Rmd b/vignettes/web_only/v3_basic_analysis.Rmd index 30de2cfa..d28c1853 100644 --- a/vignettes/web_only/v3_basic_analysis.Rmd +++ b/vignettes/web_only/v3_basic_analysis.Rmd @@ -1,6 +1,6 @@ --- -title: 'Basic analysis and clonality' -author: 'ImmunoMind' +title: "Basic analysis and clonality" +author: 'ImmunoMind – improving design of T-cell therapies using multi-omics and AI. Research and biopharma partnerships, more details: immunomind.io' date: "support@immunomind.io" output: html_document: @@ -60,7 +60,7 @@ Basic analysis functions are: - `repClonality` - to compute the clonality of repertoires. -- `repOverlap` - to compute the repertoire overlap. +- `repOverlap` - to compute the repertoire overlap. - `repOverlapAnalysis` - to analyse the repertoire overlap, including different clustering procedures and PCA. diff --git a/vignettes/web_only/v4_overlap.Rmd b/vignettes/web_only/v4_overlap.Rmd index 28a70a36..d7d2409c 100644 --- a/vignettes/web_only/v4_overlap.Rmd +++ b/vignettes/web_only/v4_overlap.Rmd @@ -1,6 +1,6 @@ --- -title: 'Repertoire overlap and public clonotypes' -author: 'ImmunoMind' +title: "Repertoire overlap and public clonotypes" +author: 'ImmunoMind – improving design of T-cell therapies using multi-omics and AI. Research and biopharma partnerships, more details: immunomind.io' date: "support@immunomind.io" output: html_document: @@ -39,7 +39,7 @@ data(immdata) ``` # Repertoire overlap -Repertoire overlap is the most common approach to measure repertoire similarity. It is achieved by computation of specific statistics on clonotypes shared between given repertoires, also called "public" clonotypes. `immunarch` provides several indices: +Repertoire overlap is the most common approach to measure repertoire similarity. It is achieved by computation of specific statistics on clonotypes shared between given repertoires, also called "public" clonotypes. `immunarch` provides several indices: - number of public clonotypes (`.method = "public"`) - a classic measure of overlap similarity. - overlap coefficient (`.method = "overlap"`) - a normalised measure of overlap similarity. It is defined as the size of the intersection divided by the smaller of the size of the two sets. diff --git a/vignettes/web_only/v5_gene_usage.Rmd b/vignettes/web_only/v5_gene_usage.Rmd index e712d25c..a9afa6c2 100644 --- a/vignettes/web_only/v5_gene_usage.Rmd +++ b/vignettes/web_only/v5_gene_usage.Rmd @@ -1,6 +1,6 @@ --- -title: 'Gene usage analysis' -author: 'ImmunoMind' +title: "Gene usage analysis" +author: 'ImmunoMind – improving design of T-cell therapies using multi-omics and AI. Research and biopharma partnerships, more details: immunomind.io' date: "support@immunomind.io" output: html_document: diff --git a/vignettes/web_only/v6_diversity.Rmd b/vignettes/web_only/v6_diversity.Rmd index 657564b2..4731bb56 100644 --- a/vignettes/web_only/v6_diversity.Rmd +++ b/vignettes/web_only/v6_diversity.Rmd @@ -1,6 +1,6 @@ --- -title: 'Diversity estimation' -author: 'ImmunoMind' +title: "Diversity estimation" +author: 'ImmunoMind – improving design of T-cell therapies using multi-omics and AI. Research and biopharma partnerships, more details: immunomind.io' date: "support@immunomind.io" output: html_document: diff --git a/vignettes/web_only/v7_fixvis.Rmd b/vignettes/web_only/v7_fixvis.Rmd index f0431c38..a98b030a 100644 --- a/vignettes/web_only/v7_fixvis.Rmd +++ b/vignettes/web_only/v7_fixvis.Rmd @@ -1,6 +1,6 @@ --- -title: 'Make your plots publication-ready with fixVis' -author: 'ImmunoMind' +title: "Make your plots publication-ready with fixVis" +author: 'ImmunoMind – improving design of T-cell therapies using multi-omics and AI. Research and biopharma partnerships, more details: immunomind.io' date: "support@immunomind.io" output: html_document: diff --git a/vignettes/web_only/v8_tracking.Rmd b/vignettes/web_only/v8_tracking.Rmd index 927ec4c8..250f4214 100644 --- a/vignettes/web_only/v8_tracking.Rmd +++ b/vignettes/web_only/v8_tracking.Rmd @@ -1,6 +1,6 @@ --- -title: 'Tracking clonotypes across time points in `immunarch`' -author: 'ImmunoMind' +title: "Tracking clonotypes across time points in `immunarch`" +author: 'ImmunoMind – improving design of T-cell therapies using multi-omics and AI. Research and biopharma partnerships, more details: immunomind.io' date: "support@immunomind.io" output: html_document: @@ -36,13 +36,13 @@ library(immunarch) data(immdata) ``` -# Tracking of clonotypes +# Tracking of clonotypes Clonotype tracking is popular approach to monitor changes in frequency of clonotypes of interest in vaccination and cancer immunology. For example, a researcher can track a clonotype of across different time points in pre- and post-vaccination repertoires. Or analyse growth of malignant clonotypes in tumor sample. Various methods of clonotype tracking are integrated into the one `trackClonotypes` function. Currently, there are three methods to choose from. The output of `trackClonotypes` can be immediately visualised with the `vis` function. ## Tracking the most abundant clonotypes -The simplest approach is to choose the most abundant clonotypes from one of the input immune repertoires and track across all immune repertoires in a batch. Arguments `.which` and `.col` are used to choose the immune repertoire, the number of clonotypes to take from it and which columns to use. +The simplest approach is to choose the most abundant clonotypes from one of the input immune repertoires and track across all immune repertoires in a batch. Arguments `.which` and `.col` are used to choose the immune repertoire, the number of clonotypes to take from it and which columns to use. To choose the 10 most abundant clonotypes from the first repertoire and track them using their CDR3 nucleotide sequence: diff --git a/vignettes/web_only/v9_kmers.Rmd b/vignettes/web_only/v9_kmers.Rmd index bd28c2a5..d1861d6a 100644 --- a/vignettes/web_only/v9_kmers.Rmd +++ b/vignettes/web_only/v9_kmers.Rmd @@ -1,6 +1,6 @@ --- -title: 'Kmer and sequence motif analysis and visualisation' -author: 'ImmunoMind' +title: "Kmer and sequence motif analysis and visualisation" +author: 'ImmunoMind – improving design of T-cell therapies using multi-omics and AI. Research and biopharma partnerships, more details: immunomind.io' date: "support@immunomind.io" output: html_document: @@ -122,7 +122,7 @@ Argument `.method` specifies which matrix to compute: - `.method = "prob"` - position probability matrix (PPM); - `.method = "wei"` - position weight matrix (PWM); - `.method = "self"` - self-information matrix. - + ```{r} kmer_profile(kmers, "freq") kmer_profile(kmers, "prob")