From 983a24edac6fcad3269d8a9064e11555e341723d Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Fri, 7 Jun 2024 16:30:12 -0700 Subject: [PATCH] update tutorial (#3) --- .gitignore | 3 +- _quarto.yml | 70 +++++++----- index.qmd | 26 ++--- requirements.txt | 4 +- rpackages.R | 2 +- tutorials/annotate_cell_types.qmd | 170 +++++++++++++++++++++++++----- 6 files changed, 206 insertions(+), 69 deletions(-) diff --git a/.gitignore b/.gitignore index 18d05db..78ec4c4 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,5 @@ _freeze chapters/zilinois_lung_with_celltypist/ *whee.h5 *.tiledb -*_cache \ No newline at end of file +*_cache +*_files \ No newline at end of file diff --git a/_quarto.yml b/_quarto.yml index 28126ea..4d45764 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -1,41 +1,59 @@ project: - type: book + type: website output-dir: docs execute: freeze: auto cache: true -book: +website: title: "Bioc2024: Interoperability between R and Python using BiocPy" - author: "[Jayaram Kancherla](mailto:jayaram.kancherla@gmail.com)" - contributor: "[Aaron Lun](mailto:infinite.monkeys.with.keys@gmail.com)" - favicon: ./assets/short.png - site-url: https://biocpy.github.io/tutorial - date: "5/31/2024" - search: true - repo-url: https://github.com/BiocPy/tutorial - repo-actions: [issue] - # downloads: [pdf, epub] - sharing: [twitter] - twitter-card: true - cover-image: ./assets/full.png - page-footer: - center: - - text: "(c) BiocPy core contributors" - # href: chapters/summary.qmd sidebar: - style: docked - background: light - chapters: - - index.qmd - - tutorials/annotate_cell_types.qmd - - tutorials/sessioninfo.qmd - # - chapters/references.qmd + # search: true + contents: + - index.qmd + - section: "Tutorials" + contents: + - tutorials/annotate_cell_types.qmd + - tutorials/sessioninfo + + # navbar: + # left: + # - href: index.qmd + # text: Home + # - tutorials/annotate_cell_types.qmd + # - tutorials/sessioninfo + +# book: +# title: "Bioc2024: Interoperability between R and Python using BiocPy" +# author: "[Jayaram Kancherla](mailto:jayaram.kancherla@gmail.com)" +# contributor: "[Aaron Lun](mailto:infinite.monkeys.with.keys@gmail.com)" +# favicon: ./assets/short.png +# site-url: https://biocpy.github.io/tutorial +# date: "5/31/2024" +# search: true +# repo-url: https://github.com/BiocPy/tutorial +# repo-actions: [issue] +# # downloads: [pdf, epub] +# sharing: [twitter] +# twitter-card: true +# cover-image: ./assets/full.png +# page-footer: +# center: +# - text: "(c) BiocPy core contributors" +# # href: chapters/summary.qmd +# sidebar: +# style: docked +# background: light +# chapters: +# - index.qmd +# - tutorials/annotate_cell_types.qmd +# - tutorials/sessioninfo.qmd +# # - chapters/references.qmd format: html: - theme: cosmo + theme: sandstone number-sections: false code-link: true toc: true diff --git a/index.qmd b/index.qmd index 2456411..9c2953f 100644 --- a/index.qmd +++ b/index.qmd @@ -1,18 +1,18 @@ # Welcome -The BiocPy project aims to facilitate development of Bioconductor workflows in Python. - -This tutorial for the workshop will provide an overview of the core data structures implemented in BiocPy -(e.g., `GenomicRanges`, `SummarizedExperiment`) that were ported from R. -Participants will be guided through two Bioconductor-derived workflows in Python. -The first will involve reading an RDS file containing genomic ranges and performing -downstream range-based analyses. -The second will use the `scRNAseq` package to access public single-cell RNA-seq datasets, -followed by cell type annotation using the `SinglePy` package. -Attendees will learn how to represent and manipulate their datasets in Python in the -same manner as in R/Bioconductor. - -All packages in BiocPy are published to PyPI, and code is open-source on [GitHub](https://github.com/BiocPy). +Welcome to our workshop on exploring the data structures and packages +available in [BiocPy](https://github.com/biocpy), a project that brings +the power of Bioconductor to Python. + +In this workshop, we will focus on interoperability between R and Python, covering two main topics: + +- Reading an RDS file containing `GenomicRanges` and performing downstream range-based analyses. +- Annotating single-cell RNA-seq data analysis using the [scrnaseq](https://github.com/biocpy/scrnaseq) package. + +Attendees will learn how to represent and manipulate their datasets in Python +in the same manner as in R/Bioconductor. +All packages in BiocPy are published to PyPI, and the code is open-source +on [GitHub](https://github.com/BiocPy). ### Core contributors diff --git a/requirements.txt b/requirements.txt index d344d31..2f00338 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,4 +18,6 @@ seaborn session-info celldex scrnaseq -anndata \ No newline at end of file +anndata +matplotlib +scanpy \ No newline at end of file diff --git a/rpackages.R b/rpackages.R index 8b58492..1c751eb 100644 --- a/rpackages.R +++ b/rpackages.R @@ -1,3 +1,3 @@ install.packages(c("BiocManager"), repos='http://cran.us.r-project.org') library(BiocManager) -BiocManager::install(c("scRNAseq", "celldex", "SingleR", "scuttle", "reticulate", "rmarkdown", "knitr", "downlit", "xml2")) \ No newline at end of file +BiocManager::install(c("scRNAseq", "celldex", "SingleR", "scuttle", "reticulate", "rmarkdown", "knitr", "downlit", "xml2", "ggplot2", "edgeR")) \ No newline at end of file diff --git a/tutorials/annotate_cell_types.qmd b/tutorials/annotate_cell_types.qmd index db566a1..72e1afc 100644 --- a/tutorials/annotate_cell_types.qmd +++ b/tutorials/annotate_cell_types.qmd @@ -1,46 +1,64 @@ # Tutorial 2: Access single-cell datasets from `scRNAseq` collection and annotate cell types -Welcome to this tutorial on accessing public single-cell datasets using `scRNAseq` and language-agnostic representations from `ArtifactDB`. We will showcase how to integrate and process single-cell datasets across languages, such as R and Python, and how to annotate cell types using reference datasets. Let's dive into the process! +Welcome to this tutorial on annotating single-cell datasets with reference collections. The **scRNAseq** ([R/Bioc](https://bioconductor.org/packages/devel/data/experiment/html/scRNAseq.html), [Python](https://github.com/BiocPy/scrnaseq)) package provides access to public single-cell RNA-seq datasets for use by other Bioconductor/BiocPy packages and workflows. These datasets are stored in language agnostic representations described in [ArtifactDB](https://github.com/artifactdb), enabling easy access to datasets and analysis results across multiple programming languages such as R and Python. We will showcase how to integrate and process single-cell datasets across languages, such as R and Python, and how to annotate cell types using reference datasets. +## Outline -## Introduction +In this tutorial, you will learn how to: -The [scRNAseq](https://bioconductor.org/packages/devel/data/experiment/html/scRNAseq.html) package provides access to public single-cell RNA-seq datasets for use by other Bioconductor packages and workflows. This package has been updated to store datasets in language-agnostic representations described in [ArtifactDB](https://github.com/artifactdb), enabling easy access to datasets and analysis results across multiple programming languages such as R and Python. +- Install and set up BiocPy packages in your Python environment. +- Explore the `scrnaseq` package and access public single-cell RNA-seq datasets. +- Perform basic operations on `SingleCellExperiment` objects, the core data structure for single-cell data analysis. +- Annotate cell types using reference datasets from the `celldex` package. +- Understand the design principles behind BiocPy. -The Python equivalent of the package is available [here](https://github.com/BiocPy/scrnaseq). +Let's dive into the process! + +## Prerequisites + +Before we begin, please ensure that you have the following prerequisites installed: + +- Python 3.8 or later with dependencies listed [here]([../requirements.txt](https://github.com/BiocPy/BiocWorkshop2024/blob/master/requirements.txt)). +- R 4.4.0 and Bioconductor packages listed [here]([../rpackages.R](https://github.com/BiocPy/BiocWorkshop2024/blob/master/rpackages.R)). ## Installation -Install the package from Bioconductor for R or from PyPI for Python: +Let's start by installing the required packages. ::: {.panel-tabset} ## R ```r -BiocManager::install("scRNAseq", repos='http://cran.us.r-project.org') +BiocManager::install(c("scRNAseq", "celldex", "SingleR"), + repos='http://cran.us.r-project.org') ``` +This will install the `scRNAseq`, `celldex`, `SingleR`, packages from Bioconductor. + + ## Python (shell) + ```sh -pip install scrnaseq +pip install scrnaseq celldex singler ``` +This will install the `scrnaseq`, `celldex`, `singler` packages from PyPI. + ::: -## Find Datasets +## Accessing and Exploring Single-Cell Datasets -Each dataset is decorated with metadata such as the study title, species, number of cells, etc., to facilitate discovery. Let's see how we can list and search for datasets. +Now that we have the necessary packages installed, let's explore the `scrnaseq` package and learn how to access public single-cell RNA-seq datasets. Dataset published to the `scrnaseq` package is decorated with metadata such as the study title, species, number of cells, etc., to facilitate discovery. Let's see how we can list and search for datasets. - ### List All Datasets -The `list_datasets()` function in Python or `surveyDatasets()` in R will display all available datasets along with their metadata. +The `list_datasets()` function in Python or `surveyDatasets()` in R will display all available datasets published to the `scRNAseq` repository along with their metadata. ::: {.panel-tabset} ## R ```{r} -suppressWarnings(library(scRNAseq)) +suppressMessages(library(scRNAseq)) all_ds <- surveyDatasets() head(all_ds[, c("name", "title", "version")], 3) ``` @@ -54,6 +72,8 @@ datasets[["name", "title", "version"]].head(3) ::: +This R|Python code lists all available datasets in the scrnaseq package and displays their names, titles, and versions. + ### Search for Datasets You can also search for datasets based on metadata using `search_datasets()` in Python or `searchDatasets()` in R. This supports both simple text queries and complex boolean expressions. @@ -76,6 +96,8 @@ pancreas_datasets[["name", "title", "version"]].head(3) ::: +This R|Python code searches for datasets containing the term "pancreas" and displays their names, titles, and versions. + #### Advanced Searches For more complex searches involving boolean operations, use `define_text_query()` in Python or `defineTextQuery()` in R. Here’s an example to find datasets using the mouse reference genome (`GRCm38`) and containing the words `neuro` or `pancrea`. @@ -114,11 +136,13 @@ res[["name", "title", "version"]].head(3) ::: +This R|Python code performs a complex search to find datasets using the mouse reference genome and containing the words "neuro" or "pancrea". + ::: {.callout-important} Once a dataset is identified, always list the name and version of the dataset in your scripts for reproducibility. ::: -## Fetch a Dataset +## Download dataset After identifying a dataset of interest, use `fetch_dataset()` in Python or `fetchDataset()` in R to download the dataset. This will load the dataset as a `SingleCellExperiment` object. @@ -147,12 +171,14 @@ print(sce) ### Side-quest on SingleCellExperiment in Python -The Python version of the `SingleCellExperiment` class adheres to Bioconductor's specification and offers similar methods. Our goal is to make it simple for analysts to switch between R and Python. Key differences include a shift from functional to object-oriented paradigms. +The Python implementation of the `SingleCellExperiment` class adheres to Bioconductor's specification and offers similar interface and methods. Our goal is to make it simple for analysts to switch between R and Python. Key differences include a shift from functional to object-oriented paradigms. ::: {.callout-note} For more details on the design, refer to the [BiocPy developer guide](https://github.com/BiocPy/developer_guide) or the [singlecellexperiment](https://github.com/BiocPy/SingleCellExperiment) documentation. ::: +This Python code demonstrates basic operations on a `SingleCellExperiment` object, including retrieving assay names, column names, column metadata, accessing counts, and coercing to an `AnnData` object. + ```{python} ## repeated because quarto's build does not keep state of python snippets across the notebook. import scrnaseq @@ -173,17 +199,17 @@ print("coerce to AnnData", sce.to_anndata()) We can now annotate cell types by using reference datasets and matching cells based on their expression profiles. In this tutorial, we will use [singleR](https://github.com/SingleR-inc/SingleR) in R or its Python equivalent [singler](https://github.com/BiocPy/singler). -Before running the singler algorithm, we need to fetch reference datasets from the `celldex` package. +Before running the `singler` algorithm, we need to fetch reference datasets from the `celldex` package. -## Access Reference Datasets from `celldex` +### Access Reference Datasets from `celldex` -Similar to `scRNAseq`, the `celldex` references are converted into language-agnostic representations for use in downstream analyses. +Similar to the `scRNAseq` package, the `celldex` package provides access to reference datasets in language-agnostic representations for use in downstream analyses. ::: {.callout-note} The `celldex` package is available on [R/Bioconductor](https://bioconductor.org/packages/devel/data/experiment/html/celldex.html) and [PyPI](https://github.com/BiocPy/celldex). ::: -For this tutorial, let's download the Human Primary Cell Atlas reference from `celldex` using `fetch_reference()` in Python or `fetchReference()` in R. +For this tutorial, let's download the [Immunological Genome Project](https://www.immgen.org/) reference from `celldex` using `fetch_reference()` in Python or `fetchReference()` in R. ::: {.panel-tabset} @@ -211,8 +237,7 @@ Now, let's identify cells from the `zeisel-brain` dataset using the `immgen` ref ## R ```{r} suppressWarnings(library(SingleR)) -# sce <- scuttle::logNormCounts(sce) -cell_labels <- SingleR(test = assay(sce, "counts")[,1:100], ref = immgen_ref, labels = immgen_ref$label.main) +cell_labels <- SingleR(test = assay(sce, "counts"), ref = immgen_ref, labels = immgen_ref$label.main) table(cell_labels$labels) ``` @@ -224,17 +249,11 @@ import singler import scrnaseq sce = scrnaseq.fetch_dataset("zeisel-brain-2015", "2023-12-14", realize_assays=True) -import scranpy -options = scranpy.LogNormCountsOptions(with_size_factors=False) - -# mat = sce.assays["counts"] -# log_normed_matrix = scranpy.log_norm_counts(mat_csc, options=options) - import celldex immgen_ref = celldex.fetch_reference("immgen", "2024-02-26", realize_assays=True) matches = singler.annotate_single( - test_data=sce[:, :100], + test_data=sce, ref_data = immgen_ref, ref_labels = "label.main" ) @@ -244,6 +263,103 @@ import pandas as pd pd.Series(matches["best"]).value_counts() ``` +Note: Since the python snippets use reticulate when built through Quarto, it does keep the history. Hence the code chunk +is longer. ::: +## Visualizing Single-Cell Data + +I can't have a tutorial without a section on visualization or figures. + +TODO: generate embeddings and then visualize clusters + + +::: {.panel-tabset} + +## R + +We will use the ggplot2 package in R to create visualizations. First, let's visualize the cell type annotations. + +```{r} +suppressWarnings(library(SingleR)) +suppressWarnings(library(ggplot2)) +cell_labels <- SingleR(test = assay(sce, "counts"), ref = immgen_ref, labels = immgen_ref$label.main) +sce$labels <- cell_labels$labels + +ggplot(as.data.frame(colData(sce)), aes(x = labels)) + + geom_bar() + + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + + labs(title = "Cell Type Annotations", x = "Cell Type", y = "Count") +``` + + +## Python + +We will use the seaborn and matplotlib packages in Python to create visualizations. First, let's visualize the cell type annotations. + +```{python} +import seaborn as sns +import matplotlib.pyplot as plt +import pandas as pd +import singler + +import scrnaseq +sce = scrnaseq.fetch_dataset("zeisel-brain-2015", "2023-12-14", realize_assays=True) + +import celldex +immgen_ref = celldex.fetch_reference("immgen", "2024-02-26", realize_assays=True) + +matches = singler.annotate_single( + test_data=sce, + ref_data=immgen_ref, + ref_labels="label.main" +) + +cell_labels = pd.Series(matches["best"]).value_counts() + +sns.barplot(x=cell_labels.index, y=cell_labels.values) +plt.xticks(rotation=45, ha='right') +plt.title("Cell Type Annotations") +plt.xlabel("Cell Type") +plt.ylabel("Count") +plt.show() +``` + +::: + +## Homework: Performing Differential Expression Analysis + +Differential expression analysis helps identify genes that are differentially expressed between different cell types or conditions. Let's explore how to identify markers for various cell types. + +### Differential Expression Analysis in Python + +We will use the scanpy package in Python to perform differential expression analysis. + +```python +import scanpy as sc + +import scrnaseq +sce = scrnaseq.fetch_dataset("zeisel-brain-2015", "2023-12-14", realize_assays=True) + +import celldex +immgen_ref = celldex.fetch_reference("immgen", "2024-02-26", realize_assays=True) + +import singler +matches = singler.annotate_single( + test_data=sce, + ref_data=immgen_ref, + ref_labels="label.main" +) + +# Prepare the data +adata = sce.to_anndata() +adata.obs['labels'] = matches["best"] + +# Perform differential expression analysis +sc.tl.rank_genes_groups(adata, groupby='labels', method='t-test') +sc.pl.rank_genes_groups(adata, n_genes=20, sharey=False) +``` + Congratulations! You have now completed the tutorial on accessing single-cell datasets using `scRNAseq` and `ArtifactDB`, and annotating cell types using reference datasets from `celldex`. For more detailed usage and advanced analyses, refer to the respective documentation of these packages. + +By integrating R and Python workflows, you can leverage the strengths of both languages and perform comprehensive single-cell analysis. Keep exploring and happy analyzing!