From 983a24edac6fcad3269d8a9064e11555e341723d Mon Sep 17 00:00:00 2001
From: Jayaram Kancherla <jayaram.kancherla@gmail.com>
Date: Fri, 7 Jun 2024 16:30:12 -0700
Subject: [PATCH] update tutorial (#3)

---
 .gitignore                        |   3 +-
 _quarto.yml                       |  70 +++++++-----
 index.qmd                         |  26 ++---
 requirements.txt                  |   4 +-
 rpackages.R                       |   2 +-
 tutorials/annotate_cell_types.qmd | 170 +++++++++++++++++++++++++-----
 6 files changed, 206 insertions(+), 69 deletions(-)

diff --git a/.gitignore b/.gitignore
index 18d05db..78ec4c4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,4 +7,5 @@ _freeze
 chapters/zilinois_lung_with_celltypist/
 *whee.h5
 *.tiledb
-*_cache
\ No newline at end of file
+*_cache
+*_files
\ No newline at end of file
diff --git a/_quarto.yml b/_quarto.yml
index 28126ea..4d45764 100644
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -1,41 +1,59 @@
 project:
-  type: book
+  type: website
   output-dir: docs
 
 execute:
   freeze: auto
   cache: true
 
-book:
+website:
   title: "Bioc2024: Interoperability between R and Python using BiocPy"
-  author: "[Jayaram Kancherla](mailto:jayaram.kancherla@gmail.com)"
-  contributor: "[Aaron Lun](mailto:infinite.monkeys.with.keys@gmail.com)"
-  favicon: ./assets/short.png
-  site-url: https://biocpy.github.io/tutorial
-  date: "5/31/2024"
-  search: true
-  repo-url: https://github.com/BiocPy/tutorial
-  repo-actions: [issue]
-  # downloads: [pdf, epub]
-  sharing: [twitter]
-  twitter-card: true
-  cover-image: ./assets/full.png
-  page-footer:
-    center: 
-      - text: "(c) BiocPy core contributors"
-        # href: chapters/summary.qmd
   sidebar:
-    style: docked
-    background: light
-  chapters:
-    - index.qmd
-    - tutorials/annotate_cell_types.qmd
-    - tutorials/sessioninfo.qmd
-    # - chapters/references.qmd
+    # search: true
+    contents:
+      - index.qmd
+      - section: "Tutorials"
+        contents:
+          - tutorials/annotate_cell_types.qmd
+          - tutorials/sessioninfo
+
+  # navbar:
+  #   left:
+  #     - href: index.qmd
+  #       text: Home
+  #     - tutorials/annotate_cell_types.qmd
+  #     - tutorials/sessioninfo
+
+# book:
+#   title: "Bioc2024: Interoperability between R and Python using BiocPy"
+#   author: "[Jayaram Kancherla](mailto:jayaram.kancherla@gmail.com)"
+#   contributor: "[Aaron Lun](mailto:infinite.monkeys.with.keys@gmail.com)"
+#   favicon: ./assets/short.png
+#   site-url: https://biocpy.github.io/tutorial
+#   date: "5/31/2024"
+#   search: true
+#   repo-url: https://github.com/BiocPy/tutorial
+#   repo-actions: [issue]
+#   # downloads: [pdf, epub]
+#   sharing: [twitter]
+#   twitter-card: true
+#   cover-image: ./assets/full.png
+#   page-footer:
+#     center: 
+#       - text: "(c) BiocPy core contributors"
+#         # href: chapters/summary.qmd
+#   sidebar:
+#     style: docked
+#     background: light
+#   chapters:
+#     - index.qmd
+#     - tutorials/annotate_cell_types.qmd
+#     - tutorials/sessioninfo.qmd
+#     # - chapters/references.qmd
 
 format:
   html:
-    theme: cosmo
+    theme: sandstone
     number-sections: false
     code-link: true
     toc: true
diff --git a/index.qmd b/index.qmd
index 2456411..9c2953f 100644
--- a/index.qmd
+++ b/index.qmd
@@ -1,18 +1,18 @@
 # Welcome
 
-The BiocPy project aims to facilitate development of Bioconductor workflows in Python. 
-
-This tutorial for the workshop will provide an overview of the core data structures implemented in BiocPy 
-(e.g., `GenomicRanges`, `SummarizedExperiment`) that were ported from R. 
-Participants will be guided through two Bioconductor-derived workflows in Python. 
-The first will involve reading an RDS file containing genomic ranges and performing 
-downstream range-based analyses. 
-The second will use the `scRNAseq` package to access public single-cell RNA-seq datasets, 
-followed by cell type annotation using the `SinglePy` package. 
-Attendees will learn how to represent and manipulate their datasets in Python in the 
-same manner as in R/Bioconductor. 
-
-All packages in BiocPy are published to PyPI, and code is open-source on [GitHub](https://github.com/BiocPy).
+Welcome to our workshop on exploring the data structures and packages 
+available in [BiocPy](https://github.com/biocpy), a project that brings 
+the power of Bioconductor to Python. 
+
+In this workshop, we will focus on interoperability between R and Python, covering two main topics:
+
+- Reading an RDS file containing `GenomicRanges` and performing downstream range-based analyses.
+- Annotating single-cell RNA-seq data analysis using the [scrnaseq](https://github.com/biocpy/scrnaseq) package.
+
+Attendees will learn how to represent and manipulate their datasets in Python 
+in the same manner as in R/Bioconductor. 
+All packages in BiocPy are published to PyPI, and the code is open-source 
+on [GitHub](https://github.com/BiocPy).
 
 
 ### Core contributors
diff --git a/requirements.txt b/requirements.txt
index d344d31..2f00338 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,4 +18,6 @@ seaborn
 session-info
 celldex
 scrnaseq
-anndata
\ No newline at end of file
+anndata
+matplotlib
+scanpy
\ No newline at end of file
diff --git a/rpackages.R b/rpackages.R
index 8b58492..1c751eb 100644
--- a/rpackages.R
+++ b/rpackages.R
@@ -1,3 +1,3 @@
 install.packages(c("BiocManager"), repos='http://cran.us.r-project.org')
 library(BiocManager)
-BiocManager::install(c("scRNAseq", "celldex", "SingleR", "scuttle", "reticulate", "rmarkdown", "knitr", "downlit", "xml2"))
\ No newline at end of file
+BiocManager::install(c("scRNAseq", "celldex", "SingleR", "scuttle", "reticulate", "rmarkdown", "knitr", "downlit", "xml2", "ggplot2", "edgeR"))
\ No newline at end of file
diff --git a/tutorials/annotate_cell_types.qmd b/tutorials/annotate_cell_types.qmd
index db566a1..72e1afc 100644
--- a/tutorials/annotate_cell_types.qmd
+++ b/tutorials/annotate_cell_types.qmd
@@ -1,46 +1,64 @@
 # Tutorial 2: Access single-cell datasets from `scRNAseq` collection and annotate cell types
 
-Welcome to this tutorial on accessing public single-cell datasets using `scRNAseq` and language-agnostic representations from `ArtifactDB`. We will showcase how to integrate and process single-cell datasets across languages, such as R and Python, and how to annotate cell types using reference datasets. Let's dive into the process!
+Welcome to this tutorial on annotating single-cell datasets with reference collections. The **scRNAseq** ([R/Bioc](https://bioconductor.org/packages/devel/data/experiment/html/scRNAseq.html), [Python](https://github.com/BiocPy/scrnaseq)) package provides access to public single-cell RNA-seq datasets for use by other Bioconductor/BiocPy packages and workflows. These datasets are stored in language agnostic representations described in [ArtifactDB](https://github.com/artifactdb), enabling easy access to datasets and analysis results across multiple programming languages such as R and Python. We will showcase how to integrate and process single-cell datasets across languages, such as R and Python, and how to annotate cell types using reference datasets. 
 
+## Outline
 
-## Introduction
+In this tutorial, you will learn how to:
 
-The [scRNAseq](https://bioconductor.org/packages/devel/data/experiment/html/scRNAseq.html) package provides access to public single-cell RNA-seq datasets for use by other Bioconductor packages and workflows. This package has been updated to store datasets in language-agnostic representations described in [ArtifactDB](https://github.com/artifactdb), enabling easy access to datasets and analysis results across multiple programming languages such as R and Python.
+- Install and set up BiocPy packages in your Python environment.
+- Explore the `scrnaseq` package and access public single-cell RNA-seq datasets.
+- Perform basic operations on `SingleCellExperiment` objects, the core data structure for single-cell data analysis.
+- Annotate cell types using reference datasets from the `celldex` package.
+- Understand the design principles behind BiocPy.
 
-The Python equivalent of the package is available [here](https://github.com/BiocPy/scrnaseq).
+Let's dive into the process!
+
+## Prerequisites
+
+Before we begin, please ensure that you have the following prerequisites installed:
+
+- Python 3.8 or later with dependencies listed [here]([../requirements.txt](https://github.com/BiocPy/BiocWorkshop2024/blob/master/requirements.txt)).
+- R 4.4.0 and Bioconductor packages listed [here]([../rpackages.R](https://github.com/BiocPy/BiocWorkshop2024/blob/master/rpackages.R)).
 
 ## Installation
 
-Install the package from Bioconductor for R or from PyPI for Python:
+Let's start by installing the required packages.
 
 ::: {.panel-tabset}
 
 ## R
 ```r
-BiocManager::install("scRNAseq", repos='http://cran.us.r-project.org')
+BiocManager::install(c("scRNAseq", "celldex", "SingleR"), 
+     repos='http://cran.us.r-project.org')
 ```
 
+This will install the `scRNAseq`, `celldex`, `SingleR`, packages from Bioconductor.
+
+
 ## Python (shell)
+
 ```sh
-pip install scrnaseq
+pip install scrnaseq celldex singler
 ```
 
+This will install the `scrnaseq`, `celldex`, `singler` packages from PyPI.
+
 :::
 
-## Find Datasets
+## Accessing and Exploring Single-Cell Datasets
 
-Each dataset is decorated with metadata such as the study title, species, number of cells, etc., to facilitate discovery. Let's see how we can list and search for datasets.
+Now that we have the necessary packages installed, let's explore the `scrnaseq` package and learn how to access public single-cell RNA-seq datasets. Dataset published to the `scrnaseq` package is decorated with metadata such as the study title, species, number of cells, etc., to facilitate discovery. Let's see how we can list and search for datasets.
 
-<a name="list-all-datasets"></a>
 ### List All Datasets
 
-The `list_datasets()` function in Python or `surveyDatasets()` in R will display all available datasets along with their metadata.
+The `list_datasets()` function in Python or `surveyDatasets()` in R will display all available datasets published to the `scRNAseq` repository along with their metadata.
 
 ::: {.panel-tabset}
 
 ## R
 ```{r}
-suppressWarnings(library(scRNAseq))
+suppressMessages(library(scRNAseq))
 all_ds <- surveyDatasets()
 head(all_ds[, c("name", "title", "version")], 3)
 ```
@@ -54,6 +72,8 @@ datasets[["name", "title", "version"]].head(3)
 
 :::
 
+This R|Python code lists all available datasets in the scrnaseq package and displays their names, titles, and versions.
+
 ### Search for Datasets
 
 You can also search for datasets based on metadata using `search_datasets()` in Python or `searchDatasets()` in R. This supports both simple text queries and complex boolean expressions.
@@ -76,6 +96,8 @@ pancreas_datasets[["name", "title", "version"]].head(3)
 
 :::
 
+This R|Python code searches for datasets containing the term "pancreas" and displays their names, titles, and versions.
+
 #### Advanced Searches
 
 For more complex searches involving boolean operations, use `define_text_query()` in Python or `defineTextQuery()` in R. Here’s an example to find datasets using the mouse reference genome (`GRCm38`) and containing the words `neuro` or `pancrea`.
@@ -114,11 +136,13 @@ res[["name", "title", "version"]].head(3)
 
 :::
 
+This R|Python code performs a complex search to find datasets using the mouse reference genome and containing the words "neuro" or "pancrea".
+
 ::: {.callout-important}
 Once a dataset is identified, always list the name and version of the dataset in your scripts for reproducibility.
 :::
 
-## Fetch a Dataset
+## Download dataset
 
 After identifying a dataset of interest, use `fetch_dataset()` in Python or `fetchDataset()` in R to download the dataset. This will load the dataset as a `SingleCellExperiment` object.
 
@@ -147,12 +171,14 @@ print(sce)
 
 ### Side-quest on SingleCellExperiment in Python
 
-The Python version of the `SingleCellExperiment` class adheres to Bioconductor's specification and offers similar methods. Our goal is to make it simple for analysts to switch between R and Python. Key differences include a shift from functional to object-oriented paradigms.
+The Python implementation of the `SingleCellExperiment` class adheres to Bioconductor's specification and offers similar interface and methods. Our goal is to make it simple for analysts to switch between R and Python. Key differences include a shift from functional to object-oriented paradigms.
 
 ::: {.callout-note}
 For more details on the design, refer to the [BiocPy developer guide](https://github.com/BiocPy/developer_guide) or the [singlecellexperiment](https://github.com/BiocPy/SingleCellExperiment) documentation.
 :::
 
+This Python code demonstrates basic operations on a `SingleCellExperiment` object, including retrieving assay names, column names, column metadata, accessing counts, and coercing to an `AnnData` object.
+
 ```{python}
 ## repeated because quarto's build does not keep state of python snippets across the notebook.
 import scrnaseq
@@ -173,17 +199,17 @@ print("coerce to AnnData", sce.to_anndata())
 
 We can now annotate cell types by using reference datasets and matching cells based on their expression profiles. In this tutorial, we will use [singleR](https://github.com/SingleR-inc/SingleR) in R or its Python equivalent [singler](https://github.com/BiocPy/singler).
 
-Before running the singler algorithm, we need to fetch reference datasets from the `celldex` package.
+Before running the `singler` algorithm, we need to fetch reference datasets from the `celldex` package.
 
-## Access Reference Datasets from `celldex`
+### Access Reference Datasets from `celldex`
 
-Similar to `scRNAseq`, the `celldex` references are converted into language-agnostic representations for use in downstream analyses.
+Similar to the `scRNAseq` package, the `celldex` package provides access to reference datasets in language-agnostic representations for use in downstream analyses.
 
 ::: {.callout-note}
 The `celldex` package is available on [R/Bioconductor](https://bioconductor.org/packages/devel/data/experiment/html/celldex.html) and [PyPI](https://github.com/BiocPy/celldex).
 :::
 
-For this tutorial, let's download the Human Primary Cell Atlas reference from `celldex` using `fetch_reference()` in Python or `fetchReference()` in R.
+For this tutorial, let's download the [Immunological Genome Project](https://www.immgen.org/) reference from `celldex` using `fetch_reference()` in Python or `fetchReference()` in R.
 
 ::: {.panel-tabset}
 
@@ -211,8 +237,7 @@ Now, let's identify cells from the `zeisel-brain` dataset using the `immgen` ref
 ## R
 ```{r}
 suppressWarnings(library(SingleR))
-# sce <- scuttle::logNormCounts(sce)
-cell_labels <- SingleR(test = assay(sce, "counts")[,1:100], ref = immgen_ref, labels = immgen_ref$label.main)
+cell_labels <- SingleR(test = assay(sce, "counts"), ref = immgen_ref, labels = immgen_ref$label.main)
 
 table(cell_labels$labels)
 ```
@@ -224,17 +249,11 @@ import singler
 import scrnaseq
 sce = scrnaseq.fetch_dataset("zeisel-brain-2015", "2023-12-14", realize_assays=True)
 
-import scranpy
-options = scranpy.LogNormCountsOptions(with_size_factors=False)
-
-# mat = sce.assays["counts"]
-# log_normed_matrix = scranpy.log_norm_counts(mat_csc, options=options)
-
 import celldex
 immgen_ref = celldex.fetch_reference("immgen", "2024-02-26", realize_assays=True)
 
 matches = singler.annotate_single(
-    test_data=sce[:, :100], 
+    test_data=sce, 
     ref_data = immgen_ref,
     ref_labels = "label.main"
 )
@@ -244,6 +263,103 @@ import pandas as pd
 pd.Series(matches["best"]).value_counts()
 ```
 
+Note: Since the python snippets use reticulate when built through Quarto, it does keep the history. Hence the code chunk
+is longer.
 :::
 
+## Visualizing Single-Cell Data
+
+I can't have a tutorial without a section on visualization or figures.
+
+TODO: generate embeddings and then visualize clusters
+
+
+::: {.panel-tabset}
+
+## R
+
+We will use the ggplot2 package in R to create visualizations. First, let's visualize the cell type annotations.
+
+```{r}
+suppressWarnings(library(SingleR))
+suppressWarnings(library(ggplot2))
+cell_labels <- SingleR(test = assay(sce, "counts"), ref = immgen_ref, labels = immgen_ref$label.main)
+sce$labels <- cell_labels$labels
+
+ggplot(as.data.frame(colData(sce)), aes(x = labels)) +
+    geom_bar() +
+    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
+    labs(title = "Cell Type Annotations", x = "Cell Type", y = "Count")
+```
+
+
+## Python
+
+We will use the seaborn and matplotlib packages in Python to create visualizations. First, let's visualize the cell type annotations.
+
+```{python}
+import seaborn as sns
+import matplotlib.pyplot as plt
+import pandas as pd
+import singler
+
+import scrnaseq
+sce = scrnaseq.fetch_dataset("zeisel-brain-2015", "2023-12-14", realize_assays=True)
+
+import celldex
+immgen_ref = celldex.fetch_reference("immgen", "2024-02-26", realize_assays=True)
+
+matches = singler.annotate_single(
+    test_data=sce, 
+    ref_data=immgen_ref,
+    ref_labels="label.main"
+)
+
+cell_labels = pd.Series(matches["best"]).value_counts()
+
+sns.barplot(x=cell_labels.index, y=cell_labels.values)
+plt.xticks(rotation=45, ha='right')
+plt.title("Cell Type Annotations")
+plt.xlabel("Cell Type")
+plt.ylabel("Count")
+plt.show()
+```
+
+:::
+
+## Homework: Performing Differential Expression Analysis
+
+Differential expression analysis helps identify genes that are differentially expressed between different cell types or conditions. Let's explore how to identify markers for various cell types.
+
+### Differential Expression Analysis in Python
+
+We will use the scanpy package in Python to perform differential expression analysis.
+
+```python
+import scanpy as sc
+
+import scrnaseq
+sce = scrnaseq.fetch_dataset("zeisel-brain-2015", "2023-12-14", realize_assays=True)
+
+import celldex
+immgen_ref = celldex.fetch_reference("immgen", "2024-02-26", realize_assays=True)
+
+import singler
+matches = singler.annotate_single(
+    test_data=sce, 
+    ref_data=immgen_ref,
+    ref_labels="label.main"
+)
+
+# Prepare the data
+adata = sce.to_anndata()
+adata.obs['labels'] = matches["best"]
+
+# Perform differential expression analysis
+sc.tl.rank_genes_groups(adata, groupby='labels', method='t-test')
+sc.pl.rank_genes_groups(adata, n_genes=20, sharey=False)
+```
+
 Congratulations! You have now completed the tutorial on accessing single-cell datasets using `scRNAseq` and `ArtifactDB`, and annotating cell types using reference datasets from `celldex`. For more detailed usage and advanced analyses, refer to the respective documentation of these packages.
+
+By integrating R and Python workflows, you can leverage the strengths of both languages and perform comprehensive single-cell analysis. Keep exploring and happy analyzing!