Demonstration: Cell type annotation and dataset integration

Goals:

Integrate multiple datasets together
Transfer annotations from one dataset to another
Project one dataset onto another dataset’s embedding

1. Integrating two replicates together

We have sequenced 2 replicates of WT cells differentiated into MCCs. Let’s process both datasets independantly.

library(SingleCellExperiment)
library(tidyverse)
Bl6J_WT <- readRDS('data/MCCs/Bl6J_WT.rds')
Bl6N_WT <- readRDS('data/MCCs/Bl6N_WT.rds')
future::plan(strategy = "multicore", workers = 16)
set.seed(1000)

# Bl6J_WT
cnts <- as(assay(Bl6J_WT, 'counts'), 'dgCMatrix')
colnames(cnts) <- Bl6J_WT$Barcode
rownames(cnts) <- rownames(Bl6J_WT)
Bl6J_WT_vst <- sctransform::vst(cnts, return_cell_attr = TRUE)
Bl6J_WT <- Bl6J_WT[rownames(Bl6J_WT_vst$y), ]
assay(Bl6J_WT, 'corrected_counts', withDimnames = FALSE) <- sctransform::correct(Bl6J_WT_vst)
assay(Bl6J_WT, 'logcounts', withDimnames = FALSE) <- log1p(assay(Bl6J_WT, 'corrected_counts'))
Bl6J_WT_variance <- scran::modelGeneVar(Bl6J_WT)
HVGs <- scran::getTopHVGs(Bl6J_WT_variance, prop = 0.1)
Bl6J_WT <- scran::denoisePCA(Bl6J_WT, technical = Bl6J_WT_variance, subset.row = HVGs, min.rank = 15)
Bl6J_WT <- scater::runUMAP(Bl6J_WT)
Bl6J_WT$cluster <- factor(igraph::cluster_louvain(scran::buildSNNGraph(Bl6J_WT, use.dimred = 'PCA'))$membership)

# Bl6N_WT
cnts <- as(assay(Bl6N_WT, 'counts'), 'dgCMatrix')
colnames(cnts) <- Bl6N_WT$Barcode
rownames(cnts) <- rownames(Bl6N_WT)
Bl6N_WT_vst <- sctransform::vst(cnts, return_cell_attr = TRUE)
Bl6N_WT <- Bl6N_WT[rownames(Bl6N_WT_vst$y), ]
assay(Bl6N_WT, 'corrected_counts', withDimnames = FALSE) <- sctransform::correct(Bl6N_WT_vst)
assay(Bl6N_WT, 'logcounts', withDimnames = FALSE) <- log1p(assay(Bl6N_WT, 'corrected_counts'))
Bl6N_WT_variance <- scran::modelGeneVar(Bl6N_WT)
HVGs <- scran::getTopHVGs(Bl6N_WT_variance, prop = 0.1)
Bl6N_WT <- scran::denoisePCA(Bl6N_WT, technical = Bl6N_WT_variance, subset.row = HVGs, min.rank = 15)
Bl6N_WT <- scater::runUMAP(Bl6N_WT)
Bl6N_WT$cluster <- factor(igraph::cluster_louvain(scran::buildSNNGraph(Bl6N_WT, use.dimred = 'PCA'))$membership)

# Compare side-by-side
p <- cowplot::plot_grid(
    scater::plotReducedDim(Bl6N_WT, 'UMAP', colour_by = 'cluster', text_by = 'cluster') + ggtitle('Bl6N_WT'),
    scater::plotReducedDim(Bl6J_WT, 'UMAP', colour_by = 'cluster', text_by = 'cluster') + ggtitle('Bl6J_WT')
)
ggsave('data/MCCs/WT-replicates_UMAP.pdf', w = 10, h = 5)

Process them together without genotype correction

Bl6J_WT <- readRDS('data/MCCs/Bl6J_WT.rds')
Bl6N_WT <- readRDS('data/MCCs/Bl6N_WT.rds')
# Merge two genotypes
MCCs <- cbind(Bl6J_WT, Bl6N_WT)
set.seed(1000)

# Normalize counts with VST
cnts <- as(assay(MCCs, 'counts'), 'dgCMatrix')
colnames(cnts) <- MCCs$Barcode
rownames(cnts) <- rownames(MCCs)
MCCs_vst <- sctransform::vst(cnts, return_cell_attr = TRUE)
MCCs <- MCCs[rownames(MCCs_vst$y), ]
assay(MCCs, 'corrected_counts', withDimnames = FALSE) <- sctransform::correct(MCCs_vst)
assay(MCCs, 'logcounts', withDimnames = FALSE) <- log1p(assay(MCCs, 'corrected_counts'))
# Flag HVGs
MCCs_variance <- scran::modelGeneVar(MCCs)
HVGs <- scran::getTopHVGs(MCCs_variance, prop = 0.1)
rowData(MCCs)$HVG <- rownames(MCCs) %in% HVGs
# Embed in PCA
MCCs <- scran::denoisePCA(MCCs, technical = MCCs_variance, subset.row = HVGs, min.rank = 15)
MCCs <- scater::runUMAP(MCCs)
# Cluster cells
MCCs$cluster <- factor(igraph::cluster_louvain(scran::buildSNNGraph(MCCs, use.dimred = 'PCA'))$membership)
p <- cowplot::plot_grid(
    scater::plotReducedDim(MCCs, 'UMAP', colour_by = 'cluster', text_by = 'cluster') + ggtitle('MCCs'),
    scater::plotReducedDim(MCCs, 'UMAP', colour_by = 'batch', text_by = 'cluster') + ggtitle('Genotype')
)
ggsave('data/MCCs/WT-replicates-merged_UMAP.pdf', w = 10, h = 5)

Now let’s do it again, but correcting for genotype.

set.seed(1000)
mergedBatches <- batchelor::fastMNN(
    MCCs, 
    batch = MCCs$batch, 
    subset.row = HVGs, 
    BPPARAM = BiocParallel::MulticoreParam(workers = 12)
)
mergedBatches
rowData(mergedBatches)
MCCs
reducedDims(mergedBatches)
reducedDim(MCCs, 'corrected_PCA') <- reducedDim(mergedBatches, 'corrected')
MCCs$corrected_cluster <- factor(igraph::cluster_louvain(scran::buildSNNGraph(MCCs, use.dimred = 'corrected_PCA'))$membership)
set.seed(1000)
reducedDim(MCCs, 'corrected_UMAP') <- scater::calculateUMAP(t(reducedDim(MCCs, 'corrected_PCA')))
p <- cowplot::plot_grid(
    scater::plotReducedDim(MCCs, 'UMAP', colour_by = 'cluster', text_by = 'cluster') + ggtitle('MCCs'),
    scater::plotReducedDim(MCCs, 'UMAP', colour_by = 'batch', text_by = 'batch') + ggtitle('Genotype'),
    scater::plotReducedDim(MCCs, 'corrected_UMAP', colour_by = 'batch', text_by = 'batch') + ggtitle('Genotype'),
    scater::plotReducedDim(MCCs, 'corrected_UMAP', colour_by = 'corrected_cluster', text_by = 'corrected_cluster') + ggtitle('MCCs')
)
ggsave('data/MCCs/WT-replicates-corrected_UMAP.pdf', w = 10, h = 10)

2. Reading CcnoKO dataset in R

There is a scRNAseq dataset of Ccno KO cells trying to undergo in vitro differentiation.

CcnoKO <- readRDS('data/MCCs/CcnoKO.rds')
set.seed(1000)

# Normalize counts with VST
cnts <- as(assay(CcnoKO, 'counts'), 'dgCMatrix')
colnames(cnts) <- CcnoKO$Barcode
rownames(cnts) <- rownames(CcnoKO)
CcnoKO_vst <- sctransform::vst(cnts, return_cell_attr = TRUE)
CcnoKO <- CcnoKO[rownames(CcnoKO_vst$y), ]
assay(CcnoKO, 'corrected_counts', withDimnames = FALSE) <- sctransform::correct(CcnoKO_vst)
assay(CcnoKO, 'logcounts', withDimnames = FALSE) <- log1p(assay(CcnoKO, 'corrected_counts'))
# Flag HVGs
CcnoKO_variance <- scran::modelGeneVar(CcnoKO)
HVGs <- scran::getTopHVGs(CcnoKO_variance, prop = 0.1)
rowData(CcnoKO)$HVG <- rownames(CcnoKO) %in% HVGs
# Embed in PCA
CcnoKO <- scran::denoisePCA(CcnoKO, technical = CcnoKO_variance, subset.row = HVGs, min.rank = 50)
CcnoKO <- scater::runUMAP(CcnoKO)
# Cluster cells
CcnoKO$cluster <- factor(igraph::cluster_louvain(scran::buildSNNGraph(CcnoKO, use.dimred = 'PCA'))$membership)
p <- cowplot::plot_grid(
    scater::plotReducedDim(CcnoKO, 'UMAP', colour_by = 'cluster', text_by = 'cluster') + ggtitle('CcnoKO'),
    scater::plotReducedDim(CcnoKO, 'UMAP', colour_by = 'batch', text_by = 'cluster') + ggtitle('Genotype')
)
ggsave('data/MCCs/CcnoKO_UMAP.pdf', w = 10, h = 5)

3. Annotating CcnoKO dataset with WT MCCs dataset using `scmap`

We have high-quality annotations for MCCs dataset, but not for CcnoKO dataset. Can we transfer annotations from MCCs to CcnoKO?

# Prepare feature indices from MCCs
set.seed(1000)
rowData(MCCs)$feature_symbol <- rowData(MCCs)$Symbol
MCCs <- scmap::selectFeatures(MCCs, suppress_plot = TRUE)
MCCs <- scmap::indexCluster(MCCs, cluster_col = 'annotation')
metadata(MCCs)
head(metadata(MCCs)[['scmap_cluster_index']])

# Map clusters from MCCs onto CcnoKO
set.seed(1000)
rowData(CcnoKO)$feature_symbol <- rowData(CcnoKO)$Symbol
CcnoKO_scmap_clus <- scmap::scmapCluster(
    projection = CcnoKO, 
    index_list = list(yan = metadata(MCCs)$scmap_cluster_index)
)

# Get transferred annotations
CcnoKO$annotation_projected <- factor(CcnoKO_scmap_clus$combined_labs, levels = levels(MCCs$annotation))

# Plot reduced dims
p <- cowplot::plot_grid(
    scater::plotReducedDim(CcnoKO, 'UMAP', colour_by = 'batch', text_by = 'batch') + ggtitle('CcnoKO'),
    scater::plotReducedDim(CcnoKO, 'UMAP', colour_by = 'cluster', text_by = 'cluster') + ggtitle('Clusters'),
    scater::plotReducedDim(CcnoKO, 'UMAP', colour_by = 'annotation_projected', text_by = 'annotation_projected') + ggtitle('Transferred annotations'),
    scater::plotReducedDim(MCCs, 'UMAP', colour_by = 'annotation', text_by = 'annotation') + ggtitle('MCCs'),
    ncol = 2
)
ggsave('data/MCCs/CcnoKO-transferred-annotations_UMAP.pdf', w = 10, h = 10)

4. Mapping CcnoKO onto WT MCCs cells

Another way to visualize which WT cell types the CcnoKO cells spatially overlap with is to project CcnoKO cells onto MCC cells in UMAP embedding. This could be done manually, by using the rotation matrix obtained from MCCs embedding in PCA space to “learn” PCA embedding of the CcnoKO data, etc…, however this process is rather hazardous when fastMNN() is first used to correct for batch bias.

Luckily, this process is facilitated in Seurat, with the MapQuery() function. However, we do need to re-process most of the data in order to project CcnoKO cells onto WT MCC UMAP embedding.

# Re-process each dataset separately with Seurat
library(Seurat)
options(future.globals.maxSize= 891289600)
MCCs_seurat <- as.Seurat(MCCs) %>% 
    NormalizeData() %>%
    FindVariableFeatures(selection.method = "vst", nfeatures = 2000) %>% 
    ScaleData() %>% 
    RunPCA() %>% 
    RunUMAP(reduction = "pca", dims = 1:30, return.model = TRUE)
CcnoKO_seurat <- as.Seurat(CcnoKO) %>% 
    NormalizeData() %>%
    FindVariableFeatures(selection.method = "vst", nfeatures = 2000) %>% 
    ScaleData() %>% 
    RunPCA() %>% 
    RunUMAP(reduction = "pca", dims = 1:30, return.model = TRUE)

# Transfer anchors from MCCs to CcnoKO
anchors <- FindTransferAnchors(
    reference = MCCs_seurat,
    query = CcnoKO_seurat,
    reference.reduction = "pca"
)

# Project CcnoKO onto MCCs UMAP embedding
CcnoKO_seurat <- MapQuery(
    anchorset = anchors, 
    reference = MCCs_seurat, 
    query = CcnoKO_seurat, 
    reference.reduction = "pca", 
    reduction.model = "umap"
)

# Exporting back the learnt UMAP to CcnoKO
reducedDim(MCCs, 'learnt_UMAP') <- Embeddings(MCCs_seurat, reduction = "umap")
reducedDim(CcnoKO, 'learnt_UMAP') <- Embeddings(CcnoKO_seurat, reduction = "ref.umap")

# Plot new embeddings
p <- cowplot::plot_grid(
    scater::plotReducedDim(MCCs, 'UMAP', colour_by = 'annotation', text_by = 'annotation') + ggtitle('MCCs, original UMAP'),
    scater::plotReducedDim(MCCs, 'learnt_UMAP', colour_by = 'annotation', text_by = 'annotation') + ggtitle('MCCs, UMAP from Seurat'),
    scater::plotReducedDim(CcnoKO, 'UMAP', colour_by = 'annotation_projected', text_by = 'annotation_projected') + ggtitle('CcnoKO, original UMAP'),
    scater::plotReducedDim(CcnoKO, 'learnt_UMAP', colour_by = 'annotation_projected', text_by = 'annotation_projected') + ggtitle('CcnoKO, learnt UMAP'),
    ncol = 2
)
ggsave('data/MCCs/CcnoKO-projected_UMAP.pdf', w = 10, h = 10)

1. Integrating two replicates together

2. Reading CcnoKO dataset in R

3. Annotating CcnoKO dataset with WT MCCs dataset using scmap

4. Mapping CcnoKO onto WT MCCs cells

3. Annotating CcnoKO dataset with WT MCCs dataset using `scmap`