Day 02

Hands-on session 3: analysing scATACseq with Signac

Here, we will focus on an example dataset generated by 10X Genomics: a set of ~5,000 cells from an mouse adult brain. Processed data (generated by cellranger-atac count) can be downladed from here:

mkdir -p data/MouseBrain
wget https://cf.10xgenomics.com/samples/cell-atac/1.2.0/atac_v1_adult_brain_fresh_5k/atac_v1_adult_brain_fresh_5k_analysis.tar.gz -O data/MouseBrain/atac_v1_adult_brain_fresh_5k_analysis.tar.gz
wget https://cf.10xgenomics.com/samples/cell-atac/1.2.0/atac_v1_adult_brain_fresh_5k/atac_v1_adult_brain_fresh_5k_filtered_peak_bc_matrix.tar.gz -O data/MouseBrain/atac_v1_adult_brain_fresh_5k_filtered_peak_bc_matrix.tar.gz
wget https://cf.10xgenomics.com/samples/cell-atac/1.2.0/atac_v1_adult_brain_fresh_5k/atac_v1_adult_brain_fresh_5k_filtered_peak_bc_matrix.h5 -O data/MouseBrain/atac_v1_adult_brain_fresh_5k_filtered_peak_bc_matrix.h5
wget https://cf.10xgenomics.com/samples/cell-atac/1.2.0/atac_v1_adult_brain_fresh_5k/atac_v1_adult_brain_fresh_5k_filtered_tf_bc_matrix.tar.gz -O data/MouseBrain/atac_v1_adult_brain_fresh_5k_filtered_tf_bc_matrix.tar.gz
wget https://cf.10xgenomics.com/samples/cell-atac/1.2.0/atac_v1_adult_brain_fresh_5k/atac_v1_adult_brain_fresh_5k_filtered_tf_bc_matrix.h5 -O data/MouseBrain/atac_v1_adult_brain_fresh_5k_filtered_tf_bc_matrix.h5
wget https://cf.10xgenomics.com/samples/cell-atac/1.2.0/atac_v1_adult_brain_fresh_5k/atac_v1_adult_brain_fresh_5k_fragments.tsv.gz -O data/MouseBrain/atac_v1_adult_brain_fresh_5k_fragments.tsv.gz
wget https://cf.10xgenomics.com/samples/cell-atac/1.2.0/atac_v1_adult_brain_fresh_5k/atac_v1_adult_brain_fresh_5k_fragments.tsv.gz.tbi -O data/MouseBrain/atac_v1_adult_brain_fresh_5k_fragments.tsv.gz.tbi
wget https://cf.10xgenomics.com/samples/cell-atac/1.2.0/atac_v1_adult_brain_fresh_5k/atac_v1_adult_brain_fresh_5k_peak_annotation.tsv -O data/MouseBrain/atac_v1_adult_brain_fresh_5k_peak_annotation.tsv
wget https://cf.10xgenomics.com/samples/cell-atac/1.2.0/atac_v1_adult_brain_fresh_5k/atac_v1_adult_brain_fresh_5k_peaks.bed -O data/MouseBrain/atac_v1_adult_brain_fresh_5k_peaks.bed
wget https://cg.10xgenomics.com/samples/cell-atac/1.2.0/atac_v1_adult_brain_fresh_5k/atac_v1_adult_brain_fresh_5k_possorted_bam.bam -O data/MouseBrain/atac_v1_adult_brain_fresh_5k_possorted_bam.bam
wget https://cf.10xgenomics.com/samples/cell-atac/1.2.0/atac_v1_adult_brain_fresh_5k/atac_v1_adult_brain_fresh_5k_possorted_bam.bam.bai -O data/MouseBrain/atac_v1_adult_brain_fresh_5k_possorted_bam.bam.bai
wget https://cf.10xgenomics.com/samples/cell-atac/1.2.0/atac_v1_adult_brain_fresh_5k/atac_v1_adult_brain_fresh_5k_raw_peak_bc_matrix.tar.gz -O data/MouseBrain/atac_v1_adult_brain_fresh_5k_raw_peak_bc_matrix.tar.gz
wget https://cf.10xgenomics.com/samples/cell-atac/1.2.0/atac_v1_adult_brain_fresh_5k/atac_v1_adult_brain_fresh_5k_raw_peak_bc_matrix.h5 -O data/MouseBrain/atac_v1_adult_brain_fresh_5k_raw_peak_bc_matrix.h5
wget https://cf.10xgenomics.com/samples/cell-atac/1.2.0/atac_v1_adult_brain_fresh_5k/atac_v1_adult_brain_fresh_5k_singlecell.csv -O data/MouseBrain/atac_v1_adult_brain_fresh_5k_singlecell.csv
wget https://cf.10xgenomics.com/samples/cell-atac/1.2.0/atac_v1_adult_brain_fresh_5k/atac_v1_adult_brain_fresh_5k_summary.csv -O data/MouseBrain/atac_v1_adult_brain_fresh_5k_summary.csv
wget https://cf.10xgenomics.com/samples/cell-atac/1.2.0/atac_v1_adult_brain_fresh_5k/atac_v1_adult_brain_fresh_5k_summary.json -O data/MouseBrain/atac_v1_adult_brain_fresh_5k_summary.json
wget https://cf.10xgenomics.com/samples/cell-atac/1.2.0/atac_v1_adult_brain_fresh_5k/atac_v1_adult_brain_fresh_5k_web_summary.html -O data/MouseBrain/atac_v1_adult_brain_fresh_5k_web_summary.html
wget https://cf.10xgenomics.com/samples/cell-atac/1.2.0/atac_v1_adult_brain_fresh_5k/atac_v1_adult_brain_fresh_5k_cloupe.cloupe -O data/MouseBrain/atac_v1_adult_brain_fresh_5k_cloupe.cloupe

Importing data

scATACseq raw data processing can be very different from bulk ATAC-seq, and downstream analysis is also quite specific to single-cell assays. As of today, Signac is one of the few solutions developed to investigate scATACseq data in R, and is the most widely used approach. Signac is an “extension” of Seurat, which originally focused on single-cell RNA-seq data analysis. It uses the same type of object structure. A slight difference regarding the features is that they are actually chromatin accessible loci, rather than genes. Because peaks are frequently studied in relation to neighbouring genes, the Seurat object can conventiently store gene annotations (or any type of genomic annotations) in its annotation slot.

library(Signac)
library(Seurat)
library(tidyverse)
library(BiocParallel)
library(plyranges)

## -- Read brain ATAC-seq counts
brain_assay <- Signac::CreateChromatinAssay(
    counts = Seurat::Read10X_h5("data/MouseBrain/atac_v1_adult_brain_fresh_5k_filtered_peak_bc_matrix.h5"),
    sep = c(":", "-"),
    genome = "mm10",
    fragments = 'data/MouseBrain/atac_v1_adult_brain_fresh_5k_fragments.tsv.gz',
    min.cells = 1
)
class(brain_assay)

## -- Create Seurat object
metadata <- vroom::vroom(
    file = "data/MouseBrain/atac_v1_adult_brain_fresh_5k_singlecell.csv",
    col_names = TRUE
) %>% 
    filter(barcode != 'NO_BARCODE') %>% 
    mutate(cell = barcode) %>%
    column_to_rownames('barcode') %>% 
    as.data.frame()
brain <- Seurat::CreateSeuratObject(
    counts = brain_assay,
    assay = 'peaks',
    project = 'ATAC',
    meta.data = metadata
)

## -- Add genome annotations
Annotation(brain) <- AnnotationHub::AnnotationHub()[['AH49547']] %>% 
    filter(gene_type == 'protein_coding', type == 'gene') %>% 
    mutate(gene_biotype = gene_type)

Accessing data

Accessing the data stored in a Seurat object is (generally) relatively easy, using the getAssayData() function.

# Data slots available in the Seurat object
Assays(brain)
slots <- slotNames(brain@assays[[brain@active.assay]])

# Find peaks 
peaks <- GetAssayData(brain, slot = 'ranges')

# Find counts 
counts <- GetAssayData(brain, slot = "counts")
data <- GetAssayData(brain, slot = "data")

# Find annotations
annots <- GetAssayData(brain, slot = "annotation")

# Find seqinfo 
seqinfo <- GetAssayData(brain, slot = "seqinfo")

Annotate peaks

peaks <- granges(brain)
annots <- Annotation(brain) 
annotated_peaks <- join_nearest(peaks, annots) %>% 
    add_nearest_distance(annots) %>% 
    mutate(
        peak = paste(seqnames, start, end, sep = '-'),
        inGene = distance == 0
    )
table(annotated_peaks$inGene)

# - Distance to nearest promoter 
mm_promoters <- Annotation(brain) %>% 
    resize(1, fix = 'start') %>% 
    resize(4000, fix = 'center') %>% 
    mutate(prom_id = paste0('prom_', 1:n()))
add_nearest_distance(peaks, mm_promoters) %>% 
    as_tibble() %>% 
    pull(distance) %>% 
    quantile(probs = seq(0, 1, 0.1))

QCing data

QC of scATACseq data stored in Seurat is simplified, using Signac-provided functions. On top of these utilities, scATACseq fragments can always be imported (remember: they are stored in the fragments.tsv.gz generated by cellranger-atac count, and they also contain information re: the cell-of-origin, for each fragment), if one wants to manually check fragment distribution, size, …

## -- Nucleosome signal 
brain <- NucleosomeSignal(object = brain)
brain$nucleosome_group <- ifelse(brain$nucleosome_signal > 4, 'NS > 4', 'NS < 4')
range(brain$nucleosome_group)
table(brain$nucleosome_group)

## -- Frag. size distribution
frags <- vroom::vroom('/data/20210804_scATACseq-workshop/data/MouseBrain/atac_v1_adult_brain_fresh_5k_fragments.tsv.gz', col_names = FALSE) %>% 
    setNames(c('chr', 'start', 'stop', 'cell', 'cluster'))
frags_chr12 <- filter(frags, chr == 'chr12', start < 50000000, cluster %in% c(1:10))
x <- left_join(frags_chr12, dplyr::select(brain@meta.data, cell, nucleosome_group)) %>% 
    mutate(width = stop - start) 
p <- ggplot(x, aes(x = width)) + 
    geom_bar() + 
    facet_wrap(~nucleosome_group+cluster, scales = 'free', ncol = 10) + 
    xlim(c(0, 800))

## -- TSS enrichment
brain <- TSSEnrichment(brain, fast = TRUE)
brain$high.tss <- ifelse(brain$TSS.enrichment > 2, 'High', 'Low')

df <- frags_chr12 %>% 
    left_join(dplyr::select(brain@meta.data, cell, high.tss)) %>% # For each fragment, recover cell TSS enrich. status
    dplyr::rename(c('seqnames' = 'chr', 'end' = 'stop')) %>% 
    as_granges() %>% 
    join_overlap_left(plyranges::select(mm_promoters, prom_id)) %>% # Link each fragment to overlapping promoter
    as_tibble() %>% 
    drop_na(prom_id) %>% # Remove fragments which are not overlapping any promoter
    left_join(
        as_tibble(mm_promoters), by = c('prom_id')
    ) %>% 
    mutate(
        midprom = start.y + (end.y - start.y)/2, 
        distance = midprom - start.x
    ) %>%
    filter(high.tss == 'High') %>%
    count(distance)
p<- ggplot(df, aes(x = distance, y = n)) + 
    geom_col() + 
    geom_smooth(method = 'loess', span = 0.05) + 
    theme_minimal() + 
    theme(legend.position = 'none') + 
    labs(title = 'Fragment "cut" sites', x = 'Distance from TSS', y = '# of cut sites') + 
    xlim(c(-1000, 1000))

Subsetting data

Cell selection can be done based on some QC metrics (e.g. FRiP, FRiBl). Feature selection can be done based on a count threshold.

brain$FRiP <- brain$peak_region_fragments / brain$passed_filters * 100
brain$FRiBl <- brain$blacklist_region_fragments / brain$peak_region_fragments
# Keep cells with high number of fragments in peaks
brain <- subset(brain, peak_region_fragments > 3000) 
# Remove cells with number of fragments in peaks too high
brain <- subset(brain, peak_region_fragments < 100000) 
# Keep cells with high FRiP 
brain <- subset(brain, FRiP > 50) 
# Keep cells with low FRiBl
brain <- subset(brain, FRiBl < 0.05) 
# Remove cells with high nucleosome signal
brain <- subset(brain, nucleosome_signal < 4) 
# Keep cells with good TSSES
brain <- subset(brain, TSS.enrichment > 2) 
# Remove peaks with less than 10 counts overall
brain <- brain[rowSums(GetAssayData(brain, slot = "counts")) > 10, ]
# Remove peaks detected in les than 10 cells overall
brain <- brain[rowSums(GetAssayData(brain, slot = "counts") > 0) > 10, ]

Normalizing and reduce dimensionality data

Data normalization and dimensionality reduction is done differently in scATACseq, compared to scRNAseq. This is mostly due to the fact that scATACseq data is overly sparse, compared to (already quite sparse) scRNAseq data. For more info, read Signac pre-print: (Multimodal single-cell chromatin analysis with Signac)[https://www.biorxiv.org/content/10.1101/2020.11.09.373613v1.full] To alleviate normalization problems linked to massive sparsity, an alternative normalization approach based on a “term frequency-inverse document frequency” approach. TF-IDF transformation yields metrics reflecting how important a “word” is to a “document” in a “collection” (in this case, a “word” is an accessible chromatin locus, a “document” is a cell, and the “collection” is the entire set of cells).

Dimensions of the normalized data matrix generated by this transformation are then reduced using singular value decomposition (SVD).

# Normalize data
brain <- Signac::RunTFIDF(brain) 
# Find variable features
brain <- Signac::FindTopFeatures(brain, min.cutoff = 'q0') 
# Perform dimensionality reduction
brain <- Signac::RunSVD(object = brain)

Clustering

Now that cells are embedded in a lower dimensional space, they can be graph-clustered just like “regular” cells from scRNAseq.

# Find neighboring cells
brain <- FindNeighbors(object = brain, reduction = 'lsi', dims = 2:30)
# Cluster cells
brain <- FindClusters(object = brain, algorithm = 3, resolution = 1.2, verbose = FALSE)
# Embbed cells in lower dimensional space
brain <- RunUMAP(brain, reduction = 'lsi', dims = 2:30)
saveRDS(brain, 'data/MouseBrain/brain.rds')

DA with Seurat

Differential accessibility analysis of chromatin loci in scATACseq is a difficult exercise, which is still very experimental and rapidly-evolving. One “easy” way to perform DA analysis within a scATACseq workflow is to rely on the already-existing FindMarkers() function from Seurat. This approach has not been very thoroughly used yet, but allows quick-n-dirty exploratory analysis of the chromatin loci differently accessible between pairs of (goups of) clusters.

## - Find markers between 2 groups of clusters 
da_peaks <- Seurat::FindMarkers(
    object = brain,
    ident.1 = c(10, 16),
    ident.2 = c(4, 5),
    min.pct = 0.05
)

Exercise session 3

Inspect genes closest to the most DA peaks (in DA analysis between clusters 10+16 and 4+5). Propose a function for cells in these groups.

da_peaks <- readRDS('data/MouseBrain/da_peaks_10-16_vs_4-5.rds')
peaks <- rownames(da_peaks)[da_peaks$?...? <= 0.01 & da_peaks$?...? >= 1.5]
peaks <- GenomicRanges::GRanges(str_replace(peaks, '-', ':'))
Annotation(brain)[nearest(?...?, Annotation(?...?))]$?...?

Hands-on session 4: Peaks differential accessibility analysis

Processing raw ATAC data

In this section, we will rely on worm bulk ATACseq data obtained from isolated tissues, to study differential accessibility analysis.

## -- Download raw data 
mkdir data/ATAC_worm/fastq/
curl -L ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR105/079/SRR10564679/SRR10564679_1.fastq.gz -o data/ATAC_worm/fastq/SRR10564679_GSM4197239_Neurons_YA_ATAC_pe_rep1_Caenorhabditis_elegans_ATAC-seq_1.fastq.gz
curl -L ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR105/079/SRR10564679/SRR10564679_2.fastq.gz -o data/ATAC_worm/fastq/SRR10564679_GSM4197239_Neurons_YA_ATAC_pe_rep1_Caenorhabditis_elegans_ATAC-seq_2.fastq.gz
curl -L ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR105/086/SRR10564686/SRR10564686_1.fastq.gz -o data/ATAC_worm/fastq/SRR10564686_GSM4197244_Neurons_YA_ATAC_pe_rep2_Caenorhabditis_elegans_ATAC-seq_1.fastq.gz
curl -L ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR105/086/SRR10564686/SRR10564686_2.fastq.gz -o data/ATAC_worm/fastq/SRR10564686_GSM4197244_Neurons_YA_ATAC_pe_rep2_Caenorhabditis_elegans_ATAC-seq_2.fastq.gz

## -- Process data 
GENOME=~/genomes/ce11/ce11.fa
CPU=16 
SAMPLE=Neurons_rep1
R1=data/ATAC_worm/fastq/SRR10564679_GSM4197239_Neurons_YA_ATAC_pe_rep1_Caenorhabditis_elegans_ATAC-seq_1.fastq.gz
R2=data/ATAC_worm/fastq/SRR10564679_GSM4197239_Neurons_YA_ATAC_pe_rep1_Caenorhabditis_elegans_ATAC-seq_2.fastq.gz
bowtie2 --threads "${CPU}" -x "${GENOME}" --maxins 1000 -1 "${R1}" -2 "${R2}" | \
samtools fixmate -@ "${CPU}" --output-fmt bam -m - - | \
samtools sort -@ "${CPU}" --output-fmt bam -T data/ATAC_worm/"${SAMPLE}".sam_sorting - | \
samtools markdup -@ "${CPU}" --output-fmt bam -r -T data/ATAC_worm/"${SAMPLE}".sam_markdup - - | \
samtools view -@ "${CPU}" --output-fmt bam -f 2 -q 10 -1 -b - | \
samtools sort -@ "${CPU}" --output-fmt bam -l 9 -T data/ATAC_worm/"${SAMPLE}".sam_sorting2 -o data/ATAC_worm/"${SAMPLE}"_filtered.bam
samtools index -@ "${CPU}" data/ATAC_worm/"${SAMPLE}"_filtered.bam
SAMPLE=Neurons_rep2
R1=data/ATAC_worm/fastq/SRR10564686_GSM4197244_Neurons_YA_ATAC_pe_rep2_Caenorhabditis_elegans_ATAC-seq_1.fastq.gz
R2=data/ATAC_worm/fastq/SRR10564686_GSM4197244_Neurons_YA_ATAC_pe_rep2_Caenorhabditis_elegans_ATAC-seq_2.fastq.gz
bowtie2 --threads "${CPU}" -x "${GENOME}" --maxins 1000 -1 "${R1}" -2 "${R2}" | \
samtools fixmate -@ "${CPU}" --output-fmt bam -m - - | \
samtools sort -@ "${CPU}" --output-fmt bam -T data/ATAC_worm/"${SAMPLE}".sam_sorting - | \
samtools markdup -@ "${CPU}" --output-fmt bam -r -T data/ATAC_worm/"${SAMPLE}".sam_markdup - - | \
samtools view -@ "${CPU}" --output-fmt bam -f 2 -q 10 -1 -b - | \
samtools sort -@ "${CPU}" --output-fmt bam -l 9 -T data/ATAC_worm/"${SAMPLE}".sam_sorting2 -o data/ATAC_worm/"${SAMPLE}"_filtered.bam
samtools index -@ "${CPU}" data/ATAC_worm/"${SAMPLE}"_filtered.bam

# This was repeated for 4 other tissues

Defining a unified set of peaks from a bunch of ATAC-seq experiments

When dealing with bulk ATAC-seq samples (or “pseudo-bulk” samples from scATACseq), another approach is to cluster accessible chromatin loci based on their levels of accessibility in each sample. This can leverage a traditional DESeq2 workflow, but first, one needs to identify a set of chromatin loci overlapping all the accessible sites of each sample.

An approach to do this (if not using cellranger-atac count-identified peaks) can be to use yapc (yet another peak caller) command-line tool. yapc relies on the ATAC-seq coverage “shape” of duplicates to identify “concave” coverage regions (i.e. “peaks”). So we need bigwig tracks of each ATAC-seq experiment to map peaks with yapc.

## -- Downloading tracks 
mkdir data/ATAC_worm
wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE141nnn/GSE141210/suppl//GSE141210_RAW.tar?tool=geoquery -O data/ATAC_worm/ATACseq_tracks.tar.gz
tar -xvf data/ATAC_worm/ATACseq_tracks.tar.gz -C data/ATAC_worm/

## -- Find peaks with yapc
yapc data/ATAC_worm/yapc \
    Germline data/ATAC_worm/GSM4197238_Germline_YA_ATAC_pe_rep1.bw data/ATAC_worm/GSM4197243_Germline_YA_ATAC_pe_rep2.bw \
    Neurons data/ATAC_worm/GSM4197239_Neurons_YA_ATAC_pe_rep1.bw data/ATAC_worm/GSM4197244_Neurons_YA_ATAC_pe_rep2.bw \
    Muscle data/ATAC_worm/GSM4197240_Muscle_YA_ATAC_pe_rep1.bw data/ATAC_worm/GSM4197245_Muscle_YA_ATAC_pe_rep2.bw \
    Hypodermis data/ATAC_worm/GSM4197241_Hypodermis_YA_ATAC_pe_rep1.bw data/ATAC_worm/GSM4197246_Hypodermis_YA_ATAC_pe_rep2.bw \
    Intestine data/ATAC_worm/GSM4197242_Intestine_YA_ATAC_pe_rep1.bw data/ATAC_worm/GSM4197247_Intestine_YA_ATAC_pe_rep2.bw

## -- Further processing depending on what you intend to do 
# .....
# Details of how the identified peaks were further processed are available in Serizay et al., Genome Research 2020. 
# Eventually, ~15,000 tissue-specific promoters were identified in this study, and available from VplotR::ce11_proms

Counting reads over peaks

From the ~ 45,000 identified peaks with yapc, we annotated and filtered down ~ 17,000 promoters. We can compute the coverage of each promoter in each of the 10 ATAC-seq samples in R using featureCounts() function from R wrapper of subread.

files <- c(
    'data/ATAC_worm/Germline_rep1_filtered.bam',
    'data/ATAC_worm/Germline_rep2_filtered.bam',
    'data/ATAC_worm/Neurons_rep1_filtered.bam',
    'data/ATAC_worm/Neurons_rep2_filtered.bam',
    'data/ATAC_worm/Muscle_rep1_filtered.bam',
    'data/ATAC_worm/Muscle_rep2_filtered.bam',
    'data/ATAC_worm/Hypodermis_rep1_filtered.bam',
    'data/ATAC_worm/Hypodermis_rep2_filtered.bam',
    'data/ATAC_worm/Intestine_rep1_filtered.bam',
    'data/ATAC_worm/Intestine_rep2_filtered.bam'
)
gtf <- 'data/ATAC_worm/ce11_proms.gtf'
cnts <- Rsubread::featureCounts(
    files = files, 
    annot.ext = 'data/ATAC_worm/ce11_proms.gtf', 
    isGTFAnnotationFile = TRUE, 
    GTF.featureType = 'sequence_feature', 
    GTF.attrType = 'id', 
    isPairedEnd = TRUE, 
    nthreads = 16
)

Performing DA tests

DESeq2 time-tested workflow can be followed, starting from the counts obtained by featureCounts.

library(tidyverse)
library(SummarizedExperiment)
library(DESeq2)

data(ce11_proms, 'VplotR')
counts <- SummarizedExperiment(
    assays = list('counts' = cnts$counts), 
    rowRanges = ce11_proms,
    colData = S4Vectors::DataFrame(
        sample = cnts$targets, 
        tissue = factor(str_replace_all(cnts$targets, '_.*', ''), c('Germline', 'Neurons', 'Muscle', 'Hypodermis', 'Intestine')), 
        rep = str_replace_all(cnts$targets, '_filtered.bam|.*rep', '')
    )
)

dds <- DESeqDataSet(counts, design = ~ tissue)
dds <- DESeq(dds)

contrasts <- levels(counts$tissue) %>% 
    combn(2, simplify = FALSE) %>% 
    lapply(function(x) {c("tissue", x[2], x[1])})
contrasts_names <- sapply(contrasts, function(x) paste0(x[2], "_vs_", x[3]))
out.l2fc <- data.frame(matrix(nrow=length(ce11_proms), ncol=length(contrasts))) ; colnames(out.l2fc) <- paste0(contrasts_names, '.l2FC')
out.padj <- data.frame(matrix(nrow=length(ce11_proms), ncol=length(contrasts))) ; colnames(out.padj) <- paste0(contrasts_names, '.padj')
for (i in 1:length(contrasts)) {
    res <- results(dds, contrast = contrasts[[i]])
    metadata(res)$alpha <- 0.01
    out <- as.data.frame(res)
    out.l2fc[,i] <- out[,2]
    out.padj[,i] <- out[,6]
}

FPM <- fpm(dds, robust = FALSE)
colnames(FPM) <- paste0('FPM_', counts$sample) %>% str_replace_all('_filtered.bam', '')

results <- cbind(
    as.data.frame(ce11_proms),
    FPM,
    out.l2fc,
    out.padj
)
results$sign.l2FC <- apply(results[,grepl("l2FC", colnames(results))], 1, function(x) any(x < -log2(2) | x > log2(2)))
results$sign.padj <- apply(results[,grepl("padj", colnames(results))], 1, function(x) any(x < 0.01))
write.table(results, 'data/ATAC_worm/quantif.txt', quote = FALSE, col.names = TRUE, row.names = FALSE, sep = '\t')

rowData(dds) <- cbind(
    rowData(dds), 
    FPM,
    out.l2fc,
    out.padj
)
saveRDS(dds, 'data/ATAC_worm/quantif.rds')

Clustering peaks

Regularized log values can be averaged across replicates to quantify chromatin accessibility at each peak in each sample.

rlogs <- DESeq2::rlog(dds, blind = FALSE) %>% 
    assay() %>% 
    as_tibble() %>%
    setNames(paste0('rlog_', dds$sample) %>% str_replace_all('_filtered.bam', '')) %>% 
    mutate(id = paste0('prom_', 1:n())) %>% 
    pivot_longer(-id, names_to = 'class', values_to = 'rlog') %>% 
    separate(class, into = c(NA, 'tissue', 'rep'), sep = '_') %>% 
    group_by(id, tissue) %>% 
    summarize(rlog = mean(rlog)) %>% 
    pivot_wider(names_from = tissue, values_from = rlog) %>%
    ungroup() %>%
    filter(rowSums(select(., -id)) > 0)
saveRDS(rlogs, 'data/ATAC_worm/rlogs.rds')

Peaks can then be clustered based on these rlog values, using unsupervised or semi-supervised approaches, such as k-means or k-medoids (through the pam() function).

clusts <- cluster::pam(select(rlogs, -id), k = 9)
p <- pheatmap::pheatmap(
    select(rlogs, -id)[order(clusts$clustering), ], 
    cluster_rows = FALSE, 
    cluster_cols = FALSE, 
    breaks = seq(0, 12, length.out = 100), 
    colorRampPalette(c("#ffffff", "#ffffff", "#dda937", "#E31A1C", "#a51618"))(100)
)

Exercise session 4

Compare clustering to provided cell type activity for ce11 promoters. Comment.

dds <- readRDS('data/ATAC_worm/quantif.rds')
rlogs <- readRDS('?...?')
clusts <- cluster::pam(select(rlogs, -?...?), k = 9)
table(rowData(dds)$which.tissues[!rowData(dds)$allZero], clusts$?...?)