Here, we will focus on an example dataset generated by 10X Genomics: a set of ~5,000 cells from an mouse adult brain.
Processed data (generated by cellranger-atac count) can be downladed from here:
mkdir -p data/MouseBrain
wget https://cf.10xgenomics.com/samples/cell-atac/1.2.0/atac_v1_adult_brain_fresh_5k/atac_v1_adult_brain_fresh_5k_analysis.tar.gz -O data/MouseBrain/atac_v1_adult_brain_fresh_5k_analysis.tar.gz
wget https://cf.10xgenomics.com/samples/cell-atac/1.2.0/atac_v1_adult_brain_fresh_5k/atac_v1_adult_brain_fresh_5k_filtered_peak_bc_matrix.tar.gz -O data/MouseBrain/atac_v1_adult_brain_fresh_5k_filtered_peak_bc_matrix.tar.gz
wget https://cf.10xgenomics.com/samples/cell-atac/1.2.0/atac_v1_adult_brain_fresh_5k/atac_v1_adult_brain_fresh_5k_filtered_peak_bc_matrix.h5 -O data/MouseBrain/atac_v1_adult_brain_fresh_5k_filtered_peak_bc_matrix.h5
wget https://cf.10xgenomics.com/samples/cell-atac/1.2.0/atac_v1_adult_brain_fresh_5k/atac_v1_adult_brain_fresh_5k_filtered_tf_bc_matrix.tar.gz -O data/MouseBrain/atac_v1_adult_brain_fresh_5k_filtered_tf_bc_matrix.tar.gz
wget https://cf.10xgenomics.com/samples/cell-atac/1.2.0/atac_v1_adult_brain_fresh_5k/atac_v1_adult_brain_fresh_5k_filtered_tf_bc_matrix.h5 -O data/MouseBrain/atac_v1_adult_brain_fresh_5k_filtered_tf_bc_matrix.h5
wget https://cf.10xgenomics.com/samples/cell-atac/1.2.0/atac_v1_adult_brain_fresh_5k/atac_v1_adult_brain_fresh_5k_fragments.tsv.gz -O data/MouseBrain/atac_v1_adult_brain_fresh_5k_fragments.tsv.gz
wget https://cf.10xgenomics.com/samples/cell-atac/1.2.0/atac_v1_adult_brain_fresh_5k/atac_v1_adult_brain_fresh_5k_fragments.tsv.gz.tbi -O data/MouseBrain/atac_v1_adult_brain_fresh_5k_fragments.tsv.gz.tbi
wget https://cf.10xgenomics.com/samples/cell-atac/1.2.0/atac_v1_adult_brain_fresh_5k/atac_v1_adult_brain_fresh_5k_peak_annotation.tsv -O data/MouseBrain/atac_v1_adult_brain_fresh_5k_peak_annotation.tsv
wget https://cf.10xgenomics.com/samples/cell-atac/1.2.0/atac_v1_adult_brain_fresh_5k/atac_v1_adult_brain_fresh_5k_peaks.bed -O data/MouseBrain/atac_v1_adult_brain_fresh_5k_peaks.bed
wget https://cg.10xgenomics.com/samples/cell-atac/1.2.0/atac_v1_adult_brain_fresh_5k/atac_v1_adult_brain_fresh_5k_possorted_bam.bam -O data/MouseBrain/atac_v1_adult_brain_fresh_5k_possorted_bam.bam
wget https://cf.10xgenomics.com/samples/cell-atac/1.2.0/atac_v1_adult_brain_fresh_5k/atac_v1_adult_brain_fresh_5k_possorted_bam.bam.bai -O data/MouseBrain/atac_v1_adult_brain_fresh_5k_possorted_bam.bam.bai
wget https://cf.10xgenomics.com/samples/cell-atac/1.2.0/atac_v1_adult_brain_fresh_5k/atac_v1_adult_brain_fresh_5k_raw_peak_bc_matrix.tar.gz -O data/MouseBrain/atac_v1_adult_brain_fresh_5k_raw_peak_bc_matrix.tar.gz
wget https://cf.10xgenomics.com/samples/cell-atac/1.2.0/atac_v1_adult_brain_fresh_5k/atac_v1_adult_brain_fresh_5k_raw_peak_bc_matrix.h5 -O data/MouseBrain/atac_v1_adult_brain_fresh_5k_raw_peak_bc_matrix.h5
wget https://cf.10xgenomics.com/samples/cell-atac/1.2.0/atac_v1_adult_brain_fresh_5k/atac_v1_adult_brain_fresh_5k_singlecell.csv -O data/MouseBrain/atac_v1_adult_brain_fresh_5k_singlecell.csv
wget https://cf.10xgenomics.com/samples/cell-atac/1.2.0/atac_v1_adult_brain_fresh_5k/atac_v1_adult_brain_fresh_5k_summary.csv -O data/MouseBrain/atac_v1_adult_brain_fresh_5k_summary.csv
wget https://cf.10xgenomics.com/samples/cell-atac/1.2.0/atac_v1_adult_brain_fresh_5k/atac_v1_adult_brain_fresh_5k_summary.json -O data/MouseBrain/atac_v1_adult_brain_fresh_5k_summary.json
wget https://cf.10xgenomics.com/samples/cell-atac/1.2.0/atac_v1_adult_brain_fresh_5k/atac_v1_adult_brain_fresh_5k_web_summary.html -O data/MouseBrain/atac_v1_adult_brain_fresh_5k_web_summary.html
wget https://cf.10xgenomics.com/samples/cell-atac/1.2.0/atac_v1_adult_brain_fresh_5k/atac_v1_adult_brain_fresh_5k_cloupe.cloupe -O data/MouseBrain/atac_v1_adult_brain_fresh_5k_cloupe.cloupescATACseq raw data processing can be very different from bulk ATAC-seq, and downstream analysis is also quite specific to single-cell assays.
As of today, Signac is one of the few solutions developed to investigate scATACseq data in R, and is the most widely used approach. Signac is
an “extension” of Seurat, which originally focused on single-cell RNA-seq data analysis. It uses the same type of object structure.
A slight difference regarding the features is that they are actually chromatin accessible loci, rather than genes. Because peaks are frequently
studied in relation to neighbouring genes, the Seurat object can conventiently store gene annotations (or any type of genomic annotations)
in its annotation slot.
library(Signac)
library(Seurat)
library(tidyverse)
library(BiocParallel)
library(plyranges)
## -- Read brain ATAC-seq counts
brain_assay <- Signac::CreateChromatinAssay(
counts = Seurat::Read10X_h5("data/MouseBrain/atac_v1_adult_brain_fresh_5k_filtered_peak_bc_matrix.h5"),
sep = c(":", "-"),
genome = "mm10",
fragments = 'data/MouseBrain/atac_v1_adult_brain_fresh_5k_fragments.tsv.gz',
min.cells = 1
)
class(brain_assay)
## -- Create Seurat object
metadata <- vroom::vroom(
file = "data/MouseBrain/atac_v1_adult_brain_fresh_5k_singlecell.csv",
col_names = TRUE
) %>%
filter(barcode != 'NO_BARCODE') %>%
mutate(cell = barcode) %>%
column_to_rownames('barcode') %>%
as.data.frame()
brain <- Seurat::CreateSeuratObject(
counts = brain_assay,
assay = 'peaks',
project = 'ATAC',
meta.data = metadata
)
## -- Add genome annotations
Annotation(brain) <- AnnotationHub::AnnotationHub()[['AH49547']] %>%
filter(gene_type == 'protein_coding', type == 'gene') %>%
mutate(gene_biotype = gene_type)Accessing the data stored in a Seurat object is (generally) relatively easy, using the getAssayData() function.
# Data slots available in the Seurat object
Assays(brain)
slots <- slotNames(brain@assays[[brain@active.assay]])
# Find peaks
peaks <- GetAssayData(brain, slot = 'ranges')
# Find counts
counts <- GetAssayData(brain, slot = "counts")
data <- GetAssayData(brain, slot = "data")
# Find annotations
annots <- GetAssayData(brain, slot = "annotation")
# Find seqinfo
seqinfo <- GetAssayData(brain, slot = "seqinfo")peaks <- granges(brain)
annots <- Annotation(brain)
annotated_peaks <- join_nearest(peaks, annots) %>%
add_nearest_distance(annots) %>%
mutate(
peak = paste(seqnames, start, end, sep = '-'),
inGene = distance == 0
)
table(annotated_peaks$inGene)
# - Distance to nearest promoter
mm_promoters <- Annotation(brain) %>%
resize(1, fix = 'start') %>%
resize(4000, fix = 'center') %>%
mutate(prom_id = paste0('prom_', 1:n()))
add_nearest_distance(peaks, mm_promoters) %>%
as_tibble() %>%
pull(distance) %>%
quantile(probs = seq(0, 1, 0.1))QC of scATACseq data stored in Seurat is simplified, using Signac-provided functions. On top of these utilities,
scATACseq fragments can always be imported (remember: they are stored in the fragments.tsv.gz generated by
cellranger-atac count, and they also contain information re: the cell-of-origin, for each fragment),
if one wants to manually check fragment distribution, size, …
## -- Nucleosome signal
brain <- NucleosomeSignal(object = brain)
brain$nucleosome_group <- ifelse(brain$nucleosome_signal > 4, 'NS > 4', 'NS < 4')
range(brain$nucleosome_group)
table(brain$nucleosome_group)
## -- Frag. size distribution
frags <- vroom::vroom('/data/20210804_scATACseq-workshop/data/MouseBrain/atac_v1_adult_brain_fresh_5k_fragments.tsv.gz', col_names = FALSE) %>%
setNames(c('chr', 'start', 'stop', 'cell', 'cluster'))
frags_chr12 <- filter(frags, chr == 'chr12', start < 50000000, cluster %in% c(1:10))
x <- left_join(frags_chr12, dplyr::select(brain@meta.data, cell, nucleosome_group)) %>%
mutate(width = stop - start)
p <- ggplot(x, aes(x = width)) +
geom_bar() +
facet_wrap(~nucleosome_group+cluster, scales = 'free', ncol = 10) +
xlim(c(0, 800))
## -- TSS enrichment
brain <- TSSEnrichment(brain, fast = TRUE)
brain$high.tss <- ifelse(brain$TSS.enrichment > 2, 'High', 'Low')
df <- frags_chr12 %>%
left_join(dplyr::select(brain@meta.data, cell, high.tss)) %>% # For each fragment, recover cell TSS enrich. status
dplyr::rename(c('seqnames' = 'chr', 'end' = 'stop')) %>%
as_granges() %>%
join_overlap_left(plyranges::select(mm_promoters, prom_id)) %>% # Link each fragment to overlapping promoter
as_tibble() %>%
drop_na(prom_id) %>% # Remove fragments which are not overlapping any promoter
left_join(
as_tibble(mm_promoters), by = c('prom_id')
) %>%
mutate(
midprom = start.y + (end.y - start.y)/2,
distance = midprom - start.x
) %>%
filter(high.tss == 'High') %>%
count(distance)
p<- ggplot(df, aes(x = distance, y = n)) +
geom_col() +
geom_smooth(method = 'loess', span = 0.05) +
theme_minimal() +
theme(legend.position = 'none') +
labs(title = 'Fragment "cut" sites', x = 'Distance from TSS', y = '# of cut sites') +
xlim(c(-1000, 1000))Cell selection can be done based on some QC metrics (e.g. FRiP, FRiBl). Feature selection can be done based on a count threshold.
brain$FRiP <- brain$peak_region_fragments / brain$passed_filters * 100
brain$FRiBl <- brain$blacklist_region_fragments / brain$peak_region_fragments
# Keep cells with high number of fragments in peaks
brain <- subset(brain, peak_region_fragments > 3000)
# Remove cells with number of fragments in peaks too high
brain <- subset(brain, peak_region_fragments < 100000)
# Keep cells with high FRiP
brain <- subset(brain, FRiP > 50)
# Keep cells with low FRiBl
brain <- subset(brain, FRiBl < 0.05)
# Remove cells with high nucleosome signal
brain <- subset(brain, nucleosome_signal < 4)
# Keep cells with good TSSES
brain <- subset(brain, TSS.enrichment > 2)
# Remove peaks with less than 10 counts overall
brain <- brain[rowSums(GetAssayData(brain, slot = "counts")) > 10, ]
# Remove peaks detected in les than 10 cells overall
brain <- brain[rowSums(GetAssayData(brain, slot = "counts") > 0) > 10, ]Data normalization and dimensionality reduction is done differently in scATACseq, compared to scRNAseq. This is
mostly due to the fact that scATACseq data is overly sparse, compared to (already quite sparse) scRNAseq data.
For more info, read Signac pre-print: (Multimodal single-cell chromatin analysis with Signac)[https://www.biorxiv.org/content/10.1101/2020.11.09.373613v1.full]
To alleviate normalization problems linked to massive sparsity, an alternative normalization approach based on a “term frequency-inverse document frequency” approach.
TF-IDF transformation yields metrics reflecting how important a “word” is to a “document” in a “collection”
(in this case, a “word” is an accessible chromatin locus, a “document” is a cell, and the “collection” is the entire set of cells).
Dimensions of the normalized data matrix generated by this transformation are then reduced using singular value decomposition (SVD).
# Normalize data
brain <- Signac::RunTFIDF(brain)
# Find variable features
brain <- Signac::FindTopFeatures(brain, min.cutoff = 'q0')
# Perform dimensionality reduction
brain <- Signac::RunSVD(object = brain) Now that cells are embedded in a lower dimensional space, they can be graph-clustered just like “regular” cells from scRNAseq.
# Find neighboring cells
brain <- FindNeighbors(object = brain, reduction = 'lsi', dims = 2:30)
# Cluster cells
brain <- FindClusters(object = brain, algorithm = 3, resolution = 1.2, verbose = FALSE)
# Embbed cells in lower dimensional space
brain <- RunUMAP(brain, reduction = 'lsi', dims = 2:30)
saveRDS(brain, 'data/MouseBrain/brain.rds')Differential accessibility analysis of chromatin loci in scATACseq is a difficult exercise, which is still very experimental and rapidly-evolving.
One “easy” way to perform DA analysis within a scATACseq workflow is to rely on the already-existing FindMarkers() function from Seurat.
This approach has not been very thoroughly used yet, but allows quick-n-dirty exploratory analysis of the chromatin loci differently accessible
between pairs of (goups of) clusters.
## - Find markers between 2 groups of clusters
da_peaks <- Seurat::FindMarkers(
object = brain,
ident.1 = c(10, 16),
ident.2 = c(4, 5),
min.pct = 0.05
)da_peaks <- readRDS('data/MouseBrain/da_peaks_10-16_vs_4-5.rds')
peaks <- rownames(da_peaks)[da_peaks$?...? <= 0.01 & da_peaks$?...? >= 1.5]
peaks <- GenomicRanges::GRanges(str_replace(peaks, '-', ':'))
Annotation(brain)[nearest(?...?, Annotation(?...?))]$?...?In this section, we will rely on worm bulk ATACseq data obtained from isolated tissues, to study differential accessibility analysis.
## -- Download raw data
mkdir data/ATAC_worm/fastq/
curl -L ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR105/079/SRR10564679/SRR10564679_1.fastq.gz -o data/ATAC_worm/fastq/SRR10564679_GSM4197239_Neurons_YA_ATAC_pe_rep1_Caenorhabditis_elegans_ATAC-seq_1.fastq.gz
curl -L ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR105/079/SRR10564679/SRR10564679_2.fastq.gz -o data/ATAC_worm/fastq/SRR10564679_GSM4197239_Neurons_YA_ATAC_pe_rep1_Caenorhabditis_elegans_ATAC-seq_2.fastq.gz
curl -L ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR105/086/SRR10564686/SRR10564686_1.fastq.gz -o data/ATAC_worm/fastq/SRR10564686_GSM4197244_Neurons_YA_ATAC_pe_rep2_Caenorhabditis_elegans_ATAC-seq_1.fastq.gz
curl -L ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR105/086/SRR10564686/SRR10564686_2.fastq.gz -o data/ATAC_worm/fastq/SRR10564686_GSM4197244_Neurons_YA_ATAC_pe_rep2_Caenorhabditis_elegans_ATAC-seq_2.fastq.gz
## -- Process data
GENOME=~/genomes/ce11/ce11.fa
CPU=16
SAMPLE=Neurons_rep1
R1=data/ATAC_worm/fastq/SRR10564679_GSM4197239_Neurons_YA_ATAC_pe_rep1_Caenorhabditis_elegans_ATAC-seq_1.fastq.gz
R2=data/ATAC_worm/fastq/SRR10564679_GSM4197239_Neurons_YA_ATAC_pe_rep1_Caenorhabditis_elegans_ATAC-seq_2.fastq.gz
bowtie2 --threads "${CPU}" -x "${GENOME}" --maxins 1000 -1 "${R1}" -2 "${R2}" | \
samtools fixmate -@ "${CPU}" --output-fmt bam -m - - | \
samtools sort -@ "${CPU}" --output-fmt bam -T data/ATAC_worm/"${SAMPLE}".sam_sorting - | \
samtools markdup -@ "${CPU}" --output-fmt bam -r -T data/ATAC_worm/"${SAMPLE}".sam_markdup - - | \
samtools view -@ "${CPU}" --output-fmt bam -f 2 -q 10 -1 -b - | \
samtools sort -@ "${CPU}" --output-fmt bam -l 9 -T data/ATAC_worm/"${SAMPLE}".sam_sorting2 -o data/ATAC_worm/"${SAMPLE}"_filtered.bam
samtools index -@ "${CPU}" data/ATAC_worm/"${SAMPLE}"_filtered.bam
SAMPLE=Neurons_rep2
R1=data/ATAC_worm/fastq/SRR10564686_GSM4197244_Neurons_YA_ATAC_pe_rep2_Caenorhabditis_elegans_ATAC-seq_1.fastq.gz
R2=data/ATAC_worm/fastq/SRR10564686_GSM4197244_Neurons_YA_ATAC_pe_rep2_Caenorhabditis_elegans_ATAC-seq_2.fastq.gz
bowtie2 --threads "${CPU}" -x "${GENOME}" --maxins 1000 -1 "${R1}" -2 "${R2}" | \
samtools fixmate -@ "${CPU}" --output-fmt bam -m - - | \
samtools sort -@ "${CPU}" --output-fmt bam -T data/ATAC_worm/"${SAMPLE}".sam_sorting - | \
samtools markdup -@ "${CPU}" --output-fmt bam -r -T data/ATAC_worm/"${SAMPLE}".sam_markdup - - | \
samtools view -@ "${CPU}" --output-fmt bam -f 2 -q 10 -1 -b - | \
samtools sort -@ "${CPU}" --output-fmt bam -l 9 -T data/ATAC_worm/"${SAMPLE}".sam_sorting2 -o data/ATAC_worm/"${SAMPLE}"_filtered.bam
samtools index -@ "${CPU}" data/ATAC_worm/"${SAMPLE}"_filtered.bam
# This was repeated for 4 other tissuesWhen dealing with bulk ATAC-seq samples (or “pseudo-bulk” samples from scATACseq), another approach is to cluster accessible chromatin loci
based on their levels of accessibility in each sample. This can leverage a traditional DESeq2 workflow, but first, one needs to identify a
set of chromatin loci overlapping all the accessible sites of each sample.
An approach to do this (if not using cellranger-atac count-identified peaks) can be to use yapc (yet another peak caller) command-line tool.
yapc relies on the ATAC-seq coverage “shape” of duplicates to identify “concave” coverage regions (i.e. “peaks”).
So we need bigwig tracks of each ATAC-seq experiment to map peaks with yapc.
## -- Downloading tracks
mkdir data/ATAC_worm
wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE141nnn/GSE141210/suppl//GSE141210_RAW.tar?tool=geoquery -O data/ATAC_worm/ATACseq_tracks.tar.gz
tar -xvf data/ATAC_worm/ATACseq_tracks.tar.gz -C data/ATAC_worm/
## -- Find peaks with yapc
yapc data/ATAC_worm/yapc \
Germline data/ATAC_worm/GSM4197238_Germline_YA_ATAC_pe_rep1.bw data/ATAC_worm/GSM4197243_Germline_YA_ATAC_pe_rep2.bw \
Neurons data/ATAC_worm/GSM4197239_Neurons_YA_ATAC_pe_rep1.bw data/ATAC_worm/GSM4197244_Neurons_YA_ATAC_pe_rep2.bw \
Muscle data/ATAC_worm/GSM4197240_Muscle_YA_ATAC_pe_rep1.bw data/ATAC_worm/GSM4197245_Muscle_YA_ATAC_pe_rep2.bw \
Hypodermis data/ATAC_worm/GSM4197241_Hypodermis_YA_ATAC_pe_rep1.bw data/ATAC_worm/GSM4197246_Hypodermis_YA_ATAC_pe_rep2.bw \
Intestine data/ATAC_worm/GSM4197242_Intestine_YA_ATAC_pe_rep1.bw data/ATAC_worm/GSM4197247_Intestine_YA_ATAC_pe_rep2.bw
## -- Further processing depending on what you intend to do
# .....
# Details of how the identified peaks were further processed are available in Serizay et al., Genome Research 2020.
# Eventually, ~15,000 tissue-specific promoters were identified in this study, and available from VplotR::ce11_promsFrom the ~ 45,000 identified peaks with yapc, we annotated and filtered down ~ 17,000 promoters.
We can compute the coverage of each promoter in each of the 10 ATAC-seq samples in R using
featureCounts() function from R wrapper of subread.
files <- c(
'data/ATAC_worm/Germline_rep1_filtered.bam',
'data/ATAC_worm/Germline_rep2_filtered.bam',
'data/ATAC_worm/Neurons_rep1_filtered.bam',
'data/ATAC_worm/Neurons_rep2_filtered.bam',
'data/ATAC_worm/Muscle_rep1_filtered.bam',
'data/ATAC_worm/Muscle_rep2_filtered.bam',
'data/ATAC_worm/Hypodermis_rep1_filtered.bam',
'data/ATAC_worm/Hypodermis_rep2_filtered.bam',
'data/ATAC_worm/Intestine_rep1_filtered.bam',
'data/ATAC_worm/Intestine_rep2_filtered.bam'
)
gtf <- 'data/ATAC_worm/ce11_proms.gtf'
cnts <- Rsubread::featureCounts(
files = files,
annot.ext = 'data/ATAC_worm/ce11_proms.gtf',
isGTFAnnotationFile = TRUE,
GTF.featureType = 'sequence_feature',
GTF.attrType = 'id',
isPairedEnd = TRUE,
nthreads = 16
)DESeq2 time-tested workflow can be followed, starting from the counts obtained by featureCounts.
library(tidyverse)
library(SummarizedExperiment)
library(DESeq2)
data(ce11_proms, 'VplotR')
counts <- SummarizedExperiment(
assays = list('counts' = cnts$counts),
rowRanges = ce11_proms,
colData = S4Vectors::DataFrame(
sample = cnts$targets,
tissue = factor(str_replace_all(cnts$targets, '_.*', ''), c('Germline', 'Neurons', 'Muscle', 'Hypodermis', 'Intestine')),
rep = str_replace_all(cnts$targets, '_filtered.bam|.*rep', '')
)
)
dds <- DESeqDataSet(counts, design = ~ tissue)
dds <- DESeq(dds)
contrasts <- levels(counts$tissue) %>%
combn(2, simplify = FALSE) %>%
lapply(function(x) {c("tissue", x[2], x[1])})
contrasts_names <- sapply(contrasts, function(x) paste0(x[2], "_vs_", x[3]))
out.l2fc <- data.frame(matrix(nrow=length(ce11_proms), ncol=length(contrasts))) ; colnames(out.l2fc) <- paste0(contrasts_names, '.l2FC')
out.padj <- data.frame(matrix(nrow=length(ce11_proms), ncol=length(contrasts))) ; colnames(out.padj) <- paste0(contrasts_names, '.padj')
for (i in 1:length(contrasts)) {
res <- results(dds, contrast = contrasts[[i]])
metadata(res)$alpha <- 0.01
out <- as.data.frame(res)
out.l2fc[,i] <- out[,2]
out.padj[,i] <- out[,6]
}
FPM <- fpm(dds, robust = FALSE)
colnames(FPM) <- paste0('FPM_', counts$sample) %>% str_replace_all('_filtered.bam', '')
results <- cbind(
as.data.frame(ce11_proms),
FPM,
out.l2fc,
out.padj
)
results$sign.l2FC <- apply(results[,grepl("l2FC", colnames(results))], 1, function(x) any(x < -log2(2) | x > log2(2)))
results$sign.padj <- apply(results[,grepl("padj", colnames(results))], 1, function(x) any(x < 0.01))
write.table(results, 'data/ATAC_worm/quantif.txt', quote = FALSE, col.names = TRUE, row.names = FALSE, sep = '\t')
rowData(dds) <- cbind(
rowData(dds),
FPM,
out.l2fc,
out.padj
)
saveRDS(dds, 'data/ATAC_worm/quantif.rds')Regularized log values can be averaged across replicates to quantify chromatin accessibility at each peak in each sample.
rlogs <- DESeq2::rlog(dds, blind = FALSE) %>%
assay() %>%
as_tibble() %>%
setNames(paste0('rlog_', dds$sample) %>% str_replace_all('_filtered.bam', '')) %>%
mutate(id = paste0('prom_', 1:n())) %>%
pivot_longer(-id, names_to = 'class', values_to = 'rlog') %>%
separate(class, into = c(NA, 'tissue', 'rep'), sep = '_') %>%
group_by(id, tissue) %>%
summarize(rlog = mean(rlog)) %>%
pivot_wider(names_from = tissue, values_from = rlog) %>%
ungroup() %>%
filter(rowSums(select(., -id)) > 0)
saveRDS(rlogs, 'data/ATAC_worm/rlogs.rds')Peaks can then be clustered based on these rlog values, using unsupervised or semi-supervised approaches,
such as k-means or k-medoids (through the pam() function).
clusts <- cluster::pam(select(rlogs, -id), k = 9)
p <- pheatmap::pheatmap(
select(rlogs, -id)[order(clusts$clustering), ],
cluster_rows = FALSE,
cluster_cols = FALSE,
breaks = seq(0, 12, length.out = 100),
colorRampPalette(c("#ffffff", "#ffffff", "#dda937", "#E31A1C", "#a51618"))(100)
)dds <- readRDS('data/ATAC_worm/quantif.rds')
rlogs <- readRDS('?...?')
clusts <- cluster::pam(select(rlogs, -?...?), k = 9)
table(rowData(dds)$which.tissues[!rowData(dds)$allZero], clusts$?...?)