Here we will process one of the first samples generated by ATAC-seq, from doi: 10.1038/nmeth.2688. This is an ATAC-seq sample from GM12878 cells. Let’s first download the raw fastqs.
mkdir -p data/GM12878/fastq
wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR891/SRR891269/SRR891269_1.fastq.gz -O data/GM12878/fastq/GM12878_R1.fq.gz
wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR891/SRR891269/SRR891269_2.fastq.gz -O data/GM12878/fastq/GM12878_R2.fq.gzThe processing steps for ATAC-seq are:
cd data/GM12878
SAMPLE=GM12878
R1=fastq/GM12878_R1.fq.gz
R2=fastq/GM12878_R2.fq.gz
GENOME=~/genomes/hg38/hg38.fa
SAMFILE=GM12878.sam
FILTEREDBAM=GM12878_filtered.bam
TRACK=GM12878.bw
PEAKFOLDER=peaks/GM12878
GENOMESIZE=2600000000
CPU=2
## -- Trim reads and run QC
trim_galore \
--paired \
--fastqc \
--cores 1 \
"${R1}" "${R2}"
## -- Map reads
bowtie2 \
--threads "${CPU}" \
-x "${GENOME}" \
--maxins 2000 \
-1 `echo "${R1}" | sed 's,.fastq.gz,_val_1.fq.gz,'` \
-2 `echo "${R2}" | sed 's,.fastq.gz,_val_2.fq.gz,'` > "${SAMFILE}"
## -- Filter reads
samtools fixmate -@ "${CPU}" --output-fmt bam -m "${SAMFILE}" - | \
samtools sort -@ "${CPU}" --output-fmt bam -T "${SAMFILE}"_sorting - | \
samtools markdup -@ "${CPU}" --output-fmt bam -r -T "${SAMFILE}"_markdup - - | \
samtools view -@ "${CPU}" --output-fmt bam -f 2 -q 10 -1 -b - | \
samtools sort -@ "${CPU}" --output-fmt bam -l 9 -T "${SAMFILE}"_sorting2 -o "${FILTEREDBAM}"
samtools index -@ "${CPU}" "${FILTEREDBAM}"
## -- Generate track
bamCoverage \
--bam "${FILTEREDBAM}" \
--outFileName "${TRACK}" \
--binSize 1 \
--numberOfProcessors "${CPU}" \
--normalizeUsing CPM \
--skipNonCoveredRegions \
--extendReads \
--ignoreDuplicates
## -- Call peaks
macs2 callpeak \
-t "${FILTEREDBAM}" \
--format BAMPE \
--gsize "${GENOMESIZE}" \
--outdir "${PEAKFOLDER}" \
--name "${SAMPLE}"To check the quality of an ATAC-seq sample, one can check the following metrics:
library(AnnotationDbi)
library(tidyverse)
library(plyranges)
library(annotatr)
library(VplotR)
## -- Import genomic features
hg_genes <- AnnotationHub::AnnotationHub()[['AH92109']] %>%
filter(type == 'gene', gene_biotype == 'protein_coding') %>%
filter(!(grepl('GL|KI', seqnames)))
seqlevelsStyle(hg_genes) <- 'UCSC'
hg_TSSs <- hg_genes %>%
anchor_start() %>%
resize(width = 1) %>%
unanchor()
hg_promoters <- hg_TSSs %>%
flank_upstream(4000) %>%
shift_downstream(2000) %>%
mutate(ID = paste0('prom_', 1:n()))
hg_features <- build_annotations(
genome = 'hg38',
annotations = 'hg38_basicgenes'
)
## -- Import fragments and peaks as GRanges
fragments <- importPEBamFiles(
'data/GM12878/GM12878_filtered.bam',
shift_ATAC_fragments = TRUE
)
peaks <- import('data/GM12878/peaks/GM12878/GM12878_peaks.narrowPeak')
peak_summits <- import('data/GM12878/peaks/GM12878/GM12878_summits.bed')
## -- Check fragment size distribution
df<-...(fragments) %>%
group_by(...) %>%
count() %>%
ungroup()
p1<-ggplot(df, aes(x = ..., y = ...)) +
...() +
labs(title = '...', x = '...', y = '...')
## -- Check distance to TSS
df<-add_nearest_distance(fragments, hg_TSSs) %>%
as_tibble() %>%
mutate(distance = cut(distance, breaks = seq(0, 5e5, by = 10), include.lowest = TRUE) %>% as.numeric() %>% `*`(10)) %>%
group_by(distance) %>%
count() %>%
ungroup() %>%
mutate(cumsum = cumsum(n), pct = cumsum/max(cumsum))
p2<-ggplot(df, aes(x = distance, y = pct)) +
geom_line() +
theme_minimal() +
scale_x_log10() +
theme(legend.position = 'none') +
labs(title = 'Cumulative distribution of ATAC-seq fragments', x = 'Distance from TSS', y = 'Cum. %')
## -- Check Vplot
p3<-plotVmat(
fragments,
hg_TSSs,
normFun = 'zscore',
ylim = c(20, 800),
xlim = c(-500, 500)
)
## -- Compute promoter (TSS) Enrichment Score
df<-resize(fragments, width = 1, fix = 'center') %>%
join_overlap_left(hg_promoters) %>%
plyranges::select(ID) %>%
as_tibble() %>%
filter(!is.na(ID)) %>%
left_join(as_tibble(hg_promoters) %>% dplyr::select(start, end, ID), by = 'ID') %>%
mutate(
prom_mid = start.y + (end.y - start.y) / 2,
distance = start.x - prom_mid,
binned_distance = round(distance/100, 0)*100
) %>%
count(binned_distance) %>%
mutate(
bg = mean(.data$n[abs(.data$binned_distance) == 2000]),
enrich_score = n/bg
)
TSSES <- round(df[df$binned_distance == 0, 'enrich_score'], 2)
p4<-ggplot(df, aes(x = binned_distance, y = enrich_score)) +
geom_line() +
theme_minimal() +
theme(legend.position = 'none') +
labs(title = glue::glue('TSS enrichment score (Signal-to-noise ratio): {TSSES}'), x = 'Distance from TSS', y = 'Enrich. score')
## -- Check location of peaks vs genomic features
peaks_annotated <- annotate_regions(
peak_summits,
annotations = hg_features,
ignore.strand = TRUE
)
p5<-plot_annotation(
annotated_regions = peaks_annotated,
annotation_order = paste0('hg38_genes_', c('promoters', '1to5kb', '5UTRs', 'exons', 'introns', '3UTRs')),
plot_title = 'ATAC peaks',
x_label = 'Genomic features',
y_label = 'Count'
) + theme_minimal()
## -- Compute FRiP
df<-add_nearest_distance(fragments, peaks) %>%
as_tibble() %>%
mutate(isInPeak = distance == 0)
pct <- round(sum(df$isInPeak == 1) / nrow(df) * 100, 2)
p6<-ggplot(df, aes(y = isInPeak)) +
geom_bar() +
theme_minimal() +
theme(legend.position = 'none') +
labs(title = glue::glue('Fraction of reads in peaks (FRiP): {pct}'), x = '# of fragments', y = 'Fragment within peak')
## -- All plots together
p <- cowplot::plot_grid(
p1, p2, p3, p4, p5, p6,
nrow = 2
)
ggsave('ATACseq_QC.pdf', w = 15, h = 10)library(tidyverse)
library(plyranges)
fragments <- VplotR::importPEBamFiles(
'?...?',
shift_ATAC_fragments = TRUE
)
df<-?...?(fragments) %>%
group_by(?...?) %>%
count() %>%
ungroup()
p1<-ggplot(df, aes(x = ?...?, y = ?...?)) +
?...?() +
labs(title = '?...?', x = '?...?', y = '?...?')Single-cell ATAC-seq data processing can be performed just like bulk ATAC-seq, as long as the
sequencing strategy is similar (e.g. in ATAC-seq from individually sorted cells).
However, 10X genomics does not follow the regular sequencing strategy used for standard
Illumina libraries, as it needs to sequenced internal barcodes. 10X genomics has come up
with a toolkit to process 10X scATACseq data into a single bam file, to identify peaks of
chromatin accessibility in the entire set of single cells, and to count fragments overlapping
with each peak in each cell.
mkdir -p data/PBMCs_10X/
wget https://cf.10xgenomics.com/samples/cell-atac/1.2.0/atac_pbmc_1k_v1/atac_pbmc_1k_v1_fastqs.tar -O data/PBMCs_10X/atac_pbmc_1k_v1_fastqs.tar.gz
tar -xf data/PBMCs_10X/atac_pbmc_1k_v1_fastqs.tar.gz
mv atac_pbmc_1k_v1_fastqs data/PBMCs_10X/fastqsIt is (most likely) necessary to run cellranger-atac count workflow from an HPC node. Requirements are
very high (see here for more details: https://support.10xgenomics.com/single-cell-atac/software/overview/system-requirements)
and processing times can be quite long. It is a completely different workflow than the one used for 10X scRNAseq!!
CPU=48
MEM=256G
salloc \
-J cellranger_atac_count \
--cpus-per-task "${CPU}" \
--mem "${MEM}" Once the allocated node is ready:
CPU=48
MEM=256G
ID=PBMCs_10X
REF=~/genomes/refdata-cellranger-arc-hg19-2020-A-2.0.0/
FASTQS=data/PBMCs_10X/fastqs
module load cellranger-atac
cellranger-atac count \
--id "${ID}" \
--reference "${REF}" \
--fastqs "${FASTQS}" \
--localcores "${CPU}" \
--localmem "${MEM}"Or, to submit the job itself from main server:
CPU=48
MEM=256G
ID=PBMCs_10X
REF=~/genomes/refdata-cellranger-arc-hg19-2020-A-2.0.0/
FASTQS=data/PBMCs_10X/fastqs
sbatch \
-J cellranger_atac_count \
-o cellranger_atac_count.pbmc.out \
-e cellranger_atac_count.pbmc.err \
--cpus-per-task "${CPU}" \
--mem "${MEM}" \
--export=ALL \
bin/submit_cellranger-atac-count.shOnce the analysis is done, you can check the generated files and QC reports. See here for more details: https://support.10xgenomics.com/single-cell-atac/software/pipelines/latest/using/count/. Our QC report file is in:
data/PBMCs_10X/atac_pbmc_1k_v1_web_summary.htmlLoupe at relevant loci to propose an annotation for each cluster.The cloupe file is available from 10X genomics
here: https://cf.10xgenomics.com/samples/cell-atac/1.1.0/atac_pbmc_1k_v1/atac_pbmc_1k_v1_cloupe.cloupe:
Load it in Loupe browser and check coverage for different genes.
# Check e.g. `CD19`, `CD8`, `CD4`, `CD3`