Day 01

Hands-on session 1: Processing bulk ATAC-seq

Download fastq

Here we will process one of the first samples generated by ATAC-seq, from doi: 10.1038/nmeth.2688. This is an ATAC-seq sample from GM12878 cells. Let’s first download the raw fastqs.

mkdir -p data/GM12878/fastq
wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR891/SRR891269/SRR891269_1.fastq.gz -O data/GM12878/fastq/GM12878_R1.fq.gz
wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR891/SRR891269/SRR891269_2.fastq.gz -O data/GM12878/fastq/GM12878_R2.fq.gz

Process GM12878 data

The processing steps for ATAC-seq are:

  1. Adapter trimming
  2. Read pairs mapping
  3. Read pairs filtering
  4. Track generation
  5. Peak calling
cd data/GM12878
SAMPLE=GM12878
R1=fastq/GM12878_R1.fq.gz
R2=fastq/GM12878_R2.fq.gz
GENOME=~/genomes/hg38/hg38.fa
SAMFILE=GM12878.sam
FILTEREDBAM=GM12878_filtered.bam
TRACK=GM12878.bw
PEAKFOLDER=peaks/GM12878
GENOMESIZE=2600000000
CPU=2

## -- Trim reads and run QC
trim_galore \
    --paired \
    --fastqc \
    --cores 1 \
    "${R1}" "${R2}"

## -- Map reads 
bowtie2 \
    --threads "${CPU}" \
    -x "${GENOME}" \
    --maxins 2000 \
    -1 `echo "${R1}" | sed 's,.fastq.gz,_val_1.fq.gz,'` \
    -2 `echo "${R2}" | sed 's,.fastq.gz,_val_2.fq.gz,'` > "${SAMFILE}"

## -- Filter reads 
samtools fixmate -@ "${CPU}" --output-fmt bam -m "${SAMFILE}" - | \
samtools sort -@ "${CPU}" --output-fmt bam -T "${SAMFILE}"_sorting - | \
samtools markdup -@ "${CPU}" --output-fmt bam -r -T "${SAMFILE}"_markdup - - | \
samtools view -@ "${CPU}" --output-fmt bam -f 2 -q 10 -1 -b - | \
samtools sort -@ "${CPU}" --output-fmt bam -l 9 -T "${SAMFILE}"_sorting2 -o "${FILTEREDBAM}"
samtools index -@ "${CPU}" "${FILTEREDBAM}"

## -- Generate track
bamCoverage \
    --bam "${FILTEREDBAM}" \
    --outFileName "${TRACK}" \
    --binSize 1 \
    --numberOfProcessors "${CPU}" \
    --normalizeUsing CPM \
    --skipNonCoveredRegions \
    --extendReads \
    --ignoreDuplicates

## -- Call peaks
macs2 callpeak \
    -t "${FILTEREDBAM}" \
    --format BAMPE \
    --gsize "${GENOMESIZE}" \
    --outdir "${PEAKFOLDER}" \
    --name "${SAMPLE}"

QC of GM12878 data

To check the quality of an ATAC-seq sample, one can check the following metrics:

  1. Fragment size distribution
  2. Distance from fragment to TSS
  3. 1+2 combined: Vplots
  4. TSS enrichment score (coverage enrichment at TSSs)
  5. Location of peaks vs. genomic features
  6. Fraction of fragments within peaks
library(AnnotationDbi)
library(tidyverse)
library(plyranges)
library(annotatr)
library(VplotR)

## -- Import genomic features
hg_genes <- AnnotationHub::AnnotationHub()[['AH92109']] %>% 
    filter(type == 'gene', gene_biotype == 'protein_coding') %>% 
    filter(!(grepl('GL|KI', seqnames)))
seqlevelsStyle(hg_genes) <- 'UCSC'
hg_TSSs <- hg_genes %>% 
    anchor_start() %>% 
    resize(width = 1) %>% 
    unanchor()
hg_promoters <- hg_TSSs %>%
    flank_upstream(4000) %>% 
    shift_downstream(2000) %>% 
    mutate(ID = paste0('prom_', 1:n()))
hg_features <- build_annotations(
    genome = 'hg38',
    annotations = 'hg38_basicgenes'
)

## -- Import fragments and peaks as GRanges
fragments <- importPEBamFiles(
    'data/GM12878/GM12878_filtered.bam', 
    shift_ATAC_fragments = TRUE
)
peaks <- import('data/GM12878/peaks/GM12878/GM12878_peaks.narrowPeak')
peak_summits <- import('data/GM12878/peaks/GM12878/GM12878_summits.bed')

## -- Check fragment size distribution
df<-...(fragments) %>% 
    group_by(...) %>% 
    count() %>% 
    ungroup() 
p1<-ggplot(df, aes(x = ..., y = ...)) + 
    ...() + 
    labs(title = '...', x = '...', y = '...')

## -- Check distance to TSS
df<-add_nearest_distance(fragments, hg_TSSs) %>%
    as_tibble() %>% 
    mutate(distance = cut(distance, breaks = seq(0, 5e5, by = 10), include.lowest = TRUE) %>% as.numeric() %>% `*`(10)) %>%
    group_by(distance) %>% 
    count() %>% 
    ungroup() %>% 
    mutate(cumsum = cumsum(n), pct = cumsum/max(cumsum))
p2<-ggplot(df, aes(x = distance, y = pct)) + 
    geom_line() + 
    theme_minimal() + 
    scale_x_log10() +
    theme(legend.position = 'none') + 
    labs(title = 'Cumulative distribution of ATAC-seq fragments', x = 'Distance from TSS', y = 'Cum. %')

## -- Check Vplot
p3<-plotVmat(
    fragments, 
    hg_TSSs, 
    normFun = 'zscore', 
    ylim = c(20, 800), 
    xlim = c(-500, 500)
)

## -- Compute promoter (TSS) Enrichment Score
df<-resize(fragments, width = 1, fix = 'center') %>% 
    join_overlap_left(hg_promoters) %>% 
    plyranges::select(ID) %>%
    as_tibble() %>% 
    filter(!is.na(ID)) %>%
    left_join(as_tibble(hg_promoters) %>% dplyr::select(start, end, ID), by = 'ID') %>% 
    mutate(
        prom_mid = start.y + (end.y - start.y) / 2, 
        distance = start.x - prom_mid, 
        binned_distance = round(distance/100, 0)*100
    ) %>%
    count(binned_distance) %>%
    mutate(
        bg = mean(.data$n[abs(.data$binned_distance) == 2000]), 
        enrich_score = n/bg
    )
TSSES <- round(df[df$binned_distance == 0, 'enrich_score'], 2)
p4<-ggplot(df, aes(x = binned_distance, y = enrich_score)) + 
    geom_line() + 
    theme_minimal() + 
    theme(legend.position = 'none') + 
    labs(title = glue::glue('TSS enrichment score (Signal-to-noise ratio): {TSSES}'), x = 'Distance from TSS', y = 'Enrich. score')

## -- Check location of peaks vs genomic features
peaks_annotated <- annotate_regions(
    peak_summits, 
    annotations = hg_features,
    ignore.strand = TRUE
)
p5<-plot_annotation(
    annotated_regions = peaks_annotated,
    annotation_order = paste0('hg38_genes_', c('promoters', '1to5kb', '5UTRs', 'exons', 'introns', '3UTRs')),
    plot_title = 'ATAC peaks',
    x_label = 'Genomic features',
    y_label = 'Count'
) + theme_minimal()

## -- Compute FRiP 
df<-add_nearest_distance(fragments, peaks) %>%
    as_tibble() %>% 
    mutate(isInPeak = distance == 0)
pct <- round(sum(df$isInPeak == 1) / nrow(df) * 100, 2)
p6<-ggplot(df, aes(y = isInPeak)) + 
    geom_bar() + 
    theme_minimal() + 
    theme(legend.position = 'none') + 
    labs(title = glue::glue('Fraction of reads in peaks (FRiP): {pct}'), x = '# of fragments', y = 'Fragment within peak')

## -- All plots together
p <- cowplot::plot_grid(
    p1, p2, p3, p4, p5, p6, 
    nrow = 2
)
ggsave('ATACseq_QC.pdf', w = 15, h = 10)

Exercise session 1

  • Check fragment size distribution in bulk ATAC-seq
library(tidyverse)
library(plyranges)
fragments <- VplotR::importPEBamFiles(
    '?...?', 
    shift_ATAC_fragments = TRUE
)
df<-?...?(fragments) %>% 
    group_by(?...?) %>% 
    count() %>% 
    ungroup() 
p1<-ggplot(df, aes(x = ?...?, y = ?...?)) + 
    ?...?() + 
    labs(title = '?...?', x = '?...?', y = '?...?')

Hands-on session 2: Processing single-cell ATAC-seq

Single-cell ATAC-seq data processing can be performed just like bulk ATAC-seq, as long as the sequencing strategy is similar (e.g. in ATAC-seq from individually sorted cells). However, 10X genomics does not follow the regular sequencing strategy used for standard Illumina libraries, as it needs to sequenced internal barcodes. 10X genomics has come up with a toolkit to process 10X scATACseq data into a single bam file, to identify peaks of chromatin accessibility in the entire set of single cells, and to count fragments overlapping with each peak in each cell.

Download fastq of 10X-provided ~ 1,000 human PBMCs

mkdir -p data/PBMCs_10X/
wget https://cf.10xgenomics.com/samples/cell-atac/1.2.0/atac_pbmc_1k_v1/atac_pbmc_1k_v1_fastqs.tar -O data/PBMCs_10X/atac_pbmc_1k_v1_fastqs.tar.gz
tar -xf data/PBMCs_10X/atac_pbmc_1k_v1_fastqs.tar.gz
mv atac_pbmc_1k_v1_fastqs data/PBMCs_10X/fastqs

Perform cellranger-atac count job on a HPC (Slurm)

It is (most likely) necessary to run cellranger-atac count workflow from an HPC node. Requirements are very high (see here for more details: https://support.10xgenomics.com/single-cell-atac/software/overview/system-requirements) and processing times can be quite long. It is a completely different workflow than the one used for 10X scRNAseq!!

CPU=48
MEM=256G
salloc \
    -J cellranger_atac_count \
    --cpus-per-task "${CPU}" \
    --mem "${MEM}" 

Once the allocated node is ready:

CPU=48
MEM=256G
ID=PBMCs_10X
REF=~/genomes/refdata-cellranger-arc-hg19-2020-A-2.0.0/
FASTQS=data/PBMCs_10X/fastqs
module load cellranger-atac
cellranger-atac count \
    --id "${ID}" \
    --reference "${REF}" \
    --fastqs "${FASTQS}" \
    --localcores "${CPU}" \
    --localmem "${MEM}"

Or, to submit the job itself from main server:

CPU=48
MEM=256G
ID=PBMCs_10X
REF=~/genomes/refdata-cellranger-arc-hg19-2020-A-2.0.0/
FASTQS=data/PBMCs_10X/fastqs
sbatch \
    -J cellranger_atac_count \
    -o cellranger_atac_count.pbmc.out \
    -e cellranger_atac_count.pbmc.err \
    --cpus-per-task "${CPU}" \
    --mem "${MEM}" \
    --export=ALL \
    bin/submit_cellranger-atac-count.sh

Once the analysis is done, you can check the generated files and QC reports. See here for more details: https://support.10xgenomics.com/single-cell-atac/software/pipelines/latest/using/count/. Our QC report file is in:

data/PBMCs_10X/atac_pbmc_1k_v1_web_summary.html

Exercise session 2

  • Check coverage of each cluster in Loupe at relevant loci to propose an annotation for each cluster.

The cloupe file is available from 10X genomics here: https://cf.10xgenomics.com/samples/cell-atac/1.1.0/atac_pbmc_1k_v1/atac_pbmc_1k_v1_cloupe.cloupe:

Load it in Loupe browser and check coverage for different genes.

# Check e.g. `CD19`, `CD8`, `CD4`, `CD3`