21  Lab 5 - Multi-omics data integration

Aims
  • Visually inspect results from MNase-seq, Scc1 ChIP-seq and RNA-seq in yeast
  • Plot aggregated profiles of stranded RNA-seq coverage @ Scc1 ChIP-seq peaks
  • Compare stranded RNA-seq coverages at strong/weak Scc1 peaks
Datasets

21.1 Inspecting data in IGV

  • Open .bw files and the Scc1 ChIP-seq peak file in IGV.
  • Compare the signal of stranded RNA-seq coverage tracks with the location of Scc1 peaks. Comment.
  • Compare the location of transcription start sites (TSSs) with nucleosome profiles. Comment.

21.2 Plotting stranded RNA-seq signal over Scc1 ChIP-seq peaks

  • Load forward and reverse RNA-seq coverage as Rle in R
  • Load Scc1 peaks in R
R
library(tidyverse)
library(GenomicRanges)
library(rtracklayer)
rna_fwd_track <- import('data/day5/RNAseq.fwd.bw', as = 'Rle')
rna_rev_track <- import('data/day5/RNAseq.rev.bw', as = 'Rle')
scc1_peaks <- import('data/day5/Scc1.narrowPeak')
  • Resize Scc1 peaks so that they all are centered over their summit (check peak column from the narrowPeak file), and extend then ± 4000 bp
R
library(plyranges)
scc1_peaks <- scc1_peaks |> 
    resize(fix = 'start', width = 1) |> 
    shift_right(scc1_peaks$peak) |> 
    resize(fix = 'center', width = 8000)
  • Compute a seqinfo from the rna_fwd_track Rle, add seqlengths and convert it into a GRanges object.
  • Remove the extended Scc1 peaks that lie outside of genome boundaries. To do this, check the operators %over% and %within%.
R
genome <- seqinfo(rna_fwd_track)
seqlengths(genome) <- lengths(rna_fwd_track)
genome <- as(genome, "GRanges")
table(scc1_peaks %over% genome)
table(scc1_peaks %within% genome)
scc1_peaks <- scc1_peaks[scc1_peaks %within% genome]
  • Extract the forward RNA-seq coverage signal over the first Scc1 peak
  • Convert it into a tibble and add coordinates (± 4000 bp centered over Scc1 peak)
  • Do the same for reverse RNA-seq coverage
  • Plot the two stranded RNA-seq coverages together
R
rna_fwd_track[scc1_peaks][[1]]
df <- rna_fwd_track[scc1_peaks][[1]] |> 
    as_tibble() |>
    mutate(position = seq(-3999, 4000, 1))
df$forward <- df$value
df$reverse <- rna_rev_track[scc1_peaks][[1]] |> as.vector()
df$value <- NULL
p <- pivot_longer(df, -pos, names_to = 'strandness', values_to = 'coverage') |>
    ggplot(aes(x = pos, y = coverage, col = strandness)) + 
    geom_line()
  • Iterate over all Scc1 peaks to extract stranded RNA-seq coverage around Scc1 peaks
R
df <- rna_fwd_track[scc1_peaks] |>
    as_tibble() |> ## Convert into a data.frame
    mutate(position = rep(seq(-3999, 4000, 1), length(scc1_peaks))) |> ## Repeat the position (± 4000bp) as many times as the number of Scc1 peaks
    mutate(forward = value) |> 
    select(group, position, forward) |> ## Tidy up columns
    mutate(reverse = rna_rev_track[scc1_peaks] |> unlist() |> as.vector()) |> ## Add reverse scores
    pivot_longer(-c(group, position), names_to = 'strandness', values_to = 'coverage') |>
    group_by(position, strandness) 
  • Calculate mean, standard deviation and 95% confidence interval of forward and reverse RNA-seq coverage around Scc1 peaks
  • Plot the average forward and reverse RNA-seq coverages, with a ribbon showing the 95% confidence interval.
R
aggr_df <- summarize(df, 
    mean = mean(coverage), 
    sd = sd(coverage, na.rm = TRUE), 
    n = dplyr::n()
) |>
    mutate(
        se = sd / sqrt(n),
        CI = qt(0.975, n-1) * se,
        topMargin = mean + CI,
        bottomMargin = mean - CI
    )
p <- ggplot(aggr_df, aes(x = position, y = mean, col = strandness, fill = strandness)) + 
    geom_line() + 
    geom_ribbon(aes(ymax = mean + CI, ymin = mean - CI), alpha = 0.2, col = NA)

21.3 Plot a heaetmap of RNA-seq coverage over ordered Scc1 peaks

  • Order peaks according to their Scc1 peak signal
  • Re-compute the stranded RNA-seq coverages around ordered Scc1 peaks
  • Plot a (rasterized) heatmap using geom_tile(), with the distance from Scc1 peak in abscisse and each row representing an individual Scc1 peak locus. You can split forward and reverse coverage with facet_wrap(~ strandness).
R
ordered_scc1_peaks <- scc1_peaks[order(scc1_peaks$signalValue, decreasing = TRUE)]
df <- rna_fwd_track[ordered_scc1_peaks] |>
    as_tibble() |> ## Convert into a data.frame
    mutate(position = rep(seq(-3999, 4000, 1), length(ordered_scc1_peaks))) |> ## Repeat the position (± 4000bp) as many times as the number of Scc1 peaks
    mutate(forward = value) |> 
    dplyr::select(group, position, forward) |> ## Tidy up columns
    mutate(reverse = rna_rev_track[ordered_scc1_peaks] |> unlist() |> as.vector()) |> ## Add reverse scores
    pivot_longer(-c(group, position), names_to = 'strandness', values_to = 'coverage') |>
    group_by(position, strandness) 
p <- ggplot(df, aes(x = position, y = group, fill = log2(coverage))) + 
    geom_tile() |> ggrastr::rasterise() + 
    facet_wrap(~ strandness, scales = 'free') +
    scale_y_reverse() +
    scale_fill_gradientn(
        colours = c("#FFFEF9", "#FFCA67", "#ED3024", "#1A0A10"), 
        na.value = 'white'
    )

21.4 Plot stranded RNA-seq coverages for Scc1 peaks grouped by their strength

  • Group the coverage tibble in 4 groups, containing the 0-25% (weakest) peaks, then 25-50%, 50-75% and 75-100% (strongest) peaks. This can be done with the ntile function.
  • Re-plot the average forward and reverse RNA-seq coverages, with a ribbon showing the 95% confidence interval, splitting the peaks from each group in a different facet (with facet_wrap(~ group)).
R
df$signalValue <- ordered_scc1_peaks$signalValue[df$group]
df$group <- ntile(df$signalValue, 4)
aggr_df <- group_by(df, group, position, strandness) |>
    summarize( 
        mean = mean(coverage), 
        sd = sd(coverage, na.rm = TRUE), 
        n = dplyr::n()
    ) |>
    mutate(
        se = sd / sqrt(n),
        CI = qt(0.975, n-1) * se,
        topMargin = mean + CI,
        bottomMargin = mean - CI
    )
p <- ggplot(aggr_df, aes(x = position, y = mean, col = strandness, fill = strandness)) + 
    geom_line() + 
    geom_ribbon(aes(ymax = mean + CI, ymin = mean - CI), alpha = 0.2, col = NA) +
    facet_wrap(~ group)