20  Demo 5 - Leveraging Bioconductor resources

Aims
  • Recovering chromatin states from the AnnotationHub
  • Intersecting GRanges
  • Recovering genes from genomic loci
  • Performing GO analysis
Datasets

All the data will be recovered from Bioconductor!

20.1 Recover HMM-based chromatin state annotations in two human cell lines

Leveraging AnnotationHub, I can recover chromatin state annotations from the Epigenome Roadmap project (published here) for two different cell lines.

library(tidyverse)
library(AnnotationHub)
ah <- AnnotationHub()
ah_human_epigenome <- query(ah, c("Homo sapiens", "EpigenomeRoadMap"))
epigenome_metadata <- query(ah_human_epigenome, 'data.frame')[[1]]
epigenome_metadata[1:10, 1:10]
query(ah_human_epigenome, c("chromhmmSegmentations", "cell line"))
H1_states <- query(ah_human_epigenome, c("chromhmmSegmentations", "ESC.H1"))[[1]]
NPC_states <- query(ah_human_epigenome, c("chromhmmSegmentations", "ESDR.H1.NEUR.PROG"))[[1]]
H1_states
NPC_states

20.2 Compare state annotations in two human cell lines

I can then compare how these two sets of annotations vary, to infer how chromatin states change througout a differentiation process.

library(plyranges)
states <- c(
    "Active TSS", 
    "Flanking Active TSS", 
    "Bivalent/Poised TSS", 
    "Enhancers",
    "Flanking Bivalent TSS/Enh", 
    "Strong transcription",
    "Transcr. at gene 5' and 3'", 
    "Weak transcription",
    "Bivalent Enhancer", 
    "Genic enhancers",
    "Quiescent/Low", 
    "Heterochromatin", 
    "Weak Repressed PolyComb", 
    "Repressed PolyComb", 
    "ZNF genes & repeats"
)
H1_states$H1_state <- factor(H1_states$name, levels = states)
H1_states$NPC_state <- join_nearest(H1_states, NPC_states)$name.y |> factor(levels = states)
df <- tibble(
    stateInH1 = H1_states$H1_state, 
    stateInNPCs = H1_states$NPC_state  
) |> 
    group_by(stateInH1, stateInNPCs) |> 
    count() |> 
    group_by(stateInH1) |> 
    mutate(
        total = sum(n), 
        pct = n/total
    )
p <- ggplot(df, aes(x = stateInH1, y = stateInNPCs, fill = pct)) + 
    geom_tile(col = "black") + 
    theme_bw() + 
    scale_fill_gradientn(colours = c('white', 'orange', 'darkred')) + 
    labs(
        x = "Chromatin states in H1 cells", 
        y = "Chromatin states in H1-derived\nneural progenitor cells", 
        fill = '# of genomic bins'
    ) + 
    guides(x =  guide_axis(angle = 90)) +
    coord_fixed()
transitioning_loci_1 <- H1_states[H1_states$H1_state == 'Bivalent/Poised TSS' & H1_states$NPC_state == 'Active TSS']
transitioning_loci_2 <- H1_states[H1_states$H1_state == 'Bivalent/Poised TSS' & H1_states$NPC_state == 'Repressed PolyComb']

20.3 Recover genes associated with varying chromatin states

query(ah, c("Homo sapiens", "release-75"))
human_gtf <- ah[['AH10684']]
human_Ensembl_genes <- human_gtf[human_gtf$type == 'gene' & human_gtf$gene_biotype == 'protein_coding']
human_Ensembl_TSSs <- resize(human_Ensembl_genes, 1, 'start')
human_Ensembl_TSSs
seqlevelsStyle(human_Ensembl_TSSs) <- seqlevelsStyle(H1_states)
#
seqlevels(transitioning_loci_1, pruning.mode = 'coarse') <- seqlevelsInUse(transitioning_loci_1)
seqlevels(human_Ensembl_TSSs, pruning.mode = 'coarse') <- seqlevels(transitioning_loci_1)
transitioning_loci_1 <- join_nearest(transitioning_loci_1, human_Ensembl_TSSs, distance = TRUE)
transitioning_loci_2 <- join_nearest(transitioning_loci_2, human_Ensembl_TSSs, distance = TRUE)

20.4 Enriched gene ontology terms for genes associated with varying chromatin states

genes <- unique(transitioning_loci_1$gene_name[transitioning_loci_1$distance <= 500]) 
res <- gprofiler2::gost(genes, organism = 'hsapiens')
as_tibble(res$result) |>   
    select(source, term_name, p_value, precision, recall) |>
    filter(source == "GO:BP",recall >= 0.05) |> 
    arrange(p_value) |> 
    print(n = 50)
genes <- unique(transitioning_loci_2$gene_name[transitioning_loci_2$distance <= 500]) 
res <- gprofiler2::gost(genes, organism = 'hsapiens')
as_tibble(res$result) |>   
    select(source, term_name, p_value, precision, recall) |>
    filter(source == "GO:BP",recall >= 0.05) |> 
    arrange(p_value) |> 
    print(n = 50)