20 Demo 5 - Leveraging Bioconductor resources

Aims

Recovering chromatin states from the AnnotationHub
Intersecting GRanges
Recovering genes from genomic loci
Performing GO analysis

Datasets

All the data will be recovered from Bioconductor!

20.1 Recover HMM-based chromatin state annotations in two human cell lines

Leveraging AnnotationHub, I can recover chromatin state annotations from the Epigenome Roadmap project (published here) for two different cell lines.

library(tidyverse)
library(AnnotationHub)
ah <- AnnotationHub()
ah_human_epigenome <- query(ah, c("Homo sapiens", "EpigenomeRoadMap"))
epigenome_metadata <- query(ah_human_epigenome, 'data.frame')[[1]]
epigenome_metadata[1:10, 1:10]
query(ah_human_epigenome, c("chromhmmSegmentations", "cell line"))
H1_states <- query(ah_human_epigenome, c("chromhmmSegmentations", "ESC.H1"))[[1]]
NPC_states <- query(ah_human_epigenome, c("chromhmmSegmentations", "ESDR.H1.NEUR.PROG"))[[1]]
H1_states
NPC_states

20.2 Compare state annotations in two human cell lines

I can then compare how these two sets of annotations vary, to infer how chromatin states change througout a differentiation process.

library(plyranges)
states <- c(
    "Active TSS", 
    "Flanking Active TSS", 
    "Bivalent/Poised TSS", 
    "Enhancers",
    "Flanking Bivalent TSS/Enh", 
    "Strong transcription",
    "Transcr. at gene 5' and 3'", 
    "Weak transcription",
    "Bivalent Enhancer", 
    "Genic enhancers",
    "Quiescent/Low", 
    "Heterochromatin", 
    "Weak Repressed PolyComb", 
    "Repressed PolyComb", 
    "ZNF genes & repeats"
)
H1_states$H1_state <- factor(H1_states$name, levels = states)
H1_states$NPC_state <- join_nearest(H1_states, NPC_states)$name.y |> factor(levels = states)
df <- tibble(
    stateInH1 = H1_states$H1_state, 
    stateInNPCs = H1_states$NPC_state  
) |> 
    group_by(stateInH1, stateInNPCs) |> 
    count() |> 
    group_by(stateInH1) |> 
    mutate(
        total = sum(n), 
        pct = n/total
    )
p <- ggplot(df, aes(x = stateInH1, y = stateInNPCs, fill = pct)) + 
    geom_tile(col = "black") + 
    theme_bw() + 
    scale_fill_gradientn(colours = c('white', 'orange', 'darkred')) + 
    labs(
        x = "Chromatin states in H1 cells", 
        y = "Chromatin states in H1-derived\nneural progenitor cells", 
        fill = '# of genomic bins'
    ) + 
    guides(x =  guide_axis(angle = 90)) +
    coord_fixed()

transitioning_loci_1 <- H1_states[H1_states$H1_state == 'Bivalent/Poised TSS' & H1_states$NPC_state == 'Active TSS']
transitioning_loci_2 <- H1_states[H1_states$H1_state == 'Bivalent/Poised TSS' & H1_states$NPC_state == 'Repressed PolyComb']

20.3 Recover genes associated with varying chromatin states

query(ah, c("Homo sapiens", "release-75"))
human_gtf <- ah[['AH10684']]
human_Ensembl_genes <- human_gtf[human_gtf$type == 'gene' & human_gtf$gene_biotype == 'protein_coding']
human_Ensembl_TSSs <- resize(human_Ensembl_genes, 1, 'start')
human_Ensembl_TSSs
seqlevelsStyle(human_Ensembl_TSSs) <- seqlevelsStyle(H1_states)
#
seqlevels(transitioning_loci_1, pruning.mode = 'coarse') <- seqlevelsInUse(transitioning_loci_1)
seqlevels(human_Ensembl_TSSs, pruning.mode = 'coarse') <- seqlevels(transitioning_loci_1)
transitioning_loci_1 <- join_nearest(transitioning_loci_1, human_Ensembl_TSSs, distance = TRUE)
transitioning_loci_2 <- join_nearest(transitioning_loci_2, human_Ensembl_TSSs, distance = TRUE)

20.4 Enriched gene ontology terms for genes associated with varying chromatin states

genes <- unique(transitioning_loci_1$gene_name[transitioning_loci_1$distance <= 500]) 
res <- gprofiler2::gost(genes, organism = 'hsapiens')
as_tibble(res$result) |>   
    select(source, term_name, p_value, precision, recall) |>
    filter(source == "GO:BP",recall >= 0.05) |> 
    arrange(p_value) |> 
    print(n = 50)

genes <- unique(transitioning_loci_2$gene_name[transitioning_loci_2$distance <= 500]) 
res <- gprofiler2::gost(genes, organism = 'hsapiens')
as_tibble(res$result) |>   
    select(source, term_name, p_value, precision, recall) |>
    filter(source == "GO:BP",recall >= 0.05) |> 
    arrange(p_value) |> 
    print(n = 50)