Appendix A — Extra material

knitr::opts_chunk$set(eval=FALSE)

A.1 Slides

Outreach material includes slide sets for training events.

A.2 PERMANOVA comparison

Here we present two possible uses of the adonis2 function which performs PERMANOVA. The optional argument by has an effect on the statistical outcome, so its two options are compared here.

# import necessary packages
library(gtools)
library(purrr)
library(vegan)
library(gtools)
library(purrr)

Let us load the enterotype TSE object and run PERMANOVA for different orders of three variables with two different approaches: by = "margin" or by = "terms".

# load and prepare data
library(mia)
data("enterotype", package="mia")
enterotype <- transformAssay(enterotype, method = "relabundance")
# drop samples missing meta data
enterotype <- enterotype[ , !rowSums(is.na(colData(enterotype)[, c("Nationality", "Gender", "ClinicalStatus")]) > 0)]
# define variables and list all possible combinations
vars <- c("Nationality", "Gender", "ClinicalStatus")
var_perm <- permutations(n = 3, r = 3, vars)
formulas <- apply(var_perm, 1, function(row) purrr::reduce(row, function(x, y) paste(x, "+", y)))
# create empty data.frames for further storing p-values
terms_df <- data.frame("Formula" = formulas,
                       "ClinicalStatus" = rep(0, 6),
                       "Gender" = rep(0, 6),
                       "Nationality" = rep(0, 6))
margin_df <- data.frame("Formula" = formulas,
                        "ClinicalStatus" = rep(0, 6),
                        "Gender" = rep(0, 6),
                        "Nationality" = rep(0, 6))

for (row_idx in 1:nrow(var_perm)) {
  
  # generate temporary formula (i.e. "assay ~ ClinicalStatus + Nationality + Gender")
  tmp_formula <- purrr::reduce(var_perm[row_idx, ], function(x, y) paste(x, "+", y))
  tmp_formula <- as.formula(paste0('t(assay(enterotype, "relabundance")) ~ ',
                            tmp_formula))

  # multiple variables, default: by = "terms"
  set.seed(75)
  with_terms <- adonis2(tmp_formula, 
                by = "terms",
                data = colData(enterotype),
                permutations = 99)
  
  # multiple variables, by = "margin"
  set.seed(75)
  with_margin <- adonis2(tmp_formula, 
                 by = "margin",
                 data = colData(enterotype),
                 permutations = 99)

  # extract p-values
  terms_p <- with_terms[["Pr(>F)"]]
  terms_p <- terms_p[!is.na(terms_p)]
  margin_p <- with_margin[["Pr(>F)"]]
  margin_p <- margin_p[!is.na(margin_p)]
  
  # store p-values into data.frames
  for (col_idx in 1:ncol(var_perm)) {
    
    terms_df[var_perm[row_idx, col_idx]][row_idx, ] <- terms_p[col_idx]
    margin_df[var_perm[row_idx, col_idx]][row_idx, ] <- margin_p[col_idx]
    
  }
  
}

The following table displays the p-values for the three variables ClinicalStatus, Gender and Nationality obtained by PERMANOVA with adonis2. Note that the p-values remain identical when by = "margin", but change with the order of the variables in the formula when by = "terms" (default).


df <- terms_df %>%
  dplyr::inner_join(margin_df, by = "Formula", suffix = c(" (terms)", " (margin)"))

knitr::kable(df)

A.3 Bayesian Multinomial Logistic-Normal Models

Analysis using such model could be performed with the function pibble from the fido package, wihch is in form of a Multinomial Logistic-Normal Linear Regression model; see vignette of package.

The following presents such an exemplary analysis based on the data of Sprockett et al. (2020) available through microbiomeDataSets package.

Sprockett, Daniel D., Melanie Martin, Elizabeth K. Costello, Adam R. Burns, Susan P. Holmes, Michael D. Gurven, and David A. Relman. 2020. “Microbiota Assembly, Structure, and Dynamics Among Tsimane Horticulturalists of the Bolivian Amazon.” Nat Commun 11 (1): 3772. https://doi.org/10.1038/s41467-020-17541-6.

library(fido)
library(microbiomeDataSets)
tse <- SprockettTHData()

We pick three covariates (“Sex”,“Age_Years”,“Delivery_Mode”) during this analysis as an example, and beforehand we check for missing data:

library(mia)
cov_names <- c("Sex","Age_Years","Delivery_Mode")
na_counts <- apply(is.na(colData(tse)[,cov_names]), 2, sum)
na_summary<-as.data.frame(na_counts,row.names=cov_names)

We drop missing values of the covariates:

tse <- tse[ , !is.na(colData(tse)$Delivery_Mode) ]
tse <- tse[ , !is.na(colData(tse)$Age_Years) ]

We agglomerate microbiome data to Phylum:

tse_phylum <- mergeFeaturesByRank(tse, "Phylum")

We extract the counts assay and covariate data to build the model matrix:

Y <- assays(tse_phylum)$counts
# design matrix
# taking 3 covariates
sample_data<-as.data.frame(colData(tse_phylum)[,cov_names])
X <- t(model.matrix(~Sex+Age_Years+Delivery_Mode,data=sample_data))

Building the parameters for the pibble call to build the model; see more at vignette:

n_taxa<-nrow(Y)
upsilon <- n_taxa+3
Omega <- diag(n_taxa)
G <- cbind(diag(n_taxa-1), -1)
Xi <- (upsilon-n_taxa)*G%*%Omega%*%t(G)
Theta <- matrix(0, n_taxa-1, nrow(X))
Gamma <- diag(nrow(X))

Automatically initializing the priors and visualizing their distributions:

priors <- pibble(NULL, X, upsilon, Theta, Gamma, Xi)
names_covariates(priors) <- rownames(X)
plot(priors, pars="Lambda") + ggplot2::xlim(c(-5, 5))

Estimating the posterior by including our response data Y. Note: Some computational failures could occur (see discussion) the arguments multDirichletBoot calcGradHess could be passed in such case.

priors$Y <- Y 
posterior <- refit(priors, optim_method="adam", multDirichletBoot=0.5) #calcGradHess=FALSE

Printing a summary about the posterior:

ppc_summary(posterior)

Plotting the summary of the posterior distributions of the regression parameters:

names_categories(posterior) <- rownames(Y)
plot(posterior,par="Lambda",focus.cov=rownames(X)[2:4])

Taking a closer look at “Sex” and “Delivery_Mode”:

plot(posterior, par="Lambda", focus.cov = rownames(X)[c(2,4)])

A.4 Interactive 3D Plots

# Load libraries
library(rgl)
library(plotly)

library(knitr)
knitr::knit_hooks$set(webgl = hook_webgl)

In this section we make a 3D version of the earlier Visualizing the most dominant genus on PCoA (see Chapter 4), with the help of the plotly (Sievert 2020).

Sievert, Carson. 2020. Interactive Web-Based Data Visualization with r, Plotly, and Shiny. Chapman; Hall/CRC. https://plotly-r.com.

# Importing necessary libraries
library(curatedMetagenomicData)
library(dplyr)
library(DT)
library(mia)
library(scater)

# Querying the data
tse <- sampleMetadata %>%
    filter(age >= 18) %>% # taking only data of age 18 or above
    filter(!is.na(alcohol)) %>% # excluding missing values
    returnSamples("relative_abundance")

tse_Genus <- mergeFeaturesByRank(tse, rank="genus")
tse_Genus <- addPerSampleDominantFeatures(tse_Genus,assay.type="relative_abundance", name = "dominant_taxa")

# Performing PCoA with Bray-Curtis dissimilarity.
tse_Genus <- runMDS(tse_Genus, FUN = vegan::vegdist, ncomponents = 3,
              name = "PCoA_BC", assay.type = "relative_abundance")

# Getting the 6 top taxa
top_taxa <- getTopFeatures(tse_Genus,top = 6, assay.type = "relative_abundance")

# Naming all the rest of non top-taxa as "Other"
most_abundant <- lapply(colData(tse_Genus)$dominant_taxa,
                   function(x){if (x %in% top_taxa) {x} else {"Other"}})

# Storing the previous results as a new column within colData
colData(tse_Genus)$most_abundant <- as.character(most_abundant)

# Calculating percentage of the most abundant
most_abundant_freq <- table(as.character(most_abundant))
most_abundant_percent <- round(most_abundant_freq/sum(most_abundant_freq)*100, 1)

# Retrieving the explained variance
e <- attr(reducedDim(tse_Genus, "PCoA_BC"), "eig");
var_explained <- e/sum(e[e>0])*100