Add heatmap and lmm modeling function

wejlab · Jan 30, 2024 · c8adc25 · c8adc25
1 parent 7752e89
commit c8adc25
Show file tree

Hide file tree

Showing 30 changed files with 1,451 additions and 233 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -1,4 +1,4 @@
-^LMTK\.Rproj$
+^LegATo\.Rproj$
 ^\.Rproj\.user$
 ^LICENSE\.md$
 ^vignettes\docs

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -11,31 +11,31 @@ Authors@R: c(
     person("Jordan", "Pincus", , "[email protected]", role = "art",
            comment = "Artist of LegATo icon")
   )
-Description: LegATo is a suite of open-source software tools for streamlining longitudinal microbiome analysis, 
-    integrating visualization, modeling and testing procedures.
+Description: LegATo is a suite of open-source software tools for
+    streamlining longitudinal microbiome analysis, integrating
+    visualization, modeling and testing procedures.
 License: MIT + file LICENSE
 Depends:
     R (>= 4.3.0)
 Imports:
     animalcules,
     data.table,
     dplyr,
-    geepack,
     ggplot2,
     magrittr,
     MultiAssayExperiment,
     rlang,
     S4Vectors,
     stringr,
     SummarizedExperiment,
-    TBSignatureProfiler,
     tibble,
     tidyr,
     tidyselect
 Suggests:
     BiocStyle,
     biomformat,
     broom,
+    ComplexHeatmap,
     emmeans,
     ggalluvial,
     ggeffects,
@@ -50,9 +50,17 @@ Suggests:
     Rsubread,
     spelling,
     sys,
+    TBSignatureProfiler,
     testthat,
     usethis,
-    vegan
+    vegan,
+    RColorBrewer,
+    circlize,
+    geepack,
+    paletteer,
+    broom.mixed,
+    lmerTest,
+    lme4
 VignetteBuilder: 
     knitr
 BiocType: Software

diff --git a/NAMESPACE b/NAMESPACE
@@ -3,17 +3,20 @@
 export(NMIT)
 export(clean_animalcules_MAE)
 export(create_formatted_MAE)
+export(distinctColors)
 export(filter_animalcules_MAE)
 export(get_long_data)
 export(get_stacked_data)
 export(get_summary_table)
 export(get_top_taxa)
 export(parse_MAE_SE)
 export(plot_alluvial)
+export(plot_heatmap)
 export(plot_spaghetti)
 export(plot_stacked_area)
 export(plot_stacked_bar)
 export(run_gee_model)
+export(run_lmm_model)
 export(test_hotelling_t2)
 export(tscor)
 import(MultiAssayExperiment)

diff --git a/R/NMIT.R b/R/NMIT.R
@@ -63,44 +63,58 @@ tscor <- function(dat, unit_var, method = "kendall", fill_na = 0) {
 }
 
 #' Nonparametric Microbial Interdependence Test (NMIT)
-#' 
-#' The package performs a multivariate distance-based test for group comparisons of
-#' microbial temporal interdependence. The NMIT test provides a comprehensive way to evaluate
-#' the association between key phenotypic variables and microbial interdependence.
-#' This function is recommended for use after a filtering step using \code{filter_animalcules_MAE}.
+#'
+#' An R-based implementation of the NMIT, a multivariate distance-based test for
+#' group comparisons of microbial temporal interdependence. The NMIT test
+#' provides a comprehensive way to evaluate the association between key
+#' phenotypic variables and microbial interdependence. This function is
+#' recommended for use after a filtering step using
+#' \code{filter_animalcules_MAE}.
 #'
 #' @author Yilong Zhang, Huilin Li, Aubrey Odom
 #'
 #' @inheritParams tscor
 #' @inheritParams plot_stacked_bar
-#' @param fixed_cov A character vector of covariates of interest found in dat.
-#' @param dist_type A character strin specifying the type of matrix norm to be computed. The default is \code{"F"}. 
+#' @param fixed_cov A character vector of covariates of interest found in
+#'   \code{dat}.
+#' @param dist_type A character string specifying the type of matrix norm to be
+#'   computed. The default is \code{"F"}.
 #' \itemize{
-#'     \item{\code{"M"} or \code{"m"}}{specifies the maximum modulus of all the elements in \code{x}.}
-#'     \item{\code{"O"}, \code{"o"} or \code{"1"}}{specifies the one norm, (maximum absolute column sum);}
-#'     \item{\code{"I"} or \code{"i"}}{specifies the infinity norm (maximum absolute row sum);}
-#'     \item{\code{"F"} or \code{"f"}}{specifies the Frobenius norm (the Euclidean norm of \code{x} treated as if it were a vector); and}
+#'     \item{\code{"M"} or \code{"m"}}{specifies the maximum modulus of all the
+#'     elements in \code{x}.}
+#'     \item{\code{"O"}, \code{"o"} or \code{"1"}}{specifies the one norm,
+#'     (maximum absolute column sum);}
+#'     \item{\code{"I"} or \code{"i"}}{specifies the infinity norm (maximum
+#'     absolute row sum);}
+#'     \item{\code{"F"} or \code{"f"}}{specifies the Frobenius norm (the
+#'     Euclidean norm of \code{x} treated as if it were a vector)}
 #' }  
-#' @param heatmap A logical value indicating whether to draw heatmap. The default is \code{TRUE}.
-#' @param classify A logical value indicating whether to draw a classifier tree. The default is \code{FALSE}.
-#' @param fill_na A number between 0 and 1 to fill \code{NA} values. The default value is 0.
+#' @param heatmap A logical value indicating whether to draw heatmap. The
+#'   default is \code{TRUE}.
+#' @param classify A logical value indicating whether to draw a classifier tree.
+#'   The default is \code{FALSE}.
+#' @param fill_na A number between 0 and 1 to fill \code{NA} values. The default
+#'   value is 0.
+#' @param ... Additional arguments to be passed to \code{ComplexHeatmap::Heatmap()}.
+#'
+#' @return This function returns an analysis of variance (ANOVA) table showing
+#'   sources of variation, degrees of freedom, sequential sums of squares, mean
+#'   squares, F statistics, partial R-squared and P values, based on 999
+#'   permutations.
 #'
-#' @return This function returns an analysis of variance (ANOVA) table showing sources of variation,
-#' degrees of freedom, sequential sums of squares, mean squares, F statistics, partial R-squared and P values,
-#' based on 999 permutations.
-#' 
 #' @export
 #' 
 #' @examples
 #' dat <- system.file("extdata/MAE_small.RDS", package = "LegATo") |> readRDS()
 #' NMIT(dat, unit_var = "Subject", fixed_cov = "Group", covariate_time = "Month")
-#'
+#' 
 
 NMIT <- function(dat, unit_var, fixed_cov,
                  covariate_time,
                  method = "kendall", dist_type = "F",
                  heatmap = TRUE, classify = FALSE,
-                 fill_na = 0) {
+                 fill_na = 0,
+                 ...) {
   dat <- .rearrange_sub_time(dat, unit_var, covariate_time) 
   all_dat <- parse_MAE_SE(dat)
   map <- all_dat$sam
@@ -119,17 +133,37 @@ NMIT <- function(dat, unit_var, fixed_cov,
                 })
   rownames(dist) <- dimnames(otu.cor)[[3]]
   colnames(dist) <- dimnames(otu.cor)[[3]]
-  grp <- map.unique[rownames(dist) , fixed_cov]
+  grp <- map.unique[rownames(dist), fixed_cov]
   test <- vegan::adonis2(dist ~ grp)
   if(heatmap){
-    #pvalue <- paste(round(c(test$F[1], test$`Pr(>F)`[1]), 3),
-    #                collapse = ";")
     pvalue <- round(c(test$`Pr(>F)`[1]), 3)
     n.taxa <- nrow(otu)
+
     diag(dist) <- stats::median(dist)
-    plot_title <- paste0(" #OTU:", n.taxa, " pvalue:", pvalue)
-    stats::heatmap(sqrt(dist), Colv = classify, Rowv = classify, scale = "column", 
-            xlab = unit_var, ylab = unit_var,
-            main = paste("NMIT Correlation Heatmap across", n.taxa, "taxa with p-value", pvalue))
+    mat <- as.matrix(sqrt(dist))
+
+    plot_title <- paste0(" # OTU: ", n.taxa, " pvalue: ", pvalue)
+    cov_mat <- grp %>% as.matrix %>%
+      magrittr::set_rownames(rownames(dist)) %>%
+      magrittr::set_colnames(fixed_cov) %>%
+      as.data.frame
+    plot_heatmap(
+      inputData = mat,
+      annotationData = cov_mat,
+      name = "NMIT Correlation",
+      plot_title = plot_title,
+      signatureColNames = NULL,
+      annotationColNames = NULL,
+      colList = list(),
+      scale = FALSE,
+      showColumnNames = TRUE,
+      showRowNames = TRUE,
+      colorSets = c("Set1", "Set2", "Set3", "Pastel1", "Pastel2", "Accent", "Dark2",
+                    "Paired"),
+      choose_color = c("blue", "gray95", "red"),
+      split_heatmap = "none",
+      column_order = NULL,
+      ...
+    )
   }
 }
diff --git a/R/clean_animalcules_MAE.R b/R/clean_animalcules_MAE.R
@@ -19,43 +19,42 @@
 #' 
 
 clean_animalcules_MAE <- function(dat) {
+  # Extract data
   parsed <- parse_MAE_SE(dat, which_assay = "MicrobeGenetics", type = "MAE")
   tax_table <- parsed$tax
-  sam_table <- parsed$sam
   counts_table <- parsed$counts
-
+  # Preliminary fixing of species names
+  if(!all(c("genus", "species") %in% colnames(tax_table))) {
+    stop("Columns 'genus' and 'species' must be present in taxonomy table.")
+  }
+  if(!all.equal(rownames(tax_table), rownames(counts_table))) {
+    stop("Error in parsing MAE")
+  }
+  # Only need to alter species names in tax_table initially
   se_rowData  <- tax_table %>%
-    dplyr::mutate("species" = stringr::str_remove_all(.data$species, "\\[|\\]"))
-
-  # Replace species that are others with "sp."
+    dplyr::mutate("species" = stringr::str_remove_all(.data$species, "\\[|\\]"),
+                  "species" = stringr::str_replace_all(.data$species, "_", " "))
+  ## Replace species that are others with "sp."
   ind <- se_rowData$species == "others"
   se_rowData$species[ind] <- paste(se_rowData$genus[ind], "sp.", sep = " ")
-  # Replace genus that are others with species information
+  ## Replace genus that are others with species information
   ind <- se_rowData$genus == "others" & !is.na(se_rowData$genus)
   all_split <- plyr::aaply(strsplit(se_rowData$species, " "), 1,
                            function(x) x[[1]])
   se_rowData$genus[ind] <- all_split[ind]
+  # Consolidate tax and counts to species level
+  up_counts <- animalcules::upsample_counts(counts_table, se_rowData,
+                                            higher_level = "species")
+  up_rowData <- se_rowData %>%
+    tibble::remove_rownames() %>%
+    dplyr::distinct(.data$species, .keep_all = TRUE)
+  ind <- match(rownames(up_counts), up_rowData$species)
+  up_rowData <- up_rowData[ind, ]
+  rownames(up_rowData) <- up_rowData$species
 
-  #ISSUE: DUplicates because we need to consolidate down to species level
-
-  # Fix rownames
-  if (all.equal(rownames(tax_table), rownames(counts_table))) {
-    to_rownames <- stringr::str_replace_all(se_rowData$species, "_", " ")
-    rownames(se_rowData) <- rownames(counts_table) <- to_rownames
-    # Alphabetize
-    se_rowData_final <- se_rowData[order(to_rownames),]
-    counts_table_final <- counts_table[order(to_rownames),]
-  } else (stop("Mismatch of rownames in datasets"))
-
-  se_mgx <- counts_table %>% base::data.matrix() %>% S4Vectors::SimpleList() %>%
-    magrittr::set_names("MGX")
-
-  microbe_se <- SummarizedExperiment::SummarizedExperiment(
-    assays = se_mgx, colData = sam_table, rowData = se_rowData)
-  MAE_out <- MultiAssayExperiment::MultiAssayExperiment(
-    experiments = S4Vectors::SimpleList(MicrobeGenetics = microbe_se),
-    colData = sam_table)
-
+  MAE_out <- create_formatted_MAE(counts_dat = up_counts,
+                                  tax_dat = up_rowData,
+                                  metadata_dat = parsed$sam)
   return(MAE_out)
 }
 
diff --git a/R/filter_animalcules_MAE.R b/R/filter_animalcules_MAE.R
@@ -5,12 +5,13 @@ utils::globalVariables(".")
 #'
 #' This function takes an animalcules-formatted \code{MultiAssayExperiment}
 #' (MAE) object and identifies all taxa at the "genus" level that represent
-#' <\code{filter_pct} average relative abundance across all samples in the MAE.
+#' <\code{filter_prop} average relative abundance across all samples in the MAE.
 #' After identification at the genus level, taxa across the genus and species
 #' levels are then consolidated into the category "Other".
 #'
 #' @inheritParams plot_stacked_bar
-#' @param filter_pct A double strictly between 0 and 1.
+#' @param filter_prop A double strictly between 0 and 1, representing
+#' the proportion of relative abundance at which to filter. Default is 0.001.
 #'
 #' @returns An animalcules-formatted \code{MultiAssayExperiment} object with
 #'   appropriate filtration.
@@ -23,8 +24,8 @@ utils::globalVariables(".")
 #' filter_animalcules_MAE(in_dat, 0.01)
 #' 
 
-filter_animalcules_MAE <- function(dat, filter_pct = 0.05) {
-  if(filter_pct <= 0 | filter_pct >= 1) stop("filter_pct must be between 0 and 1.")
+filter_animalcules_MAE <- function(dat, filter_prop = 0.001) {
+  if(filter_prop <= 0 | filter_prop >= 1) stop("filter_prop must be between 0 and 1.")
   # Extract metadata, taxonomic info, and counts
   parsed <- parse_MAE_SE(dat, which_assay = "MicrobeGenetics", type = "MAE")
   tax_table <- parsed$tax
@@ -33,11 +34,11 @@ filter_animalcules_MAE <- function(dat, filter_pct = 0.05) {
 
   all_relabu_genus <- counts_table %>%
     as.matrix() %>%
-    # Get rel abu within samokes
+    # Get rel abu within samples
     prop.table(margin = 2) %>%
     tibble::as_tibble() %>%
     dplyr::bind_cols("genus" = tax_table$genus) %>%
-    dplyr::relocate(.data$genus) %>%
+    dplyr::relocate("genus") %>%
     dplyr::group_by(.data$genus) %>%
     # Sum rel abu within samples/columns for genera
     dplyr::summarise(dplyr::across(.fns = sum, .cols = dplyr::everything())) %>%
@@ -46,11 +47,11 @@ filter_animalcules_MAE <- function(dat, filter_pct = 0.05) {
     dplyr::select("genus", "allmeans") %>%
     dplyr::mutate("genus" = replace(.data$genus, is.na(.data$genus), "Unknown")) %>%
     dplyr::arrange(dplyr::desc(.data$allmeans)) %>%
-    dplyr::mutate("lessthan1pct" = .data$allmeans < filter_pct/10)
+    dplyr::mutate("lessthanXpct" = .data$allmeans < filter_prop)
 
   # Identify species in other
   othergenera <- all_relabu_genus %>%
-    dplyr::filter(.data$lessthan1pct == TRUE) %>%
+    dplyr::filter(.data$lessthanXpct == TRUE) %>%
     dplyr::select("genus") %>% unlist() %>% unname()
 
   ## Use the identified species to update the tax, species tables
@@ -67,10 +68,7 @@ filter_animalcules_MAE <- function(dat, filter_pct = 0.05) {
     dplyr::group_by(.data$species) %>%
     dplyr::summarise(dplyr::across(.cols = dplyr::all_of(colnames(counts_table)),
                             .fns = sum)) %>%
-    tibble::column_to_rownames("species") %>%
-    base::data.matrix() %>%
-    S4Vectors::SimpleList() %>%
-    magrittr::set_names("MGX")
+    tibble::column_to_rownames("species")
 
   # Adjust tax table accordingly
   tax_table_other2 <- tax_table_other %>%
@@ -80,13 +78,7 @@ filter_animalcules_MAE <- function(dat, filter_pct = 0.05) {
   rownames(tax_table_other2) <- tax_table_other2$species
 
   ## Create SE object
-  microbe_other <- SummarizedExperiment::SummarizedExperiment(
-    assays = counts_other,
-    rowData = S4Vectors::DataFrame(tax_table_other2),
-    colData = S4Vectors::DataFrame(sam_table))
-  MAE_out <- MultiAssayExperiment::MultiAssayExperiment(
-    experiments = S4Vectors::SimpleList(MicrobeGenetics = microbe_other),
-    colData = S4Vectors::DataFrame(sam_table))
+  MAE_out <- create_formatted_MAE(counts_other, tax_table_other2, sam_table)
   return(MAE_out)
 }