Update LegATo

wejlab · Jan 23, 2024 · 8f929ff · 8f929ff
1 parent 72a460c
commit 8f929ff
Show file tree

Hide file tree

Showing 40 changed files with 1,714 additions and 0 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -0,0 +1,5 @@
+^LMTK\.Rproj$
+^\.Rproj\.user$
+^LICENSE\.md$
+^vignettes\docs
+^Legato-docs
diff --git a/.gitignore b/.gitignore
@@ -47,3 +47,7 @@ po/*~
 
 # RStudio Connect folder
 rsconnect/
+.Rproj.user
+
+# pkgdown
+/Legato-doc/
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,51 @@
+Package: LegATo
+Title: LegATo: Longitudinal mEtaGenomic Analysis Toolkit
+Version: 0.0.0.9000
+Authors@R: c(
+    person("Aubrey", "Odom", , "[email protected]", role = c("aut", "cre"),
+           comment = c(ORCID = "0000-0001-7113-7598")),
+    person("Jared", "Pincus", , "[email protected]", role = "csl"),
+    person("Jordan", "Pincus", , "[email protected]", role = "art")
+  )
+Description: Streamlining longitudinal microbiome profiling in
+    Bioconductor.
+License: MIT + file LICENSE
+Depends:
+    R (>= 4.3.0)
+Imports:
+    alluvial,
+    animalcules,
+    emmeans,
+    geepack,
+    ggeffects,
+    lme4,
+    magrittr,
+    MultiAssayExperiment,
+    rlang,
+    SummarizedExperiment,
+    S4Vectors,
+    TBSignatureProfiler,
+    dplyr,
+    ggalluvial,
+    ggplot2,
+    stringr,
+    tibble,
+    tidyr,
+    tidyselect
+Suggests:
+    BiocStyle,
+    biomformat,
+    knitr,
+    lintr,
+    plyr,
+    R.utils,
+    rmarkdown,
+    Rsubread,
+    spelling,
+    sys,
+    testthat,
+    usethis
+Encoding: UTF-8
+Language: en-US
+Roxygen: list(markdown = TRUE)
+RoxygenNote: 7.2.3
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,2 @@
+YEAR: 2024
+COPYRIGHT HOLDER: LegATo authors
diff --git a/LICENSE.md b/LICENSE.md
@@ -0,0 +1,21 @@
+# MIT License
+
+Copyright (c) 2024 LegATo authors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/LMTK.Rproj b/LMTK.Rproj
@@ -0,0 +1,17 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
+
+BuildType: Package
+PackageUseDevtools: Yes
+PackageInstallArgs: --no-multiarch --with-keep.source
diff --git a/Legato-docs b/Legato-docs
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,13 @@
+# Generated by roxygen2: do not edit by hand
+
+export(clean_animalcules_MAE)
+export(filter_animalcules_MAE)
+export(get_long_data)
+export(get_summary_table)
+export(parse_MAE_SE)
+export(plot_alluvial)
+export(plot_stacked_area)
+export(plot_stacked_bar)
+import(MultiAssayExperiment)
+importFrom(magrittr,"%>%")
+importFrom(rlang,.data)
diff --git a/R/LegATo-package.R b/R/LegATo-package.R
@@ -0,0 +1,7 @@
+#' @keywords internal
+"_PACKAGE"
+
+## usethis namespace: start
+#' @importFrom magrittr %>%
+## usethis namespace: end
+NULL
diff --git a/R/NMIT.Rt b/R/NMIT.Rt
diff --git a/R/clean_animalcules_MAE.R b/R/clean_animalcules_MAE.R
@@ -0,0 +1,59 @@
+#' Clean up taxon names in a MultiAssayExperiment
+#'
+#' This functional is an optional method for fixing up taxon names in a
+#' \code{MultiAssayExperiment} to be run before \code{filter_animalcules_MAE}.
+#' Specifically it removes brackets from species names, replaces species labeled
+#' as "others" with "sp." and finally replaces underscores with spaces.
+#'
+#' @inheritParams plot_stacked_bar
+#'
+#' @returns An animalcules-formatted \code{MultiAssayExperiment} object with
+#'   cleaned-up taxonomy nomenclature.
+#'
+#' @export
+#' @importFrom rlang .data
+#'
+#' @examples
+#' # example code
+#' 
+
+clean_animalcules_MAE <- function(dat) {
+  parsed <- parse_MAE_SE(dat, which_assay = "MicrobeGenetics", type = "MAE")
+  tax_table <- parsed$tax
+  sam_table <- parsed$sam
+  counts_table <- parsed$counts
+
+  se_rowData  <- tax_table %>%
+    dplyr::mutate("species" = stringr::str_remove_all(species, "\\[|\\]"))
+
+  # Replace species that are others with "sp."
+  ind <- se_rowData$species == "others"
+  se_rowData$species[ind] <- paste(se_rowData$genus[ind], "sp.", sep = " ")
+  # Replace genus that are others with species information
+  ind <- se_rowData$genus == "others" & !is.na(se_rowData$genus)
+  all_split <- sapply(strsplit(se_rowData$species, " "), function(x) x[[1]])
+  se_rowData$genus[ind] <- all_split[ind]
+
+  #ISSUE: DUplicates because we need to consolidate down to species level
+
+  # Fix rownames
+  if (all.equal(rownames(tax_table), rownames(counts_table))) {
+    to_rownames <- stringr::str_replace_all(se_rowData$species, "_", " ")
+    rownames(se_rowData) <- rownames(counts_table) <- to_rownames
+    # Alphabetize
+    se_rowData_final <- se_rowData[order(to_rownames),]
+    counts_table_final <- counts_table[order(to_rownames),]
+  } else (stop("Mismatch of rownames in datasets"))
+
+  se_mgx <- counts_table %>% base::data.matrix() %>% S4Vectors::SimpleList() %>%
+    magrittr::set_names("MGX")
+
+  microbe_se <- SummarizedExperiment::SummarizedExperiment(
+    assays = se_mgx, colData = sam_table, rowData = se_rowData)
+  MAE_out <- MultiAssayExperiment::MultiAssayExperiment(
+    experiments = S4Vectors::SimpleList(MicrobeGenetics = microbe_se),
+    colData = sam_table)
+
+  return(MAE_out)
+}
+
diff --git a/R/filter_animalcules_MAE.R b/R/filter_animalcules_MAE.R
@@ -0,0 +1,91 @@
+utils::globalVariables(".")
+
+#' Filter a MultiAssayExperiment to a top percentage of taxa and label the rest
+#' as "Other"
+#'
+#' This function takes an animalcules-formatted \code{MultiAssayExperiment}
+#' (MAE) object and identifies all taxa at the "genus" level that represent
+#' <\code{filter_pct} average relative abundance across all samples in the MAE.
+#' After identification at the genus level, taxa across the genus and species
+#' levels are then consolidated into the category "Other".
+#'
+#' @inheritParams plot_stacked_bar
+#' @param filter_pct A double strictly between 0 and 1.
+#'
+#' @returns An animalcules-formatted \code{MultiAssayExperiment} object with
+#'   appropriate filtration.
+#'
+#' @export
+#' @importFrom rlang .data
+#'
+#' @examples
+#' # example code
+#' 
+
+filter_animalcules_MAE <- function(dat, filter_pct = 0.05) {
+  if(filter_pct <= 0 | filter_pct >= 1) stop("filter_pct must be between 0 and 1.")
+  # Extract metadata, taxonomic info, and counts
+  parsed <- parse_MAE_SE(dat, which_assay = "MicrobeGenetics", type = "MAE")
+  tax_table <- parsed$tax
+  sam_table <- parsed$sam
+  counts_table <- parsed$counts
+
+  all_relabu_genus <- counts_table %>%
+    as.matrix() %>%
+    # Get rel abu within samokes
+    prop.table(margin = 2) %>%
+    tibble::as_tibble() %>%
+    dplyr::bind_cols("genus" = tax_table$genus) %>%
+    dplyr::relocate(.data$genus) %>%
+    dplyr::group_by(.data$genus) %>%
+    # Sum rel abu within samples/columns for genera
+    dplyr::summarise(dplyr::across(.fns = sum, .cols = dplyr::everything())) %>%
+    # Sum everything but the first columm ("genus")
+    dplyr::mutate("allmeans" = rowMeans(select(., -1))) %>%
+    dplyr::select("genus", "allmeans") %>%
+    dplyr::mutate("genus" = replace(.data$genus, is.na(.data$genus), "Unknown")) %>%
+    dplyr::arrange(dplyr::desc(.data$allmeans)) %>%
+    dplyr::mutate("lessthan1pct" = .data$allmeans < filter_pct/10)
+
+  # Identify species in other
+  othergenera <- all_relabu_genus %>%
+    dplyr::filter(.data$lessthan1pct == TRUE) %>%
+    dplyr::select("genus") %>% unlist() %>% unname()
+
+  ## Use the identified species to update the tax, species tables
+
+  # Replace tax table
+  tax_table_other <- tax_table %>%
+    dplyr::mutate("genus" = replace(.data$genus, is.na(.data$genus), "Unknown")) %>%
+    dplyr::mutate(dplyr::across(c(.data$genus, .data$species),
+                         function(x) replace(x, .data$genus %in% othergenera, "Other")))
+
+  # Resum the counts table
+  counts_other <- counts_table %>%
+    dplyr::bind_cols("species" = tax_table_other$species) %>%
+    dplyr::group_by(.data$species) %>%
+    dplyr::summarise(dplyr::across(.cols = dplyr::all_of(colnames(counts_table)),
+                            .fns = sum)) %>%
+    tibble::column_to_rownames("species") %>%
+    base::data.matrix() %>%
+    S4Vectors::SimpleList() %>%
+    magrittr::set_names("MGX")
+
+  # Adjust tax table accordingly
+  tax_table_other2 <- tax_table_other %>%
+    dplyr::distinct(.data$species, .keep_all = TRUE) %>%
+    dplyr::arrange(.data$species)
+
+  rownames(tax_table_other2) <- tax_table_other2$species
+
+  ## Create SE object
+  microbe_other <- SummarizedExperiment::SummarizedExperiment(
+    assays = counts_other,
+    rowData = S4Vectors::DataFrame(tax_table_other2),
+    colData = S4Vectors::DataFrame(sam_table))
+  MAE_out <- MultiAssayExperiment::MultiAssayExperiment(
+    experiments = S4Vectors::SimpleList(MicrobeGenetics = microbe_other),
+    colData = S4Vectors::DataFrame(sam_table))
+  return(MAE_out)
+}
+
diff --git a/R/gee.Rt b/R/gee.Rt
@@ -0,0 +1,113 @@
+## Generalized Estimating Equations
+#- Run an independent GEE model for each taxa with relative abundance
+#- Works well with small data - multiple subpoints/subjects across clusters
+
+#https://data.library.virginia.edu/getting-started-with-generalized-estimating-equations/
+
+  ### Functions to implement GEEs
+
+mk_gee_plot <- function(this_coef, complex, tn) {
+  p <- plot(ggeffects::ggemmeans(complex,
+                                 terms = c(this_coef, "VisitNumber", "Group"))) + 
+    ggplot2::labs(subtitle = "Estimated marginal means",
+                  title = str_replace(tn, "_", " ")) +
+    ylab("Taxon abundance (log CPM)") +
+    ggplot2::theme_classic()
+  ggplot2::ggsave(filename =
+                    paste("PaperFigs/gee/", tn, "_", this_coef, ".png", sep = ""),
+                  p, width = 6, height = 4, units = "in", scale = 0.7)
+}
+
+test_models_gee <- function(tn, input_df, to_plot = FALSE) {
+  filt_df <- input_df %>% filter(taxon == tn) %>%
+    mutate(PatientID = as.factor(PatientID)) %>%
+    arrange(PatientID)
+  complex <- geeglm(Abundance ~ 1 + Group +
+                      Q23.17.Neutraliz..*Group + LAI.Neutraliz..*Group +
+                      BaL.Neutraliz..*Group + VisitNumber,
+                    id = PatientID, data = filt_df,
+                    na.action = na.omit, family = "gaussian",
+                    corstr = "ar1")
+  sum_comp <- summary(complex)
+  all_ci <- broom:::confint.geeglm(complex)
+  if (to_plot) {
+    sapply(c("Q23.17.Neutraliz..", "LAI.Neutraliz..", "BaL.Neutraliz.."),
+           mk_gee_plot, complex, tn)
+  }
+  res_out <- sum_comp$coefficients %>% select(Estimate, `Pr(>|W|)`) %>%
+    mutate("Taxon" = tn) %>%
+    bind_cols(all_ci) %>% relocate(Taxon, lwr, Estimate, upr) %>%
+    tibble::rownames_to_column(var = "Coefficient")
+  return(res_out)
+}
+
+save_table <- function(input_df = results_species,
+                       fileout = "PaperFigs/species_us_gee.csv") {
+  input_df %>%
+    dplyr::rename("Coefficient Estimate" = Estimate,
+                  "Lower 95% CI" = lwr, "Upper 95% CI" = upr,
+                  "Unadj p-value" = `Pr(>|W|)`,
+                  "Adj p-value" = `adj_p`) %>%
+    write.csv(., fileout)
+}
+
+# logcpm_tab <- long_lcpm_genus
+run_model <- function(logcpm_tab, to_plot = TRUE) {
+  input_df <- logcpm_tab %>% filter(VisitNumber != 2)
+  all_tn <- unique(input_df$taxon)
+  n <- length(all_tn)
+
+  # Plot model coefficients
+  # If emmeans doesn't work, may need to step through function
+  # and make one plot
+  if (to_plot) lapply(all_tn, test_models_gee, input_df = input_df, to_plot = TRUE)
+
+  storage <- lapply(all_tn, test_models_gee, input_df = input_df) %>%
+    data.table::rbindlist() %>%
+    arrange(Coefficient) %>%
+    group_by(Coefficient) %>%
+    mutate(adj_p = p.adjust(`Pr(>|W|)`, method = "bonferroni")) %>%
+    as.data.frame()
+  kable(storage, caption = "P-values from GEEs",
+        row.names = TRUE)
+  return(storage)
+}
+
+
+### Modeling genera -----
+```{R}
+# Input: create long version of CPM (both moms and infants)
+long_lcpm_genus <- create_long(logCPM_genus)
+
+# Get genus-level results
+results_genus <- run_model(long_lcpm_genus, to_plot = TRUE)
+save_table(results_genus, "PaperFigs/genus_ar1_gee.csv")
+
+# Bonferroni cutoff - genus
+0.05/length(unique(results_genus$Taxon))
+```
+
+
+### Modeling species -----
+```{R}
+# Input: create long version of CPM (both moms and infants)
+long_lcpm_species <- SummarizedExperiment::assay(dat, "log_counts_cpm") %>%
+  as.data.frame() %>% create_long()
+
+# Top species (more than 1%)
+filt <- animalcules::counts_to_relabu(counts_table) |>
+  apply(1, function(x) round(mean(x), 3)) %>%
+  .[. > 0.01]
+## Remove any species that are general "sp." or "Other"
+ind <- stringr::str_detect(names(filt), "_sp.|Other")
+species_to_test <- names(filt[!ind])
+
+# Get species-level results
+results_species <- long_lcpm_species %>%
+  filter(taxon %in% species_to_test) %>%
+  run_model(to_plot = TRUE)
+save_table(results_species, "PaperFigs/species_ar1_gee.csv")
+
+# Bonferroni cutoff - genus
+0.05/length(species_to_test)
+```