pachterlab
diff --git a/‎.lintr
+1-1 b/‎.lintr
+1-1
diff --git a/‎CHANGELOG.md
+25 b/‎CHANGELOG.md
+25
diff --git a/‎DESCRIPTION
+2-1 b/‎DESCRIPTION
+2-1
diff --git a/‎NAMESPACE
+12-1 b/‎NAMESPACE
+12-1
diff --git a/‎R/bootstrap.R
+188-5 b/‎R/bootstrap.R
+188-5
diff --git a/‎R/gene_analysis.R
+17-2 b/‎R/gene_analysis.R
+17-2
diff --git a/‎R/likelihood.R
+11-1 b/‎R/likelihood.R
+11-1
@@ -7,4 +7,4 @@ linters: with_defaults(
   single_quotes_linter = NULL,
   trailing_blank_lines_linter = NULL
   )
-exclusions: list("R/hexamers.R", "inst/doc/intro.R")
+exclusions: list("R/hexamers.R", "vignettes/intro.Rmd")
@@ -0,0 +1,25 @@
+# version 0.29.0
+
+This version has numerous bug fixes and several performance upgrades.
+Most notably, memory usage has been decreased greatly by no longer storing the bootstraps in memory.
+Additionally, speed has been improved in numerous areas — particularly `sleuth_prep` — by changing several of the computations as well as changing the order of the parallelization (special thanks to [Warren McGee](https://github.com/warrenmcg) for his contributions to this).
+
+Below is an incomplete list of new features:
+
+- The full model no longer has to be specified in `sleuth_prep`.
+- A new function `extract_model` allow users to extract the effect sizes for a model in a tidy format similar to [broom](https://cran.r-project.org/web/packages/broom/vignettes/broom.html).
+- An arbitrary transformation can be specified/used in `sleuth_prep` (see argument `transformation_function`).
+
+A big thanks to our users for fixing and reporting bugs.
+A special thanks to [Warren McGee](https://github.com/warrenmcg) for making several of the performance improvements as well as fixing several bugs.
+Below is a partial list of many of the upgrades and the pull requests by the community.
+
+- [Memory overhaul to reduce overall usage](https://github.com/pachterlab/sleuth/pull/63) (@psturmfels)
+- [Bugfix to drop unused factors](https://github.com/pachterlab/sleuth/pull/71) (@roryk and @warrenmcg)
+- [Reduce memory footprint and improve parallelization](https://github.com/pachterlab/sleuth/pull/94) (@warrenmcg)
+- [Add gene annotations when using `sleuth_results`](https://github.com/pachterlab/sleuth/pull/95) (@warrenmcg)
+- [Improve sample name handling](https://github.com/pachterlab/sleuth/pull/96) (@warrenmcg)
+- [Reconcile memory overhaul and gene aggregation and allow arbitrary transformations](https://github.com/pachterlab/sleuth/pull/99) (@warrenmcg)
+- [Do not parallelize when in RStudio](https://github.com/pachterlab/sleuth/pull/108) (@warrenmcg)
+- [Remove warning in `sliding_window_grouping`](https://github.com/pachterlab/sleuth/pull/106) (@warrenmcg)
+- [Bug fix in `sleuth_live` in gene mode](https://github.com/pachterlab/sleuth/pull/107) (@warrenmcg)
@@ -1,6 +1,6 @@
 Package: sleuth
 Title: Tools for investigating RNA-Seq
-Version: 0.28.1
+Version: 0.29.0
 Authors@R: c(person("Harold", "Pimentel", , "[email protected]", role = c("aut", "cre")))
 Description: Investigate transcript abundance from "kallisto" and differential
     expression analysis from RNA-Seq data.
@@ -20,6 +20,7 @@ Imports:
     rhdf5,
     parallel,
     lazyeval,
+    matrixStats,
     shiny
 Suggests:
     MASS,
 
@@ -1,5 +1,6 @@
-# Generated by roxygen2 (4.1.1): do not edit by hand
+# Generated by roxygen2: do not edit by hand
 
+S3method("$<-",sleuth)
 S3method(bias_table,kallisto)
 S3method(bias_table,sleuth)
 S3method(get_bootstraps,kallisto)
@@ -14,16 +15,20 @@ S3method(print,sleuth)
 S3method(print,sleuth_model)
 S3method(summary,sleuth)
 S3method(tests,sleuth)
+export("transform_fun<-")
 export(basic_filter)
 export(bias_table)
 export(bs_sigma_summary)
 export(counts_to_fpkm)
 export(counts_to_tpm)
 export(design_matrix)
 export(enclosed_brush)
+export(extract_model)
+export(get_bootstrap_summary)
 export(get_bootstraps)
 export(get_quantile)
 export(kallisto_table)
+export(log_transform)
 export(melt_bootstrap_sleuth)
 export(models)
 export(norm_factors)
@@ -46,19 +51,25 @@ export(read_kallisto)
 export(read_kallisto_h5)
 export(read_kallisto_tsv)
 export(shrink_df)
+export(sleuth_deploy)
 export(sleuth_fit)
 export(sleuth_gene_table)
 export(sleuth_live)
 export(sleuth_live_settings)
+export(sleuth_load)
 export(sleuth_lrt)
 export(sleuth_prep)
 export(sleuth_results)
+export(sleuth_save)
 export(sleuth_to_matrix)
 export(sleuth_wt)
 export(sliding_window_grouping)
 export(tests)
 export(tpm_to_alpha)
 export(transcripts_from_gene)
+export(transform_status)
+export(transform_status.sleuth)
+export(transform_status.sleuth_model)
 import(dplyr)
 importFrom(data.table,fread)
 importFrom(lazyeval,interp)
 
@@ -41,7 +41,11 @@ bootstrap2mat <- function(kal, column = "tpm") {
 
 #' Extract bootstrap for a specific transcript
 #'
-#' Extract bootstrap for a specific transcript
+#' Extract bootstrap for a specific transcript.
+#' CURRENTLY DEPRECATED: Will probably be re implemented in the next version.
+#' Currently not working because of a complete rewrite of the bootstrap code.
+#' If you are interested in getting the bootstraps, you can manually write some code
+#' using `read_kallisto()`. Please make sure to comment in the user group if you are using this function.
 #'
 #' @param obj an object
 #' @param ... arguments passed to other functions
@@ -145,7 +149,7 @@ aggregate_bootstrap <- function(kal, mapping, split_by = "gene_id",
 
   if ( any(!complete.cases(mapping)) ) {
     warning("Found some NAs in mapping. Removing them.")
-    mapping <- mapping[complete.cases(mapping),]
+    mapping <- mapping[complete.cases(mapping), ]
   }
 
   m_bs <- melt_bootstrap(kal, column)
@@ -236,6 +240,47 @@ normalize_bootstrap <- function(kal, tpm_size_factor, est_counts_size_factor) {
   kal
 }
 
+#' bootstrap summary
+#'
+#' Extract the bootstrap summary from a sleuth object that has been initialized in sleuth_prep.
+#'
+#' @param obj a \code{sleuth} object such that \code{extra_bootstrap_summary = TRUE} inside of \code{\link{sleuth_prep}}.
+#' @param target_id a character vector of length 1 indicating the target_id (transcript or gene name depending on aggregation mode)
+#' @param units a character vector of either 'est_counts' or 'tpm' (also requires \code{extra_bootstrap_summary = TRUE} in \code{\link{sleuth_prep}})
+#' @return a \code{data.frame} with the summary statistics across all samples for that particular target
+#' @export
+get_bootstrap_summary <- function(obj, target_id, units = 'est_counts') {
+  stopifnot( is(obj, 'sleuth') )
+
+  if (units != 'est_counts' && units != 'tpm' && units != 'scaled_reads_per_base') {
+    stop(paste0("'", units, "' is invalid for 'units'. please see documentation"))
+  }
+
+  if (is.null(obj$bs_quants)) {
+    if (units == 'est_counts') {
+      stop("bootstrap summary missing. rerun sleuth_prep() with argument 'extra_bootstrap_summary = TRUE'")
+    } else {
+      stop("bootstrap summary missing. rerun sleuth_prep() with argument 'extra_bootstrap_summary = TRUE' and 'read_bootstrap_tpm = TRUE'")
+    }
+  }
+
+  if (!(target_id %in% rownames(obj$bs_quants[[1]][[units]]))) {
+    stop(paste0("couldn't find target_id '", target_id, "'"))
+  }
+
+  df <- as_df(
+    do.call(rbind,
+      lapply(obj$bs_quants,
+      function(sample_bs) {
+        sample_bs[[units]][target_id, ]
+      })
+      )
+    )
+  df <- dplyr::bind_cols(df, obj$sample_to_covariates)
+
+  df
+}
+
 
 # Sample bootstraps
 #
@@ -275,8 +320,8 @@ sample_bootstrap <- function(obj, n_samples = 100L) {
   # matrix sample
   for (s in 1:n_samples) {
     for (idx in 1:nrow(which_samp)) {
-      b <- which_samp[idx,s]
-      sample_mat[[s]][,idx] <- obj$kal[[idx]]$bootstrap[[b]]$est_counts
+      b <- which_samp[idx, s]
+      sample_mat[[s]][, idx] <- obj$kal[[idx]]$bootstrap[[b]]$est_counts
     }
   }
 
@@ -318,9 +363,147 @@ dcast_bootstrap.kallisto <- function(obj, units, nsamples = NULL) {
   mat <- matrix(NA_real_, nrow = n_features, ncol = length(which_bs))
 
   for (j in seq_along(which_bs)) {
-    mat[ ,j] <- obj[[ "bootstrap" ]][[which_bs[j]]][[ units ]]
+    mat[, j] <- obj[[ "bootstrap" ]][[which_bs[j]]][[ units ]]
   }
   rownames(mat) <- obj[["bootstrap"]][[1]][["target_id"]]
 
   mat
 }
+
+# Function to process bootstraps for parallelization
+process_bootstrap <- function(i, samp_name, kal_path,
+                              num_transcripts, est_count_sf,
+                              read_bootstrap_tpm, gene_mode,
+                              extra_bootstrap_summary,
+                              target_id, mappings, which_ids,
+                              aggregation_column, transform_fun)
+{
+  dot(i)
+  bs_quants <- list()
+
+  num_bootstrap <- as.integer(rhdf5::h5read(kal_path$path,
+                                            "aux/num_bootstrap"))
+  if (num_bootstrap == 0) {
+    stop(paste0("File ", kal_path, " has no bootstraps.",
+                "Please generate bootstraps using \"kallisto quant -b\"."))
+  }
+
+  # TODO: only perform operations on filtered transcripts
+  eff_len <- rhdf5::h5read(kal_path$path, "aux/eff_lengths")
+  bs_mat <- read_bootstrap_mat(fname = kal_path$path,
+                               num_bootstraps = num_bootstrap,
+                               num_transcripts = num_transcripts,
+                               est_count_sf = est_count_sf)
+
+  if (read_bootstrap_tpm) {
+    bs_quant_tpm <- aperm(apply(bs_mat, 1, counts_to_tpm,
+                                eff_len))
+
+    # gene level code is analogous here to below code
+    if (gene_mode) {
+      colnames(bs_quant_tpm) <- target_id
+      # Make bootstrap_num an explicit column; each is treated as a "sample"
+      bs_tpm_df <- data.frame(bootstrap_num = c(1:num_bootstrap),
+                              bs_quant_tpm, check.names = F)
+      rm(bs_quant_tpm)
+      # Make long tidy table; this step is much faster
+      # using data.table melt rather than tidyr gather
+      tidy_tpm <- data.table::melt(bs_tpm_df, id.vars = "bootstrap_num",
+                                   variable.name = "target_id",
+                                   value.name = "tpm")
+      tidy_tpm <- data.table::as.data.table(tidy_tpm)
+      rm(bs_tpm_df)
+      tidy_tpm$target_id <- as.character(tidy_tpm$target_id)
+      tidy_tpm <- merge(tidy_tpm, mappings, by = "target_id",
+                        all.x = T)
+      # Data.table dcast uses non-standard evaluation
+      # So quote the full casting formula to make sure
+      # "aggregation_column" is interpreted as a variable
+      # see: http://stackoverflow.com/a/31295592
+      quant_tpm_formula <- paste("bootstrap_num ~",
+                                 aggregation_column)
+      bs_quant_tpm <- data.table::dcast(tidy_tpm,
+                                        quant_tpm_formula, value.var = "tpm",
+                                        fun.aggregate = sum)
+      bs_quant_tpm <- as.matrix(bs_quant_tpm[, -1])
+      rm(tidy_tpm) # these tables are very large
+    }
+    bs_quant_tpm <- aperm(apply(bs_quant_tpm, 2,
+                                quantile))
+    colnames(bs_quant_tpm) <- c("min", "lower", "mid",
+                                "upper", "max")
+    bs_quants$tpm <- bs_quant_tpm
+  }
+
+  if (gene_mode) {
+    # I can combine target_id and eff_len
+    # I assume the order is the same, since it's read from the same kallisto
+    # file and each kallisto file has the same order
+    eff_len_df <- data.frame(target_id, eff_len,
+                             stringsAsFactors = F)
+    # make bootstrap number an explicit column to facilitate melting
+    bs_df <- data.frame(bootstrap_num = c(1:num_bootstrap),
+                        bs_mat, check.names = F)
+    rm(bs_mat)
+    # data.table melt function is much faster than tidyr's gather function
+    # output is a long table with each bootstrap's value for each target_id
+    tidy_bs <- data.table::melt(bs_df, id.vars = "bootstrap_num",
+                                variable.name = "target_id",
+                                value.name = "est_counts")
+    rm(bs_df)
+    # not sure why, but the melt function always returns a factor,
+    # even when setting variable.factor = F, so I coerce target_id
+    tidy_bs$target_id <- as.character(tidy_bs$target_id)
+    # combine the long tidy table with eff_len and aggregation mappings
+    # note that bootstrap number is treated as "sample" here
+    # for backwards compatibility
+    tidy_bs <- dplyr::select(tidy_bs, target_id,
+                             est_counts, sample = bootstrap_num)
+    tidy_bs <- merge(data.table::as.data.table(tidy_bs),
+                     data.table::as.data.table(eff_len_df), by = "target_id",
+                     all.x = TRUE)
+    tidy_bs <- merge(tidy_bs, mappings, by = "target_id",
+                     all.x = TRUE)
+    # create the median effective length scaling factor for each gene
+    scale_factor <- tidy_bs[, scale_factor := median(eff_len),
+                            by = eval(parse(text=aggregation_column))]
+    # use the old reads_per_base_transform method to get gene scaled counts
+    scaled_bs <- reads_per_base_transform(tidy_bs,
+                                          scale_factor$scale_factor,
+                                          aggregation_column,
+                                          mappings)
+    # this step undoes the tidying to get back a matrix format
+    # target_ids here are now the aggregation column ids
+    bs_mat <- data.table::dcast(scaled_bs, sample ~ target_id,
+                                value.var = "scaled_reads_per_base")
+    # this now has the same format as the transcript matrix
+    # but it uses gene ids
+    bs_mat <- as.matrix(bs_mat[, -1])
+    rm(tidy_bs, scaled_bs)
+  }
+
+  if (extra_bootstrap_summary) {
+    bs_quant_est_counts <- aperm(apply(bs_mat, 2,
+                                       quantile))
+    colnames(bs_quant_est_counts) <- c("min", "lower",
+                                       "mid", "upper", "max")
+    bs_quants$est_counts <- bs_quant_est_counts
+  }
+
+  bs_mat <- transform_fun(bs_mat)
+  # If bs_mat was made at gene-level, already has column names
+  # If at transcript-level, need to add target_ids
+  if(!gene_mode) {
+    colnames(bs_mat) <- target_id
+  } else {
+    # rename est_counts to scaled_reads_per_base
+    bs_quants$scaled_reads_per_base <- bs_quants$est_counts
+    bs_quants$est_counts <- NULL
+  }
+  # all_sample_bootstrap[, i] bootstrap point estimate of the inferential
+  # variability in sample i
+  # NOTE: we are only keeping the ones that pass the filter
+  bootstrap_result <- matrixStats::colVars(bs_mat[, which_ids])
+
+  list(index = i, bs_quants = bs_quants, bootstrap_result = bootstrap_result)
+}
@@ -1,8 +1,8 @@
 propagate_transcript_filter <- function(filter_df, target_mapping,
   grouping_column) {
 
-  filtered_target_mapping <- dplyr::inner_join(as.data.table(filter_df),
-    as.data.table(target_mapping), by = 'target_id')
+  filtered_target_mapping <- dplyr::inner_join(as.data.table(filter_df), # nolint
+    as.data.table(target_mapping), by = 'target_id') # nolint
 
   filtered_target_mapping <- dplyr::select_(filtered_target_mapping,
     grouping_column)
@@ -12,3 +12,18 @@ propagate_transcript_filter <- function(filter_df, target_mapping,
 
   filtered_target_mapping
 }
+
+check_quant_mode <- function(obj, units) {
+  stopifnot( is(obj, 'sleuth') )
+  if (obj$gene_mode & units == 'est_counts') {
+    warning(paste("your sleuth object is in gene mode,",
+                  "but you selected 'est_counts'. Selecting 'scaled_reads_per_base'..."))
+    units <- 'scaled_reads_per_base'
+  } else if (!obj$gene_mode & units == 'scaled_reads_per_base') {
+    warning(paste("your sleuth object is not in gene mode,",
+                  "but you selected 'scaled_reads_per_base'. Selecting 'est_counts'..."))
+    units <- 'scaled_reads_per_base'
+  }
+
+  units
+}
@@ -66,6 +66,16 @@ sleuth_lrt <- function(obj, null_model, alt_model) {
   model_exists(obj, null_model)
   model_exists(obj, alt_model)
 
+  if(!obj$fits[[alt_model]]$transform_synced) {
+    stop("Model '", alt_model, "' was not computed using the sleuth object's",
+         " current transform function. Please rerun sleuth_fit for this model.")
+  }
+
+  if(!obj$fits[[null_model]]$transform_synced) {
+    stop("Model '", null_model, "' was not computed using the sleuth object's",
+         " current transform function. Please rerun sleuth_fit for this model.")
+  }
+
   if ( !likelihood_exists(obj, null_model) ) {
     obj <- compute_likelihood(obj, null_model)
   }
@@ -90,7 +100,7 @@ sleuth_lrt <- function(obj, null_model, alt_model) {
     test_stat = test_statistic, pval = p_value)
   result <- dplyr::mutate(result, qval = p.adjust(pval, method = "BH"))
   model_info <- data.table::data.table(obj$fits[[null_model]]$summary)
-  model_info <- dplyr::select(model_info, -c(x_group, iqr))
+  model_info <- dplyr::select(model_info, -c(iqr))
   result <- dplyr::left_join(
     data.table::data.table(result),
     model_info,
Original file line number	Diff line number	Diff line change
`@@ -7,4 +7,4 @@ linters: with_defaults(`
`7`	`7`	`single_quotes_linter = NULL,`
`8`	`8`	`trailing_blank_lines_linter = NULL`
`9`	`9`	`)`
`10`		`-exclusions: list("R/hexamers.R", "inst/doc/intro.R")`
	`10`	`+exclusions: list("R/hexamers.R", "vignettes/intro.Rmd")`