Skip to content

Commit 4d16c35

Browse files
authored
Merge pull request #110 from pachterlab/devel
Release v0.29.0
2 parents 048f055 + 2ab00fb commit 4d16c35

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

70 files changed

+14677
-288
lines changed

.lintr

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,4 @@ linters: with_defaults(
77
single_quotes_linter = NULL,
88
trailing_blank_lines_linter = NULL
99
)
10-
exclusions: list("R/hexamers.R", "inst/doc/intro.R")
10+
exclusions: list("R/hexamers.R", "vignettes/intro.Rmd")

CHANGELOG.md

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# version 0.29.0
2+
3+
This version has numerous bug fixes and several performance upgrades.
4+
Most notably, memory usage has been decreased greatly by no longer storing the bootstraps in memory.
5+
Additionally, speed has been improved in numerous areas — particularly `sleuth_prep` — by changing several of the computations as well as changing the order of the parallelization (special thanks to [Warren McGee](https://github.com/warrenmcg) for his contributions to this).
6+
7+
Below is an incomplete list of new features:
8+
9+
- The full model no longer has to be specified in `sleuth_prep`.
10+
- A new function `extract_model` allow users to extract the effect sizes for a model in a tidy format similar to [broom](https://cran.r-project.org/web/packages/broom/vignettes/broom.html).
11+
- An arbitrary transformation can be specified/used in `sleuth_prep` (see argument `transformation_function`).
12+
13+
A big thanks to our users for fixing and reporting bugs.
14+
A special thanks to [Warren McGee](https://github.com/warrenmcg) for making several of the performance improvements as well as fixing several bugs.
15+
Below is a partial list of many of the upgrades and the pull requests by the community.
16+
17+
- [Memory overhaul to reduce overall usage](https://github.com/pachterlab/sleuth/pull/63) (@psturmfels)
18+
- [Bugfix to drop unused factors](https://github.com/pachterlab/sleuth/pull/71) (@roryk and @warrenmcg)
19+
- [Reduce memory footprint and improve parallelization](https://github.com/pachterlab/sleuth/pull/94) (@warrenmcg)
20+
- [Add gene annotations when using `sleuth_results`](https://github.com/pachterlab/sleuth/pull/95) (@warrenmcg)
21+
- [Improve sample name handling](https://github.com/pachterlab/sleuth/pull/96) (@warrenmcg)
22+
- [Reconcile memory overhaul and gene aggregation and allow arbitrary transformations](https://github.com/pachterlab/sleuth/pull/99) (@warrenmcg)
23+
- [Do not parallelize when in RStudio](https://github.com/pachterlab/sleuth/pull/108) (@warrenmcg)
24+
- [Remove warning in `sliding_window_grouping`](https://github.com/pachterlab/sleuth/pull/106) (@warrenmcg)
25+
- [Bug fix in `sleuth_live` in gene mode](https://github.com/pachterlab/sleuth/pull/107) (@warrenmcg)

DESCRIPTION

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Package: sleuth
22
Title: Tools for investigating RNA-Seq
3-
Version: 0.28.1
3+
Version: 0.29.0
44
Authors@R: c(person("Harold", "Pimentel", , "[email protected]", role = c("aut", "cre")))
55
Description: Investigate transcript abundance from "kallisto" and differential
66
expression analysis from RNA-Seq data.
@@ -20,6 +20,7 @@ Imports:
2020
rhdf5,
2121
parallel,
2222
lazyeval,
23+
matrixStats,
2324
shiny
2425
Suggests:
2526
MASS,

NAMESPACE

+12-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
# Generated by roxygen2 (4.1.1): do not edit by hand
1+
# Generated by roxygen2: do not edit by hand
22

3+
S3method("$<-",sleuth)
34
S3method(bias_table,kallisto)
45
S3method(bias_table,sleuth)
56
S3method(get_bootstraps,kallisto)
@@ -14,16 +15,20 @@ S3method(print,sleuth)
1415
S3method(print,sleuth_model)
1516
S3method(summary,sleuth)
1617
S3method(tests,sleuth)
18+
export("transform_fun<-")
1719
export(basic_filter)
1820
export(bias_table)
1921
export(bs_sigma_summary)
2022
export(counts_to_fpkm)
2123
export(counts_to_tpm)
2224
export(design_matrix)
2325
export(enclosed_brush)
26+
export(extract_model)
27+
export(get_bootstrap_summary)
2428
export(get_bootstraps)
2529
export(get_quantile)
2630
export(kallisto_table)
31+
export(log_transform)
2732
export(melt_bootstrap_sleuth)
2833
export(models)
2934
export(norm_factors)
@@ -46,19 +51,25 @@ export(read_kallisto)
4651
export(read_kallisto_h5)
4752
export(read_kallisto_tsv)
4853
export(shrink_df)
54+
export(sleuth_deploy)
4955
export(sleuth_fit)
5056
export(sleuth_gene_table)
5157
export(sleuth_live)
5258
export(sleuth_live_settings)
59+
export(sleuth_load)
5360
export(sleuth_lrt)
5461
export(sleuth_prep)
5562
export(sleuth_results)
63+
export(sleuth_save)
5664
export(sleuth_to_matrix)
5765
export(sleuth_wt)
5866
export(sliding_window_grouping)
5967
export(tests)
6068
export(tpm_to_alpha)
6169
export(transcripts_from_gene)
70+
export(transform_status)
71+
export(transform_status.sleuth)
72+
export(transform_status.sleuth_model)
6273
import(dplyr)
6374
importFrom(data.table,fread)
6475
importFrom(lazyeval,interp)

R/bootstrap.R

+188-5
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,11 @@ bootstrap2mat <- function(kal, column = "tpm") {
4141

4242
#' Extract bootstrap for a specific transcript
4343
#'
44-
#' Extract bootstrap for a specific transcript
44+
#' Extract bootstrap for a specific transcript.
45+
#' CURRENTLY DEPRECATED: Will probably be re implemented in the next version.
46+
#' Currently not working because of a complete rewrite of the bootstrap code.
47+
#' If you are interested in getting the bootstraps, you can manually write some code
48+
#' using `read_kallisto()`. Please make sure to comment in the user group if you are using this function.
4549
#'
4650
#' @param obj an object
4751
#' @param ... arguments passed to other functions
@@ -145,7 +149,7 @@ aggregate_bootstrap <- function(kal, mapping, split_by = "gene_id",
145149

146150
if ( any(!complete.cases(mapping)) ) {
147151
warning("Found some NAs in mapping. Removing them.")
148-
mapping <- mapping[complete.cases(mapping),]
152+
mapping <- mapping[complete.cases(mapping), ]
149153
}
150154

151155
m_bs <- melt_bootstrap(kal, column)
@@ -236,6 +240,47 @@ normalize_bootstrap <- function(kal, tpm_size_factor, est_counts_size_factor) {
236240
kal
237241
}
238242

243+
#' bootstrap summary
244+
#'
245+
#' Extract the bootstrap summary from a sleuth object that has been initialized in sleuth_prep.
246+
#'
247+
#' @param obj a \code{sleuth} object such that \code{extra_bootstrap_summary = TRUE} inside of \code{\link{sleuth_prep}}.
248+
#' @param target_id a character vector of length 1 indicating the target_id (transcript or gene name depending on aggregation mode)
249+
#' @param units a character vector of either 'est_counts' or 'tpm' (also requires \code{extra_bootstrap_summary = TRUE} in \code{\link{sleuth_prep}})
250+
#' @return a \code{data.frame} with the summary statistics across all samples for that particular target
251+
#' @export
252+
get_bootstrap_summary <- function(obj, target_id, units = 'est_counts') {
253+
stopifnot( is(obj, 'sleuth') )
254+
255+
if (units != 'est_counts' && units != 'tpm' && units != 'scaled_reads_per_base') {
256+
stop(paste0("'", units, "' is invalid for 'units'. please see documentation"))
257+
}
258+
259+
if (is.null(obj$bs_quants)) {
260+
if (units == 'est_counts') {
261+
stop("bootstrap summary missing. rerun sleuth_prep() with argument 'extra_bootstrap_summary = TRUE'")
262+
} else {
263+
stop("bootstrap summary missing. rerun sleuth_prep() with argument 'extra_bootstrap_summary = TRUE' and 'read_bootstrap_tpm = TRUE'")
264+
}
265+
}
266+
267+
if (!(target_id %in% rownames(obj$bs_quants[[1]][[units]]))) {
268+
stop(paste0("couldn't find target_id '", target_id, "'"))
269+
}
270+
271+
df <- as_df(
272+
do.call(rbind,
273+
lapply(obj$bs_quants,
274+
function(sample_bs) {
275+
sample_bs[[units]][target_id, ]
276+
})
277+
)
278+
)
279+
df <- dplyr::bind_cols(df, obj$sample_to_covariates)
280+
281+
df
282+
}
283+
239284

240285
# Sample bootstraps
241286
#
@@ -275,8 +320,8 @@ sample_bootstrap <- function(obj, n_samples = 100L) {
275320
# matrix sample
276321
for (s in 1:n_samples) {
277322
for (idx in 1:nrow(which_samp)) {
278-
b <- which_samp[idx,s]
279-
sample_mat[[s]][,idx] <- obj$kal[[idx]]$bootstrap[[b]]$est_counts
323+
b <- which_samp[idx, s]
324+
sample_mat[[s]][, idx] <- obj$kal[[idx]]$bootstrap[[b]]$est_counts
280325
}
281326
}
282327

@@ -318,9 +363,147 @@ dcast_bootstrap.kallisto <- function(obj, units, nsamples = NULL) {
318363
mat <- matrix(NA_real_, nrow = n_features, ncol = length(which_bs))
319364

320365
for (j in seq_along(which_bs)) {
321-
mat[ ,j] <- obj[[ "bootstrap" ]][[which_bs[j]]][[ units ]]
366+
mat[, j] <- obj[[ "bootstrap" ]][[which_bs[j]]][[ units ]]
322367
}
323368
rownames(mat) <- obj[["bootstrap"]][[1]][["target_id"]]
324369

325370
mat
326371
}
372+
373+
# Function to process bootstraps for parallelization
374+
process_bootstrap <- function(i, samp_name, kal_path,
375+
num_transcripts, est_count_sf,
376+
read_bootstrap_tpm, gene_mode,
377+
extra_bootstrap_summary,
378+
target_id, mappings, which_ids,
379+
aggregation_column, transform_fun)
380+
{
381+
dot(i)
382+
bs_quants <- list()
383+
384+
num_bootstrap <- as.integer(rhdf5::h5read(kal_path$path,
385+
"aux/num_bootstrap"))
386+
if (num_bootstrap == 0) {
387+
stop(paste0("File ", kal_path, " has no bootstraps.",
388+
"Please generate bootstraps using \"kallisto quant -b\"."))
389+
}
390+
391+
# TODO: only perform operations on filtered transcripts
392+
eff_len <- rhdf5::h5read(kal_path$path, "aux/eff_lengths")
393+
bs_mat <- read_bootstrap_mat(fname = kal_path$path,
394+
num_bootstraps = num_bootstrap,
395+
num_transcripts = num_transcripts,
396+
est_count_sf = est_count_sf)
397+
398+
if (read_bootstrap_tpm) {
399+
bs_quant_tpm <- aperm(apply(bs_mat, 1, counts_to_tpm,
400+
eff_len))
401+
402+
# gene level code is analogous here to below code
403+
if (gene_mode) {
404+
colnames(bs_quant_tpm) <- target_id
405+
# Make bootstrap_num an explicit column; each is treated as a "sample"
406+
bs_tpm_df <- data.frame(bootstrap_num = c(1:num_bootstrap),
407+
bs_quant_tpm, check.names = F)
408+
rm(bs_quant_tpm)
409+
# Make long tidy table; this step is much faster
410+
# using data.table melt rather than tidyr gather
411+
tidy_tpm <- data.table::melt(bs_tpm_df, id.vars = "bootstrap_num",
412+
variable.name = "target_id",
413+
value.name = "tpm")
414+
tidy_tpm <- data.table::as.data.table(tidy_tpm)
415+
rm(bs_tpm_df)
416+
tidy_tpm$target_id <- as.character(tidy_tpm$target_id)
417+
tidy_tpm <- merge(tidy_tpm, mappings, by = "target_id",
418+
all.x = T)
419+
# Data.table dcast uses non-standard evaluation
420+
# So quote the full casting formula to make sure
421+
# "aggregation_column" is interpreted as a variable
422+
# see: http://stackoverflow.com/a/31295592
423+
quant_tpm_formula <- paste("bootstrap_num ~",
424+
aggregation_column)
425+
bs_quant_tpm <- data.table::dcast(tidy_tpm,
426+
quant_tpm_formula, value.var = "tpm",
427+
fun.aggregate = sum)
428+
bs_quant_tpm <- as.matrix(bs_quant_tpm[, -1])
429+
rm(tidy_tpm) # these tables are very large
430+
}
431+
bs_quant_tpm <- aperm(apply(bs_quant_tpm, 2,
432+
quantile))
433+
colnames(bs_quant_tpm) <- c("min", "lower", "mid",
434+
"upper", "max")
435+
bs_quants$tpm <- bs_quant_tpm
436+
}
437+
438+
if (gene_mode) {
439+
# I can combine target_id and eff_len
440+
# I assume the order is the same, since it's read from the same kallisto
441+
# file and each kallisto file has the same order
442+
eff_len_df <- data.frame(target_id, eff_len,
443+
stringsAsFactors = F)
444+
# make bootstrap number an explicit column to facilitate melting
445+
bs_df <- data.frame(bootstrap_num = c(1:num_bootstrap),
446+
bs_mat, check.names = F)
447+
rm(bs_mat)
448+
# data.table melt function is much faster than tidyr's gather function
449+
# output is a long table with each bootstrap's value for each target_id
450+
tidy_bs <- data.table::melt(bs_df, id.vars = "bootstrap_num",
451+
variable.name = "target_id",
452+
value.name = "est_counts")
453+
rm(bs_df)
454+
# not sure why, but the melt function always returns a factor,
455+
# even when setting variable.factor = F, so I coerce target_id
456+
tidy_bs$target_id <- as.character(tidy_bs$target_id)
457+
# combine the long tidy table with eff_len and aggregation mappings
458+
# note that bootstrap number is treated as "sample" here
459+
# for backwards compatibility
460+
tidy_bs <- dplyr::select(tidy_bs, target_id,
461+
est_counts, sample = bootstrap_num)
462+
tidy_bs <- merge(data.table::as.data.table(tidy_bs),
463+
data.table::as.data.table(eff_len_df), by = "target_id",
464+
all.x = TRUE)
465+
tidy_bs <- merge(tidy_bs, mappings, by = "target_id",
466+
all.x = TRUE)
467+
# create the median effective length scaling factor for each gene
468+
scale_factor <- tidy_bs[, scale_factor := median(eff_len),
469+
by = eval(parse(text=aggregation_column))]
470+
# use the old reads_per_base_transform method to get gene scaled counts
471+
scaled_bs <- reads_per_base_transform(tidy_bs,
472+
scale_factor$scale_factor,
473+
aggregation_column,
474+
mappings)
475+
# this step undoes the tidying to get back a matrix format
476+
# target_ids here are now the aggregation column ids
477+
bs_mat <- data.table::dcast(scaled_bs, sample ~ target_id,
478+
value.var = "scaled_reads_per_base")
479+
# this now has the same format as the transcript matrix
480+
# but it uses gene ids
481+
bs_mat <- as.matrix(bs_mat[, -1])
482+
rm(tidy_bs, scaled_bs)
483+
}
484+
485+
if (extra_bootstrap_summary) {
486+
bs_quant_est_counts <- aperm(apply(bs_mat, 2,
487+
quantile))
488+
colnames(bs_quant_est_counts) <- c("min", "lower",
489+
"mid", "upper", "max")
490+
bs_quants$est_counts <- bs_quant_est_counts
491+
}
492+
493+
bs_mat <- transform_fun(bs_mat)
494+
# If bs_mat was made at gene-level, already has column names
495+
# If at transcript-level, need to add target_ids
496+
if(!gene_mode) {
497+
colnames(bs_mat) <- target_id
498+
} else {
499+
# rename est_counts to scaled_reads_per_base
500+
bs_quants$scaled_reads_per_base <- bs_quants$est_counts
501+
bs_quants$est_counts <- NULL
502+
}
503+
# all_sample_bootstrap[, i] bootstrap point estimate of the inferential
504+
# variability in sample i
505+
# NOTE: we are only keeping the ones that pass the filter
506+
bootstrap_result <- matrixStats::colVars(bs_mat[, which_ids])
507+
508+
list(index = i, bs_quants = bs_quants, bootstrap_result = bootstrap_result)
509+
}

R/gene_analysis.R

+17-2
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
propagate_transcript_filter <- function(filter_df, target_mapping,
22
grouping_column) {
33

4-
filtered_target_mapping <- dplyr::inner_join(as.data.table(filter_df),
5-
as.data.table(target_mapping), by = 'target_id')
4+
filtered_target_mapping <- dplyr::inner_join(as.data.table(filter_df), # nolint
5+
as.data.table(target_mapping), by = 'target_id') # nolint
66

77
filtered_target_mapping <- dplyr::select_(filtered_target_mapping,
88
grouping_column)
@@ -12,3 +12,18 @@ propagate_transcript_filter <- function(filter_df, target_mapping,
1212

1313
filtered_target_mapping
1414
}
15+
16+
check_quant_mode <- function(obj, units) {
17+
stopifnot( is(obj, 'sleuth') )
18+
if (obj$gene_mode & units == 'est_counts') {
19+
warning(paste("your sleuth object is in gene mode,",
20+
"but you selected 'est_counts'. Selecting 'scaled_reads_per_base'..."))
21+
units <- 'scaled_reads_per_base'
22+
} else if (!obj$gene_mode & units == 'scaled_reads_per_base') {
23+
warning(paste("your sleuth object is not in gene mode,",
24+
"but you selected 'scaled_reads_per_base'. Selecting 'est_counts'..."))
25+
units <- 'scaled_reads_per_base'
26+
}
27+
28+
units
29+
}

R/likelihood.R

+11-1
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,16 @@ sleuth_lrt <- function(obj, null_model, alt_model) {
6666
model_exists(obj, null_model)
6767
model_exists(obj, alt_model)
6868

69+
if(!obj$fits[[alt_model]]$transform_synced) {
70+
stop("Model '", alt_model, "' was not computed using the sleuth object's",
71+
" current transform function. Please rerun sleuth_fit for this model.")
72+
}
73+
74+
if(!obj$fits[[null_model]]$transform_synced) {
75+
stop("Model '", null_model, "' was not computed using the sleuth object's",
76+
" current transform function. Please rerun sleuth_fit for this model.")
77+
}
78+
6979
if ( !likelihood_exists(obj, null_model) ) {
7080
obj <- compute_likelihood(obj, null_model)
7181
}
@@ -90,7 +100,7 @@ sleuth_lrt <- function(obj, null_model, alt_model) {
90100
test_stat = test_statistic, pval = p_value)
91101
result <- dplyr::mutate(result, qval = p.adjust(pval, method = "BH"))
92102
model_info <- data.table::data.table(obj$fits[[null_model]]$summary)
93-
model_info <- dplyr::select(model_info, -c(x_group, iqr))
103+
model_info <- dplyr::select(model_info, -c(iqr))
94104
result <- dplyr::left_join(
95105
data.table::data.table(result),
96106
model_info,

0 commit comments

Comments
 (0)