Skip to content

Commit ed5ad30

Browse files
authored
Merge pull request #74 from pachterlab/hjp/geneagg
Gene aggregation
2 parents a7e64fc + 2b86978 commit ed5ad30

20 files changed

+862
-392
lines changed

.lintr

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
linters: with_defaults(
2+
camel_case_linter = NULL,
3+
commented_code_linter = NULL,
4+
closed_curly_linter = NULL,
5+
line_length_linter = line_length_linter(120),
6+
object_usage_linter = NULL,
7+
single_quotes_linter = NULL,
8+
trailing_blank_lines_linter = NULL
9+
)
10+
exclusions: list("R/hexamers.R", "inst/doc/intro.R")

CONTRIBUTING.md

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# contributing to `sleuth`
2+
3+
Firstly -- thank you for being interested in helping us!
4+
What follows is a short document on how to contribute to `sleuth`.
5+
6+
## development cycle
7+
8+
The basic development cycle look something like this:
9+
10+
1. Make some modifications
11+
2. Make sure all of the tests work: `devtools::test()`
12+
3. Under most circumstances you should write your own tests
13+
14+
## style
15+
16+
Style is checked using [lintr](https://github.com/jimhester/lintr).
17+
The linters that are set can be found in the configuration file `.lintr`.
18+
Style can be checked for the entire project using `lintr::lint_package()`.
19+
20+
**All proposed code must pass the lint tests** otherwise it will not be accepted into the main branch.
21+
22+
## types of pull requests
23+
24+
Please note that this project is still in alpha stages and the statistics are constantly being developed while the paper is being written.
25+
Because of this, we are currently not accepting changes to the statistics.
26+
If you are interested in adding features, please check with us first by opening a Github issue.
27+
Thanks!

DESCRIPTION

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Package: sleuth
22
Title: Tools for investigating RNA-Seq
3-
Version: 0.28.0
3+
Version: 0.28.1
44
Authors@R: c(person("Harold", "Pimentel", , "[email protected]", role = c("aut", "cre")))
55
Description: Investigate transcript abundance from "kallisto" and differential
66
expression analysis from RNA-Seq data.
@@ -18,10 +18,12 @@ Imports:
1818
tidyr,
1919
reshape2,
2020
rhdf5,
21+
parallel,
2122
lazyeval,
2223
shiny
2324
Suggests:
2425
MASS,
26+
lintr,
2527
testthat,
2628
knitr
2729
VignetteBuilder: knitr

R/bootstrap.R

+7-14
Original file line numberDiff line numberDiff line change
@@ -23,16 +23,14 @@
2323
#
2424
# @param kal a kallisto object with non-null member \code{bootstrap}
2525
# @return a matrix with rownames equal to target_id
26-
bootstrap2mat <- function(kal, column = "tpm")
27-
{
26+
bootstrap2mat <- function(kal, column = "tpm") {
2827
stopifnot(is(kal, "kallisto"))
2928
# TODO: check if "column" is a valid kallisto column
3029

3130
# assumes that all bootstrap samples are in same order (from read_kallisto)
3231

3332
all_boot <- kal$bootstrap
34-
mat <- matrix(unlist(lapply(all_boot, function(bs)
35-
{
33+
mat <- matrix(unlist(lapply(all_boot, function(bs) {
3634
bs[column]
3735
})), nrow = nrow(all_boot[[1]]))
3836

@@ -100,8 +98,7 @@ get_bootstraps.kallisto <- function(kal, transcript, max_bs = 30) {
10098
# @param column the column to pull out of the kallisto results (default = "tpm")
10199
# @return a molten data.frame with columns "target_id", "sample" and the selected variable
102100
# @export
103-
melt_bootstrap <- function(kal, column = "tpm", transform = identity)
104-
{
101+
melt_bootstrap <- function(kal, column = "tpm", transform = identity) {
105102
stopifnot(is(kal, "kallisto"))
106103
stopifnot(length(kal$bootstrap) > 0)
107104

@@ -177,8 +174,7 @@ aggregate_bootstrap <- function(kal, mapping, split_by = "gene_id",
177174
# @param column the column to select (rho, tpm, est_counts
178175
# @return a summarized data.frame
179176
# @export
180-
summarize_bootstrap <- function(kal, column = "tpm", transform = identity)
181-
{
177+
summarize_bootstrap <- function(kal, column = "tpm", transform = identity) {
182178
stopifnot(is(kal, "kallisto"))
183179
bs <- melt_bootstrap(kal, column, transform)
184180

@@ -227,8 +223,7 @@ normalize_bootstrap <- function(kal, tpm_size_factor, est_counts_size_factor) {
227223
stopifnot(length(est_counts_size_factor) == 1)
228224
}
229225

230-
bs <- lapply(kal$bootstrap, function(bs_tbl)
231-
{
226+
bs <- lapply(kal$bootstrap, function(bs_tbl) {
232227
if (calc_norm_tpm)
233228
bs_tbl$tpm <- bs_tbl$tpm / tpm_size_factor
234229
if (calc_norm_counts)
@@ -259,8 +254,7 @@ sample_bootstrap <- function(obj, n_samples = 100L) {
259254
}
260255

261256
which_samp <- lapply(seq_along(n_bs_per_samp),
262-
function(i)
263-
{
257+
function(i) {
264258
cur_n <- n_bs_per_samp[i]
265259
sample.int(cur_n, n_samples, replace = TRUE)
266260
})
@@ -269,8 +263,7 @@ sample_bootstrap <- function(obj, n_samples = 100L) {
269263

270264
# allocate the matrices
271265
sample_mat <- lapply(1:n_samples,
272-
function(discard)
273-
{
266+
function(discard) {
274267
mat <- matrix(NA_real_, nrow = nrow(obj$kal[[1]]$abundance),
275268
ncol = nrow(which_samp))
276269
rownames(mat) <- obj$kal[[1]]$abundance$target_id

R/gene_analysis.R

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
propagate_transcript_filter <- function(filter_df, target_mapping,
2+
grouping_column) {
3+
4+
filtered_target_mapping <- dplyr::inner_join(as.data.table(filter_df),
5+
as.data.table(target_mapping), by = 'target_id')
6+
7+
filtered_target_mapping <- dplyr::select_(filtered_target_mapping,
8+
grouping_column)
9+
10+
data.table::setnames(filtered_target_mapping, grouping_column, 'target_id')
11+
filtered_target_mapping <- dplyr::distinct(filtered_target_mapping)
12+
13+
filtered_target_mapping
14+
}

R/kallisto.R

+2-1
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,8 @@ bias_table.sleuth <- function(obj, sample) {
6262
#' @export
6363
bias_table.kallisto <- function(obj) {
6464
if ( length(obj$fld) == 1 && all(is.na(obj$fld)) ) {
65-
stop("kallisto object does not contain the fragment length distribution. Please rerun with a new version of kallisto.")
65+
stop("kallisto object does not contain the fragment length distribution.",
66+
"Please rerun with a new version of kallisto.")
6667
}
6768

6869
adf(

R/measurement_error.R

+120-9
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,9 @@ sleuth_fit <- function(obj, formula = NULL, fit_name = NULL, ...) {
7878
X <- formula
7979
}
8080
rownames(X) <- obj$sample_to_covariates$sample
81-
A <- solve( t(X) %*% X )
81+
A <- solve(t(X) %*% X)
8282

83+
msg("fitting measurement error models")
8384
mes <- me_model_by_row(obj, X, obj$bs_summary)
8485
tid <- names(mes)
8586

@@ -109,13 +110,12 @@ sleuth_fit <- function(obj, formula = NULL, fit_name = NULL, ...) {
109110
l_smooth <- dplyr::mutate(l_smooth,
110111
smooth_sigma_sq_pmax = pmax(smooth_sigma_sq, sigma_sq))
111112

112-
113113
msg('computing variance of betas')
114114
beta_covars <- lapply(1:nrow(l_smooth),
115115
function(i) {
116116
row <- l_smooth[i,]
117117
with(row,
118-
covar_beta(smooth_sigma_sq_pmax + sigma_q_sq, X, A)
118+
covar_beta(smooth_sigma_sq_pmax + sigma_q_sq, X, A)
119119
)
120120
})
121121
names(beta_covars) <- l_smooth$target_id
@@ -266,8 +266,7 @@ me_model_by_row <- function(obj, design, bs_summary) {
266266
stopifnot( length(bs_summary$sigma_q_sq) == nrow(bs_summary$obs_counts))
267267

268268
models <- lapply(1:nrow(bs_summary$obs_counts),
269-
function(i)
270-
{
269+
function(i) {
271270
me_model(design, bs_summary$obs_counts[i,], bs_summary$sigma_q_sq[i])
272271
})
273272
names(models) <- rownames(bs_summary$obs_counts)
@@ -305,7 +304,7 @@ me_heteroscedastic_by_row <- function(obj, design, samp_bs_summary, obs_counts)
305304
models <- lapply(1:nrow(bs_summary$obs_counts),
306305
function(i) {
307306
res <- me_white_model(design, obs_counts[i,], sigma_q_sq[i,], A)
308-
res$df$target_id = rownames(obs_counts)[i]
307+
res$df$target_id <- rownames(obs_counts)[i]
309308
res
310309
})
311310
names(models) <- rownames(obs_counts)
@@ -354,8 +353,14 @@ me_white_var <- function(df, sigma_col, sigma_q_col, X, tXX_inv) {
354353
res
355354
}
356355

356+
357+
357358
#' @export
358-
bs_sigma_summary <- function(obj, transform = identity) {
359+
bs_sigma_summary <- function(obj, transform = identity, norm_by_length = FALSE) {
360+
# if (norm_by_length) {
361+
# scaling_factor <- get_scaling_factors(obj$obs_raw)
362+
# reads_per_base_transform()
363+
# }
359364
obs_counts <- obs_to_matrix(obj, "est_counts")
360365
obs_counts <- transform( obs_counts )
361366

@@ -373,8 +378,114 @@ bs_sigma_summary <- function(obj, transform = identity) {
373378
list(obs_counts = obs_counts, sigma_q_sq = bs_sigma)
374379
}
375380

376-
me_model <- function(X, y, sigma_q_sq)
377-
{
381+
# transform reads into reads per base
382+
#
383+
#
384+
reads_per_base_transform <- function(reads_table, scale_factor_input,
385+
collapse_column = NULL,
386+
mapping = NULL,
387+
norm_by_length = TRUE) {
388+
389+
if (is(scale_factor_input, 'data.frame')) {
390+
# message('USING NORMALIZATION BY EFFECTIVE LENGTH')
391+
# browser()
392+
reads_table <- dplyr::left_join(
393+
data.table::as.data.table(reads_table),
394+
data.table::as.data.table(dplyr::select(scale_factor_input, target_id, sample, scale_factor)),
395+
by = c('sample', 'target_id'))
396+
} else {
397+
reads_table <- dplyr::mutate(reads_table, scale_factor = scale_factor_input)
398+
}
399+
# browser()
400+
reads_table <- dplyr::mutate(reads_table,
401+
reads_per_base = est_counts / eff_len,
402+
scaled_reads_per_base = scale_factor * reads_per_base
403+
)
404+
405+
reads_table <- data.table::as.data.table(reads_table)
406+
407+
if (!is.null(collapse_column)) {
408+
mapping <- data.table::as.data.table(mapping)
409+
# old stuff
410+
if (!(collapse_column %in% colnames(reads_table))) {
411+
reads_table <- dplyr::left_join(reads_table, mapping, by = 'target_id')
412+
}
413+
# browser()
414+
# reads_table <- dplyr::left_join(reads_table, mapping, by = 'target_id')
415+
416+
rows_to_remove <- !is.na(reads_table[[collapse_column]])
417+
reads_table <- dplyr::filter(reads_table, rows_to_remove)
418+
if ('sample' %in% colnames(reads_table)) {
419+
reads_table <- dplyr::group_by_(reads_table, 'sample', collapse_column)
420+
} else {
421+
reads_table <- dplyr::group_by_(reads_table, collapse_column)
422+
}
423+
424+
reads_table <- dplyr::summarize(reads_table,
425+
scaled_reads_per_base = sum(scaled_reads_per_base))
426+
data.table::setnames(reads_table, collapse_column, 'target_id')
427+
}
428+
429+
as_df(reads_table)
430+
}
431+
432+
gene_summary <- function(obj, which_column, transform = identity, norm_by_length = TRUE) {
433+
# stopifnot(is(obj, 'sleuth'))
434+
msg(paste0('aggregating by column: ', which_column))
435+
obj_mod <- obj
436+
if (norm_by_length) {
437+
tmp <- obj$obs_raw
438+
# tmp <- as.data.table(tmp)
439+
tmp <- dplyr::left_join(
440+
data.table::as.data.table(tmp),
441+
data.table::as.data.table(obj$target_mapping),
442+
by = 'target_id')
443+
tmp <- dplyr::group_by_(tmp, 'sample', which_column)
444+
scale_factor <- dplyr::mutate(tmp, scale_factor = median(eff_len))
445+
} else {
446+
scale_factor <- median(obj_mod$obs_norm_filt$eff_len)
447+
}
448+
# scale_factor <- median(obj_mod$obs_norm_filt$eff_len)
449+
obj_mod$obs_norm_filt <- reads_per_base_transform(obj_mod$obs_norm_filt,
450+
scale_factor, which_column, obj$target_mapping, norm_by_length)
451+
obj_mod$obs_norm <- reads_per_base_transform(obj_mod$obs_norm,
452+
scale_factor, which_column, obj$target_mapping, norm_by_length)
453+
454+
obs_counts <- obs_to_matrix(obj_mod, "scaled_reads_per_base")
455+
obs_counts <- transform(obs_counts)
456+
457+
obj_mod$kal <- parallel::mclapply(seq_along(obj_mod$kal),
458+
function(i) {
459+
k <- obj_mod$kal[[i]]
460+
current_sample <- obj_mod$sample_to_covariates$sample[i]
461+
msg(paste('aggregating across sample: ', current_sample))
462+
k$bootstrap <- lapply(k$bootstrap, function(b) {
463+
b <- dplyr::mutate(b, sample = current_sample)
464+
reads_per_base_transform(b, scale_factor, which_column,
465+
obj$target_mapping, norm_by_length)
466+
})
467+
468+
k
469+
})
470+
471+
bs_summary <- sleuth_summarize_bootstrap_col(obj_mod, "scaled_reads_per_base",
472+
transform)
473+
474+
bs_summary <- dplyr::group_by(bs_summary, target_id)
475+
# FIXME: the column name 'bs_var_est_counts' is incorrect. should actually rename it above
476+
bs_summary <- dplyr::summarise(bs_summary,
477+
sigma_q_sq = mean(bs_var_scaled_reads_per_base))
478+
479+
bs_summary <- as_df(bs_summary)
480+
481+
bs_sigma <- bs_summary$sigma_q_sq
482+
names(bs_sigma) <- bs_summary$target_id
483+
bs_sigma <- bs_sigma[rownames(obs_counts)]
484+
485+
list(obs_counts = obs_counts, sigma_q_sq = bs_sigma)
486+
}
487+
488+
me_model <- function(X, y, sigma_q_sq) {
378489
n <- nrow(X)
379490
degrees_free <- n - ncol(X)
380491

R/misc.R

+1-1
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ jsd <- function(p, q) {
107107
p <- p / sum(p)
108108
q <- q / sum(q)
109109

110-
m <- (p + q)/2
110+
m <- (p + q) / 2
111111
(kld(p, m) + kld(q, m)) / 2
112112
}
113113

R/model.R

+10-6
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,9 @@ get_test <- function(obj, label, type, model) {
111111
}
112112

113113
if (is.null(res)) {
114-
stop("'", label, "' is not a valid label for a test. Please see valid models and tests using the functions 'models' and 'tests'. Remember to also correctly specify the test type.")
114+
stop("'", label, "' is not a valid label for a test.",
115+
" Please see valid models and tests using the functions 'models' and 'tests'.",
116+
" Remember to also correctly specify the test type.")
115117
}
116118

117119
res
@@ -125,7 +127,9 @@ test_exists <- function(obj, label, type, model) {
125127
temp <- get_test(obj, label, type, model)
126128
}, error = function(e) {
127129
return(FALSE)
128-
}, finally = function(x) {})
130+
}, finally = function(x) {
131+
# intentionally empty
132+
})
129133

130134
TRUE
131135
}
@@ -198,7 +202,7 @@ tests <- function(obj) {
198202
#' @export
199203
tests.sleuth <- function(obj, lrt = TRUE, wt = TRUE) {
200204
if ( lrt ) {
201-
cat('~likelihood ratio tests:\n')
205+
cat('~likelihood ratio tests:\n') # nolint
202206
cur_tests <- list_tests(obj, 'lrt')
203207
if (length(cur_tests) > 0) {
204208
for (test in cur_tests) {
@@ -214,7 +218,7 @@ tests.sleuth <- function(obj, lrt = TRUE, wt = TRUE) {
214218
}
215219

216220
if ( wt ) {
217-
cat('~wald tests:\n')
221+
cat('~wald tests:\n') # nolint
218222
cur_tests <- list_tests(obj, 'wt')
219223
if (length(cur_tests) > 0) {
220224
for (i in 1:length(cur_tests)) {
@@ -311,7 +315,7 @@ sleuth_results <- function(obj, test, test_type = 'wt',
311315
)
312316
}
313317

314-
if (show_all) {
318+
if (show_all && !obj$gene_mode) {
315319
tids <- adf(target_id = obj$kal[[1]]$abundance$target_id)
316320
res <- dplyr::left_join(
317321
data.table::as.data.table(tids),
@@ -320,7 +324,7 @@ sleuth_results <- function(obj, test, test_type = 'wt',
320324
)
321325
}
322326

323-
if ( !is.null(obj$target_mapping) ) {
327+
if ( !is.null(obj$target_mapping) && !obj$gene_mode) {
324328
res <- dplyr::left_join(
325329
data.table::as.data.table(res),
326330
data.table::as.data.table(obj$target_mapping),

0 commit comments

Comments
 (0)