pachterlab
diff --git a/‎.lintr
+10 b/‎.lintr
+10
diff --git a/‎CONTRIBUTING.md
+27 b/‎CONTRIBUTING.md
+27
diff --git a/‎DESCRIPTION
+3-1 b/‎DESCRIPTION
+3-1
diff --git a/‎R/bootstrap.R
+7-14 b/‎R/bootstrap.R
+7-14
diff --git a/‎R/gene_analysis.R
+14 b/‎R/gene_analysis.R
+14
diff --git a/‎R/kallisto.R
+2-1 b/‎R/kallisto.R
+2-1
diff --git a/‎R/measurement_error.R
+120-9 b/‎R/measurement_error.R
+120-9
diff --git a/‎R/misc.R
+1-1 b/‎R/misc.R
+1-1
diff --git a/‎R/model.R
+10-6 b/‎R/model.R
+10-6
@@ -0,0 +1,10 @@
+linters: with_defaults(
+  camel_case_linter = NULL,
+  commented_code_linter = NULL,
+  closed_curly_linter = NULL,
+  line_length_linter = line_length_linter(120),
+  object_usage_linter = NULL,
+  single_quotes_linter = NULL,
+  trailing_blank_lines_linter = NULL
+  )
+exclusions: list("R/hexamers.R", "inst/doc/intro.R")
@@ -0,0 +1,27 @@
+# contributing to `sleuth`
+
+Firstly -- thank you for being interested in helping us!
+What follows is a short document on how to contribute to `sleuth`.
+
+## development cycle
+
+The basic development cycle look something like this:
+
+1. Make some modifications
+2. Make sure all of the tests work: `devtools::test()`
+3. Under most circumstances you should write your own tests
+
+## style
+
+Style is checked using [lintr](https://github.com/jimhester/lintr).
+The linters that are set can be found in the configuration file `.lintr`.
+Style can be checked for the entire project using `lintr::lint_package()`.
+
+**All proposed code must pass the lint tests** otherwise it will not be accepted into the main branch.
+
+## types of pull requests
+
+Please note that this project is still in alpha stages and the statistics are constantly being developed while the paper is being written.
+Because of this, we are currently not accepting changes to the statistics.
+If you are interested in adding features, please check with us first by opening a Github issue.
+Thanks!
@@ -1,6 +1,6 @@
 Package: sleuth
 Title: Tools for investigating RNA-Seq
-Version: 0.28.0
+Version: 0.28.1
 Authors@R: c(person("Harold", "Pimentel", , "[email protected]", role = c("aut", "cre")))
 Description: Investigate transcript abundance from "kallisto" and differential
     expression analysis from RNA-Seq data.
@@ -18,10 +18,12 @@ Imports:
     tidyr,
     reshape2,
     rhdf5,
+    parallel,
     lazyeval,
     shiny
 Suggests:
     MASS,
+    lintr,
     testthat,
     knitr
 VignetteBuilder: knitr
 
@@ -23,16 +23,14 @@
 #
 # @param kal a kallisto object with non-null member \code{bootstrap}
 # @return a matrix with rownames equal to target_id
-bootstrap2mat <- function(kal, column = "tpm")
-{
+bootstrap2mat <- function(kal, column = "tpm") {
     stopifnot(is(kal, "kallisto"))
     # TODO: check if "column" is a valid kallisto column
 
     # assumes that all bootstrap samples are in same order (from read_kallisto)
 
     all_boot <- kal$bootstrap
-    mat <- matrix(unlist(lapply(all_boot, function(bs)
-        {
+    mat <- matrix(unlist(lapply(all_boot, function(bs) {
             bs[column]
         })), nrow = nrow(all_boot[[1]]))
 
@@ -100,8 +98,7 @@ get_bootstraps.kallisto <- function(kal, transcript, max_bs = 30) {
 # @param column the column to pull out of the kallisto results (default = "tpm")
 # @return a molten data.frame with columns "target_id", "sample" and the selected variable
 # @export
-melt_bootstrap <- function(kal, column = "tpm", transform = identity)
-{
+melt_bootstrap <- function(kal, column = "tpm", transform = identity) {
     stopifnot(is(kal, "kallisto"))
   stopifnot(length(kal$bootstrap) > 0)
 
@@ -177,8 +174,7 @@ aggregate_bootstrap <- function(kal, mapping, split_by = "gene_id",
 # @param column the column to select (rho, tpm, est_counts
 # @return a summarized data.frame
 # @export
-summarize_bootstrap <- function(kal, column = "tpm", transform = identity)
-{
+summarize_bootstrap <- function(kal, column = "tpm", transform = identity) {
     stopifnot(is(kal, "kallisto"))
     bs <- melt_bootstrap(kal, column, transform)
 
@@ -227,8 +223,7 @@ normalize_bootstrap <- function(kal, tpm_size_factor, est_counts_size_factor) {
     stopifnot(length(est_counts_size_factor) == 1)
   }
 
-  bs <- lapply(kal$bootstrap, function(bs_tbl)
-    {
+  bs <- lapply(kal$bootstrap, function(bs_tbl) {
       if (calc_norm_tpm)
         bs_tbl$tpm <- bs_tbl$tpm / tpm_size_factor
       if (calc_norm_counts)
@@ -259,8 +254,7 @@ sample_bootstrap <- function(obj, n_samples = 100L) {
   }
 
   which_samp <- lapply(seq_along(n_bs_per_samp),
-    function(i)
-    {
+    function(i) {
       cur_n <- n_bs_per_samp[i]
       sample.int(cur_n, n_samples, replace = TRUE)
     })
@@ -269,8 +263,7 @@ sample_bootstrap <- function(obj, n_samples = 100L) {
 
   # allocate the matrices
   sample_mat <- lapply(1:n_samples,
-    function(discard)
-    {
+    function(discard) {
       mat <- matrix(NA_real_, nrow = nrow(obj$kal[[1]]$abundance),
         ncol = nrow(which_samp))
       rownames(mat) <- obj$kal[[1]]$abundance$target_id
 
@@ -0,0 +1,14 @@
+propagate_transcript_filter <- function(filter_df, target_mapping,
+  grouping_column) {
+
+  filtered_target_mapping <- dplyr::inner_join(as.data.table(filter_df),
+    as.data.table(target_mapping), by = 'target_id')
+
+  filtered_target_mapping <- dplyr::select_(filtered_target_mapping,
+    grouping_column)
+
+  data.table::setnames(filtered_target_mapping, grouping_column, 'target_id')
+  filtered_target_mapping <- dplyr::distinct(filtered_target_mapping)
+
+  filtered_target_mapping
+}
@@ -62,7 +62,8 @@ bias_table.sleuth <- function(obj, sample) {
 #' @export
 bias_table.kallisto <- function(obj) {
   if ( length(obj$fld) == 1 && all(is.na(obj$fld)) ) {
-    stop("kallisto object does not contain the fragment length distribution. Please rerun with a new version of kallisto.")
+    stop("kallisto object does not contain the fragment length distribution.",
+      "Please rerun with a new version of kallisto.")
   }
 
   adf(
 
@@ -78,8 +78,9 @@ sleuth_fit <- function(obj, formula = NULL, fit_name = NULL, ...) {
     X <- formula
   }
   rownames(X) <- obj$sample_to_covariates$sample
-  A <- solve( t(X) %*% X )
+  A <- solve(t(X) %*% X)
 
+  msg("fitting measurement error models")
   mes <- me_model_by_row(obj, X, obj$bs_summary)
   tid <- names(mes)
 
@@ -109,13 +110,12 @@ sleuth_fit <- function(obj, formula = NULL, fit_name = NULL, ...) {
   l_smooth <- dplyr::mutate(l_smooth,
     smooth_sigma_sq_pmax = pmax(smooth_sigma_sq, sigma_sq))
 
-
   msg('computing variance of betas')
   beta_covars <- lapply(1:nrow(l_smooth),
     function(i) {
       row <- l_smooth[i,]
       with(row,
-        covar_beta(smooth_sigma_sq_pmax + sigma_q_sq, X, A)
+          covar_beta(smooth_sigma_sq_pmax + sigma_q_sq, X, A)
         )
     })
   names(beta_covars) <- l_smooth$target_id
@@ -266,8 +266,7 @@ me_model_by_row <- function(obj, design, bs_summary) {
   stopifnot( length(bs_summary$sigma_q_sq) == nrow(bs_summary$obs_counts))
 
   models <- lapply(1:nrow(bs_summary$obs_counts),
-    function(i)
-    {
+    function(i) {
       me_model(design, bs_summary$obs_counts[i,], bs_summary$sigma_q_sq[i])
     })
   names(models) <- rownames(bs_summary$obs_counts)
@@ -305,7 +304,7 @@ me_heteroscedastic_by_row <- function(obj, design, samp_bs_summary, obs_counts)
   models <- lapply(1:nrow(bs_summary$obs_counts),
     function(i) {
       res <- me_white_model(design, obs_counts[i,], sigma_q_sq[i,], A)
-      res$df$target_id = rownames(obs_counts)[i]
+      res$df$target_id <- rownames(obs_counts)[i]
       res
     })
   names(models) <- rownames(obs_counts)
@@ -354,8 +353,14 @@ me_white_var <- function(df, sigma_col, sigma_q_col, X, tXX_inv) {
   res
 }
 
+
+
 #' @export
-bs_sigma_summary <- function(obj, transform = identity) {
+bs_sigma_summary <- function(obj, transform = identity, norm_by_length = FALSE) {
+  # if (norm_by_length) {
+  #   scaling_factor <- get_scaling_factors(obj$obs_raw)
+  #   reads_per_base_transform()
+  # }
   obs_counts <- obs_to_matrix(obj, "est_counts")
   obs_counts <- transform( obs_counts )
 
@@ -373,8 +378,114 @@ bs_sigma_summary <- function(obj, transform = identity) {
   list(obs_counts = obs_counts, sigma_q_sq = bs_sigma)
 }
 
-me_model <- function(X, y, sigma_q_sq)
-{
+# transform reads into reads per base
+#
+#
+reads_per_base_transform <- function(reads_table, scale_factor_input,
+  collapse_column = NULL,
+  mapping = NULL,
+  norm_by_length = TRUE) {
+
+  if (is(scale_factor_input, 'data.frame')) {
+    # message('USING NORMALIZATION BY EFFECTIVE LENGTH')
+    # browser()
+    reads_table <- dplyr::left_join(
+      data.table::as.data.table(reads_table),
+      data.table::as.data.table(dplyr::select(scale_factor_input, target_id, sample, scale_factor)),
+      by = c('sample', 'target_id'))
+  } else {
+    reads_table <- dplyr::mutate(reads_table, scale_factor = scale_factor_input)
+  }
+  # browser()
+  reads_table <- dplyr::mutate(reads_table,
+    reads_per_base = est_counts / eff_len,
+    scaled_reads_per_base = scale_factor * reads_per_base
+    )
+
+  reads_table <- data.table::as.data.table(reads_table)
+
+  if (!is.null(collapse_column)) {
+    mapping <- data.table::as.data.table(mapping)
+    # old stuff
+    if (!(collapse_column %in% colnames(reads_table))) {
+      reads_table <- dplyr::left_join(reads_table, mapping, by = 'target_id')
+    }
+    # browser()
+    # reads_table <- dplyr::left_join(reads_table, mapping, by = 'target_id')
+
+    rows_to_remove <- !is.na(reads_table[[collapse_column]])
+    reads_table <- dplyr::filter(reads_table, rows_to_remove)
+    if ('sample' %in% colnames(reads_table)) {
+      reads_table <- dplyr::group_by_(reads_table, 'sample', collapse_column)
+    } else {
+      reads_table <- dplyr::group_by_(reads_table, collapse_column)
+    }
+
+    reads_table <- dplyr::summarize(reads_table,
+      scaled_reads_per_base = sum(scaled_reads_per_base))
+    data.table::setnames(reads_table, collapse_column, 'target_id')
+  }
+
+  as_df(reads_table)
+}
+
+gene_summary <- function(obj, which_column, transform = identity, norm_by_length = TRUE) {
+  # stopifnot(is(obj, 'sleuth'))
+  msg(paste0('aggregating by column: ', which_column))
+  obj_mod <- obj
+  if (norm_by_length) {
+    tmp <- obj$obs_raw
+    # tmp <- as.data.table(tmp)
+    tmp <- dplyr::left_join(
+      data.table::as.data.table(tmp),
+      data.table::as.data.table(obj$target_mapping),
+      by = 'target_id')
+    tmp <- dplyr::group_by_(tmp, 'sample', which_column)
+    scale_factor <- dplyr::mutate(tmp, scale_factor = median(eff_len))
+  } else {
+    scale_factor <- median(obj_mod$obs_norm_filt$eff_len)
+  }
+  # scale_factor <- median(obj_mod$obs_norm_filt$eff_len)
+  obj_mod$obs_norm_filt <- reads_per_base_transform(obj_mod$obs_norm_filt,
+    scale_factor, which_column, obj$target_mapping, norm_by_length)
+  obj_mod$obs_norm <- reads_per_base_transform(obj_mod$obs_norm,
+    scale_factor, which_column, obj$target_mapping, norm_by_length)
+
+  obs_counts <- obs_to_matrix(obj_mod, "scaled_reads_per_base")
+  obs_counts <- transform(obs_counts)
+
+  obj_mod$kal <- parallel::mclapply(seq_along(obj_mod$kal),
+    function(i) {
+      k <- obj_mod$kal[[i]]
+      current_sample <- obj_mod$sample_to_covariates$sample[i]
+      msg(paste('aggregating across sample: ', current_sample))
+      k$bootstrap <- lapply(k$bootstrap, function(b) {
+        b <- dplyr::mutate(b, sample = current_sample)
+        reads_per_base_transform(b, scale_factor, which_column,
+          obj$target_mapping, norm_by_length)
+      })
+
+      k
+    })
+
+  bs_summary <- sleuth_summarize_bootstrap_col(obj_mod, "scaled_reads_per_base",
+    transform)
+
+  bs_summary <- dplyr::group_by(bs_summary, target_id)
+  # FIXME: the column name 'bs_var_est_counts' is incorrect. should actually rename it above
+  bs_summary <- dplyr::summarise(bs_summary,
+    sigma_q_sq = mean(bs_var_scaled_reads_per_base))
+
+  bs_summary <- as_df(bs_summary)
+
+  bs_sigma <- bs_summary$sigma_q_sq
+  names(bs_sigma) <- bs_summary$target_id
+  bs_sigma <- bs_sigma[rownames(obs_counts)]
+
+  list(obs_counts = obs_counts, sigma_q_sq = bs_sigma)
+}
+
+me_model <- function(X, y, sigma_q_sq) {
   n <- nrow(X)
   degrees_free <- n - ncol(X)
 
 
@@ -107,7 +107,7 @@ jsd <- function(p, q) {
   p <- p / sum(p)
   q <- q / sum(q)
 
-  m <- (p + q)/2
+  m <- (p + q) / 2
   (kld(p, m) + kld(q, m)) / 2
 }
 
 
@@ -111,7 +111,9 @@ get_test <- function(obj, label, type, model) {
   }
 
   if (is.null(res)) {
-    stop("'", label, "' is not a valid label for a test. Please see valid models and tests using the functions 'models' and 'tests'. Remember to also correctly specify the test type.")
+    stop("'", label, "' is not a valid label for a test.",
+      " Please see valid models and tests using the functions 'models' and 'tests'.",
+      " Remember to also correctly specify the test type.")
   }
 
   res
@@ -125,7 +127,9 @@ test_exists <- function(obj, label, type, model) {
     temp <- get_test(obj, label, type, model)
   }, error = function(e) {
     return(FALSE)
-  }, finally = function(x) {})
+  }, finally = function(x) {
+      # intentionally empty
+    })
 
   TRUE
 }
@@ -198,7 +202,7 @@ tests <- function(obj) {
 #' @export
 tests.sleuth <- function(obj, lrt = TRUE, wt = TRUE) {
   if ( lrt ) {
-    cat('~likelihood ratio tests:\n')
+    cat('~likelihood ratio tests:\n') # nolint
     cur_tests <- list_tests(obj, 'lrt')
     if (length(cur_tests) > 0) {
       for (test in cur_tests) {
@@ -214,7 +218,7 @@ tests.sleuth <- function(obj, lrt = TRUE, wt = TRUE) {
   }
 
   if ( wt ) {
-    cat('~wald tests:\n')
+    cat('~wald tests:\n') # nolint
     cur_tests <- list_tests(obj, 'wt')
     if (length(cur_tests) > 0) {
       for (i in 1:length(cur_tests)) {
@@ -311,7 +315,7 @@ sleuth_results <- function(obj, test, test_type = 'wt',
       )
   }
 
-  if (show_all) {
+  if (show_all && !obj$gene_mode) {
     tids <- adf(target_id = obj$kal[[1]]$abundance$target_id)
     res <- dplyr::left_join(
       data.table::as.data.table(tids),
@@ -320,7 +324,7 @@ sleuth_results <- function(obj, test, test_type = 'wt',
       )
   }
 
-  if ( !is.null(obj$target_mapping) ) {
+  if ( !is.null(obj$target_mapping) && !obj$gene_mode) {
     res <- dplyr::left_join(
       data.table::as.data.table(res),
       data.table::as.data.table(obj$target_mapping),
Original file line number	Diff line number	Diff line change
`@@ -62,7 +62,8 @@ bias_table.sleuth <- function(obj, sample) {`
`62`	`62`	`#' @export`
`63`	`63`	`bias_table.kallisto <- function(obj) {`
`64`	`64`	`if ( length(obj$fld) == 1 && all(is.na(obj$fld)) ) {`
`65`		`- stop("kallisto object does not contain the fragment length distribution. Please rerun with a new version of kallisto.")`
	`65`	`+ stop("kallisto object does not contain the fragment length distribution.",`
	`66`	`+ "Please rerun with a new version of kallisto.")`
`66`	`67`	`}`
`67`	`68`
`68`	`69`	`adf(`
Original file line number	Diff line number	Diff line change
`@@ -107,7 +107,7 @@ jsd <- function(p, q) {`
`107`	`107`	`p <- p / sum(p)`
`108`	`108`	`q <- q / sum(q)`
`109`	`109`
`110`		`- m <- (p + q)/2`
	`110`	`+ m <- (p + q) / 2`
`111`	`111`	`(kld(p, m) + kld(q, m)) / 2`
`112`	`112`	`}`
`113`	`113`