From 0c7f847984c81ed34a3fc372e22325f89e44515e Mon Sep 17 00:00:00 2001 From: Caroline Esnault Date: Thu, 24 Sep 2020 08:21:03 -0400 Subject: [PATCH 01/19] update downstream rnaseq for large test data --- .../downstream/functional-enrichment.Rmd | 2 +- workflows/rnaseq/downstream/rnaseq.Rmd | 246 +++++++++++++++--- 2 files changed, 213 insertions(+), 35 deletions(-) diff --git a/workflows/rnaseq/downstream/functional-enrichment.Rmd b/workflows/rnaseq/downstream/functional-enrichment.Rmd index 1b8469767..ab312074b 100644 --- a/workflows/rnaseq/downstream/functional-enrichment.Rmd +++ b/workflows/rnaseq/downstream/functional-enrichment.Rmd @@ -27,7 +27,7 @@ id.convert <- TRUE ```{r bitrgenes, results='hide'} # Docs: https://lcdb.github.io/lcdb-wf/rnaseq-rmd.html#bitrgenes # NOTE: Gene IDs are provided in what format?---------------------------------- -from.type <- "FLYBASE" +from.type <- "ENSEMBL" deg.list <- nested.lapply(sel.list, rownames) genes.df <- nested.lapply(deg.list, bitr, fromType=from.type, toType=types, OrgDb=orgdb) diff --git a/workflows/rnaseq/downstream/rnaseq.Rmd b/workflows/rnaseq/downstream/rnaseq.Rmd index 1d3fae7b9..90a5569d5 100644 --- a/workflows/rnaseq/downstream/rnaseq.Rmd +++ b/workflows/rnaseq/downstream/rnaseq.Rmd @@ -59,7 +59,7 @@ devtools::load_all('../../../lib/lcdbwf') ```{r annotationhub_setup} # Docs: https://lcdb.github.io/lcdb-wf/rnaseq-rmd.html#annotationhub_setup -annotation_genus_species <- 'Drosophila melanogaster' +annotation_genus_species <- 'Homo sapiens' annotation_key_override <- NA hub.cache <- '../../../include/AnnotationHubCache' orgdb <- get.orgdb( @@ -85,7 +85,7 @@ for (col in factor.columns){ colData[[col]] <- as.factor(colData[[col]]) } -colData$group <- relevel(colData$group, ref='control') +colData$group <- relevel(colData$group, ref='DMSO_1h') rownames(colData) <- colData[,1] ``` @@ -126,13 +126,28 @@ dds <- lcdbwf::DESeqDataSetFromCombinedFeatureCounts( # NOTE: collapse technical replicates ---------------------------------------- # If collapsing technical replicates, do so here -# dds <-collapseReplicates(dds, dds$biorep) +dds <-collapseReplicates(dds, dds$biorep) if (strip.dotted.version){ rownames(dds) <- sapply(strsplit(rownames(dds), '.', fixed=TRUE), function (x) x[1]) } vsd <- varianceStabilizingTransformation(dds, blind=TRUE) + +outliers <- c('HAP1_MTX_6h_2', 'outlier_HAP1_dBet6_1h_1') +dds.no.outlier <- lcdbwf::DESeqDataSetFromCombinedFeatureCounts( + '../data/rnaseq_aggregation/featurecounts.txt', + sampletable=colData %>% filter(!samplename %in% outliers), + design=~group, + subset.counts=TRUE) +dds.no.outlier <-collapseReplicates(dds.no.outlier, dds.no.outlier$biorep) +vsd.no.outlier <- varianceStabilizingTransformation(dds.no.outlier, blind=TRUE) + +# remove batch effect of cell line +#vsd.batch <- vsd.no.outlier +#assay(vsd.batch) <- removebatchEffect(assay(vsd.batch), +# batches=list(colData(vsd.batch)$cell_Line), +# design=model.matrix(~group, data=colData(vsd.batch))) ``` ## Sample clustering and QC @@ -146,8 +161,15 @@ we expect to see replicates clustering together and separation of treatments. ```{r sample_heatmap, cache=TRUE, dependson='dds_initial'} # Docs: https://lcdb.github.io/lcdb-wf/rnaseq-rmd.html#sample_heatmap lcdbwf::plot.heatmap(vsd, colData, cols.for.grouping=c('group')) +cat('\n\n### Without outliers\n\n') +lcdbwf::plot.heatmap(vsd.no.outlier, colData, cols.for.grouping=c('group')) +cat('\n\n### After removing cell line batch effect\n\n') +#lcdbwf::plot.heatmap(vsd.batch, colData, cols.for.grouping=c('group')) +cat('\n\nDISABLED\n\n') ``` +**Note: HAP1_MTX_6h_2 and outlier_HAP1_dBet6_1h_1 are outliers and will be removed from subsequent analyses.** + ## PCA {.tabset} Another way of looking at sample clustering is principal components analysis @@ -161,16 +183,39 @@ explained by each principal component is indicated in the axes label. groups <- list( group=c('group'), - layout=c('layout') + cell_line_group=c('cell_line_group'), + cell_Line=c('cell_Line'), + treatment=c('treatment'), + timepoint=c('timepoint') ) p <- list() for(group in groups){ - p[[group]] <- lcdbwf::plotPCA.ly(vsd, intgroup=group) + p[[group]] <- lcdbwf::plotPCA.ly(vsd.no.outlier, intgroup=group) } i <- 1; lcdbwf::mdcat('### ', names(p)[i]); ggplotly(p[[i]]) i <- 2; lcdbwf::mdcat('### ', names(p)[i]); ggplotly(p[[i]]) +i <- 3; lcdbwf::mdcat('### ', names(p)[i]); ggplotly(p[[i]]) +i <- 4; lcdbwf::mdcat('### ', names(p)[i]); ggplotly(p[[i]]) +i <- 5; lcdbwf::mdcat('### ', names(p)[i]); ggplotly(p[[i]]) +``` + +## PCA after batch effect removal {.tabset} + +DISABLED + +```{r pca_batch, results='asis', cache=TRUE, dependson='dds_initial'} +#p <- list() +#for(group in groups){ +# p[[group]] <- lcdbwf::plotPCA.ly(vsd.batch, intgroup=group) +#} +# +#i <- 1; lcdbwf::mdcat('### ', names(p)[i]); ggplotly(p[[i]]) +#i <- 2; lcdbwf::mdcat('### ', names(p)[i]); ggplotly(p[[i]]) +#i <- 3; lcdbwf::mdcat('### ', names(p)[i]); ggplotly(p[[i]]) +#i <- 4; lcdbwf::mdcat('### ', names(p)[i]); ggplotly(p[[i]]) +#i <- 5; lcdbwf::mdcat('### ', names(p)[i]); ggplotly(p[[i]]) ``` @@ -222,8 +267,8 @@ ggplotly(p) ```{r parallel_config} # Docs: https://lcdb.github.io/lcdb-wf/rnaseq-rmd.html#parallel_config -parallel <- FALSE -# register(MulticoreParam(4)) +parallel <- TRUE +register(MulticoreParam(4)) ``` ```{r dds_list, cache=TRUE} @@ -231,16 +276,46 @@ parallel <- FALSE lst <- list( main=list( - sampletable=colData, - design=~group), - no_rep4=list( - sampletable=colData %>% filter(samplename != 'sample4'), - design=~group, + sampletable=colData %>% filter(!samplename %in% outliers), + design=~treatment, + args=list(subset.counts=TRUE) + ), + jq=list( + sampletable=colData %>% filter(treatment %in% c('DMSO', 'JQ1'), !samplename %in% outliers), + design=~group + 0, + args=list(subset.counts=TRUE) + ), + jq_cell_line=list( + sampletable=colData %>% filter(treatment %in% c('DMSO', 'JQ1'), !samplename %in% outliers), + design=~cell_line_group + 0, + args=list(subset.counts=TRUE) + ), + jq_interact_term=list( + sampletable=colData %>% filter(treatment %in% c('DMSO', 'JQ1'), !samplename %in% outliers), + design=~treatment + cell_Line + timepoint + treatment:timepoint, + args=list(subset.counts=TRUE) + ), + dbet=list( + sampletable=colData %>% filter(treatment %in% c('DMSO', 'dBet6'), !samplename %in% outliers), + design=~group + 0, + args=list(subset.counts=TRUE) + ), + dbet_cell_line=list( + sampletable=colData %>% filter(treatment %in% c('DMSO', 'dBet6'), !samplename %in% outliers), + design=~cell_line_group + 0, + args=list(subset.counts=TRUE) + ), + dbet_interact_term=list( + sampletable=colData %>% + filter(treatment %in% c('DMSO', 'dBet6'), !samplename %in% outliers) %>% + mutate(treatment=factor(treatment, levels=c('DMSO', 'dBet6'))), + design=~treatment + cell_Line + timepoint + treatment:timepoint, args=list(subset.counts=TRUE) ) ) + dds.list <- map(lst, lcdbwf::make.dds, salmon.files=NULL, remove.version=TRUE, - combine.by=NULL, parallel=parallel) + combine.by='biorep', parallel=parallel) ``` @@ -249,37 +324,138 @@ dds.list <- map(lst, lcdbwf::make.dds, salmon.files=NULL, remove.version=TRUE, res.list <- list() -# NOTE: Example contrast #1---------------------------------------------------- +# NOTE: Example contrast #1: JQ1 vs. DMSO --------------------------------- # Using the example data, this compares treatment group to control group. # Change to reflect your experiment. # IMPORTANT: Control must always be last -res.list[['main']] <- list( - res=lfcShrink(dds.list[['main']], contrast=c('group', 'treatment', 'control'), type='normal'), + +# including only treatment in the contrast +res.list[['main_jq']] <- list( + res=results(dds.list[['main']], contrast=c('treatment', 'JQ1', 'DMSO')), + dds='main', + label='JQ1 vs. DMSO (LFC > 0); dds main; cts treatment' +) + +# JQ1 vs. DMSO at 1h +res.list[['jq1h']] <- list( + res=results(dds.list[['jq']], contrast=c(-1,0,1,0)), + dds='jq', + label='JQ1 vs. DMSO 1h (LFC>0); dds jq; cts group' +) + +res.list[['jq1h-batch']] <- list( + res=results(dds.list[['jq_cell_line']], contrast=c(-1,0,1,0,-1,0,1,0,-1,0,1,0)), + dds='jq_cell_line', + label='JQ1 vs. DMSO 1h (LFC>0); dds jq_cell_line; cts cell_Line_group; batch effect' +) + +# JQ1 vs. DMSO at 6h +res.list[['jq6h']] <- list( + res=results(dds.list[['jq']], contrast=c(0,-1,0,1)), + dds='jq', + label='JQ1 vs. DMSO 6h (LFC>0); dds jq; cts group' +) + +res.list[['jq6h-batch']] <- list( + res=results(dds.list[['jq_cell_line']], contrast=c(0,-1,0,1,0,-1,0,1,0,-1,0,1)), + dds='jq_cell_line', + label='JQ1 vs. DMSO 6h (LFC>0); dds jq_cell_line; cts cell_Line_group; batch effect' +) + +# JQ1 vs. DMSO 6h vs. 1h +res.list[['jq6h.vs.1h']] <- list( + res=results(dds.list[['jq']], contrast=c(1,-1,-1,1)), + dds='jq', + label='JQ1 vs. DMSO 6h vs. 1h (LFC>0); dds jq; cts group' +) + +res.list[['jq6h.vs.1h-batch']] <- list( + res=results(dds.list[['jq_cell_line']], contrast=c(1,-1,-1,1,1,-1,-1,1,1,-1,-1,1)), + dds='jq_cell_line', + label='JQ1 vs. DMSO 6h vs. 1h (LFC>0); dds jq_cell_line; cts cell_Line_group; batch effect' +) + +res.list[['jq6h.vs.1h-interactdesign']] <- list( + res=results(dds.list[['jq_interact_term']]), + dds='jq_cell_line', + label='JQ1 vs. DMSO 6h vs. 1h (LFC>0); dds jq_interact_term; cts cell_Line_group; batch effect' +) + +res.list[['jq_lfc2']] <- list( + res=lfcShrink( + dds.list[['jq_interact_term']], + coef='treatmentJQ1.timepoint6h', + #lfcThreshold=2, # no padj if lfcThreshold + type='apeglm' + ), + dds='main', + label='JQ1 vs. DMSO 6h vs. 1h (LFC>2); dds jq_interact_term; cts treat+cell_Line+timepoint+treat:timepoint; batch effect' +) + +# dBet6 --------------------------- + +res.list[['main_dbet']] <- list( + res=results(dds.list[['main']], contrast=c('treatment', 'dBet6', 'DMSO')), dds='main', - label='Using a log2FoldChange threshold of 0' + label='dBet6 vs. DMSO (LFC > 0); dds main; cts treatment' ) +# dBET6 vs. DMSO at 1h +res.list[['dbet1h']] <- list( + res=results(dds.list[['dbet']], contrast=c(-1,0,1,0)), + dds='dbet', + label='dBet6 vs. DMSO 1h (LFC>0); dds dbet; cts group' +) + +res.list[['dbet1h-batch']] <- list( + res=results(dds.list[['dbet_cell_line']], contrast=c(-1,0,1,0,-1,0,1,0,-1,0,1,0)), + dds='dbet_cell_line', + label='dBet6 vs. DMSO 1h (LFC>0); dds dbet_cell_line; cts cell_Line_group; batch effect' +) -# NOTE: Example contrast #2---------------------------------------------------- -# Using the example data, this compares treatment group to control group but -# requiring genes to have >4-fold differences (log2(4) = 2). Change to -# reflect your experiment. -# If using an lfcThreshold, must update elsewhere throughout the code. -res.lfcthresh.2 <- results( - dds.list[['main']], - contrast=c('group', 'treatment', 'control'), - lfcThreshold=2) +# dBet6 vs. DMSO at 6h +res.list[['dbet6h']] <- list( + res=results(dds.list[['dbet']], contrast=c(0,-1,0,1)), + dds='dbet', + label='dBet6 vs. DMSO 6h (LFC>0); dds dbet; cts group' +) -res.list[['lfc2']] <- list( +res.list[['dbet6h-batch']] <- list( + res=results(dds.list[['dbet_cell_line']], contrast=c(0,-1,0,1,0,-1,0,1,0,-1,0,1)), + dds='dbet_cell_line', + label='dBet6 vs. DMSO 6h (LFC>0); dds dbet_cell_line; cts cell_Line_group; batch effect' +) + +# dBet6 vs. DMSO 6h vs. 1h +res.list[['dbet6h.vs.1h']] <- list( + res=results(dds.list[['dbet']], contrast=c(1,-1,-1,1)), + dds='dbet', + label='dBet6 vs. DMSO 6h vs. 1h (LFC>0); dds dbet; cts group' +) + +res.list[['dbet6h.vs.1h-batch']] <- list( + res=results(dds.list[['dbet_cell_line']], contrast=c(1,-1,-1,1,1,-1,-1,1,1,-1,-1,1)), + dds='dbet_cell_line', + label='dBet6 vs. DMSO 6h vs. 1h (LFC>0); dds dbet_cell_line; cts cell_Line_group; batch effect' +) + +res.list[['dbet6h.vs.1h-interactdesign']] <- list( + res=results(dds.list[['dbet_interact_term']]), + dds='dbet_cell_line', + label='dBet6 vs. DMSO 6h vs. 1h (LFC>0); dds dbet_interact_term; cts cell_Line_group; batch effect' +) + +res.list[['dbet_lfc2']] <- list( res=lfcShrink( - dds.list[['main']], - contrast=c('group', 'treatment', 'control'), - res=res.lfcthresh.2, - type='normal' + dds.list[['dbet_interact_term']], + coef='treatmentdBet6.timepoint6h', + #lfcThreshold=2, + type='apeglm' ), dds='main', - label='Using a log2FoldChange threshold of >2' + label='dBet6 vs. DMSO 6h vs. 1h (LFC>2); dds dbet_interact_term; cts treat+cell_Line+timepoint+treat:timepoint; batch effect' ) + ``` @@ -339,14 +515,16 @@ for (name in names(res.list)){ mdcat('## ', label, ' {.tabset}') mdcat('### Summary of results') print(knitr::kable(my.summary(res.i, dds.i))) + cat(paste0('resultsNames: ',paste(resultsNames(dds.i), collapse=', '), '\n\n')) + cat(paste(elementMetadata(res.i)$description, collapse='\n\n')) mdcat('### Normalized counts of top 3 upregulated genes') print(counts.plot(counts.df(dds.i, res.i, sel.genes=lfc.filter(res.i), label='symbol', rank.col='padj'), - 3)) + 3, no.aes=TRUE) + aes(y=normalized_counts, x=group, color=cell_Line)) mdcat('### Normalized counts of top 3 downregulated genes') print(counts.plot(counts.df(dds.i, res.i, sel.genes=lfc.filter(res.i, reverse=TRUE), label='symbol', rank.col='padj'), - 3)) + 3, no.aes=TRUE) + aes(y=normalized_counts, x=group, color=cell_Line)) # NOTE: gene labels -------------------------------------------------------- # By default we plot the symbols of the top 10 upregulated or downregulated From e5999d482a9d33a1f4a363d671ce3c1a8edd0556 Mon Sep 17 00:00:00 2001 From: Caroline Esnault Date: Thu, 24 Sep 2020 08:21:55 -0400 Subject: [PATCH 02/19] cleanup --- workflows/rnaseq/downstream/helpers.Rmd | 871 ------------------------ 1 file changed, 871 deletions(-) delete mode 100644 workflows/rnaseq/downstream/helpers.Rmd diff --git a/workflows/rnaseq/downstream/helpers.Rmd b/workflows/rnaseq/downstream/helpers.Rmd deleted file mode 100644 index e811fe971..000000000 --- a/workflows/rnaseq/downstream/helpers.Rmd +++ /dev/null @@ -1,871 +0,0 @@ -```{r} -library(DESeq2) -library(pheatmap) -library(RColorBrewer) -library(DEGreport) -library(ggplot2) -library(ggrepel) -library(heatmaply) -library(readr) -library(stringr) -library(tibble) -library(purrr) -library(tidyr) - -#' Get the OrgDb for the specified organism, using the cached AnnotationHub. -#' -#' @param species Case-sensitive genus and species -#' @param cache Directory in which the AnnotationHub cache is stored -#' @param annotation_key_override If not NA, forces the hub to use this -#' accession. Use this when you know exactly which OrgDb to use. -#' -#' @return OrgDb object -get.orgdb <- function(species, cache, annotation_key_override=NA){ - - # Workaround to allow AnnotationHub to use proxy. See - # https://github.com/Bioconductor/AnnotationHub/issues/4, and thanks - # Wolfgang! - proxy <- Sys.getenv('http_proxy') - if (proxy == ""){ - proxy <- NULL - } - - ah <- AnnotationHub(hub=getAnnotationHubOption('URL'), - cache=cache, - proxy=proxy, - localHub=FALSE) - - find.annotationhub.name <- function(species.name, override.code) { #autodetect ah names based on loaded database - if (is.na(override.code)) { - ah.query <- query(ah, "OrgDb") - ah.query.speciesmatch <- grepl(paste("^", species.name, "$", sep=""), ah.query$species) - ah.query.which <- which(ah.query.speciesmatch) - stopifnot(length(ah.query.which) > 0) #require at least one match - if (length(ah.query.which) > 1) { #warn of duplicate matches - print("WARNING: found multiple candidate species in AnnotationHub: "); - print(ah.query.speciesmatch) - } - names(ah.query)[ah.query.which[1]] - } else { - override.code - } - } - annotation_key <- find.annotationhub.name(annotation_genus_species, annotation_key_override) - orgdb <- ah[[annotation_key]] - return(orgdb) -} - -#' Plot an interactive PCA plot -#' -#' @param rld DESeqTransform object output by varianceStabilizingTransformation() or rlog() -#' @param intgroup character vector of names in colData(x) to use for grouping -#' -#' @return Handle to ggplot with added label field in aes_string() for plotting with ggplotly() -plotPCA.ly <- function(rld, intgroup){ - mat <- plotPCA(rld, intgroup, returnData=TRUE) - pv <- attr(mat, 'percentVar') - p <- ggplot(data=mat, aes_string(x='PC1', y='PC2', color='group', label='name')) + - geom_point(size=3) + xlab(paste0('PC1: ', round(pv[1]*100), '% variance')) + - ylab(paste0('PC2: ', round(pv[2]*100), '% variance')) + coord_fixed() - return(p) -} - -#' Plot a MA plot labeled with selected genes -#' -#' @param res.list data.frame (or list) with log2FoldChange and padj columns (or elements). -#' Row names of the data.frame should be gene IDs/symbols and match the format of lab.genes -#' @param fdr.thres FDR threshold for defining statistical significance -#' @param fc.thres log2FoldChange cutoff for defining statistical significance -#' @param fc.lim User-defined limits for y-axis (log2FC). If NULL, this is defined as -#' the (floor, ceiling) of range(res$log2FoldChange) -#' @param genes.to.label Genes to label on the MA plot. NULL, by default -#' @param col Column of `res` in which to look for `genes.to.label`. If NULL, -#' rownames are used. -#' -#' @return Handle to ggplot -plotMA.label <- function(res, - fdr.thres=0.1, - fc.thres=0, - fc.lim=NULL, - genes.to.label=NULL, - col=NULL - ){ - # TODO: Add ggrastr option for points - genes.to.label <- as.character(genes.to.label) - nna <- sum(is.na(genes.to.label)) - if (nna > 0){ - warning(paste("Removing", nna, "NAs from gene list")) - genes.to.label <- genes.to.label[!is.na(genes.to.label)] - } - # convert res to data frame - res <- data.frame(res) - - # if y limits not specified - if(is.null(fc.lim)){ - fc.lim <- range(res$log2FoldChange, na.rm=TRUE) - fc.lim[1] <- floor(fc.lim[1]) - fc.lim[2] <- ceiling(fc.lim[2]) - } - - # get data frame of genes outside plot limits - up.max <- res[res$log2FoldChange > fc.lim[2],] - up.max$log2FoldChange <- rep(fc.lim[2], dim(up.max)[1]) - up.max <- data.frame(genes=rownames(up.max), up.max) - - down.max <- res[res$log2FoldChange < fc.lim[1],] - down.max$log2FoldChange <- rep(fc.lim[1], dim(down.max)[1]) - down.max <- data.frame(genes=rownames(down.max), down.max) - - # get data frame of DE genes - de.list <- res[res$padj < fdr.thres & - !is.na(res$padj) & - abs(res$log2FoldChange) >= fc.thres,] - de.list <- data.frame(genes=rownames(de.list), de.list) - - # get data frame of DE genes outside plot limits - up.max.de <- up.max[rownames(up.max) %in% rownames(de.list),] - down.max.de <- down.max[rownames(down.max) %in% rownames(de.list),] - - # create ggplot with appropriate layers - p <- ggplot(res, aes(baseMean, log2FoldChange)) + - geom_point(col="gray40") + scale_x_log10() + ylim(fc.lim[1], fc.lim[2]) + - theme_bw() + theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank()) - - p <- p + geom_hline(yintercept = 0, col="red", size=2, alpha=0.5) # add horizontal line - p <- p + geom_point(data=up.max, col="gray40", pch=2) # add points above max y - p <- p + geom_point(data=down.max, col="gray40", pch=6) # add points below min y - - p <- p + geom_point(data=de.list, col="red") # add DE points - p <- p + geom_point(data=up.max.de, col="red", pch=2) # add DE points above max y - p <- p + geom_point(data=down.max.de, col="red", pch=6) # add DE points below min y - - - if(!is.null(genes.to.label)){ - # get data frame of genes to be labeled - if (!is.null(col)){ - res$gene.labels <- res[,col] - } else { - res$gene.labels <- rownames(res) - } - - label.list <- res[res$gene.labels %in% genes.to.label,] - #label.list <- data.frame(genes=rownames(label.list), label.list) - - # label genes outside limits - up.max.idx <- rownames(label.list) %in% rownames(up.max) - down.max.idx <- rownames(label.list) %in% rownames(down.max) - - if(sum(up.max.idx) > 0){ - label.list$log2FoldChange[up.max.idx] <- rep(fc.lim[2], sum(up.max.idx)) - } - - if(sum(down.max.idx) > 0){ - label.list$log2FoldChange[down.max.idx] <- rep(fc.lim[1], sum(down.max.idx)) - } - - # add labels - p <- p + geom_point(data=label.list, col="black", pch=1, size=3) - p <- p + geom_label_repel(data=label.list, aes(label=label.list$gene.labels, fontface="italic")) - } - return(p) - -} - -#' Plot a clustered heatmap of samples -#' -#' @param rld DESeqTransform object, typically output from running rlog() -#' @param colData Dataframe of metadata, used for annotating heatmap -#' @param cols.for.grouping Columns in colData to annotate heatmap with -#' -#' @return matrix of sample distances -plot.heatmap <- function(rld, colData, cols.for.grouping){ - sampleDists <- dist(t(assay(rld))) - sampleDistMatrix <- as.matrix(sampleDists) - rownames(sampleDistMatrix) <- colnames(rld) - colors <- colorRampPalette(rev(brewer.pal(9, 'Blues')))(255) - df <- as.data.frame(colData(rld)[, cols.for.grouping]) - colnames(df) <- cols.for.grouping - rownames(df) <- colnames(rld) - heatmaply(sampleDistMatrix, - scale='none', - col=colors, - row_side_colors=df, - showticklabels=c(FALSE,TRUE)) -} - - -#' Plot heatmap of most varying genes -#' -#' @param rld DESeqTransform object, typically output from running rlog() -#' @param colData Dataframe of metadata, used for annotating heatmap -#' @param n Number of genes to include -#' -#' @return Side effect is to plot the heatmap -vargenes.heatmap <- function(rld, cols.for.grouping, n=50){ - # TODO: heatmaply? - # TODO: allow alternative gene IDs - topVarGenes <- head(order(rowVars(assay(rld)), decreasing=TRUE), n) - mat <- assay(rld)[topVarGenes,] - mat <- mat - rowMeans(mat) - df <- as.data.frame(colData(rld)[, cols.for.grouping]) - rownames(df) <- colnames(rld) - colnames(df) <- cols.for.grouping - pheatmap(mat, annotation_col=df, cluster_cols=TRUE) -} - - -#' Simple wrapper of cat() that makes markdown text easier to print -#' -#' @param ... Arguments to print -#' -#' Make sure you're in an R code chunk with `results="asis"` set. -mdcat <- function(...){ - cat('\n\n', ..., ' \n\n', sep='', fill=1500) -} - - -#' Convert a vector of BAM filenames into a vector of samplenames. -#' -#' @param x Character vector of BAM filenames (e.g., from the header of -#' featureCounts) -#' -lcdbwf.samplename <- function(x) { - x <- x %>% - str_remove_all('data/rnaseq_samples/') %>% - str_remove_all('.cutadapt.bam') %>% - str_split(fixed('/'), simplify=TRUE) - x[,1] -} - - -#' Load a single combined featureCounts table into a DESeq object. -#' -#' @param filename Filename containing featureCounts combined output -#' @param sampletable data.frame containing at least sample names as first -#' column -#' @param design Model used for creating DESeq object -#' @param sample.func Function that will be applied to each column name -#' to align it to the input sampletable. Takes a character vector as -#' input and returns a character vector of adjusted strings. -#' @param subset.counts If TRUE, then counts are subsetted by the provided -#' sampletable. That is, only the counts columns that match a rowname of -#' the provided sampletable are included. If FALSE (default), an error -#' is raised reporting the differences in sampletable and counts data. -#' It is always an error if counts data does not have an entry for -#' a sample in the sampletable. -#' -#' @return DESeq object -#' -#' Additional args are passed to DESeq2::DESeqDataSetFromMatrix. -DESeqDataSetFromCombinedFeatureCounts <- function(filename, sampletable, design, sample.func=lcdbwf.samplename, subset.counts=FALSE, ...){ - - # The sampletable may be data.frame or tibble; if it's a tibble then it - # likely doesn't have rownames. So in this function we assume that it's the - # first column that contains the samplenames. - - # Read in the counts TSV, use gene ID as rownames, and get rid of the Chr, - # Start, End, Strand, Length columns - m <- read_tsv(filename, comment="#") %>% - remove_rownames %>% - column_to_rownames('Geneid') %>% - dplyr::select(-(1:5)) %>% - as.data.frame - - # The column names of the imported table are not guaranteed to match the - # samples in the metadata, so we need to make sure things match up - # correctly when renaming columns. - # - # sample.func should convert filenames (which are columns in featurecounts - # table) with samples (as listed in the sampletable) - x <- colnames(m) %>% sample.func - - samplenames <- sampletable[,1] - counts.not.sampletable <- setdiff(x, samplenames) - sampletable.not.counts <- setdiff(samplenames, x) - - - if (!all(x %in% samplenames)){ - # sampletable is missing: - if (!subset.counts){ - stop(paste('The following samples are in the counts data but not the sampletable. If this is intended, consider using `subset.counts=TRUE` to remove them from the counts:', paste(counts.not.sampletable, collapse=', '))) - } - } - if (!all(samplenames %in% x)){ - stop( - paste( - 'The following samples are in the sampletable but not in the counts data. Check sample.func?', - paste(sampletable.not.counts, collapse=', ')) - ) - } - - colnames(m) <- x - - # We should be good -- so reorder columns in `m` to match sampletable - m <- m[, sampletable[,1]] - - # Before creating the dds object, we will drop levels from factors, in case - # that was not done before sending in the (possibly subsetted) sampletable. - - sampletable <- droplevels(as.data.frame(sampletable)) - - object <- DESeqDataSetFromMatrix(countData=m, colData=sampletable, design=design, ...) - return(object) -} - -#' Load featureCounts output into a DESeq object. -#' -#' Revised version of DESeq2::DESeqDataSetFromHTSeqCount to handle the -#' featureCounts default output format, which contains many more columns. -#' -#' @param sampleTable data.frame containing at least "featurecounts.path" column -#' @param directory Paths to featureCounts output are relative to this dir -#' @param design Model used for creating DESeq object -#' -#' @return DESeq object -#' -#' Additional args are passed to DESeq2::DESeqDataSetFromMatrix. -DESeqDataSetFromFeatureCounts <- function (sampleTable, directory='.', design, - ignoreRank=FALSE, ...) -{ - l <- lapply( - as.character(sampleTable[,'featurecounts.path']), - function(fn) read.table(file.path(directory, fn), stringsAsFactors=FALSE, skip=2) - ) - if (!all(sapply(l, function(a) all(a$V1 == l[[1]]$V1)))) - stop("Gene IDs in first column differ between files") - tbl <- sapply(l, function(a) a$V7) - colnames(tbl) <- sampleTable[, 1] - rownames(tbl) <- l[[1]]$V1 - rownames(sampleTable) <- sampleTable[, 1] - object <- DESeqDataSetFromMatrix(countData=tbl, colData=sampleTable[, -grepl('path', colnames(sampleTable)), - drop=FALSE], design=design, ignoreRank, ...) - return(object) -} - -#' Load Salmon quantification data into a DESeq object -#' -#' @param sampleTable data.frame containing at least "salmon.path" column -#' @param design Model used for creating DESeq object -#' -#' @return DESeq object with transcript-level counts -#' -#' Additional args are passed to DESeq2::DESeqDataSetFromMatrix. -DESeqDataSetFromSalmon <- function (sampleTable, design, - ignoreRank=FALSE, ...) -{ - txi <- tximport(sampleTable[, 'salmon.path'], type='salmon', txOut=TRUE) - object <- DESeqDataSetFromTximport(txi, colData=sampleTable[, -grepl('path', colnames(sampleTable)), - drop=FALSE], design=design, ignoreRank, ...) - return(object) -} - -#' Make a list of dds objects -#' -#' This function takes a list of sample, design pairs, plus other info, to output -#' a list of DESeq object. -#' -#' @param deseq_obj_list A named list of lists. Each list should have two to three elements: -#' 1 should be a sampletable, 2 should be a design. The 3rd is an optional list of arguments -#' to pass to the DESeqDataSet constructor. -#' @param salmon.files txi object from tximport if transcript-level counts desired; NULL forces featurecounts; -#' shared by all dds. -#' @param combine.by The column to collapse technical replicates by. -#' @param remove.version Boolean to decide whether to remove version information -#' @return A list of dds objects. -#' -make.dds.list <- function(deseq_obj_list, salmon.files=NULL, - combine.by=FALSE, remove.version=TRUE, ...){ - make.dds <- function(design_data, salmon.files, combine.by, remove.version, ...) - { - - subset.counts <- FALSE - colData <- pluck(design_data, 1) - design <- pluck(design_data, 2) - location <- pluck(design_data, 'file', .default='../data/rnaseq_aggregation/featurecounts.txt') - # arg list passes args to either the CombinedFeatureCounts or FromSalmon constructor - arg_list <- pluck(design_data, 'args') - - if(is.null(salmon.files)){ - # Load gene-level counts - dds<- exec(DESeqDataSetFromCombinedFeatureCounts, - location, - sampletable=colData, - design=design, - !!!arg_list) - if (remove.version){ - rownames(dds) <- sapply(strsplit(rownames(dds), '.', fixed=TRUE), function (x) x[1]) - } - } else { - dds <- exec(DESeqDataSetFromTximport, salmon.files, colData=colData[, -grepl('path', colnames(colData)), - drop=FALSE], design=design, !!!arg_list) - } - - if(combine.by != FALSE){ - dds <-collapseReplicates(dds, dds[[combine.by]]) - } - - # Constructs the dds object and passes any arguments such as parallel - dds <- DESeq(dds, ...) - - return(dds) - } - dds_list <- map(deseq_obj_list, make.dds, salmon.files, combine.by, remove.version, ...) - - return(dds_list) -} - -#' Compute label for one component of an arbitrary design matrix -#' -#' @param pattern.mat single row from unique(model.matrix) call from a dds object -#' -#' @return formatted column name - -format.group.name <- function(pattern.mat) { - my.res <- "counts" - # enforce input format - stopifnot(is.vector(pattern.mat)) - # if there's only one element - if (length(pattern.mat) == 1) { - # it must be the intercept - my.res <- paste(my.res, "groupMean", sep=".") - } else { # more than one element - # for every non-intercept entry - for (i in 2:length(pattern.mat)) { - # if 1, include - if (pattern.mat[i] == 1) my.res <- paste(my.res, names(pattern.mat[i]), sep=".") - # otherwise don't - else my.res <- paste(my.res, ".not", names(pattern.mat[i]), sep="") - } - } - my.res -} - -#' Plot a gene's normalized counts across samples -#' -#' @param gene Gene ID -#' @param dds DESeq object from which to extract counts -#' -#' @return ggplot object -my.counts <- function(gene, dds, label=NULL, intgroup='group'){ - - # Assumption: color genes by group - geneCounts <- plotCounts(dds, gene=gene, intgroup=intgroup, returnData=TRUE) - p <- ggplot(geneCounts, aes_string(x=intgroup, y='count', color=intgroup, group=intgroup)) + - scale_y_log10() + - geom_point(position=position_jitter(width=.1, height=0), size=3) + - geom_line(color='#000000') + - ggtitle(gene) - - if (!is.null(label)){ - p <- p + ggtitle(label) - } - return(p) -} - -#' Re-order results by log2FoldChange -#' -#' @param res DESeq2 results object -#' @param reverse If TRUE then sort high-to-low -#' -#' @return Re-ordered results object -lfc.order <- function(res, reverse=FALSE){ - res.na <- res[!is.na(res$log2FoldChange),] - if (!reverse){ - return(res.na[order(res.na$log2FoldChange),]) - } - if (reverse){ - return(res.na[rev(order(res.na$log2FoldChange)),]) - } -} - -#' Re-order results by adjusted pval -#' -#' @param res DESeq2 results object -#' @param reverse If TRUE then sort high-to-low -#' -#' @return Re-ordered results object -padj.order <- function(res, reverse=FALSE){ - res.na <- res[!is.na(res$log2FoldChange) & !is.na(res$log2FoldChange),] - if (!reverse){ - res.na <- res.na[res.na$log2FoldChange > 0,] - } else { - res.na <- res.na[res.na$log2FoldChange < 0,] - } - return(res.na[order(res.na$padj),]) -} - - -#' Plot normalized gene counts for the top N genes -#' -#' @param sorted DESeq2 results object -#' @param n Number of top genes to plot -#' @param func Plotting function to call on each gene. Signature is func(gene, -#' dds, label=label, ...) -#' @param dds DESeq2 object -#' @param label Vector of column names in `res` from which to add a label to -#' the gene (e.g., c('symbol', 'alias')) -#' @param ... Additional arguments that are passed to `func` -#' @return Side effect is to create plot -top.plots <- function(res, n, func, dds, add_cols=NULL, ...){ - ps <- list() - for (i in seq(n)){ - gene <- rownames(res)[i] - add_label <- as.character(as.data.frame(res)[i, add_cols]) - add_label <- add_label[!is.na(add_label)] - label <- paste(gene, add_label, sep=' | ') - if (length(label) == 0){ - label <- NULL - } - ps[[gene]] <- func(gene, dds, label=label, ...) - } - grid.arrange(grobs=ps) -} - -#' Make dataframe of normalized gene counts -#' -#' @param dds DESeq2 object -#' @param res DESeq2 results object -#' @param sel.genes list of genes to consider -#' @param label column(s) to be included to the plot labels -#' @param pc count number to be added to the normalized counts -#' Typically a pc of 0.5 is added to allow plotting in log scale -#' -#' @return dataframe -counts.df <- function(dds, res, sel.genes=NULL, label=NULL, rank.col='padj', pc=0.5) { - # getting normalized counts copied from plotCounts() - cnts <- counts(dds, normalized = TRUE, replaced = FALSE) - # add 0.5 like plotCounts to plot 0 on log scale - cnts <- cnts + pc - # merge with res.i and colData - df <- as.data.frame(cbind(cnts, res)) - # add label for plotting - df <- df %>% mutate(label = paste(gene, !!!syms(label), sep=' | ')) - # subset to sel.genes if not NULL (then keep all) - if (!is.null(sel.genes)) { - df <- df %>% - filter(gene %in% sel.genes) - } - # add rank - df <- df %>% - mutate(rank = rank(!!sym(rank.col), ties.method='first', na.last='keep')) %>% - pivot_longer(colnames(cnts), names_to='samplename', values_to='normalized_counts') - # add colData - df <- merge(df, colData(dds), by='samplename') %>% - as.data.frame() - return(df) -} - -#' Filter results by log2FoldChange sign -#' -#' @param res DESeq2 results object -#' @param reverse If TRUE then filter negative lfc -#' -#' @return filtered genes -lfc.filter <- function(res, reverse=FALSE){ - res.na <- res[!is.na(res$log2FoldChange),] - if (!reverse){ - res.na <- res.na[res.na$log2FoldChange > 0,] - } else { - res.na <- res.na[res.na$log2FoldChange < 0,] - } - return(res.na[,'gene']) -} - -#' Plot genes' normalized counts across samples -#' -#' @param df dataframe from which to extract genes normalized_counts -#' @param rank.nb rank number less or equal used to filter the genes to be plotted -#' @param no.aes whether to include the default aes or return the ggplot object -#' without aes -#' @param facet name of column to use for faceting -#' -#' @return ggplot object -counts.plot <- function(df, rank.nb=NULL, no.aes=FALSE, facet='label') { - if (!is.null(rank.nb)) { - df <- df %>% - filter(rank <= rank.nb) - } - df <- df %>% - arrange(rank) %>% - mutate(facet = factor(!!!syms(facet), levels = unique(!!!syms(facet)))) - plt <- ggplot(df) + - scale_y_log10() + - geom_point(position=position_jitter(width=.1, height=0), size=3) + - geom_line(color='#000000') + - theme_bw() + - theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + - facet_wrap(.~facet, ncol=1, scales='free_y') - if (!no.aes) { - plt <- plt + - aes(y=normalized_counts, x=group, color=group) - } - return(plt) -} - -#' Plot a histogram of raw pvals -#' -#' @param res DESeq2 results object -#' -#' @return Side effect is to create plot -pval.hist <- function(res){ - hist(res$pvalue[res$baseMean>1], breaks=0:20/20, col='grey50', - border='white', xlab='P-value', main='Distribution of p-values') -} - -#' Summarize DESeq2 results into a dataframe -#' -#' summary(res) prints out info; this function captures it into a dataframe -#' -#' @param res DESeq2 results object -#' @param dds DEseq2 object -#' @param alpha Alpha level at which to call significantly changing genes -#' -#' @return Dataframe of summarized results -my.summary <- function(res, dds, alpha, lfc.thresh=0, ...){ - if (missing(alpha)){ - alpha <- if (is.null(metadata(res)$alpha)){ 0.1 } else { metadata(res)$alpha } - } - notallzero <- sum(res$baseMean > 0) - up <- sum(res$padj < alpha & res$log2FoldChange > lfc.thresh, na.rm=TRUE) - down <- sum(res$padj < alpha & res$log2FoldChange < -lfc.thresh, na.rm=TRUE) - filt <- sum(!is.na(res$pvalue) & is.na(res$padj)) - outlier <- sum(res$baseMean > 0 & is.na(res$pvalue)) - ft <- if(is.null(metadata(res)$filterThreshold)){ 0 } else { round(metadata(res)$filterThreshold) } - # adjust width.cutoff as newline insertion causes this to return a df with - # multiple duplicate rows! - df <- data.frame( - total.annotated.genes=nrow(res), - total.nonzero.read.count=notallzero, - alpha=alpha, - lfcThreshold=lfc.thresh, - up=up, - down=down, - outliers=outlier, - low.counts=filt, - design=deparse(design(dds), width.cutoff=500L) - ) - return(df) -} - -#' Convert "1/100" to 0.01. -#' -#' clusterProfiler report columns that are strings of numbers; this converts to -#' a fraction -#' -#' @param x Character vector to convert -get.frac <- function(x){ - y <- as.numeric(strsplit(x, '/')[[1]]) - return(y[[1]] / y[[2]]) -} - -#' Summarize and aggregate multiple GO results -#' -#' Convert a list of GO analysis results (one per ontology) to a single -#' dataframe, with additional label column filled in with `label` and with -#' a fraction column. -#' -#' @param ego List of clusterProfiler results objects -#' @param labels List of labels. For each name, a new column will be added and -#" its contents will be the value -#' -#' @return dataframe -summarize.go <- function(ego, labels){ - lst <- list() - for (name in names(ego)){ - d <- as.data.frame(ego[[name]]) - if (nrow(d) > 0){ - d$ontology <- name - for (label in names(labels)){ - d[label] <- labels[[label]] - } - d$frac <- sapply(d$GeneRatio, get.frac) - } - lst[[name]] <- d - } - df <- do.call(rbind, lst) - return(df) -} - - -#' Summarize KEGG results -#' -#' Summarize KEGG results and add `frac` and `label` columns -#' -#' @param ekg Results from clusterProfiler KEGG enrichment -#' @param label Attach this label to the "label" column -#' -#' @return Dataframe -summarize.kegg <- function(ekg, labels){ - d <- as.data.frame(ekg) - if (nrow(d) > 0){ - d$ontology <- 'kegg' - for (label in names(labels)){ - d[label] <- labels[[label]] - } - d$frac <- sapply(d$GeneRatio, get.frac) - } - return(d) -} - -#' Attach additional information from an OrgDb to a results dataframe -#' -#' @param res DESeq2 results object -#' @param keytype Key type for rownames(res) -#' @param columns Character vector of columns to add from OrgDb to results -attach.info <- function(res, keytype='ENSEMBL', columns=c('SYMBOL', 'UNIPROT', 'ALIAS')){ - keys <- rownames(res) - for (column in columns){ - label <- tolower(column) - res[label] <- mapIds(orgdb, keys=keys, column=column, keytype=keytype, multiVal='first') - res[[label]] <- ifelse(is.na(res[[label]]), keys, res[[label]]) - } - # Put "gene" column as the first - cn <- colnames(res) - res$gene <- rownames(res) - res <- res[, c('gene', cn)] - return(res) -} - - -#' Combine everything in the results list into a single table -#' -#' @param res.list Named list of lists, where each sublist contains the following -#' names: c('res', 'dds', 'label'). "res" is a DESeqResults object, -#' "dds" is either the indexing label for the dds.list object or -#' the DESeq object, and "label" is a nicer-looking -#' label to use. NOTE: backwards compatibility with older versions -#' of lcdb-wf depends on no dds.list object being passed. -#' -#' @return Dataframe -summarize.res.list <- function(res.list, alpha, lfc.thresh, dds.list=NULL){ - slist <- list() - for (name in names(res.list)){ - if(!is.null(dds.list)){ - x <- my.summary(res.list[[name]][['res']], dds.list[[ res.list[[name]][['dds']] ]], alpha, lfc.thresh) - } else { x <- my.summary(res.list[[name]][['res']], res.list[[name]][['dds']], alpha, lfc.thresh) - } - rownames(x) <- res.list[[name]][['label']] - slist[[name]] <- x - } - slist <- do.call(rbind, slist) - return(slist) -} - - -#' Return index of up/down/changed genes -#' -#' @param x DESeq2 results object, or data.frame created from one -#' @param direction Direction in 'up', 'dn', 'down', 'ch', 'changed' -#' @param alpha FDR lower than this will be considered significant -#' @param thresh Log2 fold change threshold. If e.g. 2, will return < -2 and/or > 2, depending on the value of "direction" -#' @param return.names If TRUE, returns the rownames of selected genes; if FALSE return boolean index of length(x) -#' -#' @return Character vector of rownames (if return.names=TRUE) or boolean vector of genes selected. -get.sig <- function(x, direction='up', alpha=0.1, lfc.thresh=0, return.names=TRUE){ - if (direction == 'up'){ - idx <- (x$padj < alpha) & (x$log2FoldChange > lfc.thresh) & (!is.na(x$padj)) - } else if (direction %in% c('down', 'dn')){ - idx <- (x$padj < alpha) & (x$log2FoldChange < -lfc.thresh) & (!is.na(x$padj)) - } else if (direction %in% c('changed', 'ch')){ - idx <- (x$padj < alpha) & (abs(x$log2FoldChange) > lfc.thresh) & (!is.na(x$padj)) - } - if (return.names){ - return(rownames(x)[idx]) - } else { - return(idx) - } -} - -#' Prepare list for UpSet plots, but include rownames -#' -#' @param x List of sets to compare (same input as to UpSetR::fromList) -#' -#' @return data.frame of 1 and 0 showing which genes are in which sets -fromList.with.names <- function(lst){ - elements <- unique(unlist(lst)) - data <- unlist(lapply(lst, function(x) { - x <- as.vector(match(elements, x)) - })) - data[is.na(data)] <- as.integer(0) - data[data != 0] <- as.integer(1) - data <- data.frame(matrix(data, ncol = length(lst), byrow = F)) - - # This is the only way in which this function differs from UpSetR::fromList - rownames(data) <- elements - - data <- data[which(rowSums(data) != 0), ] - names(data) <- names(lst) - return(data) -} - - -#' Move rownames of data.frame to first column -#' -#' @param x data.frame -#' @param name Name to use as the new first column -#' -#' @return data.frame with original rownames moved to new first column with -#' optional name. rownames on the new data.frame are set to NULL. -rownames.first.col <- function(x, name='names'){ - orig.names <- colnames(x) - x <- data.frame(rownames(x), x, row.names=NULL) - colnames(x) <- c(name, orig.names) - return(x) -} - - -#' lapply over a nested list -#' -#' @param x Nested list. Expected 2 levels deep, and each item is the same type -#' @param subfunc Function to apply over nested "leaf" items -#' @param ... Additional arguments to be passed to subfunc -#' -#' @return Nested list with same structure but tranformed values -nested.lapply <- function(x, subfunc, ...){ - lapply(x, lapply, subfunc, ...) -} - -#' Split clusterProfiler output into one line per gene -#' -#' @param x Results from clusterProfiler. It is expected that the -#' clusterProfiler enrichment function was called with "readable=TRUE" -#' -#' @return data.frame with genes one per line instead of "/" separated in one -#' line. The rest of the original line is repeated for each gene. -#' -split.clusterProfiler.results <- function(x){ - df <- x@result - # loop over all rows - df.parse <- NULL - for(k in 1:dim(df)[1]){ - g <- strsplit(as.character(df$geneID[k]), "/")[[1]] - gg <- df[rep(k, length(g)),] - gg$geneParse <- g - if(is.null(df.parse)){ - df.parse <- gg - } else { - df.parse <- rbind(df.parse, gg) - } - } - return(df.parse) -} - - -#' Writes out original and split clusterprofiler results -#' -#' @param res clusterProfiler results -#' @param cprof.folder Directory in which to save files. Will be created if needed. -#' @param Label to use for the results. Will generate a filename -#' cprof.folder/label.txt and cprof.folder/label_split.txt -#' -#' @return List of the two files that are created on disk. -write.clusterprofiler.results <- function(res, cprof.folder, label){ - dir.create(cprof.folder, showWarnings=FALSE, recursive=TRUE) - filename.orig <- file.path(cprof.folder, paste0(label, '.txt')) - write.table(res, file=filename.orig, sep='\t', quote=FALSE, row.names=FALSE) - filename.split <- file.path(cprof.folder, paste0(label, '_split.txt')) - res.split <- split.clusterProfiler.results(res) - write.table(res.split, file=filename.split, sep='\t', quote=FALSE, row.names=FALSE) - return(list(orig=filename.orig, split=filename.split)) -} -``` From 4cec15dd879ed593971ab591fb619721a7251241 Mon Sep 17 00:00:00 2001 From: Caroline Esnault Date: Thu, 24 Sep 2020 08:58:15 -0400 Subject: [PATCH 03/19] configs for large test dataset --- .../reference_configs/Test_Homo_sapiens.yaml | 44 ++++++ workflows/chipseq/config/config.yaml | 137 +++++++++--------- workflows/chipseq/config/sampletable.tsv | 28 ++-- workflows/rnaseq/config/config.yaml | 47 +++--- workflows/rnaseq/config/sampletable.tsv | 42 +++++- 5 files changed, 192 insertions(+), 106 deletions(-) create mode 100644 include/reference_configs/Test_Homo_sapiens.yaml diff --git a/include/reference_configs/Test_Homo_sapiens.yaml b/include/reference_configs/Test_Homo_sapiens.yaml new file mode 100644 index 000000000..f8392fb6b --- /dev/null +++ b/include/reference_configs/Test_Homo_sapiens.yaml @@ -0,0 +1,44 @@ +references: + human: + small-gencode-v28: + metadata: + reference_genome_build: 'hg38.p12' + reference_effective_genome_count: 2.7e9 + reference_effective_genome_proportion: 0.87 + + genome: + url: 'https://github.com/lcdb/lcdb-test-data-human/raw/master/seq/hg38.small.fa' + postprocess: + function: 'lib.postprocess.utils.convert_fasta_chroms' + args: 'https://raw.githubusercontent.com/NICHD-BSPC/chrom-name-mappings/d73fdd4d62ca7e845f9357ea5f08d7a918c17e97/mappings/human/gencode_GRCh38.28_to_ucsc_hg38/mappings_gencode_GRCh38.28_to_ucsc_hg38.tsv' + + indexes: + - 'hisat2' + - 'bowtie2' + + annotation: + url: 'https://github.com/lcdb/lcdb-test-data-human/raw/master/annotation/hg38.small.gtf' + postprocess: + function: 'lib.postprocess.utils.convert_gtf_chroms' + args: 'https://raw.githubusercontent.com/NICHD-BSPC/chrom-name-mappings/d73fdd4d62ca7e845f9357ea5f08d7a918c17e97/mappings/human/gencode_GRCh38.28_to_ucsc_hg38/mappings_gencode_GRCh38.28_to_ucsc_hg38.tsv' + conversions: + - 'refflat' + - 'bed12' + + transcriptome: + indexes: + - 'salmon' + + + rRNA: + genome: + url: + - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_LSURef_tax_silva_trunc.fasta.gz' + - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_SSURef_Nr99_tax_silva_trunc.fasta.gz' + postprocess: + function: 'lib.common.filter_fastas' + args: 'Homo sapiens' + indexes: + - 'hisat2' + - 'bowtie2' + diff --git a/workflows/chipseq/config/config.yaml b/workflows/chipseq/config/config.yaml index 287119737..9c159be56 100644 --- a/workflows/chipseq/config/config.yaml +++ b/workflows/chipseq/config/config.yaml @@ -5,7 +5,7 @@ sampletable: 'config/sampletable.tsv' # Which key in the `references` dict below to use -organism: 'dmel' +organism: 'human' # If not specified here, use the environment variable REFERENCES_DIR. references_dir: 'references_data' @@ -37,103 +37,108 @@ chipseq: # at least a BED file of peaks. # peak_calling: - - label: gaf-embryo-sicer - algorithm: sicer + - label: BRD4_dBET6_1 + algorithm: macs2 ip: - - gaf-embryo-1 + - BRD4_dBET6_1 control: - - input-embryo-1 - redundancy_threshold: 1 - window_size: 200 - fragment_size: 150 + - mockIgG_dBET6_1 # optional user-specified override mappable genome proportion if # specified here, SICER will use this value instead of the value specific # to the genome build if NOT specified here, SICER will use the # mappability value for your genome build - effective_genome_fraction: 0.75 - genome_build: dm6 - gap_size: 600 - fdr: 0.01 + #effective_genome_count: 7e7 + extra: '--nomodel -p 0.05 --cutoff-analysis' # --broad for histones, paper says in ‘histone’ mode for MTHFD1, but got very small number of peaks with --broad + - label: BRD4_dBET6_2 + algorithm: macs2 + ip: + - BRD4_dBET6_2 + control: + - mockIgG_dBET6_2 + extra: '--nomodel -p 0.05 --cutoff-analysis' - - label: gaf-embryo-1 + - label: BRD4_DMSO_1 algorithm: macs2 ip: - - gaf-embryo-1 + - BRD4_DMSO_1 control: - - input-embryo-1 - # optional user-specified override mappable genome size if specified - # here, MACS will use this value instead of the value specific to the - # genome build if NOT specified here, MACS will use the mappability value - # for your genome build - effective_genome_count: 7e7 - extra: '--nomodel --extsize 147' - - - label: gaf-embryo-1 - algorithm: spp + - mockIgG_DMSO_1 + extra: '--nomodel -p 0.05 --cutoff-analysis' + + - label: BRD4_DMSO_2 + algorithm: macs2 ip: - - gaf-embryo-1 + - BRD4_DMSO_2 control: - - input-embryo-1 - extra: - fdr: 0.3 - zthr: 4 + - mockIgG_DMSO_2 + extra: '--nomodel -p 0.05 --cutoff-analysis' - - label: gaf-embryo-1-defaults - algorithm: spp + - label: MTHFD1_dBET6_1 + algorithm: macs2 ip: - - gaf-embryo-1 + - MTHFD1_dBET6_1 control: - - input-embryo-1 + - mockIgG_dBET6_1 + extra: '--nomodel -p 0.05 --cutoff-analysis' # --broad for histones, paper says in ‘histone’ mode for MTHFD1 - - label: gaf-wingdisc-pooled + - label: MTHFD1_dBET6_1_inputCTRL algorithm: macs2 ip: - - gaf-wingdisc-1 - - gaf-wingdisc-2 + - MTHFD1_dBET6_1 control: - - input-wingdisc-1 - - input-wingdisc-2 - extra: '--nomodel --extsize 147' + - input_dBET6_1 + extra: '--nomodel -p 0.05 --cutoff-analysis' - - label: gaf-wingdisc-pooled - algorithm: spp + - label: MTHFD1_dBET6_2 + algorithm: macs2 ip: - - gaf-wingdisc-1 - - gaf-wingdisc-2 + - MTHFD1_dBET6_2 control: - - input-wingdisc-1 - # - input-wingdisc-2 - extra: - fdr: 0.5 - zthr: 4 + - mockIgG_dBET6_2 + extra: '--nomodel -p 0.05 --cutoff-analysis' + + - label: MTHFD1_DMSO_1 + algorithm: macs2 + ip: + - MTHFD1_DMSO_1 + control: + - mockIgG_DMSO_1 + extra: '--nomodel -p 0.05 --cutoff-analysis' + + - label: MTHFD1_DMSO_2 + algorithm: macs2 + ip: + - MTHFD1_DMSO_2 + control: + - mockIgG_DMSO_2 + extra: '--nomodel -p 0.05 --cutoff-analysis' fastq_screen: - label: rRNA - organism: dmel - tag: test + organism: human + tag: small-gencode-v28 - label: PhiX organism: phix tag: default - - label: Fly - organism: dmel - tag: test - -merged_bigwigs: - input-wingdisc: - - input-wingdisc-1 - - input-wingdisc-2 - gaf-wingdisc: - - gaf-wingdisc-1 - - gaf-wingdisc-2 - gaf-embryo: - - gaf-embryo-1 + - label: Human + organism: human + tag: small-gencode-v28 + +#merged_bigwigs: +# input-wingdisc: +# - input-wingdisc-1 +# - input-wingdisc-2 +# gaf-wingdisc: +# - gaf-wingdisc-1 +# - gaf-wingdisc-2 +# gaf-embryo: +# - gaf-embryo-1 aligner: index: 'bowtie2' - tag: 'test' + tag: 'small-gencode-v28' include_references: - '../../include/reference_configs/PhiX.yaml' - - '../../include/reference_configs/Drosophila_melanogaster.yaml' - - '../../include/reference_configs/test.yaml' + - '../../include/reference_configs/Test_Homo_sapiens.yaml' diff --git a/workflows/chipseq/config/sampletable.tsv b/workflows/chipseq/config/sampletable.tsv index 05212460d..603f201e5 100644 --- a/workflows/chipseq/config/sampletable.tsv +++ b/workflows/chipseq/config/sampletable.tsv @@ -1,11 +1,17 @@ -# Samplenames with the same "label" will be considered technical replicates -samplename antibody biological_material replicate label orig_filename -input_1 input wingdisc-1 1 input-wingdisc-1 data/example_data/chipseq_input1.fq.gz -input_2 input wingdisc-2 2 input-wingdisc-2 data/example_data/chipseq_input2.fq.gz -ip_1 gaf wingdisc-1 1 gaf-wingdisc-1 data/example_data/chipseq_ip1.fq.gz -ip_2 gaf wingdisc-2 2 gaf-wingdisc-2 data/example_data/chipseq_ip2.fq.gz - -# Note here we are treating ip_3 and ip_4 as technical replicates for the sake of testing -ip_3 gaf embryo-1 1 gaf-embryo-1 data/example_data/chipseq_ip3.fq.gz -ip_4 gaf embryo-1 1 gaf-embryo-1 data/example_data/chipseq_ip4.fq.gz -input_3 input embryo-1 1 input-embryo-1 data/example_data/chipseq_input3.fq.gz +samplename label biological_material group GEO_Run LibraryLayout source_name Treatment antibody replicate orig_filename +BRD4_dBET6_1 BRD4_dBET6_1 HAP1_B1 BRD4_dBET6 SRR6202977 SINGLE ChIP-seq for BRD4 in HAP1 cell line treated with dBET6 dBET6 BRD4 1 data/example_data/BRD4_dBET6_1.small_R1.fastq.gz +BRD4_dBET6_2 BRD4_dBET6_2 HAP1_B2 BRD4_dBET6 SRR6202978 SINGLE ChIP-seq for BRD4 in HAP1 cell line treated with dBET6 dBET6 BRD4 2 data/example_data/BRD4_dBET6_2.small_R1.fastq.gz +BRD4_DMSO_1 BRD4_DMSO_1 HAP1_D1 BRD4_DMSO SRR6202979 SINGLE ChIP-seq for BRD4 in HAP1 cell line treated with DMSO DMSO BRD4 1 data/example_data/BRD4_DMSO_1.small_R1.fastq.gz +BRD4_DMSO_2 BRD4_DMSO_2 HAP1_D2 BRD4_DMSO SRR6202980 SINGLE ChIP-seq for BRD4 in HAP1 cell line treated with DMSO DMSO BRD4 2 data/example_data/BRD4_DMSO_2.small_R1.fastq.gz +mockIgG_dBET6_1 mockIgG_dBET6_1 HAP1_B1 mockIgG_dBET6 SRR6202981 SINGLE ChIP-seq with mock IgG antibody in HAP1 cell line treated with dBET6 dBET6 mockIgG 1 data/example_data/mockIgG_dBET6_1.small_R1.fastq.gz +mockIgG_dBET6_2 mockIgG_dBET6_2 HAP1_B2 mockIgG_dBET6 SRR6202982 SINGLE ChIP-seq with mock IgG antibody in HAP1 cell line treated with dBET6 dBET6 mockIgG 2 data/example_data/mockIgG_dBET6_2.small_R1.fastq.gz +mockIgG_DMSO_1 mockIgG_DMSO_1 HAP1_D1 mockIgG_DMSO SRR6202983 SINGLE ChIP-seq with mock IgG antibody in HAP1 cell line treated with DMSO DMSO mockIgG 1 data/example_data/mockIgG_DMSO_1.small_R1.fastq.gz +mockIgG_DMSO_2 mockIgG_DMSO_2 HAP1_D2 mockIgG_DMSO SRR6202984 SINGLE ChIP-seq with mock IgG antibody in HAP1 cell line treated with DMSO DMSO mockIgG 2 data/example_data/mockIgG_DMSO_2.small_R1.fastq.gz +input_dBET6_1 input_dBET6_1 HAP1_B1 input_dBET6 SRR6202985 SINGLE ChIP-seq for Input in HAP1 cell line treated with dBET6 dBET6 input 1 data/example_data/input_dBET6_1.small_R1.fastq.gz +input_dBET6_2 input_dBET6_2 HAP1_B2 input_dBET6 SRR6202986 SINGLE ChIP-seq for Input in HAP1 cell line treated with dBET6 dBET6 input 2 data/example_data/input_dBET6_2.small_R1.fastq.gz +input_DMSO_1 input_DMSO_1 HAP1_D1 input_DMSO SRR6202987 SINGLE ChIP-seq for Input in HAP1 cell line treated with DMSO DMSO input 1 data/example_data/input_DMSO_1.small_R1.fastq.gz +input_DMSO_2 input_DMSO_2 HAP1_D2 input_DMSO SRR6202988 SINGLE ChIP-seq for Input in HAP1 cell line treated with DMSO DMSO input 2 data/example_data/input_DMSO_2.small_R1.fastq.gz +MTHFD1_dBET6_1 MTHFD1_dBET6_1 HAP1_B1 MTHFD1_dBET6 SRR6202989 SINGLE ChIP-seq for MTHFD1 in HAP1 cell line treated with dBET6 dBET6 MTHFD1 1 data/example_data/MTHFD1_dBET6_1.small_R1.fastq.gz +MTHFD1_dBET6_2 MTHFD1_dBET6_2 HAP1_B2 MTHFD1_dBET6 SRR6202990 SINGLE ChIP-seq for MTHFD1 in HAP1 cell line treated with dBET6 dBET6 MTHFD1 2 data/example_data/MTHFD1_dBET6_2.small_R1.fastq.gz +MTHFD1_DMSO_1 MTHFD1_DMSO_1 HAP1_D1 MTHFD1_DMSO SRR6202991 SINGLE ChIP-seq for MTHFD1 in HAP1 cell line treated with DMSO DMSO MTHFD1 1 data/example_data/MTHFD1_DMSO_1.small_R1.fastq.gz +MTHFD1_DMSO_2 MTHFD1_DMSO_2 HAP1_D2 MTHFD1_DMSO SRR6202992 SINGLE ChIP-seq for MTHFD1 in HAP1 cell line treated with DMSO DMSO MTHFD1 2 data/example_data/MTHFD1_DMSO_2.small_R1.fastq.gz diff --git a/workflows/rnaseq/config/config.yaml b/workflows/rnaseq/config/config.yaml index c0de423e7..490948db4 100644 --- a/workflows/rnaseq/config/config.yaml +++ b/workflows/rnaseq/config/config.yaml @@ -3,53 +3,52 @@ sampletable: 'config/sampletable.tsv' patterns: 'config/rnaseq_patterns.yaml' # Which key in the `references` dict below to use -organism: 'dmel' +organism: 'human' # If not specified here, use the environment variable REFERENCES_DIR. references_dir: 'references_data' aligner: index: 'hisat2' - tag: 'test' + tag: 'small-gencode-v28' rrna: index: 'bowtie2' tag: 'rRNA' gtf: - tag: "test" + tag: "small-gencode-v28" salmon: - tag: "test" + tag: "small-gencode-v28" fastq_screen: - label: rRNA - organism: dmel - tag: test + organism: human + tag: rRNA - label: PhiX organism: phix tag: default - - label: Fly - organism: dmel - tag: test - -merged_bigwigs: - control_pos: - pos: - - sample1 - - sample2 - treatment_all: - pos: - - sample3 - - sample4 - neg: - - sample3 - - sample4 + - label: human + organism: human + tag: small-gencode-v28 + +#merged_bigwigs: +# control_pos: +# pos: +# - sample1 +# - sample2 +# treatment_all: +# pos: +# - sample3 +# - sample4 +# neg: +# - sample3 +# - sample4 # See the reference config files in the top level of the repo, # include/reference_configs, for inspiration for more species. include_references: - - '../../include/reference_configs/test.yaml' - '../../include/reference_configs/PhiX.yaml' - - '../../include/reference_configs/Drosophila_melanogaster.yaml' + - '../../include/reference_configs/Test_Homo_sapiens.yaml' diff --git a/workflows/rnaseq/config/sampletable.tsv b/workflows/rnaseq/config/sampletable.tsv index fad727e78..e1b8ee262 100644 --- a/workflows/rnaseq/config/sampletable.tsv +++ b/workflows/rnaseq/config/sampletable.tsv @@ -1,5 +1,37 @@ -samplename group layout orig_filename -sample1 control SE data/example_data/rnaseq_sample1PE_1.fq.gz -sample2 control SE data/example_data/rnaseq_sample2.fq.gz -sample3 treatment SE data/example_data/rnaseq_sample3.fq.gz -sample4 treatment SE data/example_data/rnaseq_sample4.fq.gz +samplename GEO_Run layout group cell_line_group cell_Line treatment timepoint repl biorep orig_filename +A549_dBet6_1h_1 SRR6354081 SE dBet6_1h A549_dBet6_1h A549 dBet6 1h 1 A549_dBet6_1h_1 data/example_data/A549_dBet6_1h_1.small_R1.fastq.gz +A549_dBet6_1h_2 SRR6354082 SE dBet6_1h A549_dBet6_1h A549 dBet6 1h 2 A549_dBet6_1h_1 data/example_data/A549_dBet6_1h_2.small_R1.fastq.gz +A549_dBet6_6h_1 SRR6354083 SE dBet6_6h A549_dBet6_6h A549 dBet6 6h 1 A549_dBet6_6h_1 data/example_data/A549_dBet6_6h_2.small_R1.fastq.gz +A549_dBet6_6h_2 SRR6354084 SE dBet6_6h A549_dBet6_6h A549 dBet6 6h 2 A549_dBet6_6h_1 data/example_data/A549_dBet6_6h_1.small_R1.fastq.gz +A549_DMSO_1h_1 SRR6354085 SE DMSO_1h A549_DMSO_1h A549 DMSO 1h 1 A549_DMSO_1h_1 data/example_data/A549_DMSO_1h_1.small_R1.fastq.gz +A549_DMSO_1h_2 SRR6354086 SE DMSO_1h A549_DMSO_1h A549 DMSO 1h 2 A549_DMSO_1h_2 data/example_data/A549_DMSO_1h_2.small_R1.fastq.gz +A549_DMSO_6h_1 SRR6354087 SE DMSO_6h A549_DMSO_6h A549 DMSO 6h 1 A549_DMSO_6h_1 data/example_data/A549_DMSO_6h_1.small_R1.fastq.gz +A549_DMSO_6h_2 SRR6354088 SE DMSO_6h A549_DMSO_6h A549 DMSO 6h 2 A549_DMSO_6h_2 data/example_data/A549_DMSO_6h_2.small_R1.fastq.gz +A549_JQ1_1h_1 SRR6354089 SE JQ1_1h A549_JQ1_1h A549 JQ1 1h 1 A549_JQ1_1h_1 data/example_data/A549_JQ1_1h_1.small_R1.fastq.gz +A549_JQ1_1h_2 SRR6354090 SE JQ1_1h A549_JQ1_1h A549 JQ1 1h 2 A549_JQ1_1h_2 data/example_data/A549_JQ1_1h_2.small_R1.fastq.gz +A549_JQ1_6h_1 SRR6354091 SE JQ1_6h A549_JQ1_6h A549 JQ1 6h 1 A549_JQ1_6h_1 data/example_data/A549_JQ1_6h_1.small_R1.fastq.gz +A549_JQ1_6h_2 SRR6354092 SE JQ1_6h A549_JQ1_6h A549 JQ1 6h 2 A549_JQ1_6h_2 data/example_data/A549_JQ1_6h_2.small_R1.fastq.gz +HAP1_dBet6_1h_1 SRR6354109 SE dBet6_1h HAP1_dBet6_1h HAP1 dBet6 1h 1 HAP1_dBet6_1h_1 data/example_data/HAP1_dBet6_1h_1.small_R1.fastq.gz +HAP1_dBet6_1h_2 SRR6354110 SE dBet6_1h HAP1_dBet6_1h HAP1 dBet6 1h 2 HAP1_dBet6_1h_2 data/example_data/HAP1_dBet6_1h_2.small_R1.fastq.gz +HAP1_dBet6_6h_1 SRR6354111 SE dBet6_6h HAP1_dBet6_6h HAP1 dBet6 6h 1 HAP1_dBet6_6h_1 data/example_data/HAP1_dBet6_6h_1.small_R1.fastq.gz +HAP1_dBet6_6h_2 SRR6354112 SE dBet6_6h HAP1_dBet6_6h HAP1 dBet6 6h 2 HAP1_dBet6_6h_2 data/example_data/HAP1_dBet6_6h_2.small_R1.fastq.gz +HAP1_DMSO_1h_1 SRR6354113 SE DMSO_1h HAP1_DMSO_1h HAP1 DMSO 1h 1 HAP1_DMSO_1h_1 data/example_data/HAP1_DMSO_1h_1.small_R1.fastq.gz +HAP1_DMSO_1h_2 SRR6354114 SE DMSO_1h HAP1_DMSO_1h HAP1 DMSO 1h 2 HAP1_DMSO_1h_2 data/example_data/HAP1_DMSO_1h_2.small_R1.fastq.gz +HAP1_DMSO_6h_1 SRR6354115 SE DMSO_6h HAP1_DMSO_6h HAP1 DMSO 6h 1 HAP1_DMSO_6h_1 data/example_data/HAP1_DMSO_6h_1.small_R1.fastq.gz +HAP1_DMSO_6h_2 SRR6354116 SE DMSO_6h HAP1_DMSO_6h HAP1 DMSO 6h 2 HAP1_DMSO_6h_2 data/example_data/HAP1_DMSO_6h_2.small_R1.fastq.gz +HAP1_JQ1_1h_1 SRR6354117 SE JQ1_1h HAP1_JQ1_1h HAP1 JQ1 1h 1 HAP1_JQ1_1h_1 data/example_data/HAP1_JQ1_1h_1.small_R1.fastq.gz +HAP1_JQ1_1h_2 SRR6354118 SE JQ1_1h HAP1_JQ1_1h HAP1 JQ1 1h 2 HAP1_JQ1_1h_2 data/example_data/HAP1_JQ1_1h_2.small_R1.fastq.gz +HAP1_JQ1_6h_1 SRR6354119 SE JQ1_6h HAP1_JQ1_6h HAP1 JQ1 6h 1 HAP1_JQ1_6h_1 data/example_data/HAP1_JQ1_6h_1.small_R1.fastq.gz +HAP1_JQ1_6h_2 SRR6354120 SE JQ1_6h HAP1_JQ1_6h HAP1 JQ1 6h 2 HAP1_JQ1_6h_2 data/example_data/HAP1_JQ1_6h_2.small_R1.fastq.gz +outlier_HAP1_dBet6_1h_1 SRR6354137 SE dBet6_1h HAP1_dBet6_1h HAP1 dBet6 1h 1 HAP1_dBet6_1h_1 data/example_data/outlier_HAP1_dBet6_1h_1.small_R1.fastq.gz +K562_dBet6_1h_2 SRR6354138 SE dBet6_1h K562_dBet6_1h K562 dBet6 1h 2 K562_dBet6_1h_2 data/example_data/K562_dBet6_1h_2.small_R1.fastq.gz +K562_dBet6_6h_1 SRR6354139 SE dBet6_6h K562_dBet6_6h K562 dBet6 6h 1 K562_dBet6_6h_1 data/example_data/K562_dBet6_6h_1.small_R1.fastq.gz +K562_dBet6_6h_2 SRR6354140 SE dBet6_6h K562_dBet6_6h K562 dBet6 6h 2 K562_dBet6_6h_2 data/example_data/K562_dBet6_6h_2.small_R1.fastq.gz +K562_DMSO_1h_1 SRR6354141 SE DMSO_1h K562_DMSO_1h K562 DMSO 1h 1 K562_DMSO_1h_1 data/example_data/K562_DMSO_1h_1.small_R1.fastq.gz +K562_DMSO_1h_2 SRR6354142 SE DMSO_1h K562_DMSO_1h K562 DMSO 1h 2 K562_DMSO_1h_2 data/example_data/K562_DMSO_1h_2.small_R1.fastq.gz +K562_DMSO_6h_1 SRR6354143 SE DMSO_6h K562_DMSO_6h K562 DMSO 6h 1 K562_DMSO_6h_1 data/example_data/K562_DMSO_6h_1.small_R1.fastq.gz +K562_DMSO_6h_2 SRR6354144 SE DMSO_6h K562_DMSO_6h K562 DMSO 6h 2 K562_DMSO_6h_2 data/example_data/K562_DMSO_6h_2.small_R1.fastq.gz +K562_JQ1_1h_1 SRR6354145 SE JQ1_1h K562_JQ1_1h K562 JQ1 1h 1 K562_JQ1_1h_1 data/example_data/K562_JQ1_1h_1.small_R1.fastq.gz +K562_JQ1_1h_2 SRR6354146 SE JQ1_1h K562_JQ1_1h K562 JQ1 1h 2 K562_JQ1_1h_2 data/example_data/K562_JQ1_1h_2.small_R1.fastq.gz +K562_JQ1_6h_1 SRR6354147 SE JQ1_6h K562_JQ1_6h K562 JQ1 6h 1 K562_JQ1_6h_1 data/example_data/K562_JQ1_6h_1.small_R1.fastq.gz +K562_JQ1_6h_2 SRR6354148 SE JQ1_6h K562_JQ1_6h K562 JQ1 6h 2 K562_JQ1_6h_2 data/example_data/K562_JQ1_6h_2.small_R1.fastq.gz From 39960a62fdd3027428d23db479f07d3211cc6bf4 Mon Sep 17 00:00:00 2001 From: Caroline Esnault Date: Thu, 24 Sep 2020 08:58:58 -0400 Subject: [PATCH 04/19] update ci for large test dataset --- ci/get-data.py | 70 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 53 insertions(+), 17 deletions(-) diff --git a/ci/get-data.py b/ci/get-data.py index cd2d356b1..87115e144 100755 --- a/ci/get-data.py +++ b/ci/get-data.py @@ -5,7 +5,7 @@ shell.executable('/bin/bash') BRANCH = 'master' -URL = 'https://github.com/lcdb/lcdb-test-data/blob/{0}/data/{{}}?raw=true'.format(BRANCH) +URL = 'https://github.com/lcdb/lcdb-test-data-human/blob/{0}/{{}}?raw=true'.format(BRANCH) def _download_file(fn, dest=None): @@ -17,21 +17,57 @@ def _download_file(fn, dest=None): shell('wget -q -O- {url} > {dest}') return dest +_download_file('rnaseq_samples/A549_DMSO_1h_1/A549_DMSO_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_DMSO_1h_1.small_R1.fastq.gz') +_download_file('rnaseq_samples/A549_DMSO_1h_2/A549_DMSO_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_DMSO_1h_2.small_R1.fastq.gz') +_download_file('rnaseq_samples/A549_DMSO_6h_1/A549_DMSO_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_DMSO_6h_1.small_R1.fastq.gz') +_download_file('rnaseq_samples/A549_DMSO_6h_2/A549_DMSO_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_DMSO_6h_2.small_R1.fastq.gz') +_download_file('rnaseq_samples/A549_JQ1_1h_1/A549_JQ1_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_JQ1_1h_1.small_R1.fastq.gz') +_download_file('rnaseq_samples/A549_JQ1_1h_2/A549_JQ1_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_JQ1_1h_2.small_R1.fastq.gz') +_download_file('rnaseq_samples/A549_JQ1_6h_1/A549_JQ1_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_JQ1_6h_1.small_R1.fastq.gz') +_download_file('rnaseq_samples/A549_JQ1_6h_2/A549_JQ1_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_JQ1_6h_2.small_R1.fastq.gz') +_download_file('rnaseq_samples/A549_dBet6_1h_1/A549_dBet6_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_dBet6_1h_1.small_R1.fastq.gz') +_download_file('rnaseq_samples/A549_dBet6_1h_2/A549_dBet6_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_dBet6_1h_2.small_R1.fastq.gz') +_download_file('rnaseq_samples/A549_dBet6_6h_1/A549_dBet6_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_dBet6_6h_1.small_R1.fastq.gz') +_download_file('rnaseq_samples/A549_dBet6_6h_2/A549_dBet6_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_dBet6_6h_2.small_R1.fastq.gz') +_download_file('rnaseq_samples/HAP1_DMSO_1h_1/HAP1_DMSO_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_DMSO_1h_1.small_R1.fastq.gz') +_download_file('rnaseq_samples/HAP1_DMSO_1h_2/HAP1_DMSO_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_DMSO_1h_2.small_R1.fastq.gz') +_download_file('rnaseq_samples/HAP1_DMSO_6h_1/HAP1_DMSO_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_DMSO_6h_1.small_R1.fastq.gz') +_download_file('rnaseq_samples/HAP1_DMSO_6h_2/HAP1_DMSO_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_DMSO_6h_2.small_R1.fastq.gz') +_download_file('rnaseq_samples/HAP1_JQ1_1h_1/HAP1_JQ1_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_JQ1_1h_1.small_R1.fastq.gz') +_download_file('rnaseq_samples/HAP1_JQ1_1h_2/HAP1_JQ1_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_JQ1_1h_2.small_R1.fastq.gz') +_download_file('rnaseq_samples/HAP1_JQ1_6h_1/HAP1_JQ1_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_JQ1_6h_1.small_R1.fastq.gz') +_download_file('rnaseq_samples/HAP1_JQ1_6h_2/HAP1_JQ1_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_JQ1_6h_2.small_R1.fastq.gz') +_download_file('rnaseq_samples/HAP1_dBet6_1h_1/HAP1_dBet6_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_dBet6_1h_1.small_R1.fastq.gz') +_download_file('rnaseq_samples/HAP1_dBet6_1h_2/HAP1_dBet6_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_dBet6_1h_2.small_R1.fastq.gz') +_download_file('rnaseq_samples/HAP1_dBet6_6h_1/HAP1_dBet6_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_dBet6_6h_1.small_R1.fastq.gz') +_download_file('rnaseq_samples/HAP1_dBet6_6h_2/HAP1_dBet6_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_dBet6_6h_2.small_R1.fastq.gz') +_download_file('rnaseq_samples/K562_DMSO_1h_1/K562_DMSO_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_DMSO_1h_1.small_R1.fastq.gz') +_download_file('rnaseq_samples/K562_DMSO_1h_2/K562_DMSO_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_DMSO_1h_2.small_R1.fastq.gz') +_download_file('rnaseq_samples/K562_DMSO_6h_1/K562_DMSO_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_DMSO_6h_1.small_R1.fastq.gz') +_download_file('rnaseq_samples/K562_DMSO_6h_2/K562_DMSO_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_DMSO_6h_2.small_R1.fastq.gz') +_download_file('rnaseq_samples/K562_JQ1_1h_1/K562_JQ1_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_JQ1_1h_1.small_R1.fastq.gz') +_download_file('rnaseq_samples/K562_JQ1_1h_2/K562_JQ1_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_JQ1_1h_2.small_R1.fastq.gz') +_download_file('rnaseq_samples/K562_JQ1_6h_1/K562_JQ1_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_JQ1_6h_1.small_R1.fastq.gz') +_download_file('rnaseq_samples/K562_JQ1_6h_2/K562_JQ1_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_JQ1_6h_2.small_R1.fastq.gz') +_download_file('rnaseq_samples/K562_dBet6_1h_2/K562_dBet6_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_dBet6_1h_2.small_R1.fastq.gz') +_download_file('rnaseq_samples/K562_dBet6_6h_1/K562_dBet6_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_dBet6_6h_1.small_R1.fastq.gz') +_download_file('rnaseq_samples/K562_dBet6_6h_2/K562_dBet6_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_dBet6_6h_2.small_R1.fastq.gz') +_download_file('rnaseq_samples/outlier_HAP1_dBet6_1h_1/outlier_HAP1_dBet6_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/outlier_HAP1_dBet6_1h_1.small_R1.fastq.gz') -_download_file('rnaseq_samples/sample1/sample1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample1.fq.gz') -_download_file('rnaseq_samples/sample2/sample2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample2.fq.gz') -_download_file('rnaseq_samples/sample3/sample3.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample3.fq.gz') -_download_file('rnaseq_samples/sample4/sample4.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample4.fq.gz') +_download_file('chipseq_samples/BRD4_DMSO_1/BRD4_DMSO_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/BRD4_DMSO_1.small_R1.fastq.gz') +_download_file('chipseq_samples/BRD4_DMSO_2/BRD4_DMSO_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/BRD4_DMSO_2.small_R1.fastq.gz') +_download_file('chipseq_samples/BRD4_dBET6_1/BRD4_dBET6_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/BRD4_dBET6_1.small_R1.fastq.gz') +_download_file('chipseq_samples/BRD4_dBET6_2/BRD4_dBET6_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/BRD4_dBET6_2.small_R1.fastq.gz') +_download_file('chipseq_samples/MTHFD1_DMSO_1/MTHFD1_DMSO_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/MTHFD1_DMSO_1.small_R1.fastq.gz') +_download_file('chipseq_samples/MTHFD1_DMSO_2/MTHFD1_DMSO_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/MTHFD1_DMSO_2.small_R1.fastq.gz') +_download_file('chipseq_samples/MTHFD1_dBET6_1/MTHFD1_dBET6_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/MTHFD1_dBET6_1.small_R1.fastq.gz') +_download_file('chipseq_samples/MTHFD1_dBET6_2/MTHFD1_dBET6_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/MTHFD1_dBET6_2.small_R1.fastq.gz') +_download_file('chipseq_samples/input_DMSO_1/input_DMSO_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/input_DMSO_1.small_R1.fastq.gz') +_download_file('chipseq_samples/input_DMSO_2/input_DMSO_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/input_DMSO_2.small_R1.fastq.gz') +_download_file('chipseq_samples/input_dBET6_1/input_dBET6_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/input_dBET6_1.small_R1.fastq.gz') +_download_file('chipseq_samples/input_dBET6_2/input_dBET6_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/input_dBET6_2.small_R1.fastq.gz') +_download_file('chipseq_samples/mockIgG_DMSO_1/mockIgG_DMSO_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/mockIgG_DMSO_1.small_R1.fastq.gz') +_download_file('chipseq_samples/mockIgG_DMSO_2/mockIgG_DMSO_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/mockIgG_DMSO_2.small_R1.fastq.gz') +_download_file('chipseq_samples/mockIgG_dBET6_1/mockIgG_dBET6_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/mockIgG_dBET6_1.small_R1.fastq.gz') +_download_file('chipseq_samples/mockIgG_dBET6_2/mockIgG_dBET6_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/mockIgG_dBET6_2.small_R1.fastq.gz') -_download_file('rnaseq_samples/sample1/sample1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample1PE_1.fq.gz') -_download_file('rnaseq_samples/sample1/sample1.small_R2.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample1PE_2.fq.gz') -_download_file('rnaseq_samples/sample2/sample2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample2PE_1.fq.gz') -_download_file('rnaseq_samples/sample2/sample2.small_R2.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample2PE_2.fq.gz') - -_download_file('chipseq_samples/input_1/input_1.tiny_R1.fastq.gz', 'workflows/chipseq/data/example_data/chipseq_input1.fq.gz') -_download_file('chipseq_samples/ip_1/ip_1.tiny_R1.fastq.gz', 'workflows/chipseq/data/example_data/chipseq_ip1.fq.gz') -_download_file('chipseq_samples/input_2/input_2.tiny_R1.fastq.gz', 'workflows/chipseq/data/example_data/chipseq_input2.fq.gz') -_download_file('chipseq_samples/ip_2/ip_2.tiny_R1.fastq.gz', 'workflows/chipseq/data/example_data/chipseq_ip2.fq.gz') -_download_file('chipseq_samples/ip_3/ip_3.tiny_R1.fastq.gz', 'workflows/chipseq/data/example_data/chipseq_ip3.fq.gz') -_download_file('chipseq_samples/ip_4/ip_4.tiny_R1.fastq.gz', 'workflows/chipseq/data/example_data/chipseq_ip4.fq.gz') -_download_file('chipseq_samples/input_3/input_3.tiny_R1.fastq.gz', 'workflows/chipseq/data/example_data/chipseq_input3.fq.gz') From 7d09db2d275a0fad65cdd3412e625d6ed2c742ac Mon Sep 17 00:00:00 2001 From: Caroline Esnault Date: Thu, 24 Sep 2020 08:59:45 -0400 Subject: [PATCH 05/19] add chipseq downstream diffbind --- workflows/chipseq/downstream/diffbind.Rmd | 492 ++++++++++++++++++++++ 1 file changed, 492 insertions(+) create mode 100644 workflows/chipseq/downstream/diffbind.Rmd diff --git a/workflows/chipseq/downstream/diffbind.Rmd b/workflows/chipseq/downstream/diffbind.Rmd new file mode 100644 index 000000000..6e6e3db6c --- /dev/null +++ b/workflows/chipseq/downstream/diffbind.Rmd @@ -0,0 +1,492 @@ +--- +title: Differential ChIP-seq peaks +output: + html_document: + code_folding: hide + toc: true + toc_float: true + toc_depth: 3 +--- + +## Changelog + +**Initial results** + +Last run: `r date()` + +```{r, include=FALSE} +knitr::opts_chunk$set(sep=TRUE, warning=FALSE, message=FALSE, + bootstrap.show.code=FALSE, bootstrap.show.output=FALSE, + cache.extra_file_dep_1=file.info('../config/sampletable.tsv')$mtime, + # try disabling this when running locally for nicer figures + #dev='bitmap', + fig.ext='png') +``` + +```{r, results='asis'} +subchunkify <- function(g) { + g_deparsed <- paste0(deparse( + function() {g} + ), collapse = '') + + sub_chunk <- paste0(" + `","``{r sub_chunk_", floor(runif(1) * 10000), ", fig.height=10, echo=FALSE}", + "\n(", + g_deparsed + , ")()", + "\n`","`` + ") + + cat(knitr::knit(text = knitr::knit_expand(text = sub_chunk), quiet = TRUE)) + } +``` + +```{r imports, include=FALSE} +library(DiffBind) +library(ggplot2) +library(BiocParallel) +library(AnnotationHub) +library(ChIPseeker) +library(dplyr) +``` + +```{r limit_cpus} +#register(MulticoreParam(workers=future::availableCores())) +register(MulticoreParam(workers=2)) +``` + +```{r load_helpers} +devtools::document('../../../lib/lcdbwf') +devtools::load_all('../../../lib/lcdbwf') +``` + +```{r annotationhub_setup} +annotation_genus_species <- 'Homo sapiens' +annotation_key_override <- NA +hub.cache <- '../../../include/AnnotationHubCache' +orgdb <- get.orgdb( + annotation_genus_species, + cache=hub.cache, + annotation_key_override=annotation_key_override +) +``` + +```{r txdb} +get.txdb <- function(species, cache, annotation_key_override=NA){ + + # Workaround to allow AnnotationHub to use proxy. See + # https://github.com/Bioconductor/AnnotationHub/issues/4, and thanks + # Wolfgang! + proxy <- Sys.getenv('http_proxy') + if (proxy == ""){ + proxy <- NULL + } + + ah <- AnnotationHub(hub=getAnnotationHubOption('URL'), + cache=cache, + proxy=proxy, + localHub=FALSE) + + find.annotationhub.name <- function(species.name, override.code) { #autodetect ah names based on loaded database + if (is.na(override.code)) { + ah.query <- query(ah, c("TxDb", "UCSC")) + ah.query.speciesmatch <- grepl(paste("^", species.name, "$", sep=""), ah.query$species) + ah.query.which <- which(ah.query.speciesmatch) + stopifnot(length(ah.query.which) > 0) #require at least one match + if (length(ah.query.which) > 1) { #warn of duplicate matches + print("WARNING: found multiple candidate species in AnnotationHub: "); + print(ah.query.speciesmatch) + } + names(ah.query)[ah.query.which[length(ah.query.which)]] + } else { + override.code + } + } + annotation_key <- find.annotationhub.name(annotation_genus_species, annotation_key_override) + txdb <- ah[[annotation_key]] + return(txdb) +} + +txdb <- get.txdb( + annotation_genus_species, + cache=hub.cache, + annotation_key_override=annotation_key_override +) +``` + +```{r coldata_setup} +threshold <- 0.01 +use_pval <- TRUE +sample.table.filename <- '../config/sampletable.tsv' +# PeakCaller is the peak file format for diffBind dba function +PeakCaller <- 'bed' +exclude.for.printing <- c('orig_filename', 'orig_filename_R2', 'bamReads', 'bamControl', 'Peaks', 'PeakCaller') +bam.path.func <- function (x) file.path('..', 'data', 'chipseq_merged', x, paste0(x, '.cutadapt.unique.nodups.merged.bam')) +peak.path.func <- function (x) file.path('..', 'data', 'chipseq_peaks', 'macs2', x, 'peaks.bed') + +colData <- read.table(sample.table.filename, sep='\t', header=TRUE, stringsAsFactors=FALSE) + +colData$bamReads <- sapply(colData$label, bam.path.func) +colData$Peaks <- sapply(colData$label, peak.path.func) +colData$PeakCaller <- PeakCaller + +factor.columns <- c('Treatment') +for (col in factor.columns){ + colData[[col]] <- as.factor(colData[[col]]) +} + +colData$Treatment <- relevel(colData$Treatment, ref='DMSO') +rownames(colData) <- colData[,1] + +# split by input vs. antibody +ab_ref <- 'mockIgG' +colDataInput <- colData %>% + filter(antibody == ab_ref) %>% + mutate(bamControl = bamReads, ControlID = label) %>% + distinct(label, .keep_all=TRUE) +colDataAB <- colData %>% filter(!antibody %in% c(ab_ref, 'input')) %>% +#colDataAB <- colData %>% filter(antibody == 'BRD4') %>% + distinct(label, .keep_all=TRUE) %>% + mutate(SampleID = samplename) + +# make sampletables for diffBind +st <- list() +for (ab in unique(colDataAB[['antibody']])) { + st[[ab]] <- merge(colDataAB %>% filter(antibody == ab), + colDataInput %>% dplyr::select(biological_material, bamControl, ControlID), + by='biological_material') +} + +``` + +## Experiment overview {.tabset} + +The ChIPseq samples analyzed here come from the SRA study [SRP120974](https://trace.ncbi.nlm.nih.gov/Traces/sra/?study=SRP120974), +published in [Sdelci et al., 2019](https://www.nature.com/articles/s41588-019-0413-z). + +HAP1 cells were treated with dBET6 or DMSO and ChIPseq was performed using histone acetyl-reader BRD4 or +the folate pathway enzyme MTHFD1. +ChIP-seq with mock IgG antibody were used as background. + +ChIPseq fastq files were processed through [lcdb-wf](https://github.com/lcdb/lcdb-wf) chipseq pipeline. + +Called peaks were then processed though [DiffBind](https://bioconductor.org/packages/release/bioc/html/DiffBind.html) +for differential binding analysis using the method `DBA_DESEQ2`. + +```{r, results='asis'} +if (use_pval) { + pval_attr <- 'pvalue' +} else { + pval_attr <- 'FDR' +} +cat(paste0('\n\nHere we use a **', pval_attr, '** threshold of ', threshold, + ' to determine peak significance.\n\n')) +``` + +For each antibody, a separate sampletable was used as `diffBind` input to identify differential binding. + +```{r setup, results='asis'} +data <- list() +for (ab in names(st)){ + data[[ab]] <- dba(sampleSheet=st[[ab]], + config=data.frame(th=threshold, bUsePval=use_pval)) + cat(paste0('\n\n### ', ab, '\n\n')) + subchunkify(knitr::kable(st[[ab]][, colnames(st[[ab]])[!colnames(st[[ab]]) %in% exclude.for.printing]])) +} +params <- list('args2'='diffPeaks_deseq2.tsv', + 'args3'='diffPeaks_deseq2.bed') +``` + +```{r deseq2, cache=TRUE, dependson='setup'} +# DiffBind count of reads and analysis of differential number of reads +# using the DESeq2 method +DBs <- list() +beds <- list() +for (ab in names(st)){ + lvls <- levels(st[[ab]]$Treatment) + data[[ab]] <- dba.count(data[[ab]], bParallel=TRUE) + data[[ab]] <- dba.contrast(data[[ab]], + group1=data[[ab]]$mask[[lvls[2]]], + group2=data[[ab]]$mask[[lvls[1]]], + name1=lvls[2], name2=lvls[1], minMembers=2) + data[[ab]] <- dba.analyze(data[[ab]], method=DBA_DESEQ2, bTagwise=FALSE, bSubControl=TRUE) + +# Generate a report +# specifying threshold, th=1, to include all peaks + DBs[[ab]] <- dba.report(data[[ab]], method=DBA_DESEQ2, th=1) + +# Write output table + write.table(DBs[[ab]], + file = paste(ab, '_', params$args2, sep=""), + sep = "\t", quote=FALSE, row.names=FALSE) + +# Write output in bed format + beds[[ab]] <- data.frame(iranges = DBs[[ab]]) + beds[[ab]]$newcol <- paste("Fold", beds[[ab]]$iranges.Fold, "FDR", + beds[[ab]]$iranges.FDR, sep="_") + + write.table(beds[[ab]][,c(1,2,3,12,9,10,11)], + file = paste(ab, '_',params$args3, sep=""), + quote=FALSE, row.names=FALSE, col.names=FALSE, sep = "\t") +} +``` + + +## Correlation between samples {.tabset} + +```{r deseq2plots, results='asis'} +# plot data +for(ab in names(data)){ + cat('\n\n### ', ab, '\n\n') + plot(data[[ab]], contrast=1) +} +``` + + +## Overlap of peaks between replicates {.tabset} + +Please note that the Venn diagram areas are not to scale. + +```{r venn, results='asis'} +for (ab in names(data)) { + cat('\n\n\n### ', ab, '{.tabset}\n\n') + for (lvl in levels(st[[ab]]$Treatment)) { + cat('\n\n#### ', lvl, ' samples\n\n') + dba.plotVenn(data[[ab]], data[[ab]]$masks[[lvl]]) + } +} +``` + + +## Analysis with DBA DESeq2 + +### Summary of results + +```{r} +thres = list() +diff_num = list() +for (ab in names(DBs)){ + thres[[ab]] <- data[[ab]]$config$th + diff_num[[ab]] <- sum(DBs[[ab]]$`p-value` < thres[[ab]]) +} +``` + +```{r} +thres <- list() +up <- list() +down <- list() +changed <- list() +for(ab in names(DBs)){ + thres[[ab]] <- data[[ab]]$config$th + if (use_pval) { + up[[ab]] <- sum(DBs[[ab]]$`p-value` < thres[[ab]] & DBs[[ab]]$Fold > 0) + down[[ab]] <- sum(DBs[[ab]]$`p-value` < thres[[ab]] & DBs[[ab]]$Fold < 0) + } else { + up[[ab]] <- sum(DBs[[ab]]$FDR < thres[ab] & DBs[[ab]]$Fold > 0) + down[[ab]] <- sum(DBs[[ab]]$FDR < thres[ab] & DBs[[ab]]$Fold < 0) + } + changed[[ab]] <- up[[ab]] + down[[ab]] +} +df <- data.frame(antibody = names(DBs), + threshold = unlist(thres), + up=unlist(up), + down=unlist(down), + changed=unlist(changed)) +knitr::kable(df, row.names=FALSE) +cat('\n') + +``` + + +## Overlap of differentially bound peaks + +```{r upset, results='asis'} +sel.list <- list() +sel.list[['up']] <- list() +sel.list[['down']] <- list() + +if (use_pval == TRUE) { + for (ab in names(DBs)){ + sel.list[['up']][[ab]] <- DBs[[ab]][DBs[[ab]]$`p-value` < thres[[ab]] & DBs[[ab]]$Fold > 0] + sel.list[['down']][[ab]] <- DBs[[ab]][DBs[[ab]]$`p-value` < thres[[ab]] & DBs[[ab]]$Fold < 0] + } +} else { + for (ab in names(DBs)){ + sel.list[['up']][[ab]] <- DBs[[ab]][DBs[[ab]]$`p-value` < thres[[ab]] & DBs[[ab]]$Fold > 0] + sel.list[['down']][[ab]] <- DBs[[ab]][DBs[[ab]]$`p-value` < thres[[ab]] & DBs[[ab]]$Fold < 0] + } +} + +# TODO: upset plots would be better here, but need to compute the overlap of peaks between contrasts +# vennplot from CHiPseeker accept a list of object, can be a list of GRanges or a list of vector, +# and returns a Venn plot of the overlap +cat('\n\n### Overlap of peaks with increased affinity\n\n') +if (length(sel.list[['up']]) > 1) { + vennplot(sel.list[['up']]) +} else { + cat('\n\nNot enough groups to plot overlap.\n\n') +} +cat('\n\n### Overlap of peaks with decreased affinity\n\n') +if (length(sel.list[['down']]) > 1) { + vennplot(sel.list[['down']]) +} else { + cat('\n\nNot enough groups to plot overlap.\n\n') +} + +``` + + +### Links to results + +```{r, results='asis'} +for (ab in names(DBs)){ + lvls <- levels(st[[ab]]$Treatment) + cat(paste0('\n\n#### **', lvls[2], ' vs. ', lvls[1], '**\n\n')) + cat(paste0('\n\n- TSV: [', lvls[2], '_diffPeaks_deseq2.tsv](', lvls[2], '_diffPeaks_deseq2.tsv)\n\n')) + cat(paste0('\n\n- BED: [', lvls[2], '_diffPeaks_deseq2.bed](', lvls[2], '_diffPeaks_deseq2.bed)\n\n')) +} +``` + +### Visualization of differential binding {.tabset} + +```{r plotting, results='asis', fig.height=10, fig.width=10} +for (ab in names(data)) { + cat('\n\n#### ', ab, '{.tabset}\n') + # plot PCA + cat('\n\n##### PCA plot\n') + dba.plotPCA(data[[ab]], contrast=1, label=DBA_ID) + # plot MA + cat('\n\n##### MA plot\n') + dba.plotMA(data[[ab]]) + # volcano plot + cat('\n\n##### Volcano plot\n') + dba.plotVolcano(data[[ab]]) + cat('\n\n##### Boxplot\n') + cat('\n\nThis is a boxplot of normalized reads in peaks.\n') + cat("\n\n- `+` indicates peaks with higher", ab, "in", data[[ab]][['contrasts']][[1]][['name2']], ".\n") + cat("\n- `-` indicates peaks with higher ", ab, "in", data[[ab]][['contrasts']][[1]][['name1']], ".\n") + pvals <- dba.plotBox(data[[ab]], bDBIncreased=FALSE,bDBDecreased=FALSE) + cat('\n\n##### Heatmap\n') + cat('\n\n\nThis is a heatmap of the differential peaks detected by our analysis.\n') + cat('\n- The color key shows the color map used to plot the heatmap.\n') + cat('\n- The histogram in the color key shows the distribution of scores shown in the heatmap.\n\n') + corvals <- dba.plotHeatmap(data[[ab]], contrast=1, correlations=FALSE) +} +``` + + +## Annotation to nearest gene {.tabset} + +Annotation was performed using `annotatePeak` from [CHiPseeker](http://bioconductor.org/packages/release/bioc/html/ChIPseeker.html). +Peaks were annotated to the nearest gene in [UCSC hg38 transcript-related features](http://bioconductor.org/packages/release/data/annotation/html/TxDb.Hsapiens.UCSC.hg38.knownGene.html). For promoters, the default TSS (transcription start site) region defined from -3kb to +3kb was used. + +According to CHiPseeker's documentation, the following features are reported: + +> The position and strand information of nearest genes are reported. The distance from peak to the TSS of its nearest gene is also reported. The genomic region of the peak is reported in annotation column. Since some annotation may overlap, ChIPseeker adopted the following priority in genomic annotation. +> +> - Promoter +> - 5’ UTR +> - 3’ UTR +> - Exon +> - Intron +> - Downstream +> - Intergenic +> +> Downstream is defined as the downstream of gene end. + +Only the gene corresponding to the annotation with the highest priority is reported in the output tables, +and the presence of the peak in the other annotation as TRUE/FALSE. +The order of priority can be modified in the code chunk below to obtain the genes from +other annotation categories. + + +```{r} +annot_order <- c("Promoter", "5UTR", "3UTR", "Exon", + "Intron", "Downstream", "Intergenic") +``` + + + +```{r chipseeker, results='asis', eval=TRUE} +annocols <- c("seqnames", "end", "annotation", "geneStart", "geneEnd", + "geneLength", "geneStrand", "geneId", "transcriptId", + "distanceToTSS") #, "ENSEMBL", "SYMBOL", "GENENAME") + +if (use_pval == TRUE) { + thcol <- 'iranges.p.value' +} else { + thcol <- 'iranges.FDR' +} + +for (ab in names(DBs)){ + cat(paste0('\n\n### ', ab, ' {.tabset}\n\n')) + # all peaks + bedfn <- paste0(ab, '_',params$args3) + cat('\n\n#### All\n\n') + peakAnno <- list() + peakAnno['all'] <- annotatePeak(bedfn, tssRegion=c(-3000, 3000), TxDb=txdb, verbose=FALSE, genomicAnnotationPriority = annot_order) # removed annoDb="hs.... + plotAnnoPie(peakAnno[['all']]) + print(upsetplot(peakAnno[['all']])) + # write bed with annotations for all peaks + write.table(as.data.frame(peakAnno['all']), + file = paste0(ab, '_annot_', params$args3), + quote=FALSE, row.names=FALSE, col.names=FALSE, sep = "\t") + # write tsv with annotations + tmpannot <- cbind(as.data.frame(peakAnno[['all']])[annocols], as.data.frame(peakAnno[['all']]@detailGenomicAnnotation)) + tsvannot <- merge(as.data.frame(DBs[[ab]]), tmpannot, + by=c('seqnames', 'end'), all.x=TRUE) + write.table(tsvannot, + file = paste0(ab, '_annot_', params$args2), + quote=FALSE, row.names=FALSE, sep = "\t") + + # peaks up + cat('\n\n#### Up\n\n') + write.table(beds[[ab]][beds[[ab]][[thcol]] < thres[[ab]] & beds[[ab]][['iranges.Fold']] > 0, + c(1,2,3,12,9,10,11)], + file = paste(ab, '_up_',params$args3, sep=""), + quote=FALSE, row.names=FALSE, col.names=FALSE, sep = "\t") + tryCatch( + { + peakAnno['up'] <- annotatePeak(paste(ab, '_up_',params$args3, sep=""), + tssRegion=c(-3000, 3000), TxDb=txdb, + verbose=FALSE, genomicAnnotationPriority = annot_order) + plotAnnoPie(peakAnno[['up']]) + print(upsetplot(peakAnno[['up']])) + }, + error = function(e){ + print('No line available in input') + }) + + # peaks down + cat('\n\n#### Down\n\n') + write.table(beds[[ab]][beds[[ab]][[thcol]] < thres[[ab]] & beds[[ab]][['iranges.Fold']] < 0, + c(1,2,3,12,9,10,11)], + file = paste(ab, '_down_',params$args3, sep=""), + quote=FALSE, row.names=FALSE, col.names=FALSE, sep = "\t") + tryCatch({ + peakAnno['down'] <- annotatePeak(paste(ab, '_down_',params$args3, sep=""), + tssRegion=c(-3000, 3000), TxDb=txdb, + verbose=FALSE, genomicAnnotationPriority = annot_order) + plotAnnoPie(peakAnno[['down']]) + print(upsetplot(peakAnno[['down']])) + }, + error = function(e){ + print('No line available in input') + }) + + + # compare the all/up/down distributions + cat('\n\n#### Pie charts comparison\n\n') + print(plotAnnoBar(peakAnno)) +} +``` + + +## sessionInfo + +For reproducibility purposes, here is the output of `sessionInfo` listing all packages +used in the analysis. + +```{r} +sessionInfo() +``` From 90cf34ab7408af05ee58ab158d389997c89e08d6 Mon Sep 17 00:00:00 2001 From: Caroline Esnault Date: Thu, 24 Sep 2020 09:05:43 -0400 Subject: [PATCH 06/19] update gitignore --- .gitignore | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.gitignore b/.gitignore index 710061099..187a83322 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,11 @@ workflows/rnaseq/downstream/upset_plots workflows/rnaseq/downstream/final_clusters workflows/rnaseq/downstream/rnaseq.log workflows/rnaseq/downstream/*tsv +workflows/chipseq/downstream/diffbind.html +workflows/chipseq/downstream/diffbind_cache +workflows/chipseq/downstream/diffbind_files +workflows/chipseq/downstream/*tsv +workflows/chipseq/downstream/*bed workflows/chipseq/data workflows/rnaseq/data workflows/colocalization/results From ec36c5e9a64c9f039691d5fd0c9e2cdec556e175 Mon Sep 17 00:00:00 2001 From: Caroline Esnault Date: Thu, 24 Sep 2020 20:18:43 -0400 Subject: [PATCH 07/19] add back PE test samples --- ci/get-data.py | 113 ++++++++++++++++++++++++++----------------------- 1 file changed, 59 insertions(+), 54 deletions(-) diff --git a/ci/get-data.py b/ci/get-data.py index 87115e144..f3064b3d7 100755 --- a/ci/get-data.py +++ b/ci/get-data.py @@ -5,10 +5,11 @@ shell.executable('/bin/bash') BRANCH = 'master' -URL = 'https://github.com/lcdb/lcdb-test-data-human/blob/{0}/{{}}?raw=true'.format(BRANCH) +URL = 'https://github.com/lcdb/lcdb-test-data/blob/{0}/data/{{}}?raw=true'.format(BRANCH) +URLH = 'https://github.com/lcdb/lcdb-test-data-human/blob/{0}/{{}}?raw=true'.format(BRANCH) -def _download_file(fn, dest=None): +def _download_file(URL, fn, dest=None): url = URL.format(fn) if dest is None: dest = fn @@ -17,57 +18,61 @@ def _download_file(fn, dest=None): shell('wget -q -O- {url} > {dest}') return dest -_download_file('rnaseq_samples/A549_DMSO_1h_1/A549_DMSO_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_DMSO_1h_1.small_R1.fastq.gz') -_download_file('rnaseq_samples/A549_DMSO_1h_2/A549_DMSO_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_DMSO_1h_2.small_R1.fastq.gz') -_download_file('rnaseq_samples/A549_DMSO_6h_1/A549_DMSO_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_DMSO_6h_1.small_R1.fastq.gz') -_download_file('rnaseq_samples/A549_DMSO_6h_2/A549_DMSO_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_DMSO_6h_2.small_R1.fastq.gz') -_download_file('rnaseq_samples/A549_JQ1_1h_1/A549_JQ1_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_JQ1_1h_1.small_R1.fastq.gz') -_download_file('rnaseq_samples/A549_JQ1_1h_2/A549_JQ1_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_JQ1_1h_2.small_R1.fastq.gz') -_download_file('rnaseq_samples/A549_JQ1_6h_1/A549_JQ1_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_JQ1_6h_1.small_R1.fastq.gz') -_download_file('rnaseq_samples/A549_JQ1_6h_2/A549_JQ1_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_JQ1_6h_2.small_R1.fastq.gz') -_download_file('rnaseq_samples/A549_dBet6_1h_1/A549_dBet6_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_dBet6_1h_1.small_R1.fastq.gz') -_download_file('rnaseq_samples/A549_dBet6_1h_2/A549_dBet6_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_dBet6_1h_2.small_R1.fastq.gz') -_download_file('rnaseq_samples/A549_dBet6_6h_1/A549_dBet6_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_dBet6_6h_1.small_R1.fastq.gz') -_download_file('rnaseq_samples/A549_dBet6_6h_2/A549_dBet6_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_dBet6_6h_2.small_R1.fastq.gz') -_download_file('rnaseq_samples/HAP1_DMSO_1h_1/HAP1_DMSO_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_DMSO_1h_1.small_R1.fastq.gz') -_download_file('rnaseq_samples/HAP1_DMSO_1h_2/HAP1_DMSO_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_DMSO_1h_2.small_R1.fastq.gz') -_download_file('rnaseq_samples/HAP1_DMSO_6h_1/HAP1_DMSO_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_DMSO_6h_1.small_R1.fastq.gz') -_download_file('rnaseq_samples/HAP1_DMSO_6h_2/HAP1_DMSO_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_DMSO_6h_2.small_R1.fastq.gz') -_download_file('rnaseq_samples/HAP1_JQ1_1h_1/HAP1_JQ1_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_JQ1_1h_1.small_R1.fastq.gz') -_download_file('rnaseq_samples/HAP1_JQ1_1h_2/HAP1_JQ1_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_JQ1_1h_2.small_R1.fastq.gz') -_download_file('rnaseq_samples/HAP1_JQ1_6h_1/HAP1_JQ1_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_JQ1_6h_1.small_R1.fastq.gz') -_download_file('rnaseq_samples/HAP1_JQ1_6h_2/HAP1_JQ1_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_JQ1_6h_2.small_R1.fastq.gz') -_download_file('rnaseq_samples/HAP1_dBet6_1h_1/HAP1_dBet6_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_dBet6_1h_1.small_R1.fastq.gz') -_download_file('rnaseq_samples/HAP1_dBet6_1h_2/HAP1_dBet6_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_dBet6_1h_2.small_R1.fastq.gz') -_download_file('rnaseq_samples/HAP1_dBet6_6h_1/HAP1_dBet6_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_dBet6_6h_1.small_R1.fastq.gz') -_download_file('rnaseq_samples/HAP1_dBet6_6h_2/HAP1_dBet6_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_dBet6_6h_2.small_R1.fastq.gz') -_download_file('rnaseq_samples/K562_DMSO_1h_1/K562_DMSO_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_DMSO_1h_1.small_R1.fastq.gz') -_download_file('rnaseq_samples/K562_DMSO_1h_2/K562_DMSO_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_DMSO_1h_2.small_R1.fastq.gz') -_download_file('rnaseq_samples/K562_DMSO_6h_1/K562_DMSO_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_DMSO_6h_1.small_R1.fastq.gz') -_download_file('rnaseq_samples/K562_DMSO_6h_2/K562_DMSO_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_DMSO_6h_2.small_R1.fastq.gz') -_download_file('rnaseq_samples/K562_JQ1_1h_1/K562_JQ1_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_JQ1_1h_1.small_R1.fastq.gz') -_download_file('rnaseq_samples/K562_JQ1_1h_2/K562_JQ1_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_JQ1_1h_2.small_R1.fastq.gz') -_download_file('rnaseq_samples/K562_JQ1_6h_1/K562_JQ1_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_JQ1_6h_1.small_R1.fastq.gz') -_download_file('rnaseq_samples/K562_JQ1_6h_2/K562_JQ1_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_JQ1_6h_2.small_R1.fastq.gz') -_download_file('rnaseq_samples/K562_dBet6_1h_2/K562_dBet6_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_dBet6_1h_2.small_R1.fastq.gz') -_download_file('rnaseq_samples/K562_dBet6_6h_1/K562_dBet6_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_dBet6_6h_1.small_R1.fastq.gz') -_download_file('rnaseq_samples/K562_dBet6_6h_2/K562_dBet6_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_dBet6_6h_2.small_R1.fastq.gz') -_download_file('rnaseq_samples/outlier_HAP1_dBet6_1h_1/outlier_HAP1_dBet6_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/outlier_HAP1_dBet6_1h_1.small_R1.fastq.gz') +_download_file(URLH, 'rnaseq_samples/A549_DMSO_1h_1/A549_DMSO_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_DMSO_1h_1.small_R1.fastq.gz') +_download_file(URLH, 'rnaseq_samples/A549_DMSO_1h_2/A549_DMSO_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_DMSO_1h_2.small_R1.fastq.gz') +_download_file(URLH, 'rnaseq_samples/A549_DMSO_6h_1/A549_DMSO_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_DMSO_6h_1.small_R1.fastq.gz') +_download_file(URLH, 'rnaseq_samples/A549_DMSO_6h_2/A549_DMSO_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_DMSO_6h_2.small_R1.fastq.gz') +_download_file(URLH, 'rnaseq_samples/A549_JQ1_1h_1/A549_JQ1_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_JQ1_1h_1.small_R1.fastq.gz') +_download_file(URLH, 'rnaseq_samples/A549_JQ1_1h_2/A549_JQ1_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_JQ1_1h_2.small_R1.fastq.gz') +_download_file(URLH, 'rnaseq_samples/A549_JQ1_6h_1/A549_JQ1_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_JQ1_6h_1.small_R1.fastq.gz') +_download_file(URLH, 'rnaseq_samples/A549_JQ1_6h_2/A549_JQ1_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_JQ1_6h_2.small_R1.fastq.gz') +_download_file(URLH, 'rnaseq_samples/A549_dBet6_1h_1/A549_dBet6_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_dBet6_1h_1.small_R1.fastq.gz') +_download_file(URLH, 'rnaseq_samples/A549_dBet6_1h_2/A549_dBet6_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_dBet6_1h_2.small_R1.fastq.gz') +_download_file(URLH, 'rnaseq_samples/A549_dBet6_6h_1/A549_dBet6_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_dBet6_6h_1.small_R1.fastq.gz') +_download_file(URLH, 'rnaseq_samples/A549_dBet6_6h_2/A549_dBet6_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_dBet6_6h_2.small_R1.fastq.gz') +_download_file(URLH, 'rnaseq_samples/HAP1_DMSO_1h_1/HAP1_DMSO_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_DMSO_1h_1.small_R1.fastq.gz') +_download_file(URLH, 'rnaseq_samples/HAP1_DMSO_1h_2/HAP1_DMSO_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_DMSO_1h_2.small_R1.fastq.gz') +_download_file(URLH, 'rnaseq_samples/HAP1_DMSO_6h_1/HAP1_DMSO_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_DMSO_6h_1.small_R1.fastq.gz') +_download_file(URLH, 'rnaseq_samples/HAP1_DMSO_6h_2/HAP1_DMSO_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_DMSO_6h_2.small_R1.fastq.gz') +_download_file(URLH, 'rnaseq_samples/HAP1_JQ1_1h_1/HAP1_JQ1_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_JQ1_1h_1.small_R1.fastq.gz') +_download_file(URLH, 'rnaseq_samples/HAP1_JQ1_1h_2/HAP1_JQ1_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_JQ1_1h_2.small_R1.fastq.gz') +_download_file(URLH, 'rnaseq_samples/HAP1_JQ1_6h_1/HAP1_JQ1_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_JQ1_6h_1.small_R1.fastq.gz') +_download_file(URLH, 'rnaseq_samples/HAP1_JQ1_6h_2/HAP1_JQ1_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_JQ1_6h_2.small_R1.fastq.gz') +_download_file(URLH, 'rnaseq_samples/HAP1_dBet6_1h_1/HAP1_dBet6_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_dBet6_1h_1.small_R1.fastq.gz') +_download_file(URLH, 'rnaseq_samples/HAP1_dBet6_1h_2/HAP1_dBet6_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_dBet6_1h_2.small_R1.fastq.gz') +_download_file(URLH, 'rnaseq_samples/HAP1_dBet6_6h_1/HAP1_dBet6_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_dBet6_6h_1.small_R1.fastq.gz') +_download_file(URLH, 'rnaseq_samples/HAP1_dBet6_6h_2/HAP1_dBet6_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_dBet6_6h_2.small_R1.fastq.gz') +_download_file(URLH, 'rnaseq_samples/K562_DMSO_1h_1/K562_DMSO_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_DMSO_1h_1.small_R1.fastq.gz') +_download_file(URLH, 'rnaseq_samples/K562_DMSO_1h_2/K562_DMSO_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_DMSO_1h_2.small_R1.fastq.gz') +_download_file(URLH, 'rnaseq_samples/K562_DMSO_6h_1/K562_DMSO_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_DMSO_6h_1.small_R1.fastq.gz') +_download_file(URLH, 'rnaseq_samples/K562_DMSO_6h_2/K562_DMSO_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_DMSO_6h_2.small_R1.fastq.gz') +_download_file(URLH, 'rnaseq_samples/K562_JQ1_1h_1/K562_JQ1_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_JQ1_1h_1.small_R1.fastq.gz') +_download_file(URLH, 'rnaseq_samples/K562_JQ1_1h_2/K562_JQ1_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_JQ1_1h_2.small_R1.fastq.gz') +_download_file(URLH, 'rnaseq_samples/K562_JQ1_6h_1/K562_JQ1_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_JQ1_6h_1.small_R1.fastq.gz') +_download_file(URLH, 'rnaseq_samples/K562_JQ1_6h_2/K562_JQ1_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_JQ1_6h_2.small_R1.fastq.gz') +_download_file(URLH, 'rnaseq_samples/K562_dBet6_1h_2/K562_dBet6_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_dBet6_1h_2.small_R1.fastq.gz') +_download_file(URLH, 'rnaseq_samples/K562_dBet6_6h_1/K562_dBet6_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_dBet6_6h_1.small_R1.fastq.gz') +_download_file(URLH, 'rnaseq_samples/K562_dBet6_6h_2/K562_dBet6_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_dBet6_6h_2.small_R1.fastq.gz') +_download_file(URLH, 'rnaseq_samples/outlier_HAP1_dBet6_1h_1/outlier_HAP1_dBet6_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/outlier_HAP1_dBet6_1h_1.small_R1.fastq.gz') -_download_file('chipseq_samples/BRD4_DMSO_1/BRD4_DMSO_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/BRD4_DMSO_1.small_R1.fastq.gz') -_download_file('chipseq_samples/BRD4_DMSO_2/BRD4_DMSO_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/BRD4_DMSO_2.small_R1.fastq.gz') -_download_file('chipseq_samples/BRD4_dBET6_1/BRD4_dBET6_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/BRD4_dBET6_1.small_R1.fastq.gz') -_download_file('chipseq_samples/BRD4_dBET6_2/BRD4_dBET6_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/BRD4_dBET6_2.small_R1.fastq.gz') -_download_file('chipseq_samples/MTHFD1_DMSO_1/MTHFD1_DMSO_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/MTHFD1_DMSO_1.small_R1.fastq.gz') -_download_file('chipseq_samples/MTHFD1_DMSO_2/MTHFD1_DMSO_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/MTHFD1_DMSO_2.small_R1.fastq.gz') -_download_file('chipseq_samples/MTHFD1_dBET6_1/MTHFD1_dBET6_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/MTHFD1_dBET6_1.small_R1.fastq.gz') -_download_file('chipseq_samples/MTHFD1_dBET6_2/MTHFD1_dBET6_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/MTHFD1_dBET6_2.small_R1.fastq.gz') -_download_file('chipseq_samples/input_DMSO_1/input_DMSO_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/input_DMSO_1.small_R1.fastq.gz') -_download_file('chipseq_samples/input_DMSO_2/input_DMSO_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/input_DMSO_2.small_R1.fastq.gz') -_download_file('chipseq_samples/input_dBET6_1/input_dBET6_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/input_dBET6_1.small_R1.fastq.gz') -_download_file('chipseq_samples/input_dBET6_2/input_dBET6_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/input_dBET6_2.small_R1.fastq.gz') -_download_file('chipseq_samples/mockIgG_DMSO_1/mockIgG_DMSO_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/mockIgG_DMSO_1.small_R1.fastq.gz') -_download_file('chipseq_samples/mockIgG_DMSO_2/mockIgG_DMSO_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/mockIgG_DMSO_2.small_R1.fastq.gz') -_download_file('chipseq_samples/mockIgG_dBET6_1/mockIgG_dBET6_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/mockIgG_dBET6_1.small_R1.fastq.gz') -_download_file('chipseq_samples/mockIgG_dBET6_2/mockIgG_dBET6_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/mockIgG_dBET6_2.small_R1.fastq.gz') +_download_file(URLH, 'chipseq_samples/BRD4_DMSO_1/BRD4_DMSO_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/BRD4_DMSO_1.small_R1.fastq.gz') +_download_file(URLH, 'chipseq_samples/BRD4_DMSO_2/BRD4_DMSO_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/BRD4_DMSO_2.small_R1.fastq.gz') +_download_file(URLH, 'chipseq_samples/BRD4_dBET6_1/BRD4_dBET6_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/BRD4_dBET6_1.small_R1.fastq.gz') +_download_file(URLH, 'chipseq_samples/BRD4_dBET6_2/BRD4_dBET6_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/BRD4_dBET6_2.small_R1.fastq.gz') +_download_file(URLH, 'chipseq_samples/MTHFD1_DMSO_1/MTHFD1_DMSO_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/MTHFD1_DMSO_1.small_R1.fastq.gz') +_download_file(URLH, 'chipseq_samples/MTHFD1_DMSO_2/MTHFD1_DMSO_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/MTHFD1_DMSO_2.small_R1.fastq.gz') +_download_file(URLH, 'chipseq_samples/MTHFD1_dBET6_1/MTHFD1_dBET6_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/MTHFD1_dBET6_1.small_R1.fastq.gz') +_download_file(URLH, 'chipseq_samples/MTHFD1_dBET6_2/MTHFD1_dBET6_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/MTHFD1_dBET6_2.small_R1.fastq.gz') +_download_file(URLH, 'chipseq_samples/input_DMSO_1/input_DMSO_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/input_DMSO_1.small_R1.fastq.gz') +_download_file(URLH, 'chipseq_samples/input_DMSO_2/input_DMSO_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/input_DMSO_2.small_R1.fastq.gz') +_download_file(URLH, 'chipseq_samples/input_dBET6_1/input_dBET6_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/input_dBET6_1.small_R1.fastq.gz') +_download_file(URLH, 'chipseq_samples/input_dBET6_2/input_dBET6_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/input_dBET6_2.small_R1.fastq.gz') +_download_file(URLH, 'chipseq_samples/mockIgG_DMSO_1/mockIgG_DMSO_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/mockIgG_DMSO_1.small_R1.fastq.gz') +_download_file(URLH, 'chipseq_samples/mockIgG_DMSO_2/mockIgG_DMSO_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/mockIgG_DMSO_2.small_R1.fastq.gz') +_download_file(URLH, 'chipseq_samples/mockIgG_dBET6_1/mockIgG_dBET6_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/mockIgG_dBET6_1.small_R1.fastq.gz') +_download_file(URLH, 'chipseq_samples/mockIgG_dBET6_2/mockIgG_dBET6_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/mockIgG_dBET6_2.small_R1.fastq.gz') + + +_download_file(URL, 'rnaseq_samples/sample1/sample1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample1PE_1.fq.gz') +_download_file(URL, 'rnaseq_samples/sample1/sample1.small_R2.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample1PE_2.fq.gz') From 6d464301bc31705304aa96f1f203b033ed15c1f5 Mon Sep 17 00:00:00 2001 From: Caroline Esnault Date: Thu, 24 Sep 2020 20:19:01 -0400 Subject: [PATCH 08/19] add star --- include/reference_configs/Test_Homo_sapiens.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/include/reference_configs/Test_Homo_sapiens.yaml b/include/reference_configs/Test_Homo_sapiens.yaml index f8392fb6b..e66ecece7 100644 --- a/include/reference_configs/Test_Homo_sapiens.yaml +++ b/include/reference_configs/Test_Homo_sapiens.yaml @@ -15,6 +15,7 @@ references: indexes: - 'hisat2' - 'bowtie2' + - 'star' annotation: url: 'https://github.com/lcdb/lcdb-test-data-human/raw/master/annotation/hg38.small.gtf' From 714b2cd35da29a622983a83d0d94793f8607978e Mon Sep 17 00:00:00 2001 From: Caroline Esnault Date: Thu, 24 Sep 2020 20:19:29 -0400 Subject: [PATCH 09/19] update requirements for diffbind --- requirements-r.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/requirements-r.txt b/requirements-r.txt index 7343578f5..b50579635 100644 --- a/requirements-r.txt +++ b/requirements-r.txt @@ -1,9 +1,11 @@ bioconductor-annotationhub bioconductor-apeglm bioconductor-biocparallel +bioconductor-chipseeker bioconductor-clusterprofiler bioconductor-degreport bioconductor-deseq2 +bioconductor-diffbind bioconductor-dupradar bioconductor-genomeinfodbdata bioconductor-genomicfeatures @@ -24,3 +26,4 @@ r-rmarkdown r-spp r-sparsem r-upsetr +r-ggupset From 03174bbc50d38872dd5e40d29a38ec06519edbb3 Mon Sep 17 00:00:00 2001 From: Caroline Esnault Date: Thu, 24 Sep 2020 20:21:42 -0400 Subject: [PATCH 10/19] update test configs with large datasets --- test/test_configs/chipseq_one_run.tsv | 4 +-- test/test_configs/star_override_1pass.yaml | 2 +- test/test_configs/star_override_2pass.yaml | 2 +- .../test_configs/test_chipseq_regression.yaml | 26 +++++++++---------- test/test_configs/two_samples.tsv | 4 +-- workflows/references/config/config.yaml | 1 + 6 files changed, 19 insertions(+), 20 deletions(-) diff --git a/test/test_configs/chipseq_one_run.tsv b/test/test_configs/chipseq_one_run.tsv index ec235f9f1..8f780bf73 100644 --- a/test/test_configs/chipseq_one_run.tsv +++ b/test/test_configs/chipseq_one_run.tsv @@ -1,4 +1,4 @@ # Samplenames with the same "label" will be considered technical replicates samplename antibody biological_material replicate label orig_filename -input_1 input wingdisc-1 1 input-wingdisc-1 data/example_data/chipseq_input1.fq.gz -ip_1 gaf wingdisc-1 1 gaf-wingdisc-1 data/example_data/chipseq_ip1.fq.gz +BRD4_dBET6_1 dBET6 HAP1_B1 1 BRD4_dBET6_1 data/example_data/BRD4_dBET6_1.small_R1.fastq.gz +mockIgG_dBET6_1 input HAP1_B1 1 mockIgG_dBET6_1 data/example_data/mockIgG_dBET6_1.small_R1.fastq.gz diff --git a/test/test_configs/star_override_1pass.yaml b/test/test_configs/star_override_1pass.yaml index cba6ff764..c8537b568 100644 --- a/test/test_configs/star_override_1pass.yaml +++ b/test/test_configs/star_override_1pass.yaml @@ -1,6 +1,6 @@ aligner: index: star - tag: test + tag: small-gencode-v28 merged_bigwigs: control_pos: diff --git a/test/test_configs/star_override_2pass.yaml b/test/test_configs/star_override_2pass.yaml index b091eba3d..f4e7b2c56 100644 --- a/test/test_configs/star_override_2pass.yaml +++ b/test/test_configs/star_override_2pass.yaml @@ -1,6 +1,6 @@ aligner: index: 'star-twopass' - tag: test + tag: small-gencode-v28 merged_bigwigs: control_pos: diff --git a/test/test_configs/test_chipseq_regression.yaml b/test/test_configs/test_chipseq_regression.yaml index 6b91f5dbd..4e6ec10c7 100644 --- a/test/test_configs/test_chipseq_regression.yaml +++ b/test/test_configs/test_chipseq_regression.yaml @@ -1,40 +1,38 @@ #sampletable: 'config/sampletable.tsv' -organism: 'dmel' +organism: 'human' references_dir: 'references_data' peaks_dir: 'data/chipseq_peaks' chipseq: peak_calling: - - label: gaf-wingdisc-1 + - label: BRD4_dBET6_1 algorithm: macs2 ip: - - gaf-wingdisc-1 + - BRD4_dBET6_1 control: - - input-wingdisc-1 - effective_genome_count: 7e7 - extra: '--nomodel --extsize 147' + - mockIgG_dBET6_1 + extra: '--nomodel -p 0.05 --cutoff-analysis' fastq_screen: - label: rRNA - organism: dmel - tag: test + organism: human + tag: small-gencode-v28 - label: PhiX organism: phix tag: default - - label: Fly - organism: dmel - tag: test + - label: Human + organism: human + tag: small-gencode-v28 aligner: index: 'bowtie2' - tag: 'test' + tag: 'small-gencode-v28' merged_bigwigs: {} include_references: - - '../../include/reference_configs/test.yaml' - '../../include/reference_configs/PhiX.yaml' - - '../../include/reference_configs/Drosophila_melanogaster.yaml' + - '../../include/reference_configs/Test_Homo_sapiens.yaml' diff --git a/test/test_configs/two_samples.tsv b/test/test_configs/two_samples.tsv index 30886104f..76ecc25e1 100644 --- a/test/test_configs/two_samples.tsv +++ b/test/test_configs/two_samples.tsv @@ -1,3 +1,3 @@ samplename group layout orig_filename -sample1 control SE data/example_data/rnaseq_sample1PE_1.fq.gz -sample2 control SE data/example_data/rnaseq_sample2.fq.gz +A549_dBet6_1h_1 dBet6_1h SE data/example_data/A549_dBet6_1h_1.small_R1.fastq.gz +A549_dBet6_1h_2 dBet6_1h SE data/example_data/A549_dBet6_1h_2.small_R1.fastq.gz diff --git a/workflows/references/config/config.yaml b/workflows/references/config/config.yaml index 49618dcd0..eeea065bd 100644 --- a/workflows/references/config/config.yaml +++ b/workflows/references/config/config.yaml @@ -4,3 +4,4 @@ references_dir: 'references_dir' # include/reference_configs, for inspiration for more species. include_references: - '../../include/reference_configs/test.yaml' + - '../../include/reference_configs/Test_Homo_sapiens.yaml' From 25e382f774920a03fbbda058b605e17e5c2cf597 Mon Sep 17 00:00:00 2001 From: Caroline Esnault Date: Thu, 24 Sep 2020 20:23:35 -0400 Subject: [PATCH 11/19] add merged bigwigs for testing --- workflows/chipseq/config/config.yaml | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/workflows/chipseq/config/config.yaml b/workflows/chipseq/config/config.yaml index 9c159be56..e28237624 100644 --- a/workflows/chipseq/config/config.yaml +++ b/workflows/chipseq/config/config.yaml @@ -125,15 +125,10 @@ fastq_screen: organism: human tag: small-gencode-v28 -#merged_bigwigs: -# input-wingdisc: -# - input-wingdisc-1 -# - input-wingdisc-2 -# gaf-wingdisc: -# - gaf-wingdisc-1 -# - gaf-wingdisc-2 -# gaf-embryo: -# - gaf-embryo-1 +merged_bigwigs: + BRD4_dBET6_1: + - BRD4_dBET6_1 + - BRD4_dBET6_2 aligner: index: 'bowtie2' From 0073954286d09e757b115702a5622de6df473058 Mon Sep 17 00:00:00 2001 From: Caroline Esnault Date: Thu, 24 Sep 2020 20:41:13 -0400 Subject: [PATCH 12/19] add diffbind in tests --- .circleci/config.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index e13699c50..777713c99 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -134,6 +134,8 @@ variables: source activate lcdb-wf-test ./run_test.sh --use-conda -j2 -k -p -r python chipseq_trackhub.py config/config.yaml config/hub_config.yaml + source activate lcdb-wf-test-r + Rscript -e "rmarkdown::render('downstream/diffbind.Rmd')" # -------------------------------------------------------------------------- # Previous versions had an error where chipseq peaks needed to be defined for # every caller. This does a (relatively) quick test to only run a single From b434ee556767c148a7d4c6afbfd7a0729fbdeec0 Mon Sep 17 00:00:00 2001 From: Caroline Esnault Date: Thu, 24 Sep 2020 20:56:50 -0400 Subject: [PATCH 13/19] simplify contrasts --- workflows/rnaseq/downstream/rnaseq.Rmd | 92 -------------------------- 1 file changed, 92 deletions(-) diff --git a/workflows/rnaseq/downstream/rnaseq.Rmd b/workflows/rnaseq/downstream/rnaseq.Rmd index 90a5569d5..3dcb8eb0c 100644 --- a/workflows/rnaseq/downstream/rnaseq.Rmd +++ b/workflows/rnaseq/downstream/rnaseq.Rmd @@ -280,37 +280,15 @@ lst <- list( design=~treatment, args=list(subset.counts=TRUE) ), - jq=list( - sampletable=colData %>% filter(treatment %in% c('DMSO', 'JQ1'), !samplename %in% outliers), - design=~group + 0, - args=list(subset.counts=TRUE) - ), jq_cell_line=list( sampletable=colData %>% filter(treatment %in% c('DMSO', 'JQ1'), !samplename %in% outliers), design=~cell_line_group + 0, args=list(subset.counts=TRUE) ), - jq_interact_term=list( - sampletable=colData %>% filter(treatment %in% c('DMSO', 'JQ1'), !samplename %in% outliers), - design=~treatment + cell_Line + timepoint + treatment:timepoint, - args=list(subset.counts=TRUE) - ), - dbet=list( - sampletable=colData %>% filter(treatment %in% c('DMSO', 'dBet6'), !samplename %in% outliers), - design=~group + 0, - args=list(subset.counts=TRUE) - ), dbet_cell_line=list( sampletable=colData %>% filter(treatment %in% c('DMSO', 'dBet6'), !samplename %in% outliers), design=~cell_line_group + 0, args=list(subset.counts=TRUE) - ), - dbet_interact_term=list( - sampletable=colData %>% - filter(treatment %in% c('DMSO', 'dBet6'), !samplename %in% outliers) %>% - mutate(treatment=factor(treatment, levels=c('DMSO', 'dBet6'))), - design=~treatment + cell_Line + timepoint + treatment:timepoint, - args=list(subset.counts=TRUE) ) ) @@ -337,12 +315,6 @@ res.list[['main_jq']] <- list( ) # JQ1 vs. DMSO at 1h -res.list[['jq1h']] <- list( - res=results(dds.list[['jq']], contrast=c(-1,0,1,0)), - dds='jq', - label='JQ1 vs. DMSO 1h (LFC>0); dds jq; cts group' -) - res.list[['jq1h-batch']] <- list( res=results(dds.list[['jq_cell_line']], contrast=c(-1,0,1,0,-1,0,1,0,-1,0,1,0)), dds='jq_cell_line', @@ -350,12 +322,6 @@ res.list[['jq1h-batch']] <- list( ) # JQ1 vs. DMSO at 6h -res.list[['jq6h']] <- list( - res=results(dds.list[['jq']], contrast=c(0,-1,0,1)), - dds='jq', - label='JQ1 vs. DMSO 6h (LFC>0); dds jq; cts group' -) - res.list[['jq6h-batch']] <- list( res=results(dds.list[['jq_cell_line']], contrast=c(0,-1,0,1,0,-1,0,1,0,-1,0,1)), dds='jq_cell_line', @@ -363,35 +329,12 @@ res.list[['jq6h-batch']] <- list( ) # JQ1 vs. DMSO 6h vs. 1h -res.list[['jq6h.vs.1h']] <- list( - res=results(dds.list[['jq']], contrast=c(1,-1,-1,1)), - dds='jq', - label='JQ1 vs. DMSO 6h vs. 1h (LFC>0); dds jq; cts group' -) - res.list[['jq6h.vs.1h-batch']] <- list( res=results(dds.list[['jq_cell_line']], contrast=c(1,-1,-1,1,1,-1,-1,1,1,-1,-1,1)), dds='jq_cell_line', label='JQ1 vs. DMSO 6h vs. 1h (LFC>0); dds jq_cell_line; cts cell_Line_group; batch effect' ) -res.list[['jq6h.vs.1h-interactdesign']] <- list( - res=results(dds.list[['jq_interact_term']]), - dds='jq_cell_line', - label='JQ1 vs. DMSO 6h vs. 1h (LFC>0); dds jq_interact_term; cts cell_Line_group; batch effect' -) - -res.list[['jq_lfc2']] <- list( - res=lfcShrink( - dds.list[['jq_interact_term']], - coef='treatmentJQ1.timepoint6h', - #lfcThreshold=2, # no padj if lfcThreshold - type='apeglm' - ), - dds='main', - label='JQ1 vs. DMSO 6h vs. 1h (LFC>2); dds jq_interact_term; cts treat+cell_Line+timepoint+treat:timepoint; batch effect' -) - # dBet6 --------------------------- res.list[['main_dbet']] <- list( @@ -401,12 +344,6 @@ res.list[['main_dbet']] <- list( ) # dBET6 vs. DMSO at 1h -res.list[['dbet1h']] <- list( - res=results(dds.list[['dbet']], contrast=c(-1,0,1,0)), - dds='dbet', - label='dBet6 vs. DMSO 1h (LFC>0); dds dbet; cts group' -) - res.list[['dbet1h-batch']] <- list( res=results(dds.list[['dbet_cell_line']], contrast=c(-1,0,1,0,-1,0,1,0,-1,0,1,0)), dds='dbet_cell_line', @@ -414,12 +351,6 @@ res.list[['dbet1h-batch']] <- list( ) # dBet6 vs. DMSO at 6h -res.list[['dbet6h']] <- list( - res=results(dds.list[['dbet']], contrast=c(0,-1,0,1)), - dds='dbet', - label='dBet6 vs. DMSO 6h (LFC>0); dds dbet; cts group' -) - res.list[['dbet6h-batch']] <- list( res=results(dds.list[['dbet_cell_line']], contrast=c(0,-1,0,1,0,-1,0,1,0,-1,0,1)), dds='dbet_cell_line', @@ -427,35 +358,12 @@ res.list[['dbet6h-batch']] <- list( ) # dBet6 vs. DMSO 6h vs. 1h -res.list[['dbet6h.vs.1h']] <- list( - res=results(dds.list[['dbet']], contrast=c(1,-1,-1,1)), - dds='dbet', - label='dBet6 vs. DMSO 6h vs. 1h (LFC>0); dds dbet; cts group' -) - res.list[['dbet6h.vs.1h-batch']] <- list( res=results(dds.list[['dbet_cell_line']], contrast=c(1,-1,-1,1,1,-1,-1,1,1,-1,-1,1)), dds='dbet_cell_line', label='dBet6 vs. DMSO 6h vs. 1h (LFC>0); dds dbet_cell_line; cts cell_Line_group; batch effect' ) -res.list[['dbet6h.vs.1h-interactdesign']] <- list( - res=results(dds.list[['dbet_interact_term']]), - dds='dbet_cell_line', - label='dBet6 vs. DMSO 6h vs. 1h (LFC>0); dds dbet_interact_term; cts cell_Line_group; batch effect' -) - -res.list[['dbet_lfc2']] <- list( - res=lfcShrink( - dds.list[['dbet_interact_term']], - coef='treatmentdBet6.timepoint6h', - #lfcThreshold=2, - type='apeglm' - ), - dds='main', - label='dBet6 vs. DMSO 6h vs. 1h (LFC>2); dds dbet_interact_term; cts treat+cell_Line+timepoint+treat:timepoint; batch effect' -) - ``` From 622d98d3e613890956056ecb6bd19b6d39074268 Mon Sep 17 00:00:00 2001 From: Caroline Esnault Date: Thu, 24 Sep 2020 21:37:11 -0400 Subject: [PATCH 14/19] use test reference for rnaseq_pe --- test/test_configs/override_PE.yaml | 58 ++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 test/test_configs/override_PE.yaml diff --git a/test/test_configs/override_PE.yaml b/test/test_configs/override_PE.yaml new file mode 100644 index 000000000..44f7bb548 --- /dev/null +++ b/test/test_configs/override_PE.yaml @@ -0,0 +1,58 @@ +# Due to the way Snakemake recursively merges config items, we need to +# recursively reset this dictonary to override the default one in order to +# allow arbitrary other sample names. +# +# Use it like this +# +# snakemake --configfile ../../test/override.yaml --config sampletable=/path/to/tsv + + +merged_bigwigs: + control_pos: + pos: [] + treatment_all: + pos: [] + neg: [] + + +patterns: 'config/rnaseq_patterns.yaml' + +# Which key in the `references` dict below to use +organism: 'dmel' + +# If not specified here, use the environment variable REFERENCES_DIR. +references_dir: 'references_data' + +aligner: + index: 'hisat2' + tag: 'test' + +rrna: + index: 'bowtie2' + tag: 'rRNA' + +gtf: + tag: "test" + +salmon: + tag: "test" + +fastq_screen: + - label: rRNA + organism: dmel + tag: test + - label: PhiX + organism: phix + tag: default + - label: Fly + organism: dmel + tag: test + + +# See the reference config files in the top level of the repo, +# include/reference_configs, for inspiration for more species. + +include_references: + - '../../include/reference_configs/test.yaml' + - '../../include/reference_configs/PhiX.yaml' + - '../../include/reference_configs/Drosophila_melanogaster.yaml' From 73a98da0583c695e0950727cc4d918c72273b39a Mon Sep 17 00:00:00 2001 From: Caroline Esnault Date: Thu, 24 Sep 2020 21:37:44 -0400 Subject: [PATCH 15/19] update for human large dataset --- .circleci/config.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 777713c99..51b70c9af 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -257,7 +257,7 @@ variables: # test PE reads ./run_test.sh -j 2 --use-conda -k -p -r --until multiqc \ - --configfile $ORIG/test/test_configs/override.yaml \ + --configfile $ORIG/test/test_configs/override_PE.yaml \ --config sampletable=$ORIG/test/test_configs/test_pe_sampletable.tsv @@ -354,6 +354,7 @@ jobs: rnaseq: <<: *defaults + resource_class: large steps: - checkout - *restore_cache From cd2bb06a4058303a2fc21a91bd96b7fca6506f0f Mon Sep 17 00:00:00 2001 From: Caroline Esnault Date: Fri, 25 Sep 2020 05:54:36 -0400 Subject: [PATCH 16/19] adjust ressources --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 51b70c9af..5a9db7d20 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -354,7 +354,7 @@ jobs: rnaseq: <<: *defaults - resource_class: large + resource_class: medium+ steps: - checkout - *restore_cache From 71a1e407af4b521ec45914f772dc25f96e80b299 Mon Sep 17 00:00:00 2001 From: Caroline Esnault Date: Fri, 25 Sep 2020 06:51:04 -0400 Subject: [PATCH 17/19] scale back ressources --- .circleci/config.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 5a9db7d20..a2f3cc5e8 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -354,7 +354,6 @@ jobs: rnaseq: <<: *defaults - resource_class: medium+ steps: - checkout - *restore_cache From 1106a8224c19c66e685f7cea835dfd8c87981a52 Mon Sep 17 00:00:00 2001 From: Caroline Esnault Date: Fri, 25 Sep 2020 06:51:35 -0400 Subject: [PATCH 18/19] remove JQ1 samples to preserve RAM --- workflows/rnaseq/downstream/rnaseq.Rmd | 57 ++++++++++++++------------ 1 file changed, 31 insertions(+), 26 deletions(-) diff --git a/workflows/rnaseq/downstream/rnaseq.Rmd b/workflows/rnaseq/downstream/rnaseq.Rmd index 3dcb8eb0c..16a32fd0a 100644 --- a/workflows/rnaseq/downstream/rnaseq.Rmd +++ b/workflows/rnaseq/downstream/rnaseq.Rmd @@ -87,6 +87,10 @@ for (col in factor.columns){ colData$group <- relevel(colData$group, ref='DMSO_1h') rownames(colData) <- colData[,1] + +# remove JQ1 samples +colData <- colData %>% + filter(treatment != 'JQ1') ``` ## Experiment overview @@ -122,7 +126,8 @@ vsd.txi <- varianceStabilizingTransformation(dds.txi, blind=TRUE) dds <- lcdbwf::DESeqDataSetFromCombinedFeatureCounts( '../data/rnaseq_aggregation/featurecounts.txt', sampletable=colData, - design=~group) + design=~group, + subset.counts=TRUE) # NOTE: collapse technical replicates ---------------------------------------- # If collapsing technical replicates, do so here @@ -280,11 +285,11 @@ lst <- list( design=~treatment, args=list(subset.counts=TRUE) ), - jq_cell_line=list( - sampletable=colData %>% filter(treatment %in% c('DMSO', 'JQ1'), !samplename %in% outliers), - design=~cell_line_group + 0, - args=list(subset.counts=TRUE) - ), + #jq_cell_line=list( + # sampletable=colData %>% filter(treatment %in% c('DMSO', 'JQ1'), !samplename %in% outliers), + # design=~cell_line_group + 0, + # args=list(subset.counts=TRUE) + #), dbet_cell_line=list( sampletable=colData %>% filter(treatment %in% c('DMSO', 'dBet6'), !samplename %in% outliers), design=~cell_line_group + 0, @@ -308,32 +313,32 @@ res.list <- list() # IMPORTANT: Control must always be last # including only treatment in the contrast -res.list[['main_jq']] <- list( - res=results(dds.list[['main']], contrast=c('treatment', 'JQ1', 'DMSO')), - dds='main', - label='JQ1 vs. DMSO (LFC > 0); dds main; cts treatment' -) +#res.list[['main_jq']] <- list( +# res=results(dds.list[['main']], contrast=c('treatment', 'JQ1', 'DMSO')), +# dds='main', +# label='JQ1 vs. DMSO (LFC > 0); dds main; cts treatment' +#) # JQ1 vs. DMSO at 1h -res.list[['jq1h-batch']] <- list( - res=results(dds.list[['jq_cell_line']], contrast=c(-1,0,1,0,-1,0,1,0,-1,0,1,0)), - dds='jq_cell_line', - label='JQ1 vs. DMSO 1h (LFC>0); dds jq_cell_line; cts cell_Line_group; batch effect' -) +#res.list[['jq1h-batch']] <- list( +# res=results(dds.list[['jq_cell_line']], contrast=c(-1,0,1,0,-1,0,1,0,-1,0,1,0)), +# dds='jq_cell_line', +# label='JQ1 vs. DMSO 1h (LFC>0); dds jq_cell_line; cts cell_Line_group; batch effect' +#) # JQ1 vs. DMSO at 6h -res.list[['jq6h-batch']] <- list( - res=results(dds.list[['jq_cell_line']], contrast=c(0,-1,0,1,0,-1,0,1,0,-1,0,1)), - dds='jq_cell_line', - label='JQ1 vs. DMSO 6h (LFC>0); dds jq_cell_line; cts cell_Line_group; batch effect' -) +#res.list[['jq6h-batch']] <- list( +# res=results(dds.list[['jq_cell_line']], contrast=c(0,-1,0,1,0,-1,0,1,0,-1,0,1)), +# dds='jq_cell_line', +# label='JQ1 vs. DMSO 6h (LFC>0); dds jq_cell_line; cts cell_Line_group; batch effect' +#) # JQ1 vs. DMSO 6h vs. 1h -res.list[['jq6h.vs.1h-batch']] <- list( - res=results(dds.list[['jq_cell_line']], contrast=c(1,-1,-1,1,1,-1,-1,1,1,-1,-1,1)), - dds='jq_cell_line', - label='JQ1 vs. DMSO 6h vs. 1h (LFC>0); dds jq_cell_line; cts cell_Line_group; batch effect' -) +#res.list[['jq6h.vs.1h-batch']] <- list( +# res=results(dds.list[['jq_cell_line']], contrast=c(1,-1,-1,1,1,-1,-1,1,1,-1,-1,1)), +# dds='jq_cell_line', +# label='JQ1 vs. DMSO 6h vs. 1h (LFC>0); dds jq_cell_line; cts cell_Line_group; batch effect' +#) # dBet6 --------------------------- From 660e963a53ab096bf8d8f48de2d86b322d77a6a0 Mon Sep 17 00:00:00 2001 From: daler Date: Mon, 7 Dec 2020 09:12:02 -0500 Subject: [PATCH 19/19] fix config syntax --- workflows/chipseq/config/config.yaml | 9 --------- 1 file changed, 9 deletions(-) diff --git a/workflows/chipseq/config/config.yaml b/workflows/chipseq/config/config.yaml index d0a61c847..c625ecb7f 100644 --- a/workflows/chipseq/config/config.yaml +++ b/workflows/chipseq/config/config.yaml @@ -124,8 +124,6 @@ fastq_screen: - label: Human organism: human tag: small-gencode-v28 - organism: dmel - tag: test - label: Fly organism: dmel tag: test @@ -140,14 +138,7 @@ aligner: tag: 'small-gencode-v28' include_references: -<<<<<<< HEAD - '../../include/reference_configs/PhiX.yaml' - '../../include/reference_configs/Test_Homo_sapiens.yaml' -||||||| 39fd02a - - '../../include/reference_configs/PhiX.yaml' - - '../../include/reference_configs/Drosophila_melanogaster.yaml' - - '../../include/reference_configs/test.yaml' -======= - '../../include/reference_configs/Drosophila_melanogaster.yaml' - '../../include/reference_configs/test.yaml' ->>>>>>> master