From 0c7f847984c81ed34a3fc372e22325f89e44515e Mon Sep 17 00:00:00 2001
From: Caroline Esnault <caroline.esnault@nih.gov>
Date: Thu, 24 Sep 2020 08:21:03 -0400
Subject: [PATCH 01/19] update downstream rnaseq for large test data

---
 .../downstream/functional-enrichment.Rmd      |   2 +-
 workflows/rnaseq/downstream/rnaseq.Rmd        | 246 +++++++++++++++---
 2 files changed, 213 insertions(+), 35 deletions(-)

diff --git a/workflows/rnaseq/downstream/functional-enrichment.Rmd b/workflows/rnaseq/downstream/functional-enrichment.Rmd
index 1b8469767..ab312074b 100644
--- a/workflows/rnaseq/downstream/functional-enrichment.Rmd
+++ b/workflows/rnaseq/downstream/functional-enrichment.Rmd
@@ -27,7 +27,7 @@ id.convert <- TRUE
 ```{r bitrgenes, results='hide'}
 # Docs: https://lcdb.github.io/lcdb-wf/rnaseq-rmd.html#bitrgenes
 # NOTE: Gene IDs are provided in what format?----------------------------------
-from.type <- "FLYBASE"
+from.type <- "ENSEMBL"
 
 deg.list <- nested.lapply(sel.list, rownames)
 genes.df <- nested.lapply(deg.list, bitr, fromType=from.type, toType=types, OrgDb=orgdb)
diff --git a/workflows/rnaseq/downstream/rnaseq.Rmd b/workflows/rnaseq/downstream/rnaseq.Rmd
index 1d3fae7b9..90a5569d5 100644
--- a/workflows/rnaseq/downstream/rnaseq.Rmd
+++ b/workflows/rnaseq/downstream/rnaseq.Rmd
@@ -59,7 +59,7 @@ devtools::load_all('../../../lib/lcdbwf')
 
 ```{r annotationhub_setup}
 # Docs: https://lcdb.github.io/lcdb-wf/rnaseq-rmd.html#annotationhub_setup
-annotation_genus_species <- 'Drosophila melanogaster'
+annotation_genus_species <- 'Homo sapiens'
 annotation_key_override <- NA
 hub.cache <- '../../../include/AnnotationHubCache'
 orgdb <- get.orgdb(
@@ -85,7 +85,7 @@ for (col in factor.columns){
     colData[[col]] <- as.factor(colData[[col]])
 }
 
-colData$group <- relevel(colData$group, ref='control')
+colData$group <- relevel(colData$group, ref='DMSO_1h')
 rownames(colData) <- colData[,1]
 ```
 
@@ -126,13 +126,28 @@ dds <- lcdbwf::DESeqDataSetFromCombinedFeatureCounts(
 
 # NOTE: collapse technical replicates ----------------------------------------
 #   If collapsing technical replicates, do so here
-# dds <-collapseReplicates(dds, dds$biorep)
+dds <-collapseReplicates(dds, dds$biorep)
 
 if (strip.dotted.version){
     rownames(dds) <- sapply(strsplit(rownames(dds), '.', fixed=TRUE), function (x) x[1])
 }
 
 vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
+
+outliers <- c('HAP1_MTX_6h_2', 'outlier_HAP1_dBet6_1h_1')
+dds.no.outlier <- lcdbwf::DESeqDataSetFromCombinedFeatureCounts(
+    '../data/rnaseq_aggregation/featurecounts.txt',
+    sampletable=colData %>% filter(!samplename %in% outliers),
+    design=~group,
+    subset.counts=TRUE)
+dds.no.outlier <-collapseReplicates(dds.no.outlier, dds.no.outlier$biorep)
+vsd.no.outlier <- varianceStabilizingTransformation(dds.no.outlier, blind=TRUE)
+
+# remove batch effect of cell line
+#vsd.batch <- vsd.no.outlier
+#assay(vsd.batch) <- removebatchEffect(assay(vsd.batch),
+#                                      batches=list(colData(vsd.batch)$cell_Line),
+#                                      design=model.matrix(~group, data=colData(vsd.batch)))
 ```
 
 ## Sample clustering and QC
@@ -146,8 +161,15 @@ we expect to see replicates clustering together and separation of treatments.
 ```{r sample_heatmap, cache=TRUE, dependson='dds_initial'}
 # Docs: https://lcdb.github.io/lcdb-wf/rnaseq-rmd.html#sample_heatmap
 lcdbwf::plot.heatmap(vsd, colData, cols.for.grouping=c('group'))
+cat('\n\n### Without outliers\n\n')
+lcdbwf::plot.heatmap(vsd.no.outlier, colData, cols.for.grouping=c('group'))
+cat('\n\n### After removing cell line batch effect\n\n')
+#lcdbwf::plot.heatmap(vsd.batch, colData, cols.for.grouping=c('group'))
+cat('\n\nDISABLED\n\n')
 ```
 
+**Note: HAP1_MTX_6h_2 and outlier_HAP1_dBet6_1h_1 are outliers and will be removed from subsequent analyses.**
+
 ## PCA {.tabset}
 
 Another way of looking at sample clustering is principal components analysis
@@ -161,16 +183,39 @@ explained by each principal component is indicated in the axes label.
 
 groups <- list(
                group=c('group'),
-               layout=c('layout')
+               cell_line_group=c('cell_line_group'),
+               cell_Line=c('cell_Line'),
+               treatment=c('treatment'),
+               timepoint=c('timepoint')
                )
 
 p <- list()
 for(group in groups){
-    p[[group]] <- lcdbwf::plotPCA.ly(vsd, intgroup=group)
+    p[[group]] <- lcdbwf::plotPCA.ly(vsd.no.outlier, intgroup=group)
 }
 
 i <- 1; lcdbwf::mdcat('### ', names(p)[i]); ggplotly(p[[i]])
 i <- 2; lcdbwf::mdcat('### ', names(p)[i]); ggplotly(p[[i]])
+i <- 3; lcdbwf::mdcat('### ', names(p)[i]); ggplotly(p[[i]])
+i <- 4; lcdbwf::mdcat('### ', names(p)[i]); ggplotly(p[[i]])
+i <- 5; lcdbwf::mdcat('### ', names(p)[i]); ggplotly(p[[i]])
+```
+
+## PCA after batch effect removal {.tabset}
+
+DISABLED
+
+```{r pca_batch, results='asis', cache=TRUE, dependson='dds_initial'}
+#p <- list()
+#for(group in groups){
+#    p[[group]] <- lcdbwf::plotPCA.ly(vsd.batch, intgroup=group)
+#}
+#
+#i <- 1; lcdbwf::mdcat('### ', names(p)[i]); ggplotly(p[[i]])
+#i <- 2; lcdbwf::mdcat('### ', names(p)[i]); ggplotly(p[[i]])
+#i <- 3; lcdbwf::mdcat('### ', names(p)[i]); ggplotly(p[[i]])
+#i <- 4; lcdbwf::mdcat('### ', names(p)[i]); ggplotly(p[[i]])
+#i <- 5; lcdbwf::mdcat('### ', names(p)[i]); ggplotly(p[[i]])
 ```
 
 
@@ -222,8 +267,8 @@ ggplotly(p)
 ```{r parallel_config}
 # Docs: https://lcdb.github.io/lcdb-wf/rnaseq-rmd.html#parallel_config
 
-parallel <- FALSE
-# register(MulticoreParam(4))
+parallel <- TRUE
+register(MulticoreParam(4))
 ```
 
 ```{r dds_list, cache=TRUE}
@@ -231,16 +276,46 @@ parallel <- FALSE
 
 lst <- list(
             main=list(
-                sampletable=colData,
-                design=~group),
-            no_rep4=list(
-                sampletable=colData %>% filter(samplename != 'sample4'),
-                design=~group,
+                sampletable=colData %>% filter(!samplename %in% outliers),
+                design=~treatment,
+                args=list(subset.counts=TRUE)
+            ),
+            jq=list(
+                sampletable=colData %>% filter(treatment %in% c('DMSO', 'JQ1'), !samplename %in% outliers),
+                design=~group + 0,
+                args=list(subset.counts=TRUE)
+            ),
+            jq_cell_line=list(
+                sampletable=colData %>% filter(treatment %in% c('DMSO', 'JQ1'), !samplename %in% outliers),
+                design=~cell_line_group + 0,
+                args=list(subset.counts=TRUE)
+            ),
+            jq_interact_term=list(
+                sampletable=colData %>% filter(treatment %in% c('DMSO', 'JQ1'), !samplename %in% outliers),
+                design=~treatment + cell_Line + timepoint + treatment:timepoint,
+                args=list(subset.counts=TRUE)
+            ),
+            dbet=list(
+                sampletable=colData %>% filter(treatment %in% c('DMSO', 'dBet6'), !samplename %in% outliers),
+                design=~group + 0,
+                args=list(subset.counts=TRUE)
+            ),
+            dbet_cell_line=list(
+                sampletable=colData %>% filter(treatment %in% c('DMSO', 'dBet6'), !samplename %in% outliers),
+                design=~cell_line_group + 0,
+                args=list(subset.counts=TRUE)
+            ),
+            dbet_interact_term=list(
+                sampletable=colData %>%
+                    filter(treatment %in% c('DMSO', 'dBet6'), !samplename %in% outliers) %>%
+                    mutate(treatment=factor(treatment, levels=c('DMSO', 'dBet6'))),
+                design=~treatment + cell_Line + timepoint + treatment:timepoint,
                 args=list(subset.counts=TRUE)
             )
 )
+
 dds.list <- map(lst, lcdbwf::make.dds, salmon.files=NULL, remove.version=TRUE,
-                combine.by=NULL, parallel=parallel)
+                combine.by='biorep', parallel=parallel)
 ```
 
 
@@ -249,37 +324,138 @@ dds.list <- map(lst, lcdbwf::make.dds, salmon.files=NULL, remove.version=TRUE,
 
 res.list <- list()
 
-# NOTE: Example contrast #1----------------------------------------------------
+# NOTE: Example contrast #1: JQ1 vs. DMSO ---------------------------------
 #   Using the example data, this compares treatment group to control group.
 #   Change to reflect your experiment.
 # IMPORTANT: Control must always be last
-res.list[['main']] <- list(
-    res=lfcShrink(dds.list[['main']], contrast=c('group', 'treatment', 'control'), type='normal'),
+
+# including only treatment in the contrast
+res.list[['main_jq']] <- list(
+    res=results(dds.list[['main']], contrast=c('treatment', 'JQ1', 'DMSO')),
+    dds='main',
+    label='JQ1 vs. DMSO (LFC > 0); dds main; cts treatment'
+)
+
+# JQ1 vs. DMSO at 1h
+res.list[['jq1h']] <- list(
+    res=results(dds.list[['jq']], contrast=c(-1,0,1,0)),
+    dds='jq',
+    label='JQ1 vs. DMSO 1h (LFC>0); dds jq; cts group'
+)
+
+res.list[['jq1h-batch']] <- list(
+    res=results(dds.list[['jq_cell_line']], contrast=c(-1,0,1,0,-1,0,1,0,-1,0,1,0)),
+    dds='jq_cell_line',
+    label='JQ1 vs. DMSO 1h (LFC>0); dds jq_cell_line; cts cell_Line_group; batch effect'
+)
+
+# JQ1 vs. DMSO at 6h
+res.list[['jq6h']] <- list(
+    res=results(dds.list[['jq']], contrast=c(0,-1,0,1)),
+    dds='jq',
+    label='JQ1 vs. DMSO 6h (LFC>0); dds jq; cts group'
+)
+
+res.list[['jq6h-batch']] <- list(
+    res=results(dds.list[['jq_cell_line']], contrast=c(0,-1,0,1,0,-1,0,1,0,-1,0,1)),
+    dds='jq_cell_line',
+    label='JQ1 vs. DMSO 6h (LFC>0); dds jq_cell_line; cts cell_Line_group; batch effect'
+)
+
+# JQ1 vs. DMSO 6h vs. 1h
+res.list[['jq6h.vs.1h']] <- list(
+    res=results(dds.list[['jq']], contrast=c(1,-1,-1,1)),
+    dds='jq',
+    label='JQ1 vs. DMSO 6h vs. 1h (LFC>0); dds jq; cts group'
+)
+
+res.list[['jq6h.vs.1h-batch']] <- list(
+    res=results(dds.list[['jq_cell_line']], contrast=c(1,-1,-1,1,1,-1,-1,1,1,-1,-1,1)),
+    dds='jq_cell_line',
+    label='JQ1 vs. DMSO 6h vs. 1h (LFC>0); dds jq_cell_line; cts cell_Line_group; batch effect'
+)
+
+res.list[['jq6h.vs.1h-interactdesign']] <- list(
+    res=results(dds.list[['jq_interact_term']]),
+    dds='jq_cell_line',
+    label='JQ1 vs. DMSO 6h vs. 1h (LFC>0); dds jq_interact_term; cts cell_Line_group; batch effect'
+)
+
+res.list[['jq_lfc2']] <- list(
+    res=lfcShrink(
+        dds.list[['jq_interact_term']],
+        coef='treatmentJQ1.timepoint6h',
+        #lfcThreshold=2,    # no padj if lfcThreshold
+        type='apeglm'
+    ),
+    dds='main',
+    label='JQ1 vs. DMSO 6h vs. 1h (LFC>2); dds jq_interact_term; cts treat+cell_Line+timepoint+treat:timepoint; batch effect'
+)
+
+# dBet6 ---------------------------
+
+res.list[['main_dbet']] <- list(
+    res=results(dds.list[['main']], contrast=c('treatment', 'dBet6', 'DMSO')),
     dds='main',
-    label='Using a log2FoldChange threshold of 0'
+    label='dBet6 vs. DMSO (LFC > 0); dds main; cts treatment'
 )
 
+# dBET6 vs. DMSO at 1h
+res.list[['dbet1h']] <- list(
+    res=results(dds.list[['dbet']], contrast=c(-1,0,1,0)),
+    dds='dbet',
+    label='dBet6 vs. DMSO 1h (LFC>0); dds dbet; cts group'
+)
+
+res.list[['dbet1h-batch']] <- list(
+    res=results(dds.list[['dbet_cell_line']], contrast=c(-1,0,1,0,-1,0,1,0,-1,0,1,0)),
+    dds='dbet_cell_line',
+    label='dBet6 vs. DMSO 1h (LFC>0); dds dbet_cell_line; cts cell_Line_group; batch effect'
+)
 
-# NOTE: Example contrast #2----------------------------------------------------
-#    Using the example data, this compares treatment group to control group but
-#    requiring genes to have >4-fold differences (log2(4) = 2). Change to
-#    reflect your experiment.
-#    If using an lfcThreshold, must update elsewhere throughout the code.
-res.lfcthresh.2 <- results(
-    dds.list[['main']],
-    contrast=c('group', 'treatment', 'control'),
-    lfcThreshold=2)
+# dBet6 vs. DMSO at 6h
+res.list[['dbet6h']] <- list(
+    res=results(dds.list[['dbet']], contrast=c(0,-1,0,1)),
+    dds='dbet',
+    label='dBet6 vs. DMSO 6h (LFC>0); dds dbet; cts group'
+)
 
-res.list[['lfc2']] <- list(
+res.list[['dbet6h-batch']] <- list(
+    res=results(dds.list[['dbet_cell_line']], contrast=c(0,-1,0,1,0,-1,0,1,0,-1,0,1)),
+    dds='dbet_cell_line',
+    label='dBet6 vs. DMSO 6h (LFC>0); dds dbet_cell_line; cts cell_Line_group; batch effect'
+)
+
+# dBet6 vs. DMSO 6h vs. 1h
+res.list[['dbet6h.vs.1h']] <- list(
+    res=results(dds.list[['dbet']], contrast=c(1,-1,-1,1)),
+    dds='dbet',
+    label='dBet6 vs. DMSO 6h vs. 1h (LFC>0); dds dbet; cts group'
+)
+
+res.list[['dbet6h.vs.1h-batch']] <- list(
+    res=results(dds.list[['dbet_cell_line']], contrast=c(1,-1,-1,1,1,-1,-1,1,1,-1,-1,1)),
+    dds='dbet_cell_line',
+    label='dBet6 vs. DMSO 6h vs. 1h (LFC>0); dds dbet_cell_line; cts cell_Line_group; batch effect'
+)
+
+res.list[['dbet6h.vs.1h-interactdesign']] <- list(
+    res=results(dds.list[['dbet_interact_term']]),
+    dds='dbet_cell_line',
+    label='dBet6 vs. DMSO 6h vs. 1h (LFC>0); dds dbet_interact_term; cts cell_Line_group; batch effect'
+)
+
+res.list[['dbet_lfc2']] <- list(
     res=lfcShrink(
-        dds.list[['main']],
-        contrast=c('group', 'treatment', 'control'),
-        res=res.lfcthresh.2,
-        type='normal'
+        dds.list[['dbet_interact_term']],
+        coef='treatmentdBet6.timepoint6h',
+        #lfcThreshold=2,
+        type='apeglm'
     ),
     dds='main',
-    label='Using a log2FoldChange threshold of >2'
+    label='dBet6 vs. DMSO 6h vs. 1h (LFC>2); dds dbet_interact_term; cts treat+cell_Line+timepoint+treat:timepoint; batch effect'
 )
+
 ```
 
 
@@ -339,14 +515,16 @@ for (name in names(res.list)){
   mdcat('## ', label, ' {.tabset}')
   mdcat('### Summary of results')
   print(knitr::kable(my.summary(res.i, dds.i)))
+  cat(paste0('resultsNames: ',paste(resultsNames(dds.i), collapse=', '), '\n\n'))
+  cat(paste(elementMetadata(res.i)$description, collapse='\n\n'))
   mdcat('### Normalized counts of top 3 upregulated genes')
   print(counts.plot(counts.df(dds.i, res.i, sel.genes=lfc.filter(res.i),
                               label='symbol', rank.col='padj'),
-                    3))
+                    3, no.aes=TRUE) + aes(y=normalized_counts, x=group, color=cell_Line))
   mdcat('### Normalized counts of top 3 downregulated genes')
   print(counts.plot(counts.df(dds.i, res.i, sel.genes=lfc.filter(res.i, reverse=TRUE),
                               label='symbol', rank.col='padj'),
-                    3))
+                    3, no.aes=TRUE) + aes(y=normalized_counts, x=group, color=cell_Line))
 
   # NOTE: gene labels --------------------------------------------------------
   #   By default we plot the symbols of the top 10 upregulated or downregulated

From e5999d482a9d33a1f4a363d671ce3c1a8edd0556 Mon Sep 17 00:00:00 2001
From: Caroline Esnault <caroline.esnault@nih.gov>
Date: Thu, 24 Sep 2020 08:21:55 -0400
Subject: [PATCH 02/19] cleanup

---
 workflows/rnaseq/downstream/helpers.Rmd | 871 ------------------------
 1 file changed, 871 deletions(-)
 delete mode 100644 workflows/rnaseq/downstream/helpers.Rmd

diff --git a/workflows/rnaseq/downstream/helpers.Rmd b/workflows/rnaseq/downstream/helpers.Rmd
deleted file mode 100644
index e811fe971..000000000
--- a/workflows/rnaseq/downstream/helpers.Rmd
+++ /dev/null
@@ -1,871 +0,0 @@
-```{r}
-library(DESeq2)
-library(pheatmap)
-library(RColorBrewer)
-library(DEGreport)
-library(ggplot2)
-library(ggrepel)
-library(heatmaply)
-library(readr)
-library(stringr)
-library(tibble)
-library(purrr)
-library(tidyr)
-
-#' Get the OrgDb for the specified organism, using the cached AnnotationHub.
-#'
-#' @param species Case-sensitive genus and species
-#' @param cache Directory in which the AnnotationHub cache is stored
-#' @param annotation_key_override If not NA, forces the hub to use this
-#'        accession. Use this when you know exactly which OrgDb to use.
-#'
-#' @return OrgDb object
-get.orgdb <- function(species, cache, annotation_key_override=NA){
-
-    # Workaround to allow AnnotationHub to use proxy. See
-    # https://github.com/Bioconductor/AnnotationHub/issues/4, and thanks
-    # Wolfgang!
-    proxy <- Sys.getenv('http_proxy')
-    if (proxy == ""){
-        proxy <- NULL
-    }
-
-    ah <- AnnotationHub(hub=getAnnotationHubOption('URL'),
-             cache=cache,
-             proxy=proxy,
-             localHub=FALSE)
-
-    find.annotationhub.name <- function(species.name, override.code) { #autodetect ah names based on loaded database
-        if (is.na(override.code)) {
-        ah.query <- query(ah, "OrgDb")
-        ah.query.speciesmatch <- grepl(paste("^", species.name, "$", sep=""), ah.query$species)
-        ah.query.which <- which(ah.query.speciesmatch)
-        stopifnot(length(ah.query.which) > 0) #require at least one match
-        if (length(ah.query.which) > 1) { #warn of duplicate matches
-           print("WARNING: found multiple candidate species in AnnotationHub: ");
-           print(ah.query.speciesmatch)
-        }
-        names(ah.query)[ah.query.which[1]]
-        } else {
-        override.code
-        }
-    }
-    annotation_key <- find.annotationhub.name(annotation_genus_species, annotation_key_override)
-    orgdb <- ah[[annotation_key]]
-    return(orgdb)
-}
-
-#' Plot an interactive PCA plot
-#'
-#' @param rld DESeqTransform object output by varianceStabilizingTransformation() or rlog()
-#' @param intgroup character vector of names in colData(x) to use for grouping
-#'
-#' @return Handle to ggplot with added label field in aes_string() for plotting with ggplotly()
-plotPCA.ly <- function(rld, intgroup){
-  mat <- plotPCA(rld, intgroup, returnData=TRUE)
-  pv <- attr(mat, 'percentVar')
-  p <- ggplot(data=mat, aes_string(x='PC1', y='PC2', color='group', label='name')) +
-          geom_point(size=3) + xlab(paste0('PC1: ', round(pv[1]*100), '% variance')) +
-          ylab(paste0('PC2: ', round(pv[2]*100), '% variance')) + coord_fixed()
-  return(p)
-}
-
-#' Plot a MA plot labeled with selected genes
-#'
-#' @param res.list data.frame (or list) with log2FoldChange and padj columns (or elements).
-#'        Row names of the data.frame should be gene IDs/symbols and match the format of lab.genes
-#' @param fdr.thres FDR threshold for defining statistical significance
-#' @param fc.thres log2FoldChange cutoff for defining statistical significance
-#' @param fc.lim User-defined limits for y-axis (log2FC). If NULL, this is defined as
-#'        the (floor, ceiling) of range(res$log2FoldChange)
-#' @param genes.to.label Genes to label on the MA plot. NULL, by default
-#' @param col Column of `res` in which to look for `genes.to.label`. If NULL,
-#'        rownames are used.
-#'
-#' @return Handle to ggplot
-plotMA.label <- function(res,
-                         fdr.thres=0.1,
-                         fc.thres=0,
-                         fc.lim=NULL,
-                         genes.to.label=NULL,
-                         col=NULL
-                         ){
-  # TODO: Add ggrastr option for points
-  genes.to.label <- as.character(genes.to.label)
-  nna <- sum(is.na(genes.to.label))
-  if (nna > 0){
-      warning(paste("Removing", nna, "NAs from gene list"))
-      genes.to.label <- genes.to.label[!is.na(genes.to.label)]
-  }
-  # convert res to data frame
-  res <- data.frame(res)
-
-  # if y limits not specified
-  if(is.null(fc.lim)){
-    fc.lim <- range(res$log2FoldChange, na.rm=TRUE)
-    fc.lim[1] <- floor(fc.lim[1])
-    fc.lim[2] <- ceiling(fc.lim[2])
-  }
-
-  # get data frame of genes outside plot limits
-  up.max <- res[res$log2FoldChange > fc.lim[2],]
-  up.max$log2FoldChange <- rep(fc.lim[2], dim(up.max)[1])
-  up.max <- data.frame(genes=rownames(up.max), up.max)
-
-  down.max <- res[res$log2FoldChange < fc.lim[1],]
-  down.max$log2FoldChange <- rep(fc.lim[1], dim(down.max)[1])
-  down.max <- data.frame(genes=rownames(down.max), down.max)
-
-  # get data frame of DE genes
-  de.list <- res[res$padj < fdr.thres &
-                 !is.na(res$padj) &
-                 abs(res$log2FoldChange) >= fc.thres,]
-  de.list <- data.frame(genes=rownames(de.list), de.list)
-
-  # get data frame of DE genes outside plot limits
-  up.max.de <- up.max[rownames(up.max) %in% rownames(de.list),]
-  down.max.de <- down.max[rownames(down.max) %in% rownames(de.list),]
-
-  # create ggplot with appropriate layers
-  p <- ggplot(res, aes(baseMean, log2FoldChange)) +
-    geom_point(col="gray40") + scale_x_log10() + ylim(fc.lim[1], fc.lim[2]) +
-    theme_bw() + theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank())
-
-  p <- p + geom_hline(yintercept = 0, col="red", size=2, alpha=0.5)     # add horizontal line
-  p <- p + geom_point(data=up.max, col="gray40", pch=2)                 # add points above max y
-  p <- p + geom_point(data=down.max, col="gray40", pch=6)               # add points below min y
-
-  p <- p + geom_point(data=de.list, col="red")                          # add DE points
-  p <- p + geom_point(data=up.max.de, col="red", pch=2)                 # add DE points above max y
-  p <- p + geom_point(data=down.max.de, col="red", pch=6)               # add DE points below min y
-
-
-  if(!is.null(genes.to.label)){
-    # get data frame of genes to be labeled
-    if (!is.null(col)){
-        res$gene.labels <- res[,col]
-    } else {
-        res$gene.labels <- rownames(res)
-    }
-
-    label.list <- res[res$gene.labels %in% genes.to.label,]
-    #label.list <- data.frame(genes=rownames(label.list), label.list)
-
-    # label genes outside limits
-    up.max.idx <- rownames(label.list) %in% rownames(up.max)
-    down.max.idx <- rownames(label.list) %in% rownames(down.max)
-
-    if(sum(up.max.idx) > 0){
-      label.list$log2FoldChange[up.max.idx] <- rep(fc.lim[2], sum(up.max.idx))
-    }
-
-    if(sum(down.max.idx) > 0){
-      label.list$log2FoldChange[down.max.idx] <- rep(fc.lim[1], sum(down.max.idx))
-    }
-
-    # add labels
-    p <- p + geom_point(data=label.list, col="black", pch=1, size=3)
-    p <- p + geom_label_repel(data=label.list, aes(label=label.list$gene.labels, fontface="italic"))
-  }
-  return(p)
-
-}
-
-#' Plot a clustered heatmap of samples
-#'
-#' @param rld DESeqTransform object, typically output from running rlog()
-#' @param colData Dataframe of metadata, used for annotating heatmap
-#' @param cols.for.grouping Columns in colData to annotate heatmap with
-#'
-#' @return matrix of sample distances
-plot.heatmap <- function(rld, colData, cols.for.grouping){
-    sampleDists <- dist(t(assay(rld)))
-    sampleDistMatrix <- as.matrix(sampleDists)
-    rownames(sampleDistMatrix) <- colnames(rld)
-    colors <- colorRampPalette(rev(brewer.pal(9, 'Blues')))(255)
-    df <- as.data.frame(colData(rld)[, cols.for.grouping])
-    colnames(df) <- cols.for.grouping
-    rownames(df) <- colnames(rld)
-    heatmaply(sampleDistMatrix,
-              scale='none',
-              col=colors,
-              row_side_colors=df,
-              showticklabels=c(FALSE,TRUE))
-}
-
-
-#' Plot heatmap of most varying genes
-#'
-#' @param rld DESeqTransform object, typically output from running rlog()
-#' @param colData Dataframe of metadata, used for annotating heatmap
-#' @param n Number of genes to include
-#'
-#' @return Side effect is to plot the heatmap
-vargenes.heatmap <- function(rld, cols.for.grouping, n=50){
-  # TODO: heatmaply?
-  # TODO: allow alternative gene IDs
-  topVarGenes <- head(order(rowVars(assay(rld)), decreasing=TRUE), n)
-  mat <- assay(rld)[topVarGenes,]
-  mat <- mat - rowMeans(mat)
-  df <- as.data.frame(colData(rld)[, cols.for.grouping])
-  rownames(df) <- colnames(rld)
-  colnames(df) <- cols.for.grouping
-  pheatmap(mat, annotation_col=df, cluster_cols=TRUE)
-}
-
-
-#' Simple wrapper of cat() that makes markdown text easier to print
-#'
-#' @param ... Arguments to print
-#'
-#' Make sure you're in an R code chunk with `results="asis"` set.
-mdcat <- function(...){
-  cat('\n\n', ..., ' \n\n', sep='', fill=1500)
-}
-
-
-#' Convert a vector of BAM filenames into a vector of samplenames.
-#'
-#' @param x Character vector of BAM filenames (e.g., from the header of
-#'        featureCounts)
-#'
-lcdbwf.samplename <- function(x) {
-    x <- x %>%
-        str_remove_all('data/rnaseq_samples/') %>%
-        str_remove_all('.cutadapt.bam') %>%
-        str_split(fixed('/'), simplify=TRUE)
-    x[,1]
-}
-
-
-#' Load a single combined featureCounts table into a DESeq object.
-#'
-#' @param filename Filename containing featureCounts combined output
-#' @param sampletable data.frame containing at least sample names as first
-#'        column
-#' @param design Model used for creating DESeq object
-#' @param sample.func Function that will be applied to each column name
-#'        to align it to the input sampletable. Takes a character vector as
-#'        input and returns a character vector of adjusted strings.
-#' @param subset.counts If TRUE, then counts are subsetted by the provided
-#'        sampletable. That is, only the counts columns that match a rowname of
-#'        the provided sampletable are included. If FALSE (default), an error
-#'        is raised reporting the differences in sampletable and counts data.
-#'        It is always an error if counts data does not have an entry for
-#'        a sample in the sampletable.
-#'
-#' @return DESeq object
-#'
-#' Additional args are passed to DESeq2::DESeqDataSetFromMatrix.
-DESeqDataSetFromCombinedFeatureCounts <- function(filename, sampletable, design, sample.func=lcdbwf.samplename, subset.counts=FALSE, ...){
-
-    # The sampletable may be data.frame or tibble; if it's a tibble then it
-    # likely doesn't have rownames. So in this function we assume that it's the
-    # first column that contains the samplenames.
-
-    # Read in the counts TSV, use gene ID as rownames, and get rid of the Chr,
-    # Start, End, Strand, Length columns
-    m <- read_tsv(filename, comment="#") %>%
-        remove_rownames %>%
-        column_to_rownames('Geneid') %>%
-        dplyr::select(-(1:5)) %>%
-        as.data.frame
-
-    # The column names of the imported table are not guaranteed to match the
-    # samples in the metadata, so we need to make sure things match up
-    # correctly when renaming columns.
-    #
-    # sample.func should convert filenames (which are columns in featurecounts
-    # table) with samples (as listed in the sampletable)
-    x <- colnames(m) %>% sample.func
-
-    samplenames <- sampletable[,1]
-    counts.not.sampletable <- setdiff(x, samplenames)
-    sampletable.not.counts <- setdiff(samplenames, x)
-
-
-    if (!all(x %in% samplenames)){
-        # sampletable is missing:
-        if (!subset.counts){
-            stop(paste('The following samples are in the counts data but not the sampletable. If this is intended, consider using `subset.counts=TRUE` to remove them from the counts:', paste(counts.not.sampletable, collapse=', ')))
-        }
-    }
-    if (!all(samplenames %in% x)){
-        stop(
-           paste(
-             'The following samples are in the sampletable but not in the counts data. Check sample.func?',
-             paste(sampletable.not.counts, collapse=', '))
-        )
-    }
-
-    colnames(m) <- x
-
-    # We should be good -- so reorder columns in `m` to match sampletable
-    m <- m[, sampletable[,1]]
-
-    # Before creating the dds object, we will drop levels from factors, in case
-    # that was not done before sending in the (possibly subsetted) sampletable.
-
-    sampletable <- droplevels(as.data.frame(sampletable))
-
-    object <- DESeqDataSetFromMatrix(countData=m, colData=sampletable, design=design, ...)
-    return(object)
-}
-
-#' Load featureCounts output into a DESeq object.
-#'
-#' Revised version of DESeq2::DESeqDataSetFromHTSeqCount to handle the
-#' featureCounts default output format, which contains many more columns.
-#'
-#' @param sampleTable data.frame containing at least "featurecounts.path" column
-#' @param directory Paths to featureCounts output are relative to this dir
-#' @param design Model used for creating DESeq object
-#'
-#' @return DESeq object
-#'
-#' Additional args are passed to DESeq2::DESeqDataSetFromMatrix.
-DESeqDataSetFromFeatureCounts <- function (sampleTable, directory='.', design,
-                                           ignoreRank=FALSE,  ...)
-{
-  l <- lapply(
-    as.character(sampleTable[,'featurecounts.path']),
-    function(fn) read.table(file.path(directory, fn), stringsAsFactors=FALSE, skip=2)
-  )
-  if (!all(sapply(l, function(a) all(a$V1 == l[[1]]$V1))))
-    stop("Gene IDs in first column differ between files")
-  tbl <- sapply(l, function(a) a$V7)
-  colnames(tbl) <- sampleTable[, 1]
-  rownames(tbl) <- l[[1]]$V1
-  rownames(sampleTable) <- sampleTable[, 1]
-  object <- DESeqDataSetFromMatrix(countData=tbl, colData=sampleTable[, -grepl('path', colnames(sampleTable)),
-                                   drop=FALSE], design=design, ignoreRank, ...)
-  return(object)
-}
-
-#' Load Salmon quantification data into a DESeq object
-#'
-#' @param sampleTable data.frame containing at least "salmon.path" column
-#' @param design Model used for creating DESeq object
-#'
-#' @return DESeq object with transcript-level counts
-#'
-#' Additional args are passed to DESeq2::DESeqDataSetFromMatrix.
-DESeqDataSetFromSalmon <- function (sampleTable, design,
-                                           ignoreRank=FALSE,  ...)
-{
-    txi <- tximport(sampleTable[, 'salmon.path'], type='salmon', txOut=TRUE)
-    object <- DESeqDataSetFromTximport(txi, colData=sampleTable[, -grepl('path', colnames(sampleTable)),
-                                       drop=FALSE], design=design, ignoreRank, ...)
-    return(object)
-}
-
-#' Make a list of dds objects
-#'
-#' This function takes a list of sample, design pairs, plus other info, to output
-#' a list of DESeq object.
-#'
-#' @param deseq_obj_list A named list of lists. Each list should have two to three elements:
-#'       1 should be a sampletable, 2 should be a design. The 3rd is an optional list of arguments
-#'       to pass to the DESeqDataSet constructor.
-#' @param salmon.files txi object from tximport if transcript-level counts desired; NULL forces featurecounts;
-#'        shared by all dds.
-#' @param combine.by The column to collapse technical replicates by.
-#' @param remove.version Boolean to decide whether to remove version information
-#' @return A list of dds objects.
-#'
-make.dds.list <- function(deseq_obj_list, salmon.files=NULL,
-                          combine.by=FALSE, remove.version=TRUE, ...){
-    make.dds <- function(design_data, salmon.files, combine.by, remove.version, ...)
-    {
-
-        subset.counts <- FALSE
-        colData <- pluck(design_data, 1)
-        design <- pluck(design_data, 2)
-        location <- pluck(design_data, 'file', .default='../data/rnaseq_aggregation/featurecounts.txt')
-        # arg list passes args to either the CombinedFeatureCounts or FromSalmon constructor
-        arg_list <- pluck(design_data, 'args')
-
-        if(is.null(salmon.files)){
-            # Load gene-level counts
-           dds<- exec(DESeqDataSetFromCombinedFeatureCounts,
-                location,
-                sampletable=colData,
-                design=design,
-                !!!arg_list)
-                if (remove.version){
-                    rownames(dds) <- sapply(strsplit(rownames(dds), '.', fixed=TRUE), function (x) x[1])
-                }
-        } else {
-            dds <- exec(DESeqDataSetFromTximport, salmon.files, colData=colData[, -grepl('path', colnames(colData)),
-                                            drop=FALSE], design=design, !!!arg_list)
-        }
-
-        if(combine.by != FALSE){
-            dds <-collapseReplicates(dds, dds[[combine.by]])
-        }
-
-        # Constructs the dds object and passes any arguments such as parallel
-        dds <- DESeq(dds, ...)
-
-        return(dds)
-    }
-    dds_list <- map(deseq_obj_list, make.dds, salmon.files, combine.by, remove.version, ...)
-
-    return(dds_list)
-}
-
-#' Compute label for one component of an arbitrary design matrix
-#'
-#' @param pattern.mat single row from unique(model.matrix) call from a dds object
-#'
-#' @return formatted column name
-
-format.group.name <- function(pattern.mat) {
-    my.res <- "counts"
-    # enforce input format
-    stopifnot(is.vector(pattern.mat))
-    # if there's only one element
-    if (length(pattern.mat) == 1) {
-        # it must be the intercept
-    my.res <- paste(my.res, "groupMean", sep=".")
-    } else { # more than one element
-        # for every non-intercept entry
-        for (i in 2:length(pattern.mat)) {
-        # if 1, include
-        if (pattern.mat[i] == 1) my.res <- paste(my.res, names(pattern.mat[i]), sep=".")
-        # otherwise don't
-        else my.res <- paste(my.res, ".not", names(pattern.mat[i]), sep="")
-    }
-    }
-    my.res
-}
-
-#' Plot a gene's normalized counts across samples
-#'
-#' @param gene Gene ID
-#' @param dds DESeq object from which to extract counts
-#'
-#' @return ggplot object
-my.counts <- function(gene, dds, label=NULL, intgroup='group'){
-
-  # Assumption: color genes by group
-  geneCounts <- plotCounts(dds, gene=gene, intgroup=intgroup, returnData=TRUE)
-  p <- ggplot(geneCounts, aes_string(x=intgroup, y='count', color=intgroup, group=intgroup)) +
-    scale_y_log10() +
-    geom_point(position=position_jitter(width=.1, height=0),  size=3) +
-    geom_line(color='#000000') +
-    ggtitle(gene)
-
-  if (!is.null(label)){
-    p <- p + ggtitle(label)
-  }
-  return(p)
-}
-
-#' Re-order results by log2FoldChange
-#'
-#' @param res DESeq2 results object
-#' @param reverse If TRUE then sort high-to-low
-#'
-#' @return Re-ordered results object
-lfc.order <- function(res, reverse=FALSE){
-    res.na <- res[!is.na(res$log2FoldChange),]
-    if (!reverse){
-        return(res.na[order(res.na$log2FoldChange),])
-    }
-    if (reverse){
-        return(res.na[rev(order(res.na$log2FoldChange)),])
-    }
-}
-
-#' Re-order results by adjusted pval
-#'
-#' @param res DESeq2 results object
-#' @param reverse If TRUE then sort high-to-low
-#'
-#' @return Re-ordered results object
-padj.order <- function(res, reverse=FALSE){
-    res.na <- res[!is.na(res$log2FoldChange) & !is.na(res$log2FoldChange),]
-    if (!reverse){
-        res.na <- res.na[res.na$log2FoldChange > 0,]
-    } else {
-        res.na <- res.na[res.na$log2FoldChange < 0,]
-    }
-    return(res.na[order(res.na$padj),])
-}
-
-
-#' Plot normalized gene counts for the top N genes
-#'
-#' @param sorted DESeq2 results object
-#' @param n Number of top genes to plot
-#' @param func Plotting function to call on each gene. Signature is func(gene,
-#'             dds, label=label, ...)
-#' @param dds DESeq2 object
-#' @param label Vector of column names in `res` from which to add a label to
-#'        the gene (e.g., c('symbol', 'alias'))
-#' @param ... Additional arguments that are passed to `func`
-#' @return Side effect is to create plot
-top.plots <- function(res, n, func, dds, add_cols=NULL, ...){
-    ps <- list()
-    for (i in seq(n)){
-        gene <- rownames(res)[i]
-        add_label <- as.character(as.data.frame(res)[i, add_cols])
-        add_label <- add_label[!is.na(add_label)]
-        label <- paste(gene, add_label, sep=' | ')
-        if (length(label) == 0){
-          label <- NULL
-        }
-        ps[[gene]] <- func(gene, dds, label=label, ...)
-    }
-    grid.arrange(grobs=ps)
-}
-
-#' Make dataframe of normalized gene counts
-#'
-#' @param dds DESeq2 object
-#' @param res DESeq2 results object
-#' @param sel.genes list of genes to consider
-#' @param label column(s) to be included to the plot labels
-#' @param pc count number to be added to the normalized counts
-#'        Typically a pc of 0.5 is added to allow plotting in log scale
-#'
-#' @return dataframe
-counts.df <- function(dds, res, sel.genes=NULL, label=NULL, rank.col='padj', pc=0.5) {
-    # getting normalized counts copied from plotCounts()
-    cnts <- counts(dds, normalized = TRUE, replaced = FALSE)
-    # add 0.5 like plotCounts to plot 0 on log scale
-    cnts <- cnts + pc
-    # merge with res.i and colData
-    df <- as.data.frame(cbind(cnts, res))
-    # add label for plotting
-    df <- df %>% mutate(label = paste(gene, !!!syms(label), sep=' | '))
-    # subset to sel.genes if not NULL (then keep all)
-    if (!is.null(sel.genes)) {
-        df <- df %>%
-            filter(gene %in% sel.genes)
-    }
-    # add rank
-    df <- df %>%
-        mutate(rank = rank(!!sym(rank.col), ties.method='first', na.last='keep')) %>%
-        pivot_longer(colnames(cnts), names_to='samplename', values_to='normalized_counts')
-    # add colData
-    df <- merge(df, colData(dds), by='samplename') %>%
-        as.data.frame()
-    return(df)
-}
-
-#' Filter results by log2FoldChange sign
-#'
-#' @param res DESeq2 results object
-#' @param reverse If TRUE then filter negative lfc
-#'
-#' @return filtered genes
-lfc.filter <- function(res, reverse=FALSE){
-    res.na <- res[!is.na(res$log2FoldChange),]
-    if (!reverse){
-        res.na <- res.na[res.na$log2FoldChange > 0,]
-    } else {
-        res.na <- res.na[res.na$log2FoldChange < 0,]
-    }
-    return(res.na[,'gene'])
-}
-
-#' Plot genes' normalized counts across samples
-#'
-#' @param df dataframe from which to extract genes normalized_counts
-#' @param rank.nb rank number less or equal used to filter the genes to be plotted
-#' @param no.aes whether to include the default aes or return the ggplot object
-#' without aes
-#' @param facet name of column to use for faceting
-#'
-#' @return ggplot object
-counts.plot <- function(df, rank.nb=NULL, no.aes=FALSE, facet='label') {
-    if (!is.null(rank.nb)) {
-        df <- df %>%
-            filter(rank <= rank.nb)
-    }
-    df <- df %>%
-        arrange(rank) %>%
-        mutate(facet = factor(!!!syms(facet), levels = unique(!!!syms(facet))))
-    plt <- ggplot(df) +
-        scale_y_log10() +
-        geom_point(position=position_jitter(width=.1, height=0),  size=3) +
-        geom_line(color='#000000') +
-        theme_bw() +
-        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
-        facet_wrap(.~facet, ncol=1, scales='free_y')
-    if (!no.aes) {
-        plt <- plt +
-            aes(y=normalized_counts, x=group, color=group)
-    }
-    return(plt)
-}
-
-#' Plot a histogram of raw pvals
-#'
-#' @param res DESeq2 results object
-#'
-#' @return Side effect is to create plot
-pval.hist <- function(res){
-    hist(res$pvalue[res$baseMean>1], breaks=0:20/20, col='grey50',
-         border='white', xlab='P-value', main='Distribution of p-values')
-}
-
-#' Summarize DESeq2 results into a dataframe
-#'
-#' summary(res) prints out info; this function captures it into a dataframe
-#'
-#' @param res DESeq2 results object
-#' @param dds DEseq2 object
-#' @param alpha Alpha level at which to call significantly changing genes
-#'
-#' @return Dataframe of summarized results
-my.summary <- function(res, dds, alpha, lfc.thresh=0, ...){
-   if (missing(alpha)){
-       alpha <- if (is.null(metadata(res)$alpha)){ 0.1 } else { metadata(res)$alpha }
-   }
-   notallzero <- sum(res$baseMean > 0)
-   up <- sum(res$padj < alpha & res$log2FoldChange > lfc.thresh, na.rm=TRUE)
-   down <- sum(res$padj < alpha & res$log2FoldChange < -lfc.thresh, na.rm=TRUE)
-   filt <- sum(!is.na(res$pvalue) & is.na(res$padj))
-   outlier <- sum(res$baseMean > 0 & is.na(res$pvalue))
-   ft <- if(is.null(metadata(res)$filterThreshold)){ 0 } else { round(metadata(res)$filterThreshold) }
-   # adjust width.cutoff as newline insertion causes this to return a df with
-   # multiple duplicate rows!
-   df <- data.frame(
-                    total.annotated.genes=nrow(res),
-                    total.nonzero.read.count=notallzero,
-                    alpha=alpha,
-                    lfcThreshold=lfc.thresh,
-                    up=up,
-                    down=down,
-                    outliers=outlier,
-                    low.counts=filt,
-                    design=deparse(design(dds), width.cutoff=500L)
-                    )
-   return(df)
-}
-
-#' Convert "1/100" to 0.01.
-#'
-#' clusterProfiler report columns that are strings of numbers; this converts to
-#' a fraction
-#'
-#' @param x Character vector to convert
-get.frac <- function(x){
-    y <- as.numeric(strsplit(x, '/')[[1]])
-    return(y[[1]] / y[[2]])
-}
-
-#' Summarize and aggregate multiple GO results
-#'
-#' Convert a list of GO analysis results (one per ontology) to a single
-#' dataframe, with additional label column filled in with `label` and with
-#' a fraction column.
-#'
-#' @param ego List of clusterProfiler results objects
-#' @param labels List of labels. For each name, a new column will be added and
-#"        its contents will be the value
-#'
-#' @return dataframe
-summarize.go <- function(ego, labels){
-  lst <- list()
-  for (name in names(ego)){
-    d <- as.data.frame(ego[[name]])
-    if (nrow(d) > 0){
-      d$ontology <- name
-      for (label in names(labels)){
-        d[label] <- labels[[label]]
-      }
-      d$frac <- sapply(d$GeneRatio, get.frac)
-    }
-    lst[[name]] <- d
-  }
-  df <- do.call(rbind, lst)
-  return(df)
-}
-
-
-#' Summarize KEGG results
-#'
-#' Summarize KEGG results and add `frac` and `label` columns
-#'
-#' @param ekg Results from clusterProfiler KEGG enrichment
-#' @param label Attach this label to the "label" column
-#'
-#' @return Dataframe
-summarize.kegg <- function(ekg, labels){
-  d <- as.data.frame(ekg)
-  if (nrow(d) > 0){
-    d$ontology <- 'kegg'
-    for (label in names(labels)){
-      d[label] <- labels[[label]]
-    }
-    d$frac <- sapply(d$GeneRatio, get.frac)
-  }
-  return(d)
-}
-
-#' Attach additional information from an OrgDb to a results dataframe
-#'
-#' @param res DESeq2 results object
-#' @param keytype Key type for rownames(res)
-#' @param columns Character vector of columns to add from OrgDb to results
-attach.info <- function(res, keytype='ENSEMBL', columns=c('SYMBOL', 'UNIPROT', 'ALIAS')){
-    keys <- rownames(res)
-    for (column in columns){
-      label <- tolower(column)
-      res[label] <- mapIds(orgdb, keys=keys, column=column, keytype=keytype, multiVal='first')
-      res[[label]] <- ifelse(is.na(res[[label]]), keys, res[[label]])
-    }
-    # Put "gene" column as the first
-    cn <- colnames(res)
-    res$gene <- rownames(res)
-    res <- res[, c('gene', cn)]
-    return(res)
-}
-
-
-#' Combine everything in the results list into a single table
-#'
-#' @param res.list Named list of lists, where each sublist contains the following
-#'                 names: c('res', 'dds', 'label'). "res" is a DESeqResults object,
-#'                 "dds" is either the indexing label for the dds.list object or
-#'                  the DESeq object, and "label" is a nicer-looking
-#'                 label to use. NOTE: backwards compatibility with older versions
-#'                  of lcdb-wf depends on no dds.list object being passed.
-#'
-#' @return Dataframe
-summarize.res.list <- function(res.list, alpha, lfc.thresh, dds.list=NULL){
-    slist <- list()
-    for (name in names(res.list)){
-        if(!is.null(dds.list)){
-            x <- my.summary(res.list[[name]][['res']], dds.list[[ res.list[[name]][['dds']] ]], alpha, lfc.thresh)
-        } else { x <- my.summary(res.list[[name]][['res']], res.list[[name]][['dds']], alpha, lfc.thresh)
-        }
-        rownames(x) <- res.list[[name]][['label']]
-        slist[[name]] <- x
-    }
-    slist <- do.call(rbind, slist)
-    return(slist)
-}
-
-
-#' Return index of up/down/changed genes
-#'
-#' @param x DESeq2 results object, or data.frame created from one
-#' @param direction Direction in 'up', 'dn', 'down', 'ch', 'changed'
-#' @param alpha FDR lower than this will be considered significant
-#' @param thresh Log2 fold change threshold. If e.g. 2, will return < -2 and/or > 2, depending on the value of "direction"
-#' @param return.names If TRUE, returns the rownames of selected genes; if FALSE return boolean index of length(x)
-#'
-#' @return Character vector of rownames (if return.names=TRUE) or boolean vector of genes selected.
-get.sig <- function(x, direction='up', alpha=0.1, lfc.thresh=0, return.names=TRUE){
-    if (direction == 'up'){
-        idx <- (x$padj < alpha) & (x$log2FoldChange > lfc.thresh) & (!is.na(x$padj))
-    } else if (direction %in% c('down', 'dn')){
-        idx <- (x$padj < alpha) & (x$log2FoldChange < -lfc.thresh) & (!is.na(x$padj))
-    } else if (direction %in% c('changed', 'ch')){
-        idx <- (x$padj < alpha) & (abs(x$log2FoldChange) > lfc.thresh) & (!is.na(x$padj))
-    }
-    if (return.names){
-        return(rownames(x)[idx])
-    } else {
-        return(idx)
-    }
-}
-
-#' Prepare list for UpSet plots, but include rownames
-#'
-#' @param x List of sets to compare (same input as to UpSetR::fromList)
-#'
-#' @return data.frame of 1 and 0 showing which genes are in which sets
-fromList.with.names <- function(lst){
-    elements <- unique(unlist(lst))
-    data <- unlist(lapply(lst, function(x) {
-        x <- as.vector(match(elements, x))
-    }))
-    data[is.na(data)] <- as.integer(0)
-    data[data != 0] <- as.integer(1)
-    data <- data.frame(matrix(data, ncol = length(lst), byrow = F))
-
-    # This is the only way in which this function differs from UpSetR::fromList
-    rownames(data) <- elements
-
-    data <- data[which(rowSums(data) != 0), ]
-    names(data) <- names(lst)
-    return(data)
-}
-
-
-#' Move rownames of data.frame to first column
-#'
-#' @param x data.frame
-#' @param name Name to use as the new first column
-#'
-#' @return data.frame with original rownames moved to new first column with
-#' optional name. rownames on the new data.frame are set to NULL.
-rownames.first.col <- function(x, name='names'){
-    orig.names <- colnames(x)
-    x <- data.frame(rownames(x), x, row.names=NULL)
-    colnames(x) <- c(name, orig.names)
-    return(x)
-}
-
-
-#' lapply over a nested list
-#'
-#' @param x Nested list. Expected 2 levels deep, and each item is the same type
-#' @param subfunc Function to apply over nested "leaf" items
-#' @param ... Additional arguments to be passed to subfunc
-#'
-#' @return Nested list with same structure but tranformed values
-nested.lapply <- function(x, subfunc, ...){
-    lapply(x, lapply, subfunc, ...)
-}
-
-#' Split clusterProfiler output into one line per gene
-#'
-#' @param x Results from clusterProfiler. It is expected that the
-#' clusterProfiler enrichment function was called with "readable=TRUE"
-#'
-#' @return data.frame with genes one per line instead of "/" separated in one
-#' line. The rest of the original line is repeated for each gene.
-#'
-split.clusterProfiler.results <- function(x){
-    df <- x@result
-    # loop over all rows
-    df.parse <- NULL
-    for(k in 1:dim(df)[1]){
-        g <- strsplit(as.character(df$geneID[k]), "/")[[1]]
-        gg <- df[rep(k, length(g)),]
-        gg$geneParse <- g
-        if(is.null(df.parse)){
-            df.parse <- gg
-        } else {
-            df.parse <- rbind(df.parse, gg)
-        }
-    }
-    return(df.parse)
-}
-
-
-#' Writes out original and split clusterprofiler results
-#'
-#' @param res clusterProfiler results
-#' @param cprof.folder Directory in which to save files. Will be created if needed.
-#' @param Label to use for the results. Will generate a filename
-#' cprof.folder/label.txt and cprof.folder/label_split.txt
-#'
-#' @return List of the two files that are created on disk.
-write.clusterprofiler.results <- function(res, cprof.folder, label){
-    dir.create(cprof.folder, showWarnings=FALSE, recursive=TRUE)
-    filename.orig <- file.path(cprof.folder, paste0(label, '.txt'))
-    write.table(res, file=filename.orig, sep='\t', quote=FALSE, row.names=FALSE)
-    filename.split <- file.path(cprof.folder, paste0(label, '_split.txt'))
-    res.split <- split.clusterProfiler.results(res)
-    write.table(res.split, file=filename.split, sep='\t', quote=FALSE, row.names=FALSE)
-    return(list(orig=filename.orig, split=filename.split))
-}
-```

From 4cec15dd879ed593971ab591fb619721a7251241 Mon Sep 17 00:00:00 2001
From: Caroline Esnault <caroline.esnault@nih.gov>
Date: Thu, 24 Sep 2020 08:58:15 -0400
Subject: [PATCH 03/19] configs for large test dataset

---
 .../reference_configs/Test_Homo_sapiens.yaml  |  44 ++++++
 workflows/chipseq/config/config.yaml          | 137 +++++++++---------
 workflows/chipseq/config/sampletable.tsv      |  28 ++--
 workflows/rnaseq/config/config.yaml           |  47 +++---
 workflows/rnaseq/config/sampletable.tsv       |  42 +++++-
 5 files changed, 192 insertions(+), 106 deletions(-)
 create mode 100644 include/reference_configs/Test_Homo_sapiens.yaml

diff --git a/include/reference_configs/Test_Homo_sapiens.yaml b/include/reference_configs/Test_Homo_sapiens.yaml
new file mode 100644
index 000000000..f8392fb6b
--- /dev/null
+++ b/include/reference_configs/Test_Homo_sapiens.yaml
@@ -0,0 +1,44 @@
+references:
+  human:
+    small-gencode-v28:
+      metadata:
+        reference_genome_build: 'hg38.p12'
+        reference_effective_genome_count: 2.7e9
+        reference_effective_genome_proportion: 0.87
+
+      genome:
+        url: 'https://github.com/lcdb/lcdb-test-data-human/raw/master/seq/hg38.small.fa'
+        postprocess:
+          function: 'lib.postprocess.utils.convert_fasta_chroms'
+          args: 'https://raw.githubusercontent.com/NICHD-BSPC/chrom-name-mappings/d73fdd4d62ca7e845f9357ea5f08d7a918c17e97/mappings/human/gencode_GRCh38.28_to_ucsc_hg38/mappings_gencode_GRCh38.28_to_ucsc_hg38.tsv'
+
+        indexes:
+          - 'hisat2'
+          - 'bowtie2'
+
+      annotation:
+        url: 'https://github.com/lcdb/lcdb-test-data-human/raw/master/annotation/hg38.small.gtf'
+        postprocess:
+          function: 'lib.postprocess.utils.convert_gtf_chroms'
+          args: 'https://raw.githubusercontent.com/NICHD-BSPC/chrom-name-mappings/d73fdd4d62ca7e845f9357ea5f08d7a918c17e97/mappings/human/gencode_GRCh38.28_to_ucsc_hg38/mappings_gencode_GRCh38.28_to_ucsc_hg38.tsv'
+        conversions:
+          - 'refflat'
+          - 'bed12'
+
+      transcriptome:
+        indexes:
+          - 'salmon'
+
+
+    rRNA:
+      genome:
+        url:
+          - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_LSURef_tax_silva_trunc.fasta.gz'
+          - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_SSURef_Nr99_tax_silva_trunc.fasta.gz'
+        postprocess:
+          function: 'lib.common.filter_fastas'
+          args: 'Homo sapiens'
+        indexes:
+          - 'hisat2'
+          - 'bowtie2'
+
diff --git a/workflows/chipseq/config/config.yaml b/workflows/chipseq/config/config.yaml
index 287119737..9c159be56 100644
--- a/workflows/chipseq/config/config.yaml
+++ b/workflows/chipseq/config/config.yaml
@@ -5,7 +5,7 @@
 sampletable: 'config/sampletable.tsv'
 
 # Which key in the `references` dict below to use
-organism: 'dmel'
+organism: 'human'
 
 # If not specified here, use the environment variable REFERENCES_DIR.
 references_dir: 'references_data'
@@ -37,103 +37,108 @@ chipseq:
   # at least a BED file of peaks.
   #
   peak_calling:
-    - label: gaf-embryo-sicer
-      algorithm: sicer
+    - label: BRD4_dBET6_1
+      algorithm: macs2
       ip:
-        - gaf-embryo-1
+        - BRD4_dBET6_1
       control:
-        - input-embryo-1
-      redundancy_threshold: 1
-      window_size: 200
-      fragment_size: 150
+        - mockIgG_dBET6_1
       # optional user-specified override mappable genome proportion if
       # specified here, SICER will use this value instead of the value specific
       # to the genome build if NOT specified here, SICER will use the
       # mappability value for your genome build
-      effective_genome_fraction: 0.75
-      genome_build: dm6
-      gap_size: 600
-      fdr: 0.01
+      #effective_genome_count: 7e7
+      extra: '--nomodel -p 0.05 --cutoff-analysis'    # --broad for histones, paper says  in ‘histone’ mode for MTHFD1, but got very small number of peaks with --broad
 
+    - label: BRD4_dBET6_2
+      algorithm: macs2
+      ip:
+        - BRD4_dBET6_2
+      control:
+        - mockIgG_dBET6_2
+      extra: '--nomodel -p 0.05 --cutoff-analysis'
 
-    - label: gaf-embryo-1
+    - label: BRD4_DMSO_1
       algorithm: macs2
       ip:
-        - gaf-embryo-1
+        - BRD4_DMSO_1
       control:
-        - input-embryo-1
-      # optional user-specified override mappable genome size if specified
-      # here, MACS will use this value instead of the value specific to the
-      # genome build if NOT specified here, MACS will use the mappability value
-      # for your genome build
-      effective_genome_count: 7e7
-      extra: '--nomodel --extsize 147'
-
-    - label: gaf-embryo-1
-      algorithm: spp
+        - mockIgG_DMSO_1
+      extra: '--nomodel -p 0.05 --cutoff-analysis'
+
+    - label: BRD4_DMSO_2
+      algorithm: macs2
       ip:
-        - gaf-embryo-1
+        - BRD4_DMSO_2
       control:
-        - input-embryo-1
-      extra:
-        fdr: 0.3
-        zthr: 4
+        - mockIgG_DMSO_2
+      extra: '--nomodel -p 0.05 --cutoff-analysis'
 
-    - label: gaf-embryo-1-defaults
-      algorithm: spp
+    - label: MTHFD1_dBET6_1
+      algorithm: macs2
       ip:
-        - gaf-embryo-1
+        - MTHFD1_dBET6_1
       control:
-        - input-embryo-1
+        - mockIgG_dBET6_1
+      extra: '--nomodel -p 0.05 --cutoff-analysis'    # --broad for histones, paper says  in ‘histone’ mode for MTHFD1
 
-    - label: gaf-wingdisc-pooled
+    - label: MTHFD1_dBET6_1_inputCTRL
       algorithm: macs2
       ip:
-        - gaf-wingdisc-1
-        - gaf-wingdisc-2
+        - MTHFD1_dBET6_1
       control:
-        - input-wingdisc-1
-        - input-wingdisc-2
-      extra: '--nomodel --extsize 147'
+        - input_dBET6_1
+      extra: '--nomodel -p 0.05 --cutoff-analysis'
 
-    - label: gaf-wingdisc-pooled
-      algorithm: spp
+    - label: MTHFD1_dBET6_2
+      algorithm: macs2
       ip:
-        - gaf-wingdisc-1
-        - gaf-wingdisc-2
+        - MTHFD1_dBET6_2
       control:
-        - input-wingdisc-1
-        # - input-wingdisc-2
-      extra:
-        fdr: 0.5
-        zthr: 4
+        - mockIgG_dBET6_2
+      extra: '--nomodel -p 0.05 --cutoff-analysis'
+
+    - label: MTHFD1_DMSO_1
+      algorithm: macs2
+      ip:
+        - MTHFD1_DMSO_1
+      control:
+        - mockIgG_DMSO_1
+      extra: '--nomodel -p 0.05 --cutoff-analysis'
+
+    - label: MTHFD1_DMSO_2
+      algorithm: macs2
+      ip:
+        - MTHFD1_DMSO_2
+      control:
+        - mockIgG_DMSO_2
+      extra: '--nomodel -p 0.05 --cutoff-analysis'
 
 fastq_screen:
   - label: rRNA
-    organism: dmel
-    tag: test
+    organism: human
+    tag: small-gencode-v28
   - label: PhiX
     organism: phix
     tag: default
-  - label: Fly
-    organism: dmel
-    tag: test
-
-merged_bigwigs:
-  input-wingdisc:
-    - input-wingdisc-1
-    - input-wingdisc-2
-  gaf-wingdisc:
-    - gaf-wingdisc-1
-    - gaf-wingdisc-2
-  gaf-embryo:
-    - gaf-embryo-1
+  - label: Human
+    organism: human
+    tag: small-gencode-v28
+
+#merged_bigwigs:
+#  input-wingdisc:
+#    - input-wingdisc-1
+#    - input-wingdisc-2
+#  gaf-wingdisc:
+#    - gaf-wingdisc-1
+#    - gaf-wingdisc-2
+#  gaf-embryo:
+#    - gaf-embryo-1
 
 aligner:
   index: 'bowtie2'
-  tag: 'test'
+  tag: 'small-gencode-v28'
 
 include_references:
   - '../../include/reference_configs/PhiX.yaml'
-  - '../../include/reference_configs/Drosophila_melanogaster.yaml'
-  - '../../include/reference_configs/test.yaml'
+  - '../../include/reference_configs/Test_Homo_sapiens.yaml'
diff --git a/workflows/chipseq/config/sampletable.tsv b/workflows/chipseq/config/sampletable.tsv
index 05212460d..603f201e5 100644
--- a/workflows/chipseq/config/sampletable.tsv
+++ b/workflows/chipseq/config/sampletable.tsv
@@ -1,11 +1,17 @@
-# Samplenames with the same "label" will be considered technical replicates
-samplename	antibody	biological_material	replicate	label	orig_filename
-input_1	input	wingdisc-1	1	input-wingdisc-1	data/example_data/chipseq_input1.fq.gz
-input_2	input	wingdisc-2	2	input-wingdisc-2	data/example_data/chipseq_input2.fq.gz
-ip_1	gaf	wingdisc-1	1	gaf-wingdisc-1	data/example_data/chipseq_ip1.fq.gz
-ip_2	gaf	wingdisc-2	2	gaf-wingdisc-2	data/example_data/chipseq_ip2.fq.gz
-
-# Note here we are treating ip_3 and ip_4 as technical replicates for the sake of testing
-ip_3	gaf	embryo-1	1	gaf-embryo-1	data/example_data/chipseq_ip3.fq.gz
-ip_4	gaf	embryo-1	1	gaf-embryo-1	data/example_data/chipseq_ip4.fq.gz
-input_3	input	embryo-1	1	input-embryo-1	data/example_data/chipseq_input3.fq.gz
+samplename	label	biological_material	group	GEO_Run	LibraryLayout	source_name	Treatment	antibody	replicate	orig_filename
+BRD4_dBET6_1	BRD4_dBET6_1	HAP1_B1	BRD4_dBET6	SRR6202977	SINGLE	ChIP-seq for BRD4 in HAP1 cell line treated with dBET6	dBET6	BRD4	1	data/example_data/BRD4_dBET6_1.small_R1.fastq.gz
+BRD4_dBET6_2	BRD4_dBET6_2	HAP1_B2	BRD4_dBET6	SRR6202978	SINGLE	ChIP-seq for BRD4 in HAP1 cell line treated with dBET6	dBET6	BRD4	2	data/example_data/BRD4_dBET6_2.small_R1.fastq.gz
+BRD4_DMSO_1	BRD4_DMSO_1	HAP1_D1	BRD4_DMSO	SRR6202979	SINGLE	ChIP-seq for BRD4 in HAP1 cell line treated with DMSO	DMSO	BRD4	1	data/example_data/BRD4_DMSO_1.small_R1.fastq.gz
+BRD4_DMSO_2	BRD4_DMSO_2	HAP1_D2	BRD4_DMSO	SRR6202980	SINGLE	ChIP-seq for BRD4 in HAP1 cell line treated with DMSO	DMSO	BRD4	2	data/example_data/BRD4_DMSO_2.small_R1.fastq.gz
+mockIgG_dBET6_1	mockIgG_dBET6_1	HAP1_B1	mockIgG_dBET6	SRR6202981	SINGLE	ChIP-seq with mock IgG antibody in HAP1 cell line treated with dBET6	dBET6	mockIgG	1	data/example_data/mockIgG_dBET6_1.small_R1.fastq.gz
+mockIgG_dBET6_2	mockIgG_dBET6_2	HAP1_B2	mockIgG_dBET6	SRR6202982	SINGLE	ChIP-seq with mock IgG antibody in HAP1 cell line treated with dBET6	dBET6	mockIgG	2	data/example_data/mockIgG_dBET6_2.small_R1.fastq.gz
+mockIgG_DMSO_1	mockIgG_DMSO_1	HAP1_D1	mockIgG_DMSO	SRR6202983	SINGLE	ChIP-seq with mock IgG antibody in HAP1 cell line treated with DMSO	DMSO	mockIgG	1	data/example_data/mockIgG_DMSO_1.small_R1.fastq.gz
+mockIgG_DMSO_2	mockIgG_DMSO_2	HAP1_D2	mockIgG_DMSO	SRR6202984	SINGLE	ChIP-seq with mock IgG antibody in HAP1 cell line treated with DMSO	DMSO	mockIgG	2	data/example_data/mockIgG_DMSO_2.small_R1.fastq.gz
+input_dBET6_1	input_dBET6_1	HAP1_B1	input_dBET6	SRR6202985	SINGLE	ChIP-seq for Input in HAP1 cell line treated with dBET6	dBET6	input	1	data/example_data/input_dBET6_1.small_R1.fastq.gz
+input_dBET6_2	input_dBET6_2	HAP1_B2	input_dBET6	SRR6202986	SINGLE	ChIP-seq for Input in HAP1 cell line treated with dBET6	dBET6	input	2	data/example_data/input_dBET6_2.small_R1.fastq.gz
+input_DMSO_1	input_DMSO_1	HAP1_D1	input_DMSO	SRR6202987	SINGLE	ChIP-seq for Input in HAP1 cell line treated with DMSO	DMSO	input	1	data/example_data/input_DMSO_1.small_R1.fastq.gz
+input_DMSO_2	input_DMSO_2	HAP1_D2	input_DMSO	SRR6202988	SINGLE	ChIP-seq for Input in HAP1 cell line treated with DMSO	DMSO	input	2	data/example_data/input_DMSO_2.small_R1.fastq.gz
+MTHFD1_dBET6_1	MTHFD1_dBET6_1	HAP1_B1	MTHFD1_dBET6	SRR6202989	SINGLE	ChIP-seq for MTHFD1 in HAP1 cell line treated with dBET6	dBET6	MTHFD1	1	data/example_data/MTHFD1_dBET6_1.small_R1.fastq.gz
+MTHFD1_dBET6_2	MTHFD1_dBET6_2	HAP1_B2	MTHFD1_dBET6	SRR6202990	SINGLE	ChIP-seq for MTHFD1 in HAP1 cell line treated with dBET6	dBET6	MTHFD1	2	data/example_data/MTHFD1_dBET6_2.small_R1.fastq.gz
+MTHFD1_DMSO_1	MTHFD1_DMSO_1	HAP1_D1	MTHFD1_DMSO	SRR6202991	SINGLE	ChIP-seq for MTHFD1 in HAP1 cell line treated with DMSO	DMSO	MTHFD1	1	data/example_data/MTHFD1_DMSO_1.small_R1.fastq.gz
+MTHFD1_DMSO_2	MTHFD1_DMSO_2	HAP1_D2	MTHFD1_DMSO	SRR6202992	SINGLE	ChIP-seq for MTHFD1 in HAP1 cell line treated with DMSO	DMSO	MTHFD1	2	data/example_data/MTHFD1_DMSO_2.small_R1.fastq.gz
diff --git a/workflows/rnaseq/config/config.yaml b/workflows/rnaseq/config/config.yaml
index c0de423e7..490948db4 100644
--- a/workflows/rnaseq/config/config.yaml
+++ b/workflows/rnaseq/config/config.yaml
@@ -3,53 +3,52 @@ sampletable: 'config/sampletable.tsv'
 patterns: 'config/rnaseq_patterns.yaml'
 
 # Which key in the `references` dict below to use
-organism: 'dmel'
+organism: 'human'
 
 # If not specified here, use the environment variable REFERENCES_DIR.
 references_dir: 'references_data'
 
 aligner:
   index: 'hisat2'
-  tag: 'test'
+  tag: 'small-gencode-v28'
 
 rrna:
   index: 'bowtie2'
   tag: 'rRNA'
 
 gtf:
-  tag: "test"
+  tag: "small-gencode-v28"
 
 salmon:
-  tag: "test"
+  tag: "small-gencode-v28"
 
 fastq_screen:
   - label: rRNA
-    organism: dmel
-    tag: test
+    organism: human
+    tag: rRNA
   - label: PhiX
     organism: phix
     tag: default
-  - label: Fly
-    organism: dmel
-    tag: test
-
-merged_bigwigs:
-  control_pos:
-    pos:
-      - sample1
-      - sample2
-  treatment_all:
-    pos:
-      - sample3
-      - sample4
-    neg:
-      - sample3
-      - sample4
+  - label: human
+    organism: human
+    tag: small-gencode-v28
+
+#merged_bigwigs:
+#  control_pos:
+#    pos:
+#      - sample1
+#      - sample2
+#  treatment_all:
+#    pos:
+#      - sample3
+#      - sample4
+#    neg:
+#      - sample3
+#      - sample4
 
 # See the reference config files in the top level of the repo,
 # include/reference_configs, for inspiration for more species.
 
 include_references:
-  - '../../include/reference_configs/test.yaml'
   - '../../include/reference_configs/PhiX.yaml'
-  - '../../include/reference_configs/Drosophila_melanogaster.yaml'
+  - '../../include/reference_configs/Test_Homo_sapiens.yaml'
diff --git a/workflows/rnaseq/config/sampletable.tsv b/workflows/rnaseq/config/sampletable.tsv
index fad727e78..e1b8ee262 100644
--- a/workflows/rnaseq/config/sampletable.tsv
+++ b/workflows/rnaseq/config/sampletable.tsv
@@ -1,5 +1,37 @@
-samplename	group	layout	orig_filename
-sample1	control	SE	data/example_data/rnaseq_sample1PE_1.fq.gz
-sample2	control	SE	data/example_data/rnaseq_sample2.fq.gz
-sample3	treatment	SE	data/example_data/rnaseq_sample3.fq.gz
-sample4	treatment	SE	data/example_data/rnaseq_sample4.fq.gz
+samplename	GEO_Run	layout	group	cell_line_group	cell_Line	treatment	timepoint	repl	biorep	orig_filename
+A549_dBet6_1h_1	SRR6354081	SE	dBet6_1h	A549_dBet6_1h	A549	dBet6	1h	1	A549_dBet6_1h_1	data/example_data/A549_dBet6_1h_1.small_R1.fastq.gz
+A549_dBet6_1h_2	SRR6354082	SE	dBet6_1h	A549_dBet6_1h	A549	dBet6	1h	2	A549_dBet6_1h_1	data/example_data/A549_dBet6_1h_2.small_R1.fastq.gz
+A549_dBet6_6h_1	SRR6354083	SE	dBet6_6h	A549_dBet6_6h	A549	dBet6	6h	1	A549_dBet6_6h_1	data/example_data/A549_dBet6_6h_2.small_R1.fastq.gz
+A549_dBet6_6h_2	SRR6354084	SE	dBet6_6h	A549_dBet6_6h	A549	dBet6	6h	2	A549_dBet6_6h_1	data/example_data/A549_dBet6_6h_1.small_R1.fastq.gz
+A549_DMSO_1h_1	SRR6354085	SE	DMSO_1h	A549_DMSO_1h	A549	DMSO	1h	1	A549_DMSO_1h_1	data/example_data/A549_DMSO_1h_1.small_R1.fastq.gz
+A549_DMSO_1h_2	SRR6354086	SE	DMSO_1h	A549_DMSO_1h	A549	DMSO	1h	2	A549_DMSO_1h_2	data/example_data/A549_DMSO_1h_2.small_R1.fastq.gz
+A549_DMSO_6h_1	SRR6354087	SE	DMSO_6h	A549_DMSO_6h	A549	DMSO	6h	1	A549_DMSO_6h_1	data/example_data/A549_DMSO_6h_1.small_R1.fastq.gz
+A549_DMSO_6h_2	SRR6354088	SE	DMSO_6h	A549_DMSO_6h	A549	DMSO	6h	2	A549_DMSO_6h_2	data/example_data/A549_DMSO_6h_2.small_R1.fastq.gz
+A549_JQ1_1h_1	SRR6354089	SE	JQ1_1h	A549_JQ1_1h	A549	JQ1	1h	1	A549_JQ1_1h_1	data/example_data/A549_JQ1_1h_1.small_R1.fastq.gz
+A549_JQ1_1h_2	SRR6354090	SE	JQ1_1h	A549_JQ1_1h	A549	JQ1	1h	2	A549_JQ1_1h_2	data/example_data/A549_JQ1_1h_2.small_R1.fastq.gz
+A549_JQ1_6h_1	SRR6354091	SE	JQ1_6h	A549_JQ1_6h	A549	JQ1	6h	1	A549_JQ1_6h_1	data/example_data/A549_JQ1_6h_1.small_R1.fastq.gz
+A549_JQ1_6h_2	SRR6354092	SE	JQ1_6h	A549_JQ1_6h	A549	JQ1	6h	2	A549_JQ1_6h_2	data/example_data/A549_JQ1_6h_2.small_R1.fastq.gz
+HAP1_dBet6_1h_1	SRR6354109	SE	dBet6_1h	HAP1_dBet6_1h	HAP1	dBet6	1h	1	HAP1_dBet6_1h_1	data/example_data/HAP1_dBet6_1h_1.small_R1.fastq.gz
+HAP1_dBet6_1h_2	SRR6354110	SE	dBet6_1h	HAP1_dBet6_1h	HAP1	dBet6	1h	2	HAP1_dBet6_1h_2	data/example_data/HAP1_dBet6_1h_2.small_R1.fastq.gz
+HAP1_dBet6_6h_1	SRR6354111	SE	dBet6_6h	HAP1_dBet6_6h	HAP1	dBet6	6h	1	HAP1_dBet6_6h_1	data/example_data/HAP1_dBet6_6h_1.small_R1.fastq.gz
+HAP1_dBet6_6h_2	SRR6354112	SE	dBet6_6h	HAP1_dBet6_6h	HAP1	dBet6	6h	2	HAP1_dBet6_6h_2	data/example_data/HAP1_dBet6_6h_2.small_R1.fastq.gz
+HAP1_DMSO_1h_1	SRR6354113	SE	DMSO_1h	HAP1_DMSO_1h	HAP1	DMSO	1h	1	HAP1_DMSO_1h_1	data/example_data/HAP1_DMSO_1h_1.small_R1.fastq.gz
+HAP1_DMSO_1h_2	SRR6354114	SE	DMSO_1h	HAP1_DMSO_1h	HAP1	DMSO	1h	2	HAP1_DMSO_1h_2	data/example_data/HAP1_DMSO_1h_2.small_R1.fastq.gz
+HAP1_DMSO_6h_1	SRR6354115	SE	DMSO_6h	HAP1_DMSO_6h	HAP1	DMSO	6h	1	HAP1_DMSO_6h_1	data/example_data/HAP1_DMSO_6h_1.small_R1.fastq.gz
+HAP1_DMSO_6h_2	SRR6354116	SE	DMSO_6h	HAP1_DMSO_6h	HAP1	DMSO	6h	2	HAP1_DMSO_6h_2	data/example_data/HAP1_DMSO_6h_2.small_R1.fastq.gz
+HAP1_JQ1_1h_1	SRR6354117	SE	JQ1_1h	HAP1_JQ1_1h	HAP1	JQ1	1h	1	HAP1_JQ1_1h_1	data/example_data/HAP1_JQ1_1h_1.small_R1.fastq.gz
+HAP1_JQ1_1h_2	SRR6354118	SE	JQ1_1h	HAP1_JQ1_1h	HAP1	JQ1	1h	2	HAP1_JQ1_1h_2	data/example_data/HAP1_JQ1_1h_2.small_R1.fastq.gz
+HAP1_JQ1_6h_1	SRR6354119	SE	JQ1_6h	HAP1_JQ1_6h	HAP1	JQ1	6h	1	HAP1_JQ1_6h_1	data/example_data/HAP1_JQ1_6h_1.small_R1.fastq.gz
+HAP1_JQ1_6h_2	SRR6354120	SE	JQ1_6h	HAP1_JQ1_6h	HAP1	JQ1	6h	2	HAP1_JQ1_6h_2	data/example_data/HAP1_JQ1_6h_2.small_R1.fastq.gz
+outlier_HAP1_dBet6_1h_1	SRR6354137	SE	dBet6_1h	HAP1_dBet6_1h	HAP1	dBet6	1h	1	HAP1_dBet6_1h_1	data/example_data/outlier_HAP1_dBet6_1h_1.small_R1.fastq.gz
+K562_dBet6_1h_2	SRR6354138	SE	dBet6_1h	K562_dBet6_1h	K562	dBet6	1h	2	K562_dBet6_1h_2	data/example_data/K562_dBet6_1h_2.small_R1.fastq.gz
+K562_dBet6_6h_1	SRR6354139	SE	dBet6_6h	K562_dBet6_6h	K562	dBet6	6h	1	K562_dBet6_6h_1	data/example_data/K562_dBet6_6h_1.small_R1.fastq.gz
+K562_dBet6_6h_2	SRR6354140	SE	dBet6_6h	K562_dBet6_6h	K562	dBet6	6h	2	K562_dBet6_6h_2	data/example_data/K562_dBet6_6h_2.small_R1.fastq.gz
+K562_DMSO_1h_1	SRR6354141	SE	DMSO_1h	K562_DMSO_1h	K562	DMSO	1h	1	K562_DMSO_1h_1	data/example_data/K562_DMSO_1h_1.small_R1.fastq.gz
+K562_DMSO_1h_2	SRR6354142	SE	DMSO_1h	K562_DMSO_1h	K562	DMSO	1h	2	K562_DMSO_1h_2	data/example_data/K562_DMSO_1h_2.small_R1.fastq.gz
+K562_DMSO_6h_1	SRR6354143	SE	DMSO_6h	K562_DMSO_6h	K562	DMSO	6h	1	K562_DMSO_6h_1	data/example_data/K562_DMSO_6h_1.small_R1.fastq.gz
+K562_DMSO_6h_2	SRR6354144	SE	DMSO_6h	K562_DMSO_6h	K562	DMSO	6h	2	K562_DMSO_6h_2	data/example_data/K562_DMSO_6h_2.small_R1.fastq.gz
+K562_JQ1_1h_1	SRR6354145	SE	JQ1_1h	K562_JQ1_1h	K562	JQ1	1h	1	K562_JQ1_1h_1	data/example_data/K562_JQ1_1h_1.small_R1.fastq.gz
+K562_JQ1_1h_2	SRR6354146	SE	JQ1_1h	K562_JQ1_1h	K562	JQ1	1h	2	K562_JQ1_1h_2	data/example_data/K562_JQ1_1h_2.small_R1.fastq.gz
+K562_JQ1_6h_1	SRR6354147	SE	JQ1_6h	K562_JQ1_6h	K562	JQ1	6h	1	K562_JQ1_6h_1	data/example_data/K562_JQ1_6h_1.small_R1.fastq.gz
+K562_JQ1_6h_2	SRR6354148	SE	JQ1_6h	K562_JQ1_6h	K562	JQ1	6h	2	K562_JQ1_6h_2	data/example_data/K562_JQ1_6h_2.small_R1.fastq.gz

From 39960a62fdd3027428d23db479f07d3211cc6bf4 Mon Sep 17 00:00:00 2001
From: Caroline Esnault <caroline.esnault@nih.gov>
Date: Thu, 24 Sep 2020 08:58:58 -0400
Subject: [PATCH 04/19] update ci for large test dataset

---
 ci/get-data.py | 70 ++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 53 insertions(+), 17 deletions(-)

diff --git a/ci/get-data.py b/ci/get-data.py
index cd2d356b1..87115e144 100755
--- a/ci/get-data.py
+++ b/ci/get-data.py
@@ -5,7 +5,7 @@
 
 shell.executable('/bin/bash')
 BRANCH = 'master'
-URL = 'https://github.com/lcdb/lcdb-test-data/blob/{0}/data/{{}}?raw=true'.format(BRANCH)
+URL = 'https://github.com/lcdb/lcdb-test-data-human/blob/{0}/{{}}?raw=true'.format(BRANCH)
 
 
 def _download_file(fn, dest=None):
@@ -17,21 +17,57 @@ def _download_file(fn, dest=None):
     shell('wget -q -O- {url} > {dest}')
     return dest
 
+_download_file('rnaseq_samples/A549_DMSO_1h_1/A549_DMSO_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_DMSO_1h_1.small_R1.fastq.gz')
+_download_file('rnaseq_samples/A549_DMSO_1h_2/A549_DMSO_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_DMSO_1h_2.small_R1.fastq.gz')
+_download_file('rnaseq_samples/A549_DMSO_6h_1/A549_DMSO_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_DMSO_6h_1.small_R1.fastq.gz')
+_download_file('rnaseq_samples/A549_DMSO_6h_2/A549_DMSO_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_DMSO_6h_2.small_R1.fastq.gz')
+_download_file('rnaseq_samples/A549_JQ1_1h_1/A549_JQ1_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_JQ1_1h_1.small_R1.fastq.gz')
+_download_file('rnaseq_samples/A549_JQ1_1h_2/A549_JQ1_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_JQ1_1h_2.small_R1.fastq.gz')
+_download_file('rnaseq_samples/A549_JQ1_6h_1/A549_JQ1_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_JQ1_6h_1.small_R1.fastq.gz')
+_download_file('rnaseq_samples/A549_JQ1_6h_2/A549_JQ1_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_JQ1_6h_2.small_R1.fastq.gz')
+_download_file('rnaseq_samples/A549_dBet6_1h_1/A549_dBet6_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_dBet6_1h_1.small_R1.fastq.gz')
+_download_file('rnaseq_samples/A549_dBet6_1h_2/A549_dBet6_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_dBet6_1h_2.small_R1.fastq.gz')
+_download_file('rnaseq_samples/A549_dBet6_6h_1/A549_dBet6_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_dBet6_6h_1.small_R1.fastq.gz')
+_download_file('rnaseq_samples/A549_dBet6_6h_2/A549_dBet6_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_dBet6_6h_2.small_R1.fastq.gz')
+_download_file('rnaseq_samples/HAP1_DMSO_1h_1/HAP1_DMSO_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_DMSO_1h_1.small_R1.fastq.gz')
+_download_file('rnaseq_samples/HAP1_DMSO_1h_2/HAP1_DMSO_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_DMSO_1h_2.small_R1.fastq.gz')
+_download_file('rnaseq_samples/HAP1_DMSO_6h_1/HAP1_DMSO_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_DMSO_6h_1.small_R1.fastq.gz')
+_download_file('rnaseq_samples/HAP1_DMSO_6h_2/HAP1_DMSO_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_DMSO_6h_2.small_R1.fastq.gz')
+_download_file('rnaseq_samples/HAP1_JQ1_1h_1/HAP1_JQ1_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_JQ1_1h_1.small_R1.fastq.gz')
+_download_file('rnaseq_samples/HAP1_JQ1_1h_2/HAP1_JQ1_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_JQ1_1h_2.small_R1.fastq.gz')
+_download_file('rnaseq_samples/HAP1_JQ1_6h_1/HAP1_JQ1_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_JQ1_6h_1.small_R1.fastq.gz')
+_download_file('rnaseq_samples/HAP1_JQ1_6h_2/HAP1_JQ1_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_JQ1_6h_2.small_R1.fastq.gz')
+_download_file('rnaseq_samples/HAP1_dBet6_1h_1/HAP1_dBet6_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_dBet6_1h_1.small_R1.fastq.gz')
+_download_file('rnaseq_samples/HAP1_dBet6_1h_2/HAP1_dBet6_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_dBet6_1h_2.small_R1.fastq.gz')
+_download_file('rnaseq_samples/HAP1_dBet6_6h_1/HAP1_dBet6_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_dBet6_6h_1.small_R1.fastq.gz')
+_download_file('rnaseq_samples/HAP1_dBet6_6h_2/HAP1_dBet6_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_dBet6_6h_2.small_R1.fastq.gz')
+_download_file('rnaseq_samples/K562_DMSO_1h_1/K562_DMSO_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_DMSO_1h_1.small_R1.fastq.gz')
+_download_file('rnaseq_samples/K562_DMSO_1h_2/K562_DMSO_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_DMSO_1h_2.small_R1.fastq.gz')
+_download_file('rnaseq_samples/K562_DMSO_6h_1/K562_DMSO_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_DMSO_6h_1.small_R1.fastq.gz')
+_download_file('rnaseq_samples/K562_DMSO_6h_2/K562_DMSO_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_DMSO_6h_2.small_R1.fastq.gz')
+_download_file('rnaseq_samples/K562_JQ1_1h_1/K562_JQ1_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_JQ1_1h_1.small_R1.fastq.gz')
+_download_file('rnaseq_samples/K562_JQ1_1h_2/K562_JQ1_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_JQ1_1h_2.small_R1.fastq.gz')
+_download_file('rnaseq_samples/K562_JQ1_6h_1/K562_JQ1_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_JQ1_6h_1.small_R1.fastq.gz')
+_download_file('rnaseq_samples/K562_JQ1_6h_2/K562_JQ1_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_JQ1_6h_2.small_R1.fastq.gz')
+_download_file('rnaseq_samples/K562_dBet6_1h_2/K562_dBet6_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_dBet6_1h_2.small_R1.fastq.gz')
+_download_file('rnaseq_samples/K562_dBet6_6h_1/K562_dBet6_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_dBet6_6h_1.small_R1.fastq.gz')
+_download_file('rnaseq_samples/K562_dBet6_6h_2/K562_dBet6_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_dBet6_6h_2.small_R1.fastq.gz')
+_download_file('rnaseq_samples/outlier_HAP1_dBet6_1h_1/outlier_HAP1_dBet6_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/outlier_HAP1_dBet6_1h_1.small_R1.fastq.gz')
 
-_download_file('rnaseq_samples/sample1/sample1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample1.fq.gz')
-_download_file('rnaseq_samples/sample2/sample2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample2.fq.gz')
-_download_file('rnaseq_samples/sample3/sample3.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample3.fq.gz')
-_download_file('rnaseq_samples/sample4/sample4.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample4.fq.gz')
+_download_file('chipseq_samples/BRD4_DMSO_1/BRD4_DMSO_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/BRD4_DMSO_1.small_R1.fastq.gz')
+_download_file('chipseq_samples/BRD4_DMSO_2/BRD4_DMSO_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/BRD4_DMSO_2.small_R1.fastq.gz')
+_download_file('chipseq_samples/BRD4_dBET6_1/BRD4_dBET6_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/BRD4_dBET6_1.small_R1.fastq.gz')
+_download_file('chipseq_samples/BRD4_dBET6_2/BRD4_dBET6_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/BRD4_dBET6_2.small_R1.fastq.gz')
+_download_file('chipseq_samples/MTHFD1_DMSO_1/MTHFD1_DMSO_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/MTHFD1_DMSO_1.small_R1.fastq.gz')
+_download_file('chipseq_samples/MTHFD1_DMSO_2/MTHFD1_DMSO_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/MTHFD1_DMSO_2.small_R1.fastq.gz')
+_download_file('chipseq_samples/MTHFD1_dBET6_1/MTHFD1_dBET6_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/MTHFD1_dBET6_1.small_R1.fastq.gz')
+_download_file('chipseq_samples/MTHFD1_dBET6_2/MTHFD1_dBET6_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/MTHFD1_dBET6_2.small_R1.fastq.gz')
+_download_file('chipseq_samples/input_DMSO_1/input_DMSO_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/input_DMSO_1.small_R1.fastq.gz')
+_download_file('chipseq_samples/input_DMSO_2/input_DMSO_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/input_DMSO_2.small_R1.fastq.gz')
+_download_file('chipseq_samples/input_dBET6_1/input_dBET6_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/input_dBET6_1.small_R1.fastq.gz')
+_download_file('chipseq_samples/input_dBET6_2/input_dBET6_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/input_dBET6_2.small_R1.fastq.gz')
+_download_file('chipseq_samples/mockIgG_DMSO_1/mockIgG_DMSO_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/mockIgG_DMSO_1.small_R1.fastq.gz')
+_download_file('chipseq_samples/mockIgG_DMSO_2/mockIgG_DMSO_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/mockIgG_DMSO_2.small_R1.fastq.gz')
+_download_file('chipseq_samples/mockIgG_dBET6_1/mockIgG_dBET6_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/mockIgG_dBET6_1.small_R1.fastq.gz')
+_download_file('chipseq_samples/mockIgG_dBET6_2/mockIgG_dBET6_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/mockIgG_dBET6_2.small_R1.fastq.gz')
 
-_download_file('rnaseq_samples/sample1/sample1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample1PE_1.fq.gz')
-_download_file('rnaseq_samples/sample1/sample1.small_R2.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample1PE_2.fq.gz')
-_download_file('rnaseq_samples/sample2/sample2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample2PE_1.fq.gz')
-_download_file('rnaseq_samples/sample2/sample2.small_R2.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample2PE_2.fq.gz')
-
-_download_file('chipseq_samples/input_1/input_1.tiny_R1.fastq.gz', 'workflows/chipseq/data/example_data/chipseq_input1.fq.gz')
-_download_file('chipseq_samples/ip_1/ip_1.tiny_R1.fastq.gz', 'workflows/chipseq/data/example_data/chipseq_ip1.fq.gz')
-_download_file('chipseq_samples/input_2/input_2.tiny_R1.fastq.gz', 'workflows/chipseq/data/example_data/chipseq_input2.fq.gz')
-_download_file('chipseq_samples/ip_2/ip_2.tiny_R1.fastq.gz', 'workflows/chipseq/data/example_data/chipseq_ip2.fq.gz')
-_download_file('chipseq_samples/ip_3/ip_3.tiny_R1.fastq.gz', 'workflows/chipseq/data/example_data/chipseq_ip3.fq.gz')
-_download_file('chipseq_samples/ip_4/ip_4.tiny_R1.fastq.gz', 'workflows/chipseq/data/example_data/chipseq_ip4.fq.gz')
-_download_file('chipseq_samples/input_3/input_3.tiny_R1.fastq.gz', 'workflows/chipseq/data/example_data/chipseq_input3.fq.gz')

From 7d09db2d275a0fad65cdd3412e625d6ed2c742ac Mon Sep 17 00:00:00 2001
From: Caroline Esnault <caroline.esnault@nih.gov>
Date: Thu, 24 Sep 2020 08:59:45 -0400
Subject: [PATCH 05/19] add chipseq downstream diffbind

---
 workflows/chipseq/downstream/diffbind.Rmd | 492 ++++++++++++++++++++++
 1 file changed, 492 insertions(+)
 create mode 100644 workflows/chipseq/downstream/diffbind.Rmd

diff --git a/workflows/chipseq/downstream/diffbind.Rmd b/workflows/chipseq/downstream/diffbind.Rmd
new file mode 100644
index 000000000..6e6e3db6c
--- /dev/null
+++ b/workflows/chipseq/downstream/diffbind.Rmd
@@ -0,0 +1,492 @@
+---
+title: Differential ChIP-seq peaks
+output:
+  html_document:
+    code_folding: hide
+    toc: true
+    toc_float: true
+    toc_depth: 3
+---
+
+## Changelog
+
+**Initial results**
+
+Last run: `r date()`
+
+```{r, include=FALSE}
+knitr::opts_chunk$set(sep=TRUE, warning=FALSE, message=FALSE,
+                      bootstrap.show.code=FALSE, bootstrap.show.output=FALSE,
+                      cache.extra_file_dep_1=file.info('../config/sampletable.tsv')$mtime,
+                      # try disabling this when running locally for nicer figures
+                      #dev='bitmap',
+                      fig.ext='png')
+```
+
+```{r, results='asis'}
+subchunkify <- function(g) {
+  g_deparsed <- paste0(deparse(
+    function() {g}
+  ), collapse = '')
+
+  sub_chunk <- paste0("
+  `","``{r sub_chunk_", floor(runif(1) * 10000), ", fig.height=10, echo=FALSE}",
+  "\n(", 
+    g_deparsed
+    , ")()",
+  "\n`","``
+  ")
+
+  cat(knitr::knit(text = knitr::knit_expand(text = sub_chunk), quiet = TRUE))
+  }
+```
+
+```{r imports, include=FALSE}
+library(DiffBind)
+library(ggplot2)
+library(BiocParallel)
+library(AnnotationHub)
+library(ChIPseeker)
+library(dplyr)
+```
+
+```{r limit_cpus}
+#register(MulticoreParam(workers=future::availableCores()))
+register(MulticoreParam(workers=2))
+```
+
+```{r load_helpers}
+devtools::document('../../../lib/lcdbwf')
+devtools::load_all('../../../lib/lcdbwf')
+```
+
+```{r annotationhub_setup}
+annotation_genus_species <- 'Homo sapiens'
+annotation_key_override <- NA
+hub.cache <- '../../../include/AnnotationHubCache'
+orgdb <- get.orgdb(
+    annotation_genus_species,
+    cache=hub.cache,
+    annotation_key_override=annotation_key_override
+)
+```
+
+```{r txdb}
+get.txdb <- function(species, cache, annotation_key_override=NA){
+
+    # Workaround to allow AnnotationHub to use proxy. See
+    # https://github.com/Bioconductor/AnnotationHub/issues/4, and thanks
+    # Wolfgang!
+    proxy <- Sys.getenv('http_proxy')
+    if (proxy == ""){
+        proxy <- NULL
+    }
+
+    ah <- AnnotationHub(hub=getAnnotationHubOption('URL'),
+             cache=cache,
+             proxy=proxy,
+             localHub=FALSE)
+
+    find.annotationhub.name <- function(species.name, override.code) { #autodetect ah names based on loaded database
+        if (is.na(override.code)) {
+            ah.query <- query(ah, c("TxDb", "UCSC"))
+            ah.query.speciesmatch <- grepl(paste("^", species.name, "$", sep=""), ah.query$species)
+        ah.query.which <- which(ah.query.speciesmatch)
+        stopifnot(length(ah.query.which) > 0) #require at least one match
+        if (length(ah.query.which) > 1) { #warn of duplicate matches
+            print("WARNING: found multiple candidate species in AnnotationHub: ");
+            print(ah.query.speciesmatch)
+        }
+        names(ah.query)[ah.query.which[length(ah.query.which)]]
+        } else {
+        override.code
+        }
+    }
+    annotation_key <- find.annotationhub.name(annotation_genus_species, annotation_key_override)
+    txdb <- ah[[annotation_key]]
+    return(txdb)
+}
+
+txdb <- get.txdb(
+    annotation_genus_species,
+    cache=hub.cache,
+    annotation_key_override=annotation_key_override
+)
+```
+
+```{r coldata_setup}
+threshold <- 0.01
+use_pval <- TRUE
+sample.table.filename <- '../config/sampletable.tsv'
+# PeakCaller is the peak file format for diffBind dba function
+PeakCaller <- 'bed'
+exclude.for.printing <- c('orig_filename', 'orig_filename_R2', 'bamReads', 'bamControl', 'Peaks', 'PeakCaller')
+bam.path.func <- function (x) file.path('..', 'data', 'chipseq_merged', x, paste0(x, '.cutadapt.unique.nodups.merged.bam'))
+peak.path.func <- function (x) file.path('..', 'data', 'chipseq_peaks', 'macs2', x, 'peaks.bed')
+
+colData <- read.table(sample.table.filename, sep='\t', header=TRUE, stringsAsFactors=FALSE)
+
+colData$bamReads <- sapply(colData$label, bam.path.func)
+colData$Peaks <- sapply(colData$label, peak.path.func)
+colData$PeakCaller <- PeakCaller
+
+factor.columns <- c('Treatment')
+for (col in factor.columns){
+    colData[[col]] <- as.factor(colData[[col]])
+}
+
+colData$Treatment <- relevel(colData$Treatment, ref='DMSO')
+rownames(colData) <- colData[,1]
+
+# split by input vs. antibody
+ab_ref <- 'mockIgG'
+colDataInput <- colData %>%
+    filter(antibody == ab_ref) %>%
+    mutate(bamControl = bamReads, ControlID = label) %>%
+    distinct(label, .keep_all=TRUE)
+colDataAB <- colData %>% filter(!antibody %in% c(ab_ref, 'input')) %>%
+#colDataAB <- colData %>% filter(antibody == 'BRD4') %>%
+    distinct(label, .keep_all=TRUE) %>%
+    mutate(SampleID = samplename)
+
+# make sampletables for diffBind
+st <- list()
+for (ab in unique(colDataAB[['antibody']])) {
+    st[[ab]] <- merge(colDataAB %>% filter(antibody == ab),
+                    colDataInput %>% dplyr::select(biological_material, bamControl, ControlID),
+                    by='biological_material')
+}
+
+```
+
+## Experiment overview {.tabset}
+
+The ChIPseq samples analyzed here come from the SRA study [SRP120974](https://trace.ncbi.nlm.nih.gov/Traces/sra/?study=SRP120974),
+published in [Sdelci et al., 2019](https://www.nature.com/articles/s41588-019-0413-z).
+
+HAP1 cells were treated with dBET6 or DMSO and ChIPseq was performed using histone acetyl-reader BRD4 or
+the folate pathway enzyme MTHFD1.
+ChIP-seq with mock IgG antibody were used as background.
+
+ChIPseq fastq files were processed through [lcdb-wf](https://github.com/lcdb/lcdb-wf) chipseq pipeline.
+
+Called peaks were then processed though [DiffBind](https://bioconductor.org/packages/release/bioc/html/DiffBind.html)
+for differential binding analysis using the method `DBA_DESEQ2`.
+
+```{r, results='asis'}
+if (use_pval) {
+    pval_attr <- 'pvalue'
+} else {
+    pval_attr <- 'FDR'
+}
+cat(paste0('\n\nHere we use a **', pval_attr, '** threshold of ', threshold,
+           ' to determine peak significance.\n\n'))
+```
+
+For each antibody, a separate sampletable was used as `diffBind` input to identify differential binding.
+
+```{r setup, results='asis'}
+data <- list()
+for (ab in names(st)){
+    data[[ab]] <- dba(sampleSheet=st[[ab]], 
+                     config=data.frame(th=threshold, bUsePval=use_pval))
+    cat(paste0('\n\n### ', ab, '\n\n'))
+    subchunkify(knitr::kable(st[[ab]][, colnames(st[[ab]])[!colnames(st[[ab]]) %in% exclude.for.printing]]))
+}
+params <- list('args2'='diffPeaks_deseq2.tsv',
+               'args3'='diffPeaks_deseq2.bed')
+```
+
+```{r deseq2, cache=TRUE, dependson='setup'}
+# DiffBind count of reads and analysis of differential number of reads 
+# using the DESeq2 method
+DBs <- list()
+beds <- list()
+for (ab in names(st)){
+    lvls <- levels(st[[ab]]$Treatment)
+    data[[ab]] <- dba.count(data[[ab]], bParallel=TRUE)
+    data[[ab]] <- dba.contrast(data[[ab]],
+                               group1=data[[ab]]$mask[[lvls[2]]], 
+                               group2=data[[ab]]$mask[[lvls[1]]],
+                               name1=lvls[2], name2=lvls[1], minMembers=2)
+    data[[ab]] <- dba.analyze(data[[ab]], method=DBA_DESEQ2, bTagwise=FALSE, bSubControl=TRUE)
+
+# Generate a report
+# specifying threshold, th=1, to include all peaks
+    DBs[[ab]] <- dba.report(data[[ab]], method=DBA_DESEQ2, th=1)
+
+# Write output table
+    write.table(DBs[[ab]], 
+                file = paste(ab, '_', params$args2, sep=""), 
+                sep = "\t", quote=FALSE, row.names=FALSE)
+
+# Write output in bed format
+    beds[[ab]] <- data.frame(iranges = DBs[[ab]])
+    beds[[ab]]$newcol <- paste("Fold", beds[[ab]]$iranges.Fold, "FDR",
+                            beds[[ab]]$iranges.FDR, sep="_")
+
+    write.table(beds[[ab]][,c(1,2,3,12,9,10,11)], 
+                file = paste(ab, '_',params$args3, sep=""), 
+                quote=FALSE, row.names=FALSE, col.names=FALSE, sep = "\t")
+}
+```
+
+
+## Correlation between samples {.tabset}
+
+```{r deseq2plots, results='asis'}
+# plot data
+for(ab in names(data)){
+    cat('\n\n### ', ab, '\n\n')
+    plot(data[[ab]], contrast=1)
+}
+```
+
+
+## Overlap of peaks between replicates {.tabset}
+
+Please note that the Venn diagram areas are not to scale.
+
+```{r venn, results='asis'}
+for (ab in names(data)) {
+    cat('\n\n\n### ', ab, '{.tabset}\n\n')
+    for (lvl in levels(st[[ab]]$Treatment)) {
+        cat('\n\n#### ', lvl, ' samples\n\n')
+        dba.plotVenn(data[[ab]], data[[ab]]$masks[[lvl]])
+    }
+}
+```
+
+
+## Analysis with DBA DESeq2
+
+### Summary of results
+
+```{r}
+thres = list()
+diff_num = list()
+for (ab in names(DBs)){
+    thres[[ab]] <- data[[ab]]$config$th
+    diff_num[[ab]] <- sum(DBs[[ab]]$`p-value` < thres[[ab]])
+}
+```
+
+```{r}
+thres <- list()
+up <- list()
+down <- list()
+changed <- list()
+for(ab in names(DBs)){
+        thres[[ab]] <- data[[ab]]$config$th
+        if (use_pval) {
+            up[[ab]] <- sum(DBs[[ab]]$`p-value` < thres[[ab]] & DBs[[ab]]$Fold > 0)
+            down[[ab]] <- sum(DBs[[ab]]$`p-value` < thres[[ab]] & DBs[[ab]]$Fold < 0)
+        } else {
+            up[[ab]] <- sum(DBs[[ab]]$FDR < thres[ab] & DBs[[ab]]$Fold > 0)
+            down[[ab]] <- sum(DBs[[ab]]$FDR < thres[ab] & DBs[[ab]]$Fold < 0)
+        } 
+        changed[[ab]] <- up[[ab]] + down[[ab]]
+}
+df <- data.frame(antibody = names(DBs),
+                 threshold = unlist(thres),
+                 up=unlist(up),
+                 down=unlist(down),
+                 changed=unlist(changed))
+knitr::kable(df, row.names=FALSE)
+cat('\n')
+
+```
+
+
+## Overlap of differentially bound peaks
+
+```{r upset, results='asis'}
+sel.list <- list()
+sel.list[['up']] <- list()
+sel.list[['down']] <- list()
+
+if (use_pval == TRUE) {
+    for (ab in names(DBs)){
+        sel.list[['up']][[ab]] <- DBs[[ab]][DBs[[ab]]$`p-value` < thres[[ab]] & DBs[[ab]]$Fold > 0]
+        sel.list[['down']][[ab]] <- DBs[[ab]][DBs[[ab]]$`p-value` < thres[[ab]] & DBs[[ab]]$Fold < 0]
+    }
+} else {
+    for (ab in names(DBs)){
+        sel.list[['up']][[ab]] <- DBs[[ab]][DBs[[ab]]$`p-value` < thres[[ab]] & DBs[[ab]]$Fold > 0]
+        sel.list[['down']][[ab]] <- DBs[[ab]][DBs[[ab]]$`p-value` < thres[[ab]] & DBs[[ab]]$Fold < 0]
+    }
+}
+
+# TODO: upset plots would be better here, but need to compute the overlap of peaks between contrasts
+# vennplot from CHiPseeker accept a list of object, can be a list of GRanges or a list of vector,
+# and returns a Venn plot of the overlap
+cat('\n\n### Overlap of peaks with increased affinity\n\n')
+if (length(sel.list[['up']]) > 1) {
+    vennplot(sel.list[['up']])
+} else {
+    cat('\n\nNot enough groups to plot overlap.\n\n')
+}
+cat('\n\n### Overlap of peaks with decreased affinity\n\n')
+if (length(sel.list[['down']]) > 1) {
+    vennplot(sel.list[['down']])
+} else {
+    cat('\n\nNot enough groups to plot overlap.\n\n')
+}
+
+```
+
+
+### Links to results
+
+```{r, results='asis'}
+for (ab in names(DBs)){
+    lvls <- levels(st[[ab]]$Treatment)
+    cat(paste0('\n\n#### **', lvls[2], ' vs. ', lvls[1], '**\n\n'))
+    cat(paste0('\n\n- TSV: [', lvls[2], '_diffPeaks_deseq2.tsv](', lvls[2], '_diffPeaks_deseq2.tsv)\n\n'))
+    cat(paste0('\n\n- BED: [', lvls[2], '_diffPeaks_deseq2.bed](', lvls[2], '_diffPeaks_deseq2.bed)\n\n'))
+}
+```
+
+### Visualization of differential binding {.tabset}
+
+```{r plotting, results='asis', fig.height=10, fig.width=10}
+for (ab in names(data)) {
+    cat('\n\n#### ', ab, '{.tabset}\n')
+    # plot PCA
+    cat('\n\n##### PCA plot\n')
+    dba.plotPCA(data[[ab]], contrast=1, label=DBA_ID)
+    # plot MA
+    cat('\n\n##### MA plot\n')
+    dba.plotMA(data[[ab]])
+    # volcano plot
+    cat('\n\n##### Volcano plot\n')
+    dba.plotVolcano(data[[ab]])
+    cat('\n\n##### Boxplot\n')
+    cat('\n\nThis is a boxplot of normalized reads in peaks.\n')
+    cat("\n\n- `+` indicates peaks with higher", ab, "in", data[[ab]][['contrasts']][[1]][['name2']], ".\n")
+    cat("\n- `-` indicates peaks with higher ", ab, "in", data[[ab]][['contrasts']][[1]][['name1']], ".\n")
+    pvals <- dba.plotBox(data[[ab]], bDBIncreased=FALSE,bDBDecreased=FALSE)
+    cat('\n\n##### Heatmap\n')
+    cat('\n\n\nThis is a heatmap of the differential peaks detected by our analysis.\n')
+    cat('\n- The color key shows the color map used to plot the heatmap.\n')
+    cat('\n- The histogram in the color key shows the distribution of scores shown in the heatmap.\n\n')
+    corvals <- dba.plotHeatmap(data[[ab]], contrast=1, correlations=FALSE)
+}
+```
+
+
+## Annotation to nearest gene {.tabset}
+
+Annotation was performed using `annotatePeak` from [CHiPseeker](http://bioconductor.org/packages/release/bioc/html/ChIPseeker.html). 
+Peaks were annotated to the nearest gene in [UCSC hg38 transcript-related features](http://bioconductor.org/packages/release/data/annotation/html/TxDb.Hsapiens.UCSC.hg38.knownGene.html). For promoters, the default TSS (transcription start site) region defined from -3kb to +3kb was used.
+
+According to CHiPseeker's documentation, the following features are reported:
+
+> The position and strand information of nearest genes are reported. The distance from peak to the TSS of its nearest gene is also reported. The genomic region of the peak is reported in annotation column. Since some annotation may overlap, ChIPseeker adopted the following priority in genomic annotation.
+> 
+>  - Promoter
+>  - 5’ UTR
+>  - 3’ UTR
+>  - Exon
+>  - Intron
+>  - Downstream
+>  - Intergenic
+>
+> Downstream is defined as the downstream of gene end.
+
+Only the gene corresponding to the annotation with the highest priority is reported in the output tables,
+and the presence of the peak in the other annotation as TRUE/FALSE.
+The order of priority can be modified in the code chunk below to obtain the genes from 
+other annotation categories.
+
+
+```{r}
+annot_order <- c("Promoter", "5UTR", "3UTR", "Exon",
+       "Intron", "Downstream", "Intergenic")
+```
+
+
+
+```{r chipseeker, results='asis', eval=TRUE}
+annocols <- c("seqnames", "end", "annotation", "geneStart", "geneEnd",
+              "geneLength", "geneStrand", "geneId", "transcriptId",
+              "distanceToTSS") #, "ENSEMBL", "SYMBOL", "GENENAME")
+
+if (use_pval == TRUE) {
+    thcol <- 'iranges.p.value'
+} else {
+    thcol <- 'iranges.FDR'
+}
+
+for (ab in names(DBs)){
+    cat(paste0('\n\n### ', ab, ' {.tabset}\n\n'))
+    # all peaks
+    bedfn <- paste0(ab, '_',params$args3)
+    cat('\n\n#### All\n\n')
+    peakAnno <- list()
+    peakAnno['all'] <- annotatePeak(bedfn, tssRegion=c(-3000, 3000), TxDb=txdb, verbose=FALSE, genomicAnnotationPriority = annot_order) # removed annoDb="hs....
+    plotAnnoPie(peakAnno[['all']])
+    print(upsetplot(peakAnno[['all']]))
+    # write bed with annotations for all peaks
+    write.table(as.data.frame(peakAnno['all']), 
+                file = paste0(ab, '_annot_', params$args3), 
+                quote=FALSE, row.names=FALSE, col.names=FALSE, sep = "\t")
+    # write tsv with annotations
+    tmpannot <- cbind(as.data.frame(peakAnno[['all']])[annocols], as.data.frame(peakAnno[['all']]@detailGenomicAnnotation))
+    tsvannot <- merge(as.data.frame(DBs[[ab]]), tmpannot,
+                      by=c('seqnames', 'end'), all.x=TRUE)
+    write.table(tsvannot, 
+                file = paste0(ab, '_annot_', params$args2), 
+                quote=FALSE, row.names=FALSE, sep = "\t")
+
+    # peaks up
+    cat('\n\n#### Up\n\n')
+    write.table(beds[[ab]][beds[[ab]][[thcol]] < thres[[ab]] & beds[[ab]][['iranges.Fold']] > 0,
+                          c(1,2,3,12,9,10,11)], 
+                file = paste(ab, '_up_',params$args3, sep=""), 
+                quote=FALSE, row.names=FALSE, col.names=FALSE, sep = "\t")
+    tryCatch(
+        {
+           peakAnno['up'] <- annotatePeak(paste(ab, '_up_',params$args3, sep=""),
+                               tssRegion=c(-3000, 3000), TxDb=txdb,
+                               verbose=FALSE, genomicAnnotationPriority = annot_order)
+            plotAnnoPie(peakAnno[['up']])
+            print(upsetplot(peakAnno[['up']]))
+        }, 
+        error = function(e){
+            print('No line available in input')
+        })
+
+    # peaks down
+    cat('\n\n#### Down\n\n')
+    write.table(beds[[ab]][beds[[ab]][[thcol]] < thres[[ab]] & beds[[ab]][['iranges.Fold']] < 0,
+                          c(1,2,3,12,9,10,11)], 
+                file = paste(ab, '_down_',params$args3, sep=""), 
+                quote=FALSE, row.names=FALSE, col.names=FALSE, sep = "\t")
+    tryCatch({
+            peakAnno['down'] <- annotatePeak(paste(ab, '_down_',params$args3, sep=""),
+                               tssRegion=c(-3000, 3000), TxDb=txdb,
+                               verbose=FALSE, genomicAnnotationPriority = annot_order)
+            plotAnnoPie(peakAnno[['down']])
+            print(upsetplot(peakAnno[['down']]))
+    },
+    error = function(e){
+            print('No line available in input')
+        })
+
+
+    # compare the all/up/down distributions
+    cat('\n\n#### Pie charts comparison\n\n')
+    print(plotAnnoBar(peakAnno))
+}
+```
+
+
+## sessionInfo
+
+For reproducibility purposes, here is the output of `sessionInfo` listing all packages
+used in the analysis.
+
+```{r}
+sessionInfo()
+```

From 90cf34ab7408af05ee58ab158d389997c89e08d6 Mon Sep 17 00:00:00 2001
From: Caroline Esnault <caroline.esnault@nih.gov>
Date: Thu, 24 Sep 2020 09:05:43 -0400
Subject: [PATCH 06/19] update gitignore

---
 .gitignore | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.gitignore b/.gitignore
index 710061099..187a83322 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,6 +11,11 @@ workflows/rnaseq/downstream/upset_plots
 workflows/rnaseq/downstream/final_clusters
 workflows/rnaseq/downstream/rnaseq.log
 workflows/rnaseq/downstream/*tsv
+workflows/chipseq/downstream/diffbind.html
+workflows/chipseq/downstream/diffbind_cache
+workflows/chipseq/downstream/diffbind_files
+workflows/chipseq/downstream/*tsv
+workflows/chipseq/downstream/*bed
 workflows/chipseq/data
 workflows/rnaseq/data
 workflows/colocalization/results

From ec36c5e9a64c9f039691d5fd0c9e2cdec556e175 Mon Sep 17 00:00:00 2001
From: Caroline Esnault <caroline.esnault@nih.gov>
Date: Thu, 24 Sep 2020 20:18:43 -0400
Subject: [PATCH 07/19] add back PE test samples

---
 ci/get-data.py | 113 ++++++++++++++++++++++++++-----------------------
 1 file changed, 59 insertions(+), 54 deletions(-)

diff --git a/ci/get-data.py b/ci/get-data.py
index 87115e144..f3064b3d7 100755
--- a/ci/get-data.py
+++ b/ci/get-data.py
@@ -5,10 +5,11 @@
 
 shell.executable('/bin/bash')
 BRANCH = 'master'
-URL = 'https://github.com/lcdb/lcdb-test-data-human/blob/{0}/{{}}?raw=true'.format(BRANCH)
+URL = 'https://github.com/lcdb/lcdb-test-data/blob/{0}/data/{{}}?raw=true'.format(BRANCH)
+URLH = 'https://github.com/lcdb/lcdb-test-data-human/blob/{0}/{{}}?raw=true'.format(BRANCH)
 
 
-def _download_file(fn, dest=None):
+def _download_file(URL, fn, dest=None):
     url = URL.format(fn)
     if dest is None:
         dest = fn
@@ -17,57 +18,61 @@ def _download_file(fn, dest=None):
     shell('wget -q -O- {url} > {dest}')
     return dest
 
-_download_file('rnaseq_samples/A549_DMSO_1h_1/A549_DMSO_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_DMSO_1h_1.small_R1.fastq.gz')
-_download_file('rnaseq_samples/A549_DMSO_1h_2/A549_DMSO_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_DMSO_1h_2.small_R1.fastq.gz')
-_download_file('rnaseq_samples/A549_DMSO_6h_1/A549_DMSO_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_DMSO_6h_1.small_R1.fastq.gz')
-_download_file('rnaseq_samples/A549_DMSO_6h_2/A549_DMSO_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_DMSO_6h_2.small_R1.fastq.gz')
-_download_file('rnaseq_samples/A549_JQ1_1h_1/A549_JQ1_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_JQ1_1h_1.small_R1.fastq.gz')
-_download_file('rnaseq_samples/A549_JQ1_1h_2/A549_JQ1_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_JQ1_1h_2.small_R1.fastq.gz')
-_download_file('rnaseq_samples/A549_JQ1_6h_1/A549_JQ1_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_JQ1_6h_1.small_R1.fastq.gz')
-_download_file('rnaseq_samples/A549_JQ1_6h_2/A549_JQ1_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_JQ1_6h_2.small_R1.fastq.gz')
-_download_file('rnaseq_samples/A549_dBet6_1h_1/A549_dBet6_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_dBet6_1h_1.small_R1.fastq.gz')
-_download_file('rnaseq_samples/A549_dBet6_1h_2/A549_dBet6_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_dBet6_1h_2.small_R1.fastq.gz')
-_download_file('rnaseq_samples/A549_dBet6_6h_1/A549_dBet6_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_dBet6_6h_1.small_R1.fastq.gz')
-_download_file('rnaseq_samples/A549_dBet6_6h_2/A549_dBet6_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_dBet6_6h_2.small_R1.fastq.gz')
-_download_file('rnaseq_samples/HAP1_DMSO_1h_1/HAP1_DMSO_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_DMSO_1h_1.small_R1.fastq.gz')
-_download_file('rnaseq_samples/HAP1_DMSO_1h_2/HAP1_DMSO_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_DMSO_1h_2.small_R1.fastq.gz')
-_download_file('rnaseq_samples/HAP1_DMSO_6h_1/HAP1_DMSO_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_DMSO_6h_1.small_R1.fastq.gz')
-_download_file('rnaseq_samples/HAP1_DMSO_6h_2/HAP1_DMSO_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_DMSO_6h_2.small_R1.fastq.gz')
-_download_file('rnaseq_samples/HAP1_JQ1_1h_1/HAP1_JQ1_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_JQ1_1h_1.small_R1.fastq.gz')
-_download_file('rnaseq_samples/HAP1_JQ1_1h_2/HAP1_JQ1_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_JQ1_1h_2.small_R1.fastq.gz')
-_download_file('rnaseq_samples/HAP1_JQ1_6h_1/HAP1_JQ1_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_JQ1_6h_1.small_R1.fastq.gz')
-_download_file('rnaseq_samples/HAP1_JQ1_6h_2/HAP1_JQ1_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_JQ1_6h_2.small_R1.fastq.gz')
-_download_file('rnaseq_samples/HAP1_dBet6_1h_1/HAP1_dBet6_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_dBet6_1h_1.small_R1.fastq.gz')
-_download_file('rnaseq_samples/HAP1_dBet6_1h_2/HAP1_dBet6_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_dBet6_1h_2.small_R1.fastq.gz')
-_download_file('rnaseq_samples/HAP1_dBet6_6h_1/HAP1_dBet6_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_dBet6_6h_1.small_R1.fastq.gz')
-_download_file('rnaseq_samples/HAP1_dBet6_6h_2/HAP1_dBet6_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_dBet6_6h_2.small_R1.fastq.gz')
-_download_file('rnaseq_samples/K562_DMSO_1h_1/K562_DMSO_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_DMSO_1h_1.small_R1.fastq.gz')
-_download_file('rnaseq_samples/K562_DMSO_1h_2/K562_DMSO_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_DMSO_1h_2.small_R1.fastq.gz')
-_download_file('rnaseq_samples/K562_DMSO_6h_1/K562_DMSO_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_DMSO_6h_1.small_R1.fastq.gz')
-_download_file('rnaseq_samples/K562_DMSO_6h_2/K562_DMSO_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_DMSO_6h_2.small_R1.fastq.gz')
-_download_file('rnaseq_samples/K562_JQ1_1h_1/K562_JQ1_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_JQ1_1h_1.small_R1.fastq.gz')
-_download_file('rnaseq_samples/K562_JQ1_1h_2/K562_JQ1_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_JQ1_1h_2.small_R1.fastq.gz')
-_download_file('rnaseq_samples/K562_JQ1_6h_1/K562_JQ1_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_JQ1_6h_1.small_R1.fastq.gz')
-_download_file('rnaseq_samples/K562_JQ1_6h_2/K562_JQ1_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_JQ1_6h_2.small_R1.fastq.gz')
-_download_file('rnaseq_samples/K562_dBet6_1h_2/K562_dBet6_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_dBet6_1h_2.small_R1.fastq.gz')
-_download_file('rnaseq_samples/K562_dBet6_6h_1/K562_dBet6_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_dBet6_6h_1.small_R1.fastq.gz')
-_download_file('rnaseq_samples/K562_dBet6_6h_2/K562_dBet6_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_dBet6_6h_2.small_R1.fastq.gz')
-_download_file('rnaseq_samples/outlier_HAP1_dBet6_1h_1/outlier_HAP1_dBet6_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/outlier_HAP1_dBet6_1h_1.small_R1.fastq.gz')
+_download_file(URLH, 'rnaseq_samples/A549_DMSO_1h_1/A549_DMSO_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_DMSO_1h_1.small_R1.fastq.gz')
+_download_file(URLH, 'rnaseq_samples/A549_DMSO_1h_2/A549_DMSO_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_DMSO_1h_2.small_R1.fastq.gz')
+_download_file(URLH, 'rnaseq_samples/A549_DMSO_6h_1/A549_DMSO_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_DMSO_6h_1.small_R1.fastq.gz')
+_download_file(URLH, 'rnaseq_samples/A549_DMSO_6h_2/A549_DMSO_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_DMSO_6h_2.small_R1.fastq.gz')
+_download_file(URLH, 'rnaseq_samples/A549_JQ1_1h_1/A549_JQ1_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_JQ1_1h_1.small_R1.fastq.gz')
+_download_file(URLH, 'rnaseq_samples/A549_JQ1_1h_2/A549_JQ1_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_JQ1_1h_2.small_R1.fastq.gz')
+_download_file(URLH, 'rnaseq_samples/A549_JQ1_6h_1/A549_JQ1_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_JQ1_6h_1.small_R1.fastq.gz')
+_download_file(URLH, 'rnaseq_samples/A549_JQ1_6h_2/A549_JQ1_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_JQ1_6h_2.small_R1.fastq.gz')
+_download_file(URLH, 'rnaseq_samples/A549_dBet6_1h_1/A549_dBet6_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_dBet6_1h_1.small_R1.fastq.gz')
+_download_file(URLH, 'rnaseq_samples/A549_dBet6_1h_2/A549_dBet6_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_dBet6_1h_2.small_R1.fastq.gz')
+_download_file(URLH, 'rnaseq_samples/A549_dBet6_6h_1/A549_dBet6_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_dBet6_6h_1.small_R1.fastq.gz')
+_download_file(URLH, 'rnaseq_samples/A549_dBet6_6h_2/A549_dBet6_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/A549_dBet6_6h_2.small_R1.fastq.gz')
+_download_file(URLH, 'rnaseq_samples/HAP1_DMSO_1h_1/HAP1_DMSO_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_DMSO_1h_1.small_R1.fastq.gz')
+_download_file(URLH, 'rnaseq_samples/HAP1_DMSO_1h_2/HAP1_DMSO_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_DMSO_1h_2.small_R1.fastq.gz')
+_download_file(URLH, 'rnaseq_samples/HAP1_DMSO_6h_1/HAP1_DMSO_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_DMSO_6h_1.small_R1.fastq.gz')
+_download_file(URLH, 'rnaseq_samples/HAP1_DMSO_6h_2/HAP1_DMSO_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_DMSO_6h_2.small_R1.fastq.gz')
+_download_file(URLH, 'rnaseq_samples/HAP1_JQ1_1h_1/HAP1_JQ1_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_JQ1_1h_1.small_R1.fastq.gz')
+_download_file(URLH, 'rnaseq_samples/HAP1_JQ1_1h_2/HAP1_JQ1_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_JQ1_1h_2.small_R1.fastq.gz')
+_download_file(URLH, 'rnaseq_samples/HAP1_JQ1_6h_1/HAP1_JQ1_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_JQ1_6h_1.small_R1.fastq.gz')
+_download_file(URLH, 'rnaseq_samples/HAP1_JQ1_6h_2/HAP1_JQ1_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_JQ1_6h_2.small_R1.fastq.gz')
+_download_file(URLH, 'rnaseq_samples/HAP1_dBet6_1h_1/HAP1_dBet6_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_dBet6_1h_1.small_R1.fastq.gz')
+_download_file(URLH, 'rnaseq_samples/HAP1_dBet6_1h_2/HAP1_dBet6_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_dBet6_1h_2.small_R1.fastq.gz')
+_download_file(URLH, 'rnaseq_samples/HAP1_dBet6_6h_1/HAP1_dBet6_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_dBet6_6h_1.small_R1.fastq.gz')
+_download_file(URLH, 'rnaseq_samples/HAP1_dBet6_6h_2/HAP1_dBet6_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/HAP1_dBet6_6h_2.small_R1.fastq.gz')
+_download_file(URLH, 'rnaseq_samples/K562_DMSO_1h_1/K562_DMSO_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_DMSO_1h_1.small_R1.fastq.gz')
+_download_file(URLH, 'rnaseq_samples/K562_DMSO_1h_2/K562_DMSO_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_DMSO_1h_2.small_R1.fastq.gz')
+_download_file(URLH, 'rnaseq_samples/K562_DMSO_6h_1/K562_DMSO_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_DMSO_6h_1.small_R1.fastq.gz')
+_download_file(URLH, 'rnaseq_samples/K562_DMSO_6h_2/K562_DMSO_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_DMSO_6h_2.small_R1.fastq.gz')
+_download_file(URLH, 'rnaseq_samples/K562_JQ1_1h_1/K562_JQ1_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_JQ1_1h_1.small_R1.fastq.gz')
+_download_file(URLH, 'rnaseq_samples/K562_JQ1_1h_2/K562_JQ1_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_JQ1_1h_2.small_R1.fastq.gz')
+_download_file(URLH, 'rnaseq_samples/K562_JQ1_6h_1/K562_JQ1_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_JQ1_6h_1.small_R1.fastq.gz')
+_download_file(URLH, 'rnaseq_samples/K562_JQ1_6h_2/K562_JQ1_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_JQ1_6h_2.small_R1.fastq.gz')
+_download_file(URLH, 'rnaseq_samples/K562_dBet6_1h_2/K562_dBet6_1h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_dBet6_1h_2.small_R1.fastq.gz')
+_download_file(URLH, 'rnaseq_samples/K562_dBet6_6h_1/K562_dBet6_6h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_dBet6_6h_1.small_R1.fastq.gz')
+_download_file(URLH, 'rnaseq_samples/K562_dBet6_6h_2/K562_dBet6_6h_2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/K562_dBet6_6h_2.small_R1.fastq.gz')
+_download_file(URLH, 'rnaseq_samples/outlier_HAP1_dBet6_1h_1/outlier_HAP1_dBet6_1h_1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/outlier_HAP1_dBet6_1h_1.small_R1.fastq.gz')
 
-_download_file('chipseq_samples/BRD4_DMSO_1/BRD4_DMSO_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/BRD4_DMSO_1.small_R1.fastq.gz')
-_download_file('chipseq_samples/BRD4_DMSO_2/BRD4_DMSO_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/BRD4_DMSO_2.small_R1.fastq.gz')
-_download_file('chipseq_samples/BRD4_dBET6_1/BRD4_dBET6_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/BRD4_dBET6_1.small_R1.fastq.gz')
-_download_file('chipseq_samples/BRD4_dBET6_2/BRD4_dBET6_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/BRD4_dBET6_2.small_R1.fastq.gz')
-_download_file('chipseq_samples/MTHFD1_DMSO_1/MTHFD1_DMSO_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/MTHFD1_DMSO_1.small_R1.fastq.gz')
-_download_file('chipseq_samples/MTHFD1_DMSO_2/MTHFD1_DMSO_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/MTHFD1_DMSO_2.small_R1.fastq.gz')
-_download_file('chipseq_samples/MTHFD1_dBET6_1/MTHFD1_dBET6_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/MTHFD1_dBET6_1.small_R1.fastq.gz')
-_download_file('chipseq_samples/MTHFD1_dBET6_2/MTHFD1_dBET6_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/MTHFD1_dBET6_2.small_R1.fastq.gz')
-_download_file('chipseq_samples/input_DMSO_1/input_DMSO_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/input_DMSO_1.small_R1.fastq.gz')
-_download_file('chipseq_samples/input_DMSO_2/input_DMSO_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/input_DMSO_2.small_R1.fastq.gz')
-_download_file('chipseq_samples/input_dBET6_1/input_dBET6_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/input_dBET6_1.small_R1.fastq.gz')
-_download_file('chipseq_samples/input_dBET6_2/input_dBET6_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/input_dBET6_2.small_R1.fastq.gz')
-_download_file('chipseq_samples/mockIgG_DMSO_1/mockIgG_DMSO_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/mockIgG_DMSO_1.small_R1.fastq.gz')
-_download_file('chipseq_samples/mockIgG_DMSO_2/mockIgG_DMSO_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/mockIgG_DMSO_2.small_R1.fastq.gz')
-_download_file('chipseq_samples/mockIgG_dBET6_1/mockIgG_dBET6_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/mockIgG_dBET6_1.small_R1.fastq.gz')
-_download_file('chipseq_samples/mockIgG_dBET6_2/mockIgG_dBET6_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/mockIgG_dBET6_2.small_R1.fastq.gz')
+_download_file(URLH, 'chipseq_samples/BRD4_DMSO_1/BRD4_DMSO_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/BRD4_DMSO_1.small_R1.fastq.gz')
+_download_file(URLH, 'chipseq_samples/BRD4_DMSO_2/BRD4_DMSO_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/BRD4_DMSO_2.small_R1.fastq.gz')
+_download_file(URLH, 'chipseq_samples/BRD4_dBET6_1/BRD4_dBET6_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/BRD4_dBET6_1.small_R1.fastq.gz')
+_download_file(URLH, 'chipseq_samples/BRD4_dBET6_2/BRD4_dBET6_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/BRD4_dBET6_2.small_R1.fastq.gz')
+_download_file(URLH, 'chipseq_samples/MTHFD1_DMSO_1/MTHFD1_DMSO_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/MTHFD1_DMSO_1.small_R1.fastq.gz')
+_download_file(URLH, 'chipseq_samples/MTHFD1_DMSO_2/MTHFD1_DMSO_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/MTHFD1_DMSO_2.small_R1.fastq.gz')
+_download_file(URLH, 'chipseq_samples/MTHFD1_dBET6_1/MTHFD1_dBET6_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/MTHFD1_dBET6_1.small_R1.fastq.gz')
+_download_file(URLH, 'chipseq_samples/MTHFD1_dBET6_2/MTHFD1_dBET6_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/MTHFD1_dBET6_2.small_R1.fastq.gz')
+_download_file(URLH, 'chipseq_samples/input_DMSO_1/input_DMSO_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/input_DMSO_1.small_R1.fastq.gz')
+_download_file(URLH, 'chipseq_samples/input_DMSO_2/input_DMSO_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/input_DMSO_2.small_R1.fastq.gz')
+_download_file(URLH, 'chipseq_samples/input_dBET6_1/input_dBET6_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/input_dBET6_1.small_R1.fastq.gz')
+_download_file(URLH, 'chipseq_samples/input_dBET6_2/input_dBET6_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/input_dBET6_2.small_R1.fastq.gz')
+_download_file(URLH, 'chipseq_samples/mockIgG_DMSO_1/mockIgG_DMSO_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/mockIgG_DMSO_1.small_R1.fastq.gz')
+_download_file(URLH, 'chipseq_samples/mockIgG_DMSO_2/mockIgG_DMSO_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/mockIgG_DMSO_2.small_R1.fastq.gz')
+_download_file(URLH, 'chipseq_samples/mockIgG_dBET6_1/mockIgG_dBET6_1.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/mockIgG_dBET6_1.small_R1.fastq.gz')
+_download_file(URLH, 'chipseq_samples/mockIgG_dBET6_2/mockIgG_dBET6_2.small_R1.fastq.gz', 'workflows/chipseq/data/example_data/mockIgG_dBET6_2.small_R1.fastq.gz')
 
+
+
+_download_file(URL, 'rnaseq_samples/sample1/sample1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample1PE_1.fq.gz')
+_download_file(URL, 'rnaseq_samples/sample1/sample1.small_R2.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample1PE_2.fq.gz')

From 6d464301bc31705304aa96f1f203b033ed15c1f5 Mon Sep 17 00:00:00 2001
From: Caroline Esnault <caroline.esnault@nih.gov>
Date: Thu, 24 Sep 2020 20:19:01 -0400
Subject: [PATCH 08/19] add star

---
 include/reference_configs/Test_Homo_sapiens.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/reference_configs/Test_Homo_sapiens.yaml b/include/reference_configs/Test_Homo_sapiens.yaml
index f8392fb6b..e66ecece7 100644
--- a/include/reference_configs/Test_Homo_sapiens.yaml
+++ b/include/reference_configs/Test_Homo_sapiens.yaml
@@ -15,6 +15,7 @@ references:
         indexes:
           - 'hisat2'
           - 'bowtie2'
+          - 'star'
 
       annotation:
         url: 'https://github.com/lcdb/lcdb-test-data-human/raw/master/annotation/hg38.small.gtf'

From 714b2cd35da29a622983a83d0d94793f8607978e Mon Sep 17 00:00:00 2001
From: Caroline Esnault <caroline.esnault@nih.gov>
Date: Thu, 24 Sep 2020 20:19:29 -0400
Subject: [PATCH 09/19] update requirements for diffbind

---
 requirements-r.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/requirements-r.txt b/requirements-r.txt
index 7343578f5..b50579635 100644
--- a/requirements-r.txt
+++ b/requirements-r.txt
@@ -1,9 +1,11 @@
 bioconductor-annotationhub
 bioconductor-apeglm
 bioconductor-biocparallel
+bioconductor-chipseeker
 bioconductor-clusterprofiler
 bioconductor-degreport
 bioconductor-deseq2
+bioconductor-diffbind
 bioconductor-dupradar
 bioconductor-genomeinfodbdata
 bioconductor-genomicfeatures
@@ -24,3 +26,4 @@ r-rmarkdown
 r-spp
 r-sparsem
 r-upsetr
+r-ggupset

From 03174bbc50d38872dd5e40d29a38ec06519edbb3 Mon Sep 17 00:00:00 2001
From: Caroline Esnault <caroline.esnault@nih.gov>
Date: Thu, 24 Sep 2020 20:21:42 -0400
Subject: [PATCH 10/19] update test configs with large datasets

---
 test/test_configs/chipseq_one_run.tsv         |  4 +--
 test/test_configs/star_override_1pass.yaml    |  2 +-
 test/test_configs/star_override_2pass.yaml    |  2 +-
 .../test_configs/test_chipseq_regression.yaml | 26 +++++++++----------
 test/test_configs/two_samples.tsv             |  4 +--
 workflows/references/config/config.yaml       |  1 +
 6 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/test/test_configs/chipseq_one_run.tsv b/test/test_configs/chipseq_one_run.tsv
index ec235f9f1..8f780bf73 100644
--- a/test/test_configs/chipseq_one_run.tsv
+++ b/test/test_configs/chipseq_one_run.tsv
@@ -1,4 +1,4 @@
 # Samplenames with the same "label" will be considered technical replicates
 samplename	antibody	biological_material	replicate	label	orig_filename
-input_1	input	wingdisc-1	1	input-wingdisc-1	data/example_data/chipseq_input1.fq.gz
-ip_1	gaf	wingdisc-1	1	gaf-wingdisc-1	data/example_data/chipseq_ip1.fq.gz
+BRD4_dBET6_1	dBET6	HAP1_B1	1	BRD4_dBET6_1	data/example_data/BRD4_dBET6_1.small_R1.fastq.gz
+mockIgG_dBET6_1	input	HAP1_B1	1	mockIgG_dBET6_1	data/example_data/mockIgG_dBET6_1.small_R1.fastq.gz
diff --git a/test/test_configs/star_override_1pass.yaml b/test/test_configs/star_override_1pass.yaml
index cba6ff764..c8537b568 100644
--- a/test/test_configs/star_override_1pass.yaml
+++ b/test/test_configs/star_override_1pass.yaml
@@ -1,6 +1,6 @@
 aligner:
   index: star
-  tag: test
+  tag: small-gencode-v28
 
 merged_bigwigs:
   control_pos:
diff --git a/test/test_configs/star_override_2pass.yaml b/test/test_configs/star_override_2pass.yaml
index b091eba3d..f4e7b2c56 100644
--- a/test/test_configs/star_override_2pass.yaml
+++ b/test/test_configs/star_override_2pass.yaml
@@ -1,6 +1,6 @@
 aligner:
   index: 'star-twopass'
-  tag: test
+  tag: small-gencode-v28
 
 merged_bigwigs:
   control_pos:
diff --git a/test/test_configs/test_chipseq_regression.yaml b/test/test_configs/test_chipseq_regression.yaml
index 6b91f5dbd..4e6ec10c7 100644
--- a/test/test_configs/test_chipseq_regression.yaml
+++ b/test/test_configs/test_chipseq_regression.yaml
@@ -1,40 +1,38 @@
 #sampletable: 'config/sampletable.tsv'
-organism: 'dmel'
+organism: 'human'
 references_dir: 'references_data'
 peaks_dir: 'data/chipseq_peaks'
 
 chipseq:
   peak_calling:
 
-    - label: gaf-wingdisc-1
+    - label: BRD4_dBET6_1
       algorithm: macs2
       ip:
-        - gaf-wingdisc-1
+        - BRD4_dBET6_1
       control:
-        - input-wingdisc-1
-      effective_genome_count: 7e7
-      extra: '--nomodel --extsize 147'
+        - mockIgG_dBET6_1
+      extra: '--nomodel -p 0.05 --cutoff-analysis'
 
 
 fastq_screen:
   - label: rRNA
-    organism: dmel
-    tag: test
+    organism: human
+    tag: small-gencode-v28
   - label: PhiX
     organism: phix
     tag: default
-  - label: Fly
-    organism: dmel
-    tag: test
+  - label: Human
+    organism: human
+    tag: small-gencode-v28
 
 aligner:
   index: 'bowtie2'
-  tag: 'test'
+  tag: 'small-gencode-v28'
 
 
 merged_bigwigs: {}
 
 include_references:
-  - '../../include/reference_configs/test.yaml'
   - '../../include/reference_configs/PhiX.yaml'
-  - '../../include/reference_configs/Drosophila_melanogaster.yaml'
+  - '../../include/reference_configs/Test_Homo_sapiens.yaml'
diff --git a/test/test_configs/two_samples.tsv b/test/test_configs/two_samples.tsv
index 30886104f..76ecc25e1 100644
--- a/test/test_configs/two_samples.tsv
+++ b/test/test_configs/two_samples.tsv
@@ -1,3 +1,3 @@
 samplename	group	layout	orig_filename
-sample1	control	SE	data/example_data/rnaseq_sample1PE_1.fq.gz
-sample2	control	SE	data/example_data/rnaseq_sample2.fq.gz
+A549_dBet6_1h_1	dBet6_1h	SE	data/example_data/A549_dBet6_1h_1.small_R1.fastq.gz
+A549_dBet6_1h_2	dBet6_1h	SE	data/example_data/A549_dBet6_1h_2.small_R1.fastq.gz
diff --git a/workflows/references/config/config.yaml b/workflows/references/config/config.yaml
index 49618dcd0..eeea065bd 100644
--- a/workflows/references/config/config.yaml
+++ b/workflows/references/config/config.yaml
@@ -4,3 +4,4 @@ references_dir: 'references_dir'
 # include/reference_configs, for inspiration for more species.
 include_references:
   - '../../include/reference_configs/test.yaml'
+  - '../../include/reference_configs/Test_Homo_sapiens.yaml'

From 25e382f774920a03fbbda058b605e17e5c2cf597 Mon Sep 17 00:00:00 2001
From: Caroline Esnault <caroline.esnault@nih.gov>
Date: Thu, 24 Sep 2020 20:23:35 -0400
Subject: [PATCH 11/19] add merged bigwigs for testing

---
 workflows/chipseq/config/config.yaml | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/workflows/chipseq/config/config.yaml b/workflows/chipseq/config/config.yaml
index 9c159be56..e28237624 100644
--- a/workflows/chipseq/config/config.yaml
+++ b/workflows/chipseq/config/config.yaml
@@ -125,15 +125,10 @@ fastq_screen:
     organism: human
     tag: small-gencode-v28
 
-#merged_bigwigs:
-#  input-wingdisc:
-#    - input-wingdisc-1
-#    - input-wingdisc-2
-#  gaf-wingdisc:
-#    - gaf-wingdisc-1
-#    - gaf-wingdisc-2
-#  gaf-embryo:
-#    - gaf-embryo-1
+merged_bigwigs:
+  BRD4_dBET6_1:
+    - BRD4_dBET6_1
+    - BRD4_dBET6_2
 
 aligner:
   index: 'bowtie2'

From 0073954286d09e757b115702a5622de6df473058 Mon Sep 17 00:00:00 2001
From: Caroline Esnault <caroline.esnault@nih.gov>
Date: Thu, 24 Sep 2020 20:41:13 -0400
Subject: [PATCH 12/19] add diffbind in tests

---
 .circleci/config.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index e13699c50..777713c99 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -134,6 +134,8 @@ variables:
           source activate lcdb-wf-test
           ./run_test.sh --use-conda -j2 -k -p -r
           python chipseq_trackhub.py config/config.yaml config/hub_config.yaml
+          source activate lcdb-wf-test-r
+          Rscript -e "rmarkdown::render('downstream/diffbind.Rmd')"
   # --------------------------------------------------------------------------
   # Previous versions had an error where chipseq peaks needed to be defined for
   # every caller. This does a (relatively) quick test to only run a single

From b434ee556767c148a7d4c6afbfd7a0729fbdeec0 Mon Sep 17 00:00:00 2001
From: Caroline Esnault <caroline.esnault@nih.gov>
Date: Thu, 24 Sep 2020 20:56:50 -0400
Subject: [PATCH 13/19] simplify contrasts

---
 workflows/rnaseq/downstream/rnaseq.Rmd | 92 --------------------------
 1 file changed, 92 deletions(-)

diff --git a/workflows/rnaseq/downstream/rnaseq.Rmd b/workflows/rnaseq/downstream/rnaseq.Rmd
index 90a5569d5..3dcb8eb0c 100644
--- a/workflows/rnaseq/downstream/rnaseq.Rmd
+++ b/workflows/rnaseq/downstream/rnaseq.Rmd
@@ -280,37 +280,15 @@ lst <- list(
                 design=~treatment,
                 args=list(subset.counts=TRUE)
             ),
-            jq=list(
-                sampletable=colData %>% filter(treatment %in% c('DMSO', 'JQ1'), !samplename %in% outliers),
-                design=~group + 0,
-                args=list(subset.counts=TRUE)
-            ),
             jq_cell_line=list(
                 sampletable=colData %>% filter(treatment %in% c('DMSO', 'JQ1'), !samplename %in% outliers),
                 design=~cell_line_group + 0,
                 args=list(subset.counts=TRUE)
             ),
-            jq_interact_term=list(
-                sampletable=colData %>% filter(treatment %in% c('DMSO', 'JQ1'), !samplename %in% outliers),
-                design=~treatment + cell_Line + timepoint + treatment:timepoint,
-                args=list(subset.counts=TRUE)
-            ),
-            dbet=list(
-                sampletable=colData %>% filter(treatment %in% c('DMSO', 'dBet6'), !samplename %in% outliers),
-                design=~group + 0,
-                args=list(subset.counts=TRUE)
-            ),
             dbet_cell_line=list(
                 sampletable=colData %>% filter(treatment %in% c('DMSO', 'dBet6'), !samplename %in% outliers),
                 design=~cell_line_group + 0,
                 args=list(subset.counts=TRUE)
-            ),
-            dbet_interact_term=list(
-                sampletable=colData %>%
-                    filter(treatment %in% c('DMSO', 'dBet6'), !samplename %in% outliers) %>%
-                    mutate(treatment=factor(treatment, levels=c('DMSO', 'dBet6'))),
-                design=~treatment + cell_Line + timepoint + treatment:timepoint,
-                args=list(subset.counts=TRUE)
             )
 )
 
@@ -337,12 +315,6 @@ res.list[['main_jq']] <- list(
 )
 
 # JQ1 vs. DMSO at 1h
-res.list[['jq1h']] <- list(
-    res=results(dds.list[['jq']], contrast=c(-1,0,1,0)),
-    dds='jq',
-    label='JQ1 vs. DMSO 1h (LFC>0); dds jq; cts group'
-)
-
 res.list[['jq1h-batch']] <- list(
     res=results(dds.list[['jq_cell_line']], contrast=c(-1,0,1,0,-1,0,1,0,-1,0,1,0)),
     dds='jq_cell_line',
@@ -350,12 +322,6 @@ res.list[['jq1h-batch']] <- list(
 )
 
 # JQ1 vs. DMSO at 6h
-res.list[['jq6h']] <- list(
-    res=results(dds.list[['jq']], contrast=c(0,-1,0,1)),
-    dds='jq',
-    label='JQ1 vs. DMSO 6h (LFC>0); dds jq; cts group'
-)
-
 res.list[['jq6h-batch']] <- list(
     res=results(dds.list[['jq_cell_line']], contrast=c(0,-1,0,1,0,-1,0,1,0,-1,0,1)),
     dds='jq_cell_line',
@@ -363,35 +329,12 @@ res.list[['jq6h-batch']] <- list(
 )
 
 # JQ1 vs. DMSO 6h vs. 1h
-res.list[['jq6h.vs.1h']] <- list(
-    res=results(dds.list[['jq']], contrast=c(1,-1,-1,1)),
-    dds='jq',
-    label='JQ1 vs. DMSO 6h vs. 1h (LFC>0); dds jq; cts group'
-)
-
 res.list[['jq6h.vs.1h-batch']] <- list(
     res=results(dds.list[['jq_cell_line']], contrast=c(1,-1,-1,1,1,-1,-1,1,1,-1,-1,1)),
     dds='jq_cell_line',
     label='JQ1 vs. DMSO 6h vs. 1h (LFC>0); dds jq_cell_line; cts cell_Line_group; batch effect'
 )
 
-res.list[['jq6h.vs.1h-interactdesign']] <- list(
-    res=results(dds.list[['jq_interact_term']]),
-    dds='jq_cell_line',
-    label='JQ1 vs. DMSO 6h vs. 1h (LFC>0); dds jq_interact_term; cts cell_Line_group; batch effect'
-)
-
-res.list[['jq_lfc2']] <- list(
-    res=lfcShrink(
-        dds.list[['jq_interact_term']],
-        coef='treatmentJQ1.timepoint6h',
-        #lfcThreshold=2,    # no padj if lfcThreshold
-        type='apeglm'
-    ),
-    dds='main',
-    label='JQ1 vs. DMSO 6h vs. 1h (LFC>2); dds jq_interact_term; cts treat+cell_Line+timepoint+treat:timepoint; batch effect'
-)
-
 # dBet6 ---------------------------
 
 res.list[['main_dbet']] <- list(
@@ -401,12 +344,6 @@ res.list[['main_dbet']] <- list(
 )
 
 # dBET6 vs. DMSO at 1h
-res.list[['dbet1h']] <- list(
-    res=results(dds.list[['dbet']], contrast=c(-1,0,1,0)),
-    dds='dbet',
-    label='dBet6 vs. DMSO 1h (LFC>0); dds dbet; cts group'
-)
-
 res.list[['dbet1h-batch']] <- list(
     res=results(dds.list[['dbet_cell_line']], contrast=c(-1,0,1,0,-1,0,1,0,-1,0,1,0)),
     dds='dbet_cell_line',
@@ -414,12 +351,6 @@ res.list[['dbet1h-batch']] <- list(
 )
 
 # dBet6 vs. DMSO at 6h
-res.list[['dbet6h']] <- list(
-    res=results(dds.list[['dbet']], contrast=c(0,-1,0,1)),
-    dds='dbet',
-    label='dBet6 vs. DMSO 6h (LFC>0); dds dbet; cts group'
-)
-
 res.list[['dbet6h-batch']] <- list(
     res=results(dds.list[['dbet_cell_line']], contrast=c(0,-1,0,1,0,-1,0,1,0,-1,0,1)),
     dds='dbet_cell_line',
@@ -427,35 +358,12 @@ res.list[['dbet6h-batch']] <- list(
 )
 
 # dBet6 vs. DMSO 6h vs. 1h
-res.list[['dbet6h.vs.1h']] <- list(
-    res=results(dds.list[['dbet']], contrast=c(1,-1,-1,1)),
-    dds='dbet',
-    label='dBet6 vs. DMSO 6h vs. 1h (LFC>0); dds dbet; cts group'
-)
-
 res.list[['dbet6h.vs.1h-batch']] <- list(
     res=results(dds.list[['dbet_cell_line']], contrast=c(1,-1,-1,1,1,-1,-1,1,1,-1,-1,1)),
     dds='dbet_cell_line',
     label='dBet6 vs. DMSO 6h vs. 1h (LFC>0); dds dbet_cell_line; cts cell_Line_group; batch effect'
 )
 
-res.list[['dbet6h.vs.1h-interactdesign']] <- list(
-    res=results(dds.list[['dbet_interact_term']]),
-    dds='dbet_cell_line',
-    label='dBet6 vs. DMSO 6h vs. 1h (LFC>0); dds dbet_interact_term; cts cell_Line_group; batch effect'
-)
-
-res.list[['dbet_lfc2']] <- list(
-    res=lfcShrink(
-        dds.list[['dbet_interact_term']],
-        coef='treatmentdBet6.timepoint6h',
-        #lfcThreshold=2,
-        type='apeglm'
-    ),
-    dds='main',
-    label='dBet6 vs. DMSO 6h vs. 1h (LFC>2); dds dbet_interact_term; cts treat+cell_Line+timepoint+treat:timepoint; batch effect'
-)
-
 ```
 
 

From 622d98d3e613890956056ecb6bd19b6d39074268 Mon Sep 17 00:00:00 2001
From: Caroline Esnault <caroline.esnault@nih.gov>
Date: Thu, 24 Sep 2020 21:37:11 -0400
Subject: [PATCH 14/19] use test reference for rnaseq_pe

---
 test/test_configs/override_PE.yaml | 58 ++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 test/test_configs/override_PE.yaml

diff --git a/test/test_configs/override_PE.yaml b/test/test_configs/override_PE.yaml
new file mode 100644
index 000000000..44f7bb548
--- /dev/null
+++ b/test/test_configs/override_PE.yaml
@@ -0,0 +1,58 @@
+# Due to the way Snakemake recursively merges config items, we need to
+# recursively reset this dictonary to override the default one in order to
+# allow arbitrary other sample names.
+#
+# Use it like this
+#
+#   snakemake --configfile ../../test/override.yaml --config sampletable=/path/to/tsv
+
+
+merged_bigwigs:
+  control_pos:
+    pos: []
+  treatment_all:
+    pos: []
+    neg: []
+
+
+patterns: 'config/rnaseq_patterns.yaml'
+
+# Which key in the `references` dict below to use
+organism: 'dmel'
+
+# If not specified here, use the environment variable REFERENCES_DIR.
+references_dir: 'references_data'
+
+aligner:
+  index: 'hisat2'
+  tag: 'test'
+
+rrna:
+  index: 'bowtie2'
+  tag: 'rRNA'
+
+gtf:
+  tag: "test"
+
+salmon:
+  tag: "test"
+
+fastq_screen:
+  - label: rRNA
+    organism: dmel
+    tag: test
+  - label: PhiX
+    organism: phix
+    tag: default
+  - label: Fly
+    organism: dmel
+    tag: test
+
+
+# See the reference config files in the top level of the repo,
+# include/reference_configs, for inspiration for more species.
+
+include_references:
+  - '../../include/reference_configs/test.yaml'
+  - '../../include/reference_configs/PhiX.yaml'
+  - '../../include/reference_configs/Drosophila_melanogaster.yaml'

From 73a98da0583c695e0950727cc4d918c72273b39a Mon Sep 17 00:00:00 2001
From: Caroline Esnault <caroline.esnault@nih.gov>
Date: Thu, 24 Sep 2020 21:37:44 -0400
Subject: [PATCH 15/19] update for human large dataset

---
 .circleci/config.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 777713c99..51b70c9af 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -257,7 +257,7 @@ variables:
 
           # test PE reads
           ./run_test.sh -j 2 --use-conda -k -p -r --until multiqc \
-            --configfile $ORIG/test/test_configs/override.yaml \
+            --configfile $ORIG/test/test_configs/override_PE.yaml \
             --config sampletable=$ORIG/test/test_configs/test_pe_sampletable.tsv
 
 
@@ -354,6 +354,7 @@ jobs:
 
   rnaseq:
     <<: *defaults
+    resource_class: large
     steps:
       - checkout
       - *restore_cache

From cd2bb06a4058303a2fc21a91bd96b7fca6506f0f Mon Sep 17 00:00:00 2001
From: Caroline Esnault <caroline.esnault@nih.gov>
Date: Fri, 25 Sep 2020 05:54:36 -0400
Subject: [PATCH 16/19] adjust ressources

---
 .circleci/config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 51b70c9af..5a9db7d20 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -354,7 +354,7 @@ jobs:
 
   rnaseq:
     <<: *defaults
-    resource_class: large
+    resource_class: medium+
     steps:
       - checkout
       - *restore_cache

From 71a1e407af4b521ec45914f772dc25f96e80b299 Mon Sep 17 00:00:00 2001
From: Caroline Esnault <caroline.esnault@nih.gov>
Date: Fri, 25 Sep 2020 06:51:04 -0400
Subject: [PATCH 17/19] scale back ressources

---
 .circleci/config.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 5a9db7d20..a2f3cc5e8 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -354,7 +354,6 @@ jobs:
 
   rnaseq:
     <<: *defaults
-    resource_class: medium+
     steps:
       - checkout
       - *restore_cache

From 1106a8224c19c66e685f7cea835dfd8c87981a52 Mon Sep 17 00:00:00 2001
From: Caroline Esnault <caroline.esnault@nih.gov>
Date: Fri, 25 Sep 2020 06:51:35 -0400
Subject: [PATCH 18/19] remove JQ1 samples to preserve RAM

---
 workflows/rnaseq/downstream/rnaseq.Rmd | 57 ++++++++++++++------------
 1 file changed, 31 insertions(+), 26 deletions(-)

diff --git a/workflows/rnaseq/downstream/rnaseq.Rmd b/workflows/rnaseq/downstream/rnaseq.Rmd
index 3dcb8eb0c..16a32fd0a 100644
--- a/workflows/rnaseq/downstream/rnaseq.Rmd
+++ b/workflows/rnaseq/downstream/rnaseq.Rmd
@@ -87,6 +87,10 @@ for (col in factor.columns){
 
 colData$group <- relevel(colData$group, ref='DMSO_1h')
 rownames(colData) <- colData[,1]
+
+# remove JQ1 samples
+colData <- colData %>%
+    filter(treatment != 'JQ1')
 ```
 
 ## Experiment overview
@@ -122,7 +126,8 @@ vsd.txi <- varianceStabilizingTransformation(dds.txi, blind=TRUE)
 dds <- lcdbwf::DESeqDataSetFromCombinedFeatureCounts(
     '../data/rnaseq_aggregation/featurecounts.txt',
     sampletable=colData,
-    design=~group)
+    design=~group,
+    subset.counts=TRUE)
 
 # NOTE: collapse technical replicates ----------------------------------------
 #   If collapsing technical replicates, do so here
@@ -280,11 +285,11 @@ lst <- list(
                 design=~treatment,
                 args=list(subset.counts=TRUE)
             ),
-            jq_cell_line=list(
-                sampletable=colData %>% filter(treatment %in% c('DMSO', 'JQ1'), !samplename %in% outliers),
-                design=~cell_line_group + 0,
-                args=list(subset.counts=TRUE)
-            ),
+            #jq_cell_line=list(
+            #    sampletable=colData %>% filter(treatment %in% c('DMSO', 'JQ1'), !samplename %in% outliers),
+            #    design=~cell_line_group + 0,
+            #    args=list(subset.counts=TRUE)
+            #),
             dbet_cell_line=list(
                 sampletable=colData %>% filter(treatment %in% c('DMSO', 'dBet6'), !samplename %in% outliers),
                 design=~cell_line_group + 0,
@@ -308,32 +313,32 @@ res.list <- list()
 # IMPORTANT: Control must always be last
 
 # including only treatment in the contrast
-res.list[['main_jq']] <- list(
-    res=results(dds.list[['main']], contrast=c('treatment', 'JQ1', 'DMSO')),
-    dds='main',
-    label='JQ1 vs. DMSO (LFC > 0); dds main; cts treatment'
-)
+#res.list[['main_jq']] <- list(
+#    res=results(dds.list[['main']], contrast=c('treatment', 'JQ1', 'DMSO')),
+#    dds='main',
+#    label='JQ1 vs. DMSO (LFC > 0); dds main; cts treatment'
+#)
 
 # JQ1 vs. DMSO at 1h
-res.list[['jq1h-batch']] <- list(
-    res=results(dds.list[['jq_cell_line']], contrast=c(-1,0,1,0,-1,0,1,0,-1,0,1,0)),
-    dds='jq_cell_line',
-    label='JQ1 vs. DMSO 1h (LFC>0); dds jq_cell_line; cts cell_Line_group; batch effect'
-)
+#res.list[['jq1h-batch']] <- list(
+#    res=results(dds.list[['jq_cell_line']], contrast=c(-1,0,1,0,-1,0,1,0,-1,0,1,0)),
+#    dds='jq_cell_line',
+#    label='JQ1 vs. DMSO 1h (LFC>0); dds jq_cell_line; cts cell_Line_group; batch effect'
+#)
 
 # JQ1 vs. DMSO at 6h
-res.list[['jq6h-batch']] <- list(
-    res=results(dds.list[['jq_cell_line']], contrast=c(0,-1,0,1,0,-1,0,1,0,-1,0,1)),
-    dds='jq_cell_line',
-    label='JQ1 vs. DMSO 6h (LFC>0); dds jq_cell_line; cts cell_Line_group; batch effect'
-)
+#res.list[['jq6h-batch']] <- list(
+#    res=results(dds.list[['jq_cell_line']], contrast=c(0,-1,0,1,0,-1,0,1,0,-1,0,1)),
+#    dds='jq_cell_line',
+#    label='JQ1 vs. DMSO 6h (LFC>0); dds jq_cell_line; cts cell_Line_group; batch effect'
+#)
 
 # JQ1 vs. DMSO 6h vs. 1h
-res.list[['jq6h.vs.1h-batch']] <- list(
-    res=results(dds.list[['jq_cell_line']], contrast=c(1,-1,-1,1,1,-1,-1,1,1,-1,-1,1)),
-    dds='jq_cell_line',
-    label='JQ1 vs. DMSO 6h vs. 1h (LFC>0); dds jq_cell_line; cts cell_Line_group; batch effect'
-)
+#res.list[['jq6h.vs.1h-batch']] <- list(
+#    res=results(dds.list[['jq_cell_line']], contrast=c(1,-1,-1,1,1,-1,-1,1,1,-1,-1,1)),
+#    dds='jq_cell_line',
+#    label='JQ1 vs. DMSO 6h vs. 1h (LFC>0); dds jq_cell_line; cts cell_Line_group; batch effect'
+#)
 
 # dBet6 ---------------------------
 

From 660e963a53ab096bf8d8f48de2d86b322d77a6a0 Mon Sep 17 00:00:00 2001
From: daler <dalerr@niddk.nih.gov>
Date: Mon, 7 Dec 2020 09:12:02 -0500
Subject: [PATCH 19/19] fix config syntax

---
 workflows/chipseq/config/config.yaml | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/workflows/chipseq/config/config.yaml b/workflows/chipseq/config/config.yaml
index d0a61c847..c625ecb7f 100644
--- a/workflows/chipseq/config/config.yaml
+++ b/workflows/chipseq/config/config.yaml
@@ -124,8 +124,6 @@ fastq_screen:
   - label: Human
     organism: human
     tag: small-gencode-v28
-    organism: dmel
-    tag: test
   - label: Fly
     organism: dmel
     tag: test
@@ -140,14 +138,7 @@ aligner:
   tag: 'small-gencode-v28'
 
 include_references:
-<<<<<<< HEAD
   - '../../include/reference_configs/PhiX.yaml'
   - '../../include/reference_configs/Test_Homo_sapiens.yaml'
-||||||| 39fd02a
-  - '../../include/reference_configs/PhiX.yaml'
-  - '../../include/reference_configs/Drosophila_melanogaster.yaml'
-  - '../../include/reference_configs/test.yaml'
-=======
   - '../../include/reference_configs/Drosophila_melanogaster.yaml'
   - '../../include/reference_configs/test.yaml'
->>>>>>> master