Merge pull request #4 from cmap/dev

AnupJonchhe · web-flow · commit 1f8701ed2bd4 · 2021-10-12T14:01:30.000-04:00
Merge dev into main
diff --git a/CBnormalize.R b/CBnormalize.R
@@ -18,7 +18,7 @@ suppressPackageStartupMessages(library(reshape2))
 ##      barcodes - a vector of control barcode Name identifiers
 normalize <- function(X, barcodes) {
   normalized <- X %>%
-    dplyr::group_by(sample_ID) %>%
+    dplyr::group_by(profile_id) %>%
     dplyr::mutate(log_normalized_n = glm(y ~ x,
                                          data = tibble(
                                            y = log_dose[Name %in% barcodes],
@@ -37,15 +37,12 @@ parser$add_argument("-v", "--verbose", action="store_true", default=TRUE,
 parser$add_argument("-q", "--quietly", action="store_false", 
                     dest="verbose", help="Print little output")
 
-
 parser$add_argument("--wkdir", default=getwd(), help="Working directory")
 parser$add_argument("-c", "--filtered_counts", default="filtered_counts.csv",
                     help="path to file containing filtered counts")
 parser$add_argument("--CB_meta", default="../metadata/CB_meta.csv", help = "Control Barcode metadata")
 parser$add_argument("-o", "--out", default="", help = "Output path. Default is working directory")
 
-
-
 # get command line options, if help option encountered print help and exit
 args <- parser$parse_args()
 
diff --git a/bcl2fastq.sh b/bcl2fastq.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+source /broad/software/scripts/useuse
+reuse .bcl2fastq2-v2.20.0 > /dev/null
+reuse .bcl2fastq2-2.20.0.422 > /dev/null
+
+OUT_DIR=/xchip/prism/bcl2fastq/
+PSEQ_obelix=/cmap/obelix/pod/prismSeq/
+
+#optional
+if test $# -lt 1; then
+  printf "Usage ./bcl2fastq.sh [options]\nOptions include:\n"
+  printf -- "-s, --seq_code \t\t Sequencer run code E.g. JNV9V \n"
+  printf -- "-p, --proj_code \t Project code to prep project directory in /cmap/obelix/pod/prismSeq/ \n"
+  printf -- "-b, --build_dir \t Build directory, usually on /cmap/obelix/. Overrides PROJ_CODE\n"
+  printf -- "-o, --out_dir \t\t Path to temp storage of fastq files on /xchip/prism/ \n"
+  printf -- "-h, --help \t\t Print this help text\n"
+  exit 1
+fi
+
+while test $# -gt 0; do
+  case "$1" in
+    -h|--help)
+      printf "Usage ./bcl2fastq.sh [options]\nOptions include:\n"
+      printf -- "-s, --seq_code \t\t Sequencer run code E.g. JNV9V \n"
+      printf -- "-p, --proj_code \t Project code to prep project directory in /cmap/obelix/pod/prismSeq/ \n"
+      printf -- "-b, --build_dir \t Build directory, usually on /cmap/obelix/. Overrides PROJ_CODE\n"
+      printf -- "-o, --out_dir \t\t Path to temp storage of fastq files on /xchip/prism/ \n"
+      printf -- "-h, --help \t\t Print this help text\n"
+      exit 0
+      ;;
+    -s|--seq_code)
+      shift
+      SEQ_CODE=$1
+      ;;
+    -b|--build_dir)
+      shift
+      #echo $1
+      BUILD_DIR=$1
+      ;;
+    -p|--proj_code)
+      shift
+      #echo $1
+      PROJ_CODE=$1
+      ;;
+    -o|--out_dir)
+      shift
+      #echo $1
+      OUT_DIR=$1
+      ;;
+    *)
+      printf "Unknown parameter: %s \n" "$1"
+      shift
+      ;;
+  esac
+  shift
+done
+
+if [ ! -d $OUT_DIR ]
+then
+  mkdir $OUT_DIR
+fi
+
+RUNFOLDER_DIR=$(echo /xchip/prism/MiSeq\ Outputs/*-$SEQ_CODE)
+
+echo $RUNFOLDER_DIR
+
+bcl2fastq --runfolder-dir "$RUNFOLDER_DIR" --output-dir $OUT_DIR/$SEQ_CODE --minimum-trimmed-read-length 35 --mask-short-adapter-reads 22 --create-fastq-for-index-reads
+
+if [ -z $BUILD_DIR ]
+then
+  BUILD_DIR=$PSEQ_obelix/$PROJ_CODE
+fi
+
+if [ ! -d $BUILD_DIR ]
+then
+  mkdir $BUILD_DIR
+fi
+
+if [ ! -d $BUILD_DIR/fastq/ ]
+then
+  mkdir $BUILD_DIR/fastq/
+fi
+
+echo Copying fastq files from $OUT_DIR/$SEQ_CODE/ to $BUILD_DIR/fastq/
+cp $OUT_DIR/$SEQ_CODE/*.fastq.gz $BUILD_DIR/fastq/
diff --git a/compute_l2fc.R b/compute_l2fc.R
@@ -12,34 +12,39 @@ suppressPackageStartupMessages(library(dplyr)) #n()
 ## takes:
 ##      normalized_counts - table with normalized_n column and control_sample column that designates the 
 ##          name of the control sample for each treatment sample
-compute_l2fc = function(normalized_counts, control_types) {
+compute_l2fc = function(normalized_counts, control_type) {
   treatments = normalized_counts %>% 
-    filter(!(trt_type %in% control_types),
+    filter(trt_type!=control_type, trt_type!="day_0",
            is.na(Name)) %>% 
     dplyr::select(-Name, -log_dose, -n, -log_n, -log_normalized_n) %>% 
-    group_by_at(setdiff(names(.), c("normalized_n", "tech_rep", "profile_id"))) %>% 
+    group_by_at(setdiff(names(.), c("normalized_n", "tech_rep"))) %>% 
     dplyr::summarise(sum_normalized_n = sum(normalized_n)) %>% 
     ungroup()
+  
   controls = normalized_counts %>% 
-    filter(trt_type %in% control_types,
+    filter(trt_type==control_type,
            is.na(Name)) %>% 
-    mutate(control_sample=sample_ID) %>% 
     dplyr::select(-Name, -log_dose, -n, -log_n, -log_normalized_n) %>% 
-    group_by_at(setdiff(names(.), c("normalized_n", "tech_rep", "profile_id"))) %>% 
+    group_by_at(setdiff(names(.), c("normalized_n", "tech_rep"))) %>% 
     dplyr::summarise(sum_normalized_n = sum(normalized_n)) %>% 
     ungroup() %>% 
-    #group_by_at(setdiff(names(.), c("sum_normalized_n", "bio_rep"))) %>% 
-    group_by(CCLE_name, DepMap_ID, prism_cell_set, control_sample) %>% 
+    group_by(CCLE_name, DepMap_ID, prism_cell_set) %>% 
     dplyr::summarise(control_median_normalized_n = median(sum_normalized_n),
                      control_mad_sqrtN = mad(log10(sum_normalized_n))/sqrt(n())) %>% 
     ungroup() %>% 
     mutate(control_pass_QC = ifelse(control_mad_sqrtN > 0.5, F, T)) %>% 
-    dplyr::select(CCLE_name, DepMap_ID, prism_cell_set, control_sample, control_median_normalized_n, control_mad_sqrtN, control_pass_QC)
+    dplyr::select(CCLE_name, DepMap_ID, prism_cell_set, control_median_normalized_n, control_mad_sqrtN, control_pass_QC)
+  
+  if(nrow(controls)==0) {
+    print("No samples found for indicated control type.")
+    stop()
+  }
+  
   l2fc = treatments %>% 
-    merge(controls, by=c("CCLE_name", "DepMap_ID", "prism_cell_set", "control_sample"), all.x=T, all.y=T) %>% 
+    merge(controls, by=c("CCLE_name", "DepMap_ID", "prism_cell_set"), all.x=T, all.y=T) %>% 
     mutate(l2fc=log2(sum_normalized_n/control_median_normalized_n)) %>% 
-    dplyr::relocate(project_code, CCLE_name, DepMap_ID, prism_cell_set, sample_ID, trt_type, control_sample, control_barcodes,
-                    bio_rep)
+    dplyr::relocate(project_code, CCLE_name, DepMap_ID, prism_cell_set, profile_id, trt_type, control_barcodes,
+                    bio_rep) 
   
   return(l2fc)
 }
@@ -54,7 +59,7 @@ parser$add_argument("-q", "--quietly", action="store_false",
 parser$add_argument("--wkdir", default=getwd(), help="Working directory")
 parser$add_argument("-c", "--normalized_counts", default="normalized_counts.csv",
                     help="path to file containing normalized counts")
-parser$add_argument("-ct", "--control_types", default="trt_ctrl,negcon", help="trt_types to use as control")
+parser$add_argument("-ct", "--control_type", default="negcon", help="trt_type to use as control")
 parser$add_argument("-o","--out", default="", help = "Output path. Default is working directory")
 
 # get command line options, if help option encountered print help and exit
@@ -64,12 +69,12 @@ if (args$out == ""){
   args$out = args$wkdir
 }
 
-control_types = unlist(strsplit(args$control_types, ","))
+control_type = args$control_type
 
 normalized_counts = read.csv(args$normalized_counts)
 
 print("computing log-fold change")
-l2fc = compute_l2fc(normalized_counts, control_types)
+l2fc = compute_l2fc(normalized_counts, control_type)
 
 l2fc_out = paste(args$out, "l2fc.csv", sep="/")
 write.csv(l2fc, l2fc_out, row.names=F, quote=F)
diff --git a/filter_counts.R b/filter_counts.R
@@ -46,7 +46,7 @@ filter_raw_reads = function(raw_counts, sample_meta, cell_line_meta, cell_set_me
     dplyr::select(-any_of(c("flowcell_name", "flowcell_lane", "index_1", "index_2", "members", 
                           "lysate_well", "lysate_plate", "pcr_well", "pcr_plate",
                           "forward_read_cl_barcode", "LUA"))) %>% 
-    dplyr::relocate(project_code, CCLE_name, DepMap_ID, prism_cell_set, Name, log_dose, profile_id, sample_ID, trt_type, control_sample, control_barcodes,
+    dplyr::relocate(project_code, CCLE_name, DepMap_ID, prism_cell_set, Name, log_dose, profile_id, trt_type, control_barcodes,
                     bio_rep, tech_rep) %>% 
     dplyr::relocate(n, .after=last_col())
   
@@ -86,7 +86,7 @@ parser$add_argument("-s", "--sample_meta", default="", help = "Sample metadata")
 parser$add_argument("--cell_line_meta", default="../metadata/cell_line_meta.csv", help = "Cell Line metadata")
 parser$add_argument("--cell_set_meta", default="../metadata/cell_set_meta.csv", help = "Cell set metadata")
 parser$add_argument("--CB_meta", default="../metadata/CB_meta.csv", help = "Control Barcode metadata")
-parser$add_argument("--id_cols", default="sample_ID,pcr_well,tech_rep", help = "Columns used to generate profile ids, comma-separated colnames from --sample_meta")
+parser$add_argument("--id_cols", default="treatment,dose,dose_unit,day", help = "Columns used to generate profile ids, comma-separated colnames from --sample_meta")
 
 # get command line options, if help option encountered print help and exit
 args <- parser$parse_args()
diff --git a/generate_biomarkers.R b/generate_biomarkers.R
@@ -19,7 +19,7 @@ suppressPackageStartupMessages(library(cdsrbiomarker))
 generate_biomarkers = function(collapsed_values) {
   bio_in = collapsed_values %>% 
     filter(trt_pass_QC) %>% 
-    dcast(DepMap_ID~sample_ID+control_sample, value.var="median_l2fc") %>% 
+    dcast(DepMap_ID~profile_id, value.var="median_l2fc") %>% 
     column_to_rownames("DepMap_ID")
   
   bio_out = cdsrbiomarker::get_biomarkers(bio_in)
diff --git a/replicate_QC.R b/replicate_QC.R
@@ -7,6 +7,7 @@ suppressPackageStartupMessages(library(magrittr))
 suppressPackageStartupMessages(library(tidyr))
 suppressPackageStartupMessages(library(reshape2))
 suppressPackageStartupMessages(library(tibble))
+suppressPackageStartupMessages(library(stringr))
 
 ## check_replicate_cor
 ## checks that technical and biological replicates are all well correlated with each other
@@ -18,7 +19,7 @@ suppressPackageStartupMessages(library(tibble))
 check_replicate_cor = function(normalized_counts, out) {
   tech_rep_cor = normalized_counts %>% 
     filter(is.na(Name)) %>% 
-    dcast(CCLE_name~sample_ID+bio_rep+tech_rep, value.var="log_normalized_n") %>% 
+    dcast(CCLE_name~profile_id+bio_rep+tech_rep, value.var="log_normalized_n") %>% 
     dplyr::select(-CCLE_name) %>% 
     cor(use="complete.obs") %>% as.data.frame() 
   
@@ -27,31 +28,25 @@ check_replicate_cor = function(normalized_counts, out) {
   
   tech_rep_cor_long = tech_rep_cor %>% 
     rownames_to_column("sample_1") %>% 
-    melt(id.vars="sample_1", variable.name="sample_2", value.name="cor") %>% 
-    mutate(sample_ID_1 = as.character(sample_1) %>% purrr::map(strsplit, "_") %>% purrr::map(`[[`, 1) %>% purrr::map(`[`, 1) %>% unlist(),
-           sample_ID_2 = as.character(sample_2) %>% purrr::map(strsplit, "_") %>% purrr::map(`[[`, 1) %>% purrr::map(`[`, 1) %>% unlist()) %>% 
-    filter(sample_ID_1 == sample_ID_2) %>% 
-    mutate(bio_rep_1 = as.character(sample_1) %>% purrr::map(strsplit, "_") %>% purrr::map(`[[`, 1) %>% purrr::map(`[`, 2) %>% unlist(),
-           bio_rep_2 = as.character(sample_2) %>% purrr::map(strsplit, "_") %>% purrr::map(`[[`, 1) %>% purrr::map(`[`, 2) %>% unlist()) %>% 
-    filter(bio_rep_1 == bio_rep_2) %>%
-    mutate(tech_rep_1 = as.character(sample_1) %>% purrr::map(strsplit, "_") %>% purrr::map(`[[`, 1) %>% purrr::map(`[`, 3) %>% unlist(),
-           tech_rep_2 = as.character(sample_2) %>% purrr::map(strsplit, "_") %>% purrr::map(`[[`, 1) %>% purrr::map(`[`, 3) %>% unlist()) %>%
-    filter(tech_rep_2>tech_rep_1) %>% 
-    dplyr::rename(sample_ID = sample_ID_1, bio_rep = bio_rep_1) %>% 
-    dcast(sample_ID+bio_rep~tech_rep_1+tech_rep_2, value.var="cor")
+    melt(id.vars="sample_1", variable.name="sample_2", value.name="tech_rep_cor") %>% 
+    mutate(sample_1 = gsub('.{2}$', '', sample_1),
+           sample_2 = gsub('.{2}$', '', sample_2)) %>% 
+    filter(sample_1 == sample_2) %>% 
+    dplyr::rename(profile_id = sample_1) %>% 
+    dplyr::select(profile_id, tech_rep_cor)
   
   trep_long_out = paste(args$out, "tech_rep_cor_long.csv", sep='/')
-  write.csv(tech_rep_cor_long, trep_long_out, row.names=T, quote=F)
+  write.csv(tech_rep_cor_long, trep_long_out, row.names=F, quote=F)
   
   tech_collapsed_counts = normalized_counts %>% 
     filter(is.na(Name)) %>%  
-    dplyr::select(-Name, -log_dose, -n, -log_n, -log_normalized_n, -profile_id) %>% 
+    dplyr::select(-Name, -log_dose, -n, -log_n, -log_normalized_n) %>% 
     group_by_at(setdiff(names(.), c("normalized_n", "tech_rep"))) %>% 
     dplyr::summarise(sum_normalized_n = sum(normalized_n)) %>% 
     ungroup()
   
   bio_rep_cor = tech_collapsed_counts %>% 
-    dcast(CCLE_name~sample_ID+bio_rep, value.var="sum_normalized_n") %>% 
+    dcast(CCLE_name~profile_id+bio_rep, value.var="sum_normalized_n") %>% 
     dplyr::select(-CCLE_name) %>% 
     cor(use="complete.obs") %>% 
     as.data.frame()
@@ -62,17 +57,17 @@ check_replicate_cor = function(normalized_counts, out) {
   bio_rep_cor_long = bio_rep_cor %>% 
     rownames_to_column("sample_1") %>% 
     melt(id.vars="sample_1", variable.name="sample_2", value.name="cor") %>% 
-    mutate(sample_ID_1 = as.character(sample_1) %>% purrr::map(strsplit, "_") %>% purrr::map(`[[`, 1) %>% purrr::map(`[`, 1) %>% unlist(),
-           sample_ID_2 = as.character(sample_2) %>% purrr::map(strsplit, "_") %>% purrr::map(`[[`, 1) %>% purrr::map(`[`, 1) %>% unlist()) %>% 
+    mutate(sample_ID_1 = gsub('.{2}$', '', sample_1),
+           sample_ID_2 = gsub('.{2}$', '', sample_2)) %>% 
     filter(sample_ID_1 == sample_ID_2) %>% 
-    mutate(bio_rep_1 = as.character(sample_1) %>% purrr::map(strsplit, "_") %>% purrr::map(`[[`, 1) %>% purrr::map(`[`, 2) %>% unlist(),
-           bio_rep_2 = as.character(sample_2) %>% purrr::map(strsplit, "_") %>% purrr::map(`[[`, 1) %>% purrr::map(`[`, 2) %>% unlist()) %>% 
+    mutate(bio_rep_1 = as.character(sample_1) %>% purrr::map(str_sub, -1, -1) %>% unlist(),
+           bio_rep_2 = as.character(sample_2) %>% purrr::map(str_sub, -1, -1) %>% unlist()) %>% 
     filter(bio_rep_2>bio_rep_1) %>% 
-    dplyr::rename(sample_ID = sample_ID_1) %>% 
-    dcast(sample_ID~bio_rep_1+bio_rep_2, value.var="cor")
+    dplyr::rename(profile_id = sample_ID_1) %>% 
+    dcast(profile_id~bio_rep_1+bio_rep_2, value.var="cor")
   
   brep_long_out = paste(args$out, "bio_rep_cor_long.csv", sep='/')
-  write.csv(bio_rep_cor_long, brep_long_out, row.names=T, quote=F)
+  write.csv(bio_rep_cor_long, brep_long_out, row.names=F, quote=F)
 }