Skip to content

Commit 1f8701e

Browse files
authored
Merge pull request #4 from cmap/dev
Merge dev into main
2 parents 07228da + e5bcb04 commit 1f8701e

6 files changed

+127
-45
lines changed

CBnormalize.R

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ suppressPackageStartupMessages(library(reshape2))
1818
## barcodes - a vector of control barcode Name identifiers
1919
normalize <- function(X, barcodes) {
2020
normalized <- X %>%
21-
dplyr::group_by(sample_ID) %>%
21+
dplyr::group_by(profile_id) %>%
2222
dplyr::mutate(log_normalized_n = glm(y ~ x,
2323
data = tibble(
2424
y = log_dose[Name %in% barcodes],
@@ -37,15 +37,12 @@ parser$add_argument("-v", "--verbose", action="store_true", default=TRUE,
3737
parser$add_argument("-q", "--quietly", action="store_false",
3838
dest="verbose", help="Print little output")
3939

40-
4140
parser$add_argument("--wkdir", default=getwd(), help="Working directory")
4241
parser$add_argument("-c", "--filtered_counts", default="filtered_counts.csv",
4342
help="path to file containing filtered counts")
4443
parser$add_argument("--CB_meta", default="../metadata/CB_meta.csv", help = "Control Barcode metadata")
4544
parser$add_argument("-o", "--out", default="", help = "Output path. Default is working directory")
4645

47-
48-
4946
# get command line options, if help option encountered print help and exit
5047
args <- parser$parse_args()
5148

bcl2fastq.sh

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
#!/bin/bash
2+
source /broad/software/scripts/useuse
3+
reuse .bcl2fastq2-v2.20.0 > /dev/null
4+
reuse .bcl2fastq2-2.20.0.422 > /dev/null
5+
6+
OUT_DIR=/xchip/prism/bcl2fastq/
7+
PSEQ_obelix=/cmap/obelix/pod/prismSeq/
8+
9+
#optional
10+
if test $# -lt 1; then
11+
printf "Usage ./bcl2fastq.sh [options]\nOptions include:\n"
12+
printf -- "-s, --seq_code \t\t Sequencer run code E.g. JNV9V \n"
13+
printf -- "-p, --proj_code \t Project code to prep project directory in /cmap/obelix/pod/prismSeq/ \n"
14+
printf -- "-b, --build_dir \t Build directory, usually on /cmap/obelix/. Overrides PROJ_CODE\n"
15+
printf -- "-o, --out_dir \t\t Path to temp storage of fastq files on /xchip/prism/ \n"
16+
printf -- "-h, --help \t\t Print this help text\n"
17+
exit 1
18+
fi
19+
20+
while test $# -gt 0; do
21+
case "$1" in
22+
-h|--help)
23+
printf "Usage ./bcl2fastq.sh [options]\nOptions include:\n"
24+
printf -- "-s, --seq_code \t\t Sequencer run code E.g. JNV9V \n"
25+
printf -- "-p, --proj_code \t Project code to prep project directory in /cmap/obelix/pod/prismSeq/ \n"
26+
printf -- "-b, --build_dir \t Build directory, usually on /cmap/obelix/. Overrides PROJ_CODE\n"
27+
printf -- "-o, --out_dir \t\t Path to temp storage of fastq files on /xchip/prism/ \n"
28+
printf -- "-h, --help \t\t Print this help text\n"
29+
exit 0
30+
;;
31+
-s|--seq_code)
32+
shift
33+
SEQ_CODE=$1
34+
;;
35+
-b|--build_dir)
36+
shift
37+
#echo $1
38+
BUILD_DIR=$1
39+
;;
40+
-p|--proj_code)
41+
shift
42+
#echo $1
43+
PROJ_CODE=$1
44+
;;
45+
-o|--out_dir)
46+
shift
47+
#echo $1
48+
OUT_DIR=$1
49+
;;
50+
*)
51+
printf "Unknown parameter: %s \n" "$1"
52+
shift
53+
;;
54+
esac
55+
shift
56+
done
57+
58+
if [ ! -d $OUT_DIR ]
59+
then
60+
mkdir $OUT_DIR
61+
fi
62+
63+
RUNFOLDER_DIR=$(echo /xchip/prism/MiSeq\ Outputs/*-$SEQ_CODE)
64+
65+
echo $RUNFOLDER_DIR
66+
67+
bcl2fastq --runfolder-dir "$RUNFOLDER_DIR" --output-dir $OUT_DIR/$SEQ_CODE --minimum-trimmed-read-length 35 --mask-short-adapter-reads 22 --create-fastq-for-index-reads
68+
69+
if [ -z $BUILD_DIR ]
70+
then
71+
BUILD_DIR=$PSEQ_obelix/$PROJ_CODE
72+
fi
73+
74+
if [ ! -d $BUILD_DIR ]
75+
then
76+
mkdir $BUILD_DIR
77+
fi
78+
79+
if [ ! -d $BUILD_DIR/fastq/ ]
80+
then
81+
mkdir $BUILD_DIR/fastq/
82+
fi
83+
84+
echo Copying fastq files from $OUT_DIR/$SEQ_CODE/ to $BUILD_DIR/fastq/
85+
cp $OUT_DIR/$SEQ_CODE/*.fastq.gz $BUILD_DIR/fastq/

compute_l2fc.R

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,34 +12,39 @@ suppressPackageStartupMessages(library(dplyr)) #n()
1212
## takes:
1313
## normalized_counts - table with normalized_n column and control_sample column that designates the
1414
## name of the control sample for each treatment sample
15-
compute_l2fc = function(normalized_counts, control_types) {
15+
compute_l2fc = function(normalized_counts, control_type) {
1616
treatments = normalized_counts %>%
17-
filter(!(trt_type %in% control_types),
17+
filter(trt_type!=control_type, trt_type!="day_0",
1818
is.na(Name)) %>%
1919
dplyr::select(-Name, -log_dose, -n, -log_n, -log_normalized_n) %>%
20-
group_by_at(setdiff(names(.), c("normalized_n", "tech_rep", "profile_id"))) %>%
20+
group_by_at(setdiff(names(.), c("normalized_n", "tech_rep"))) %>%
2121
dplyr::summarise(sum_normalized_n = sum(normalized_n)) %>%
2222
ungroup()
23+
2324
controls = normalized_counts %>%
24-
filter(trt_type %in% control_types,
25+
filter(trt_type==control_type,
2526
is.na(Name)) %>%
26-
mutate(control_sample=sample_ID) %>%
2727
dplyr::select(-Name, -log_dose, -n, -log_n, -log_normalized_n) %>%
28-
group_by_at(setdiff(names(.), c("normalized_n", "tech_rep", "profile_id"))) %>%
28+
group_by_at(setdiff(names(.), c("normalized_n", "tech_rep"))) %>%
2929
dplyr::summarise(sum_normalized_n = sum(normalized_n)) %>%
3030
ungroup() %>%
31-
#group_by_at(setdiff(names(.), c("sum_normalized_n", "bio_rep"))) %>%
32-
group_by(CCLE_name, DepMap_ID, prism_cell_set, control_sample) %>%
31+
group_by(CCLE_name, DepMap_ID, prism_cell_set) %>%
3332
dplyr::summarise(control_median_normalized_n = median(sum_normalized_n),
3433
control_mad_sqrtN = mad(log10(sum_normalized_n))/sqrt(n())) %>%
3534
ungroup() %>%
3635
mutate(control_pass_QC = ifelse(control_mad_sqrtN > 0.5, F, T)) %>%
37-
dplyr::select(CCLE_name, DepMap_ID, prism_cell_set, control_sample, control_median_normalized_n, control_mad_sqrtN, control_pass_QC)
36+
dplyr::select(CCLE_name, DepMap_ID, prism_cell_set, control_median_normalized_n, control_mad_sqrtN, control_pass_QC)
37+
38+
if(nrow(controls)==0) {
39+
print("No samples found for indicated control type.")
40+
stop()
41+
}
42+
3843
l2fc = treatments %>%
39-
merge(controls, by=c("CCLE_name", "DepMap_ID", "prism_cell_set", "control_sample"), all.x=T, all.y=T) %>%
44+
merge(controls, by=c("CCLE_name", "DepMap_ID", "prism_cell_set"), all.x=T, all.y=T) %>%
4045
mutate(l2fc=log2(sum_normalized_n/control_median_normalized_n)) %>%
41-
dplyr::relocate(project_code, CCLE_name, DepMap_ID, prism_cell_set, sample_ID, trt_type, control_sample, control_barcodes,
42-
bio_rep)
46+
dplyr::relocate(project_code, CCLE_name, DepMap_ID, prism_cell_set, profile_id, trt_type, control_barcodes,
47+
bio_rep)
4348

4449
return(l2fc)
4550
}
@@ -54,7 +59,7 @@ parser$add_argument("-q", "--quietly", action="store_false",
5459
parser$add_argument("--wkdir", default=getwd(), help="Working directory")
5560
parser$add_argument("-c", "--normalized_counts", default="normalized_counts.csv",
5661
help="path to file containing normalized counts")
57-
parser$add_argument("-ct", "--control_types", default="trt_ctrl,negcon", help="trt_types to use as control")
62+
parser$add_argument("-ct", "--control_type", default="negcon", help="trt_type to use as control")
5863
parser$add_argument("-o","--out", default="", help = "Output path. Default is working directory")
5964

6065
# get command line options, if help option encountered print help and exit
@@ -64,12 +69,12 @@ if (args$out == ""){
6469
args$out = args$wkdir
6570
}
6671

67-
control_types = unlist(strsplit(args$control_types, ","))
72+
control_type = args$control_type
6873

6974
normalized_counts = read.csv(args$normalized_counts)
7075

7176
print("computing log-fold change")
72-
l2fc = compute_l2fc(normalized_counts, control_types)
77+
l2fc = compute_l2fc(normalized_counts, control_type)
7378

7479
l2fc_out = paste(args$out, "l2fc.csv", sep="/")
7580
write.csv(l2fc, l2fc_out, row.names=F, quote=F)

filter_counts.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ filter_raw_reads = function(raw_counts, sample_meta, cell_line_meta, cell_set_me
4646
dplyr::select(-any_of(c("flowcell_name", "flowcell_lane", "index_1", "index_2", "members",
4747
"lysate_well", "lysate_plate", "pcr_well", "pcr_plate",
4848
"forward_read_cl_barcode", "LUA"))) %>%
49-
dplyr::relocate(project_code, CCLE_name, DepMap_ID, prism_cell_set, Name, log_dose, profile_id, sample_ID, trt_type, control_sample, control_barcodes,
49+
dplyr::relocate(project_code, CCLE_name, DepMap_ID, prism_cell_set, Name, log_dose, profile_id, trt_type, control_barcodes,
5050
bio_rep, tech_rep) %>%
5151
dplyr::relocate(n, .after=last_col())
5252

@@ -86,7 +86,7 @@ parser$add_argument("-s", "--sample_meta", default="", help = "Sample metadata")
8686
parser$add_argument("--cell_line_meta", default="../metadata/cell_line_meta.csv", help = "Cell Line metadata")
8787
parser$add_argument("--cell_set_meta", default="../metadata/cell_set_meta.csv", help = "Cell set metadata")
8888
parser$add_argument("--CB_meta", default="../metadata/CB_meta.csv", help = "Control Barcode metadata")
89-
parser$add_argument("--id_cols", default="sample_ID,pcr_well,tech_rep", help = "Columns used to generate profile ids, comma-separated colnames from --sample_meta")
89+
parser$add_argument("--id_cols", default="treatment,dose,dose_unit,day", help = "Columns used to generate profile ids, comma-separated colnames from --sample_meta")
9090

9191
# get command line options, if help option encountered print help and exit
9292
args <- parser$parse_args()

generate_biomarkers.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ suppressPackageStartupMessages(library(cdsrbiomarker))
1919
generate_biomarkers = function(collapsed_values) {
2020
bio_in = collapsed_values %>%
2121
filter(trt_pass_QC) %>%
22-
dcast(DepMap_ID~sample_ID+control_sample, value.var="median_l2fc") %>%
22+
dcast(DepMap_ID~profile_id, value.var="median_l2fc") %>%
2323
column_to_rownames("DepMap_ID")
2424

2525
bio_out = cdsrbiomarker::get_biomarkers(bio_in)

replicate_QC.R

Lines changed: 18 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ suppressPackageStartupMessages(library(magrittr))
77
suppressPackageStartupMessages(library(tidyr))
88
suppressPackageStartupMessages(library(reshape2))
99
suppressPackageStartupMessages(library(tibble))
10+
suppressPackageStartupMessages(library(stringr))
1011

1112
## check_replicate_cor
1213
## checks that technical and biological replicates are all well correlated with each other
@@ -18,7 +19,7 @@ suppressPackageStartupMessages(library(tibble))
1819
check_replicate_cor = function(normalized_counts, out) {
1920
tech_rep_cor = normalized_counts %>%
2021
filter(is.na(Name)) %>%
21-
dcast(CCLE_name~sample_ID+bio_rep+tech_rep, value.var="log_normalized_n") %>%
22+
dcast(CCLE_name~profile_id+bio_rep+tech_rep, value.var="log_normalized_n") %>%
2223
dplyr::select(-CCLE_name) %>%
2324
cor(use="complete.obs") %>% as.data.frame()
2425

@@ -27,31 +28,25 @@ check_replicate_cor = function(normalized_counts, out) {
2728

2829
tech_rep_cor_long = tech_rep_cor %>%
2930
rownames_to_column("sample_1") %>%
30-
melt(id.vars="sample_1", variable.name="sample_2", value.name="cor") %>%
31-
mutate(sample_ID_1 = as.character(sample_1) %>% purrr::map(strsplit, "_") %>% purrr::map(`[[`, 1) %>% purrr::map(`[`, 1) %>% unlist(),
32-
sample_ID_2 = as.character(sample_2) %>% purrr::map(strsplit, "_") %>% purrr::map(`[[`, 1) %>% purrr::map(`[`, 1) %>% unlist()) %>%
33-
filter(sample_ID_1 == sample_ID_2) %>%
34-
mutate(bio_rep_1 = as.character(sample_1) %>% purrr::map(strsplit, "_") %>% purrr::map(`[[`, 1) %>% purrr::map(`[`, 2) %>% unlist(),
35-
bio_rep_2 = as.character(sample_2) %>% purrr::map(strsplit, "_") %>% purrr::map(`[[`, 1) %>% purrr::map(`[`, 2) %>% unlist()) %>%
36-
filter(bio_rep_1 == bio_rep_2) %>%
37-
mutate(tech_rep_1 = as.character(sample_1) %>% purrr::map(strsplit, "_") %>% purrr::map(`[[`, 1) %>% purrr::map(`[`, 3) %>% unlist(),
38-
tech_rep_2 = as.character(sample_2) %>% purrr::map(strsplit, "_") %>% purrr::map(`[[`, 1) %>% purrr::map(`[`, 3) %>% unlist()) %>%
39-
filter(tech_rep_2>tech_rep_1) %>%
40-
dplyr::rename(sample_ID = sample_ID_1, bio_rep = bio_rep_1) %>%
41-
dcast(sample_ID+bio_rep~tech_rep_1+tech_rep_2, value.var="cor")
31+
melt(id.vars="sample_1", variable.name="sample_2", value.name="tech_rep_cor") %>%
32+
mutate(sample_1 = gsub('.{2}$', '', sample_1),
33+
sample_2 = gsub('.{2}$', '', sample_2)) %>%
34+
filter(sample_1 == sample_2) %>%
35+
dplyr::rename(profile_id = sample_1) %>%
36+
dplyr::select(profile_id, tech_rep_cor)
4237

4338
trep_long_out = paste(args$out, "tech_rep_cor_long.csv", sep='/')
44-
write.csv(tech_rep_cor_long, trep_long_out, row.names=T, quote=F)
39+
write.csv(tech_rep_cor_long, trep_long_out, row.names=F, quote=F)
4540

4641
tech_collapsed_counts = normalized_counts %>%
4742
filter(is.na(Name)) %>%
48-
dplyr::select(-Name, -log_dose, -n, -log_n, -log_normalized_n, -profile_id) %>%
43+
dplyr::select(-Name, -log_dose, -n, -log_n, -log_normalized_n) %>%
4944
group_by_at(setdiff(names(.), c("normalized_n", "tech_rep"))) %>%
5045
dplyr::summarise(sum_normalized_n = sum(normalized_n)) %>%
5146
ungroup()
5247

5348
bio_rep_cor = tech_collapsed_counts %>%
54-
dcast(CCLE_name~sample_ID+bio_rep, value.var="sum_normalized_n") %>%
49+
dcast(CCLE_name~profile_id+bio_rep, value.var="sum_normalized_n") %>%
5550
dplyr::select(-CCLE_name) %>%
5651
cor(use="complete.obs") %>%
5752
as.data.frame()
@@ -62,17 +57,17 @@ check_replicate_cor = function(normalized_counts, out) {
6257
bio_rep_cor_long = bio_rep_cor %>%
6358
rownames_to_column("sample_1") %>%
6459
melt(id.vars="sample_1", variable.name="sample_2", value.name="cor") %>%
65-
mutate(sample_ID_1 = as.character(sample_1) %>% purrr::map(strsplit, "_") %>% purrr::map(`[[`, 1) %>% purrr::map(`[`, 1) %>% unlist(),
66-
sample_ID_2 = as.character(sample_2) %>% purrr::map(strsplit, "_") %>% purrr::map(`[[`, 1) %>% purrr::map(`[`, 1) %>% unlist()) %>%
60+
mutate(sample_ID_1 = gsub('.{2}$', '', sample_1),
61+
sample_ID_2 = gsub('.{2}$', '', sample_2)) %>%
6762
filter(sample_ID_1 == sample_ID_2) %>%
68-
mutate(bio_rep_1 = as.character(sample_1) %>% purrr::map(strsplit, "_") %>% purrr::map(`[[`, 1) %>% purrr::map(`[`, 2) %>% unlist(),
69-
bio_rep_2 = as.character(sample_2) %>% purrr::map(strsplit, "_") %>% purrr::map(`[[`, 1) %>% purrr::map(`[`, 2) %>% unlist()) %>%
63+
mutate(bio_rep_1 = as.character(sample_1) %>% purrr::map(str_sub, -1, -1) %>% unlist(),
64+
bio_rep_2 = as.character(sample_2) %>% purrr::map(str_sub, -1, -1) %>% unlist()) %>%
7065
filter(bio_rep_2>bio_rep_1) %>%
71-
dplyr::rename(sample_ID = sample_ID_1) %>%
72-
dcast(sample_ID~bio_rep_1+bio_rep_2, value.var="cor")
66+
dplyr::rename(profile_id = sample_ID_1) %>%
67+
dcast(profile_id~bio_rep_1+bio_rep_2, value.var="cor")
7368

7469
brep_long_out = paste(args$out, "bio_rep_cor_long.csv", sep='/')
75-
write.csv(bio_rep_cor_long, brep_long_out, row.names=T, quote=F)
70+
write.csv(bio_rep_cor_long, brep_long_out, row.names=F, quote=F)
7671
}
7772

7873

0 commit comments

Comments
 (0)