From a02c35046802557322a71b1a9077b9c9ffc91a46 Mon Sep 17 00:00:00 2001 From: Elena Buscaroli Date: Wed, 29 Jan 2025 16:15:35 +0100 Subject: [PATCH 1/8] removing obsolete modules.config --- .nf-core.yml | 4 ++++ conf/modules.config | 34 ---------------------------------- 2 files changed, 4 insertions(+), 34 deletions(-) delete mode 100644 conf/modules.config diff --git a/.nf-core.yml b/.nf-core.yml index d3220f5..f21e986 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -13,3 +13,7 @@ template: org: nf-core outdir: . version: 1.0dev +lint: + modules_config: False + files_exist: + - conf/modules.config diff --git a/conf/modules.config b/conf/modules.config deleted file mode 100644 index d203d2b..0000000 --- a/conf/modules.config +++ /dev/null @@ -1,34 +0,0 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Config file for defining DSL2 per module options and publishing paths -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Available keys to override module options: - ext.args = Additional arguments appended to command in module. - ext.args2 = Second set of arguments appended to command in module (multi-tool modules). - ext.args3 = Third set of arguments appended to command in module (multi-tool modules). - ext.prefix = File name prefix for output files. ----------------------------------------------------------------------------------------- -*/ - -process { - - publishDir = [ - path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - - withName: FASTQC { - ext.args = '--quiet' - } - - withName: 'MULTIQC' { - ext.args = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' } - publishDir = [ - path: { "${params.outdir}/multiqc" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - -} From c76acb9b4cb8338bc7f93528bae4d0e163df8655 Mon Sep 17 00:00:00 2001 From: Elena Buscaroli Date: Wed, 29 Jan 2025 16:15:49 +0100 Subject: [PATCH 2/8] updating test.config params --- conf/test.config | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/conf/test.config b/conf/test.config index bb94a8e..4b2e5b1 100644 --- a/conf/test.config +++ b/conf/test.config @@ -43,10 +43,10 @@ params { sparsesignatures_K = "2:3" sparsesignatures_nmf_runs = "3" - sparsesignatures_iterations = "10" + sparsesignatures_iterations = "5" sparsesignatures_max_iterations_lasso = "80" - sparsesignatures_cross_validation_repetitions = "8" - sparsesignatures_cross_validation_iterations = "5" + sparsesignatures_cross_validation_repetitions = "5" + sparsesignatures_cross_validation_iterations = "3" sparsesignatures_lambda_values_alpha = "c(0.00, 0.01)" sparsesignatures_lambda_values_beta = "c(0.01, 0.05)" From 0a42b17710a696a373ed3fec1dda1a807cd00d1e Mon Sep 17 00:00:00 2001 From: Elena Buscaroli Date: Wed, 29 Jan 2025 16:29:43 +0100 Subject: [PATCH 3/8] update modules.json removing old nf-core modules --- modules.json | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/modules.json b/modules.json index 05615ab..011365e 100644 --- a/modules.json +++ b/modules.json @@ -23,16 +23,6 @@ "installed_by": ["modules", "vcf_annotate_ensemblvep"], "patch": "modules/nf-core/ensemblvep/vep/ensemblvep-vep.diff" }, - "fastqc": { - "branch": "master", - "git_sha": "08108058ea36a63f141c25c4e75f9f872a5b2296", - "installed_by": ["modules"] - }, - "multiqc": { - "branch": "master", - "git_sha": "f0719ae309075ae4a291533883847c3f7c441dad", - "installed_by": ["modules"] - }, "tabix/tabix": { "branch": "master", "git_sha": "9502adb23c0b97ed8e616bbbdfa73b4585aec9a1", From eb273f6fd6dbb275db9780a76d7ea01809b9423c Mon Sep 17 00:00:00 2001 From: Elena Buscaroli Date: Wed, 29 Jan 2025 16:30:09 +0100 Subject: [PATCH 4/8] adding templates --- .../local/CNAqc2tsv/templates/main_script.R | 94 ++++ .../SparseSignatures/templates/main_script.R | 248 ++++++++++ .../local/cna2CNAqc/templates/main_script.R | 63 +++ .../local/mobsterh/templates/main_script.R | 199 ++++++++ .../local/pyclonevi/templates/main_script.py | 132 +++++ .../local/vcf2CNAqc/templates/main_script.R | 454 ++++++++++++++++++ modules/local/viber/templates/main_script.R | 236 +++++++++ 7 files changed, 1426 insertions(+) create mode 100644 modules/local/CNAqc2tsv/templates/main_script.R create mode 100644 modules/local/SparseSignatures/templates/main_script.R create mode 100644 modules/local/cna2CNAqc/templates/main_script.R create mode 100644 modules/local/mobsterh/templates/main_script.R create mode 100644 modules/local/pyclonevi/templates/main_script.py create mode 100644 modules/local/vcf2CNAqc/templates/main_script.R create mode 100644 modules/local/viber/templates/main_script.R diff --git a/modules/local/CNAqc2tsv/templates/main_script.R b/modules/local/CNAqc2tsv/templates/main_script.R new file mode 100644 index 0000000..75ff610 --- /dev/null +++ b/modules/local/CNAqc2tsv/templates/main_script.R @@ -0,0 +1,94 @@ +#!/usr/bin/env Rscript + +opt = list( + prefix = ifelse('$task.ext.prefix' == 'null', '$meta.id', '$task.ext.prefix') +) + +# Auxiliary functions ##### + +get_sample = function(m_cnaqc_obj, sample, which_obj) { + if (class(m_cnaqc_obj) != "m_cnaqc") { + wrong_class_all = class(m_cnaqc_obj) + + cli::cli_abort( + c("cnaqc_objs must be a {.field m_cnaqc} object", + "x" = "{.var m_cnaqc_obj} is a {.cls {class(m_cnaqc_obj)}}") + ) + } + + consented_obj = c("shared", "original") + if ((which_obj %in% consented_obj) == FALSE) { + cli::cli_abort("{.var which_obj} must be one of {.val shared} or {.val original}") + } + + # define the element names + if (which_obj == "original") { + type = "original_cnaqc_objc" + # check if the original cnaqc obj exist + check_or = any(names(m_cnaqc_obj) == type) + if (check_or == FALSE) { + cli::cli_abort(c("mCNAqc object was build without keeping original CNAqc objects"), + "x" = "It is not possible to retrieve the required samples") + } else { + cli::cli_h1("Retrieving original {.cls CNAqc} objects") + cnaqc_samples = m_cnaqc_obj[[type]][sample] + } + + } else { + type = "cnaqc_obj_new_segmentation" + cli::cli_h1("Retrieving {.cls CNAqc} objects with the new segmentation") + cnaqc_samples = m_cnaqc_obj[[type]][sample] + } + return(cnaqc_samples) +} + +get_sample_name = function(x) { + if (class(x) == "m_cnaqc") { + lapply(x[["cnaqc_obj_new_segmentation"]], function(y) { + y[["sample"]] + }) %>% unlist() %>% unname() + } else if (class(x) == "cnaqc") { + x[["sample"]] + } else { + wrong_class_all = class(x) + cli::cli_abort( + c("must provide a {.field m_cnaqc} object", + "x" = "{.var x} is a {.cls {class(x)}}") + ) + } +} + +get_mCNAqc_stats = function(m_cnaqc_obj){ + stats = m_cnaqc_obj[["m_cnaqc_stats"]] + return(stats) +} + +# Script #### + +library(dplyr) +library(CNAqc) + +multi_cnaqc = readRDS(file = "$rds_join") +mutations_multisample = get_sample(m_cnaqc_obj = multi_cnaqc,sample = get_sample_name(multi_cnaqc), + which_obj = "original") +multisample_jointTable = list() + +for (s in get_sample_name(multi_cnaqc)){ + purity = mutations_multisample[[s]][["purity"]] + multisample_jointTable[[s]] = mutations_multisample[[s]][["mutations"]] %>% + dplyr::mutate(purity=purity) +} + +joint_table = bind_rows(multisample_jointTable) + +write.table(joint_table, file = paste0(opt[["prefix"]], "_joint_table.tsv"), append = F, quote = F, sep = "\t", row.names = FALSE) + +# version export +f = file("versions.yml","w") +dplyr_version = sessionInfo()\$otherPkgs\$dplyr\$Version +cnaqc_version = sessionInfo()\$otherPkgs\$CNAqc\$Version +writeLines(paste0('"', "$task.process", '"', ":"), f) +writeLines(paste(" dplyr:", dplyr_version), f) +writeLines(paste(" CNAqc:", cnaqc_version), f) +close(f) + diff --git a/modules/local/SparseSignatures/templates/main_script.R b/modules/local/SparseSignatures/templates/main_script.R new file mode 100644 index 0000000..a0f1c90 --- /dev/null +++ b/modules/local/SparseSignatures/templates/main_script.R @@ -0,0 +1,248 @@ +#!/usr/bin/env Rscript + +parse_args = function(x) { + x = gsub("\\\\[","",x) + x = gsub("\\\\]","",x) + # giving errors when we have lists like c(xxx, xxx) since it will separate it + # args_list = unlist(strsplit(x, ', ')[[1]]) + args_list = unlist(strsplit(x, ", (?=[^)]*(?:\\\\(|\$))", perl=TRUE)) + # args_vals = lapply(args_list, function(x) strsplit(x, split=":")[[1]]) + args_vals = lapply(args_list, function(x) { + x_splt = strsplit(x, split=":")[[1]] + c(x_splt[1], paste(x_splt[2:length(x_splt)], collapse=":")) + }) + + # Ensure the option vectors are length 2 (key/ value) to catch empty ones + args_vals = lapply(args_vals, function(z){ length(z) = 2; z}) + + parsed_args = structure(lapply(args_vals, function(x) x[2]), names = lapply(args_vals, function(x) x[1])) + parsed_args[! is.na(parsed_args)] +} + +opt = list( + prefix = ifelse('$task.ext.prefix' == 'null', '$meta.id', '$task.ext.prefix') +) +args_opt = parse_args('$task.ext.args') +for ( ao in names(args_opt)) opt[[ao]] = args_opt[[ao]] + +# Auxiliary functions ##### +get_sample = function(m_cnaqc_obj, sample, which_obj) { + if (class(m_cnaqc_obj) != "m_cnaqc") { + wrong_class_all = class(m_cnaqc_obj) + cli::cli_abort( + c("cnaqc_objs must be a {.field m_cnaqc} object", + "x" = "{.var m_cnaqc_obj} is a {.cls {class(m_cnaqc_obj)}}") + ) + } + + consented_obj = c("shared", "original") + if ((which_obj %in% consented_obj) == FALSE) { + cli::cli_abort("{.var which_obj} must be one of {.val shared} or {.val original}") + } + + # define the element names + if (which_obj == "original") { + type = "original_cnaqc_objc" + # check if the original cnaqc obj exist + check_or = any(names(m_cnaqc_obj) == type) + if (check_or == FALSE) { + cli::cli_abort(c("mCNAqc object was build without keeping original CNAqc objects"), + "x" = "It is not possible to retrieve the required samples") + } else { + cli::cli_h1("Retrieving original {.cls CNAqc} objects") + cnaqc_samples = m_cnaqc_obj[[type]][sample] + } + } else { + type = "cnaqc_obj_new_segmentation" + cli::cli_h1("Retrieving {.cls CNAqc} objects with the new segmentation") + cnaqc_samples = m_cnaqc_obj[[type]][sample] + } + return(cnaqc_samples) +} + +get_sample_name =function(x) { + if (class(x) == "m_cnaqc") { + lapply(x[["cnaqc_obj_new_segmentation"]], function(y) { + y[["sample"]] + }) %>% unlist() %>% unname() + } else if (class(x) == "cnaqc") { + x[["sample"]] + } else { + wrong_class_all = class(x) + cli::cli_abort( + c("must provide a {.field m_cnaqc} object", + "x" = "{.var x} is a {.cls {class(x)}}") + ) + } +} + +get_mCNAqc_stats = function(m_cnaqc_obj){ + stats = m_cnaqc_obj[["m_cnaqc_stats"]] + return(stats) +} + +# Script ##### + +library(SparseSignatures) +library(ggplot2) +library(stringr) +library(patchwork) +library(dplyr) + +n_procs = parse(text=opt[["num_processes"]]) +if (n_procs == "all"){ + n_procs = as.double("Inf") +} else { + n_procs = eval(n_procs) +} + + +patients_tsv = strsplit("$tsv_join", " ")[[1]] +tables = lapply(patients_tsv, FUN = function(p_table){ + read.delim(p_table, sep = "\\t", header=T) %>% + mutate(across(everything(), as.character)) +} +) +multisample_table = dplyr::bind_rows(tables) + + +#Extract input data information +input_data = multisample_table[,c("Indiv","chr","from","to","ref","alt")] +input_data = setNames(input_data, c("sample","chrom","start","end","ref","alt")) +input_data[["end"]] = input_data[["start"]] +input_data = input_data %>% mutate(start = as.integer(start), end = as.integer(end)) + +#Generate the patient vs mutation count matrix from mutation data +#Load reference human-genome specification. +#The user must select, among the available choices, the reference genome consistent with the mutation dataset. + +load_genome = function(genome, input_data) { + if (genome == "GRCh37") { + library(BSgenome.Hsapiens.1000genomes.hs37d5) + bsg = BSgenome.Hsapiens.1000genomes.hs37d5::hs37d5 + input_data[["chrom"]] = substring(input_data[["chrom"]], 4, 5) + + } else if (genome == "GRCh38") { + library(BSgenome.Hsapiens.UCSC.hg38) + bsg = BSgenome.Hsapiens.UCSC.hg38 + + # Leave 'chrom' unchanged for GRCh38 + } + return(list(bsg = bsg, input_data = input_data)) +} + +data_list = load_genome("$params.genome", input_data) +bsg = data_list[["bsg"]] +input_data = data_list[["input_data"]] + +mut_counts = SparseSignatures::import.trinucleotides.counts(data=input_data, reference=bsg) + +# Load a reference SBS5 background signature from COSMIC +data(background) + +# Estimate the initial values of beta +starting_betas = SparseSignatures::startingBetaEstimation(x = mut_counts, + K = eval(parse(text=opt[["K"]])), + background_signature = background, + nmf_runs = as.integer(opt[["nmf_runs"]])) + +# Find the optimal number of signatures and sparsity level: rely on cross-validation +# higher number of CV repetitions corresponds to more accurate parameter estimates + +cv_out = SparseSignatures::nmfLassoCV( + x = mut_counts, + K = eval(parse(text=opt[["K"]])), + starting_beta = starting_betas, + background_signature = background, + normalize_counts = as.logical(opt[["normalize_counts"]]), + nmf_runs = as.integer(opt[["nmf_runs"]]), + lambda_values_alpha = eval(parse(text=opt[["lambda_values_alpha"]])), + lambda_values_beta = eval(parse(text=opt[["lambda_values_beta"]])), + cross_validation_entries = as.numeric(opt[["cross_validation_entries"]]), + cross_validation_iterations = as.integer(opt[["cross_validation_iterations"]]), + cross_validation_repetitions = as.integer(opt[["cross_validation_repetitions"]]), + iterations = as.integer(opt[["iterations"]]), + max_iterations_lasso = as.integer(opt[["max_iterations_lasso"]]), + num_processes = n_procs, + verbose = as.logical(opt[["verbose"]]), + seed = as.integer(opt[["seed"]]) +) + +# Analyze the mean squared error results averaging over cross-validation repetitions +cv_mses = cv_out[["grid_search_mse"]][1,,] +cv_means_mse = matrix(sapply(cv_mses, FUN = mean), + nrow = dim(cv_mses)[1] +) + +dimnames(cv_means_mse) = dimnames(cv_mses) + +# Find the combination of parameters that yields the lowest MSE + +print("CV MEAN MSE") +print(cv_means_mse) + +print("MIN MEANS MSE") +print(min(cv_means_mse, na.rm = TRUE)) + +min_ii = which(cv_means_mse == min(cv_means_mse, na.rm = TRUE), arr.ind = TRUE) +min_Lambda_beta = rownames(cv_means_mse)[min_ii[1]] +min_Lambda_beta = as.numeric(gsub("_Lambda_Beta", "", min_Lambda_beta)) +min_K = colnames(cv_means_mse)[min_ii[2]] +min_K = as.numeric(gsub("_Signatures", "", min_K)) +best_params_config = data.frame(min_K, min_Lambda_beta) + +saveRDS(object = cv_means_mse, file = paste0(opt[["prefix"]], "_cv_means_mse.rds")) +saveRDS(object = best_params_config, file = paste0(opt[["prefix"]], "_best_params_config.rds")) + +# Discovering the signatures within the dataset: NMF Lasso +# Compute the signatures for the best configuration. + +print(paste("MIN K =", min_K)) + +nmf_Lasso_out = SparseSignatures::nmfLasso( + x = mut_counts, + K = min_K, + beta = eval(parse(text=opt[["beta"]])), + background_signature = background, + normalize_counts = as.logical(opt[["normalize_counts"]]), + lambda_rate_alpha = eval(parse(text=opt[["lambda_rate_alpha"]])), + lambda_rate_beta = min_Lambda_beta, + iterations = as.integer(opt[["iterations"]]), + max_iterations_lasso = as.integer(opt[["max_iterations_lasso"]]), + verbose = as.logical(opt[["verbose"]]) +) + +saveRDS(object = nmf_Lasso_out, file = paste0(opt[["prefix"]], "_nmf_Lasso_out.rds")) + +# Signature visualization +signatures = nmf_Lasso_out[["beta"]] +plot_signatures = SparseSignatures::signatures.plot(beta=signatures, xlabels=FALSE) + +plot_exposure = nmf_Lasso_out[["alpha"]] %>% + as.data.frame() %>% + tibble::rownames_to_column(var="PatientID") %>% + tidyr::pivot_longer(cols=!"PatientID", names_to="Signatures", values_to="Exposures") %>% + + ggplot() + + geom_bar(aes(x=PatientID, y=Exposures, fill=Signatures), + position="stack", stat="identity") + + theme(axis.text.x=element_text(angle=90,hjust=1), + panel.background=element_blank(), + axis.line=element_line(colour="black")) + +plt_all = patchwork::wrap_plots(plot_exposure, plot_signatures, ncol=2) + patchwork::plot_annotation(title = "$meta.id") +ggplot2::ggsave(plot = plt_all, filename = paste0(opt[["prefix"]], "_plot_all.pdf"), width = 210, height = 297, units="mm", dpi = 200) +saveRDS(object = plt_all, file = paste0(opt[["prefix"]], "_plot_all.rds")) + +# version export +f = file("versions.yml","w") +SparseSignatures_version = sessionInfo()\$otherPkgs\$SparseSignatures\$Version +dplyr_version = sessionInfo()\$otherPkgs\$dplyr\$Version +ggplot2_version = sessionInfo()\$otherPkgs\$ggplot2\$Version +patchwork_version = sessionInfo()\$otherPkgs\$patchwork\$Version +writeLines(paste0('"', "$task.process", '"', ":"), f) +writeLines(paste(" SparseSignatures:", SparseSignatures_version), f) +writeLines(paste(" dplyr:", dplyr_version), f) +writeLines(paste(" ggplot2:", ggplot2_version), f) +writeLines(paste(" patchwork:", patchwork_version), f) +close(f) diff --git a/modules/local/cna2CNAqc/templates/main_script.R b/modules/local/cna2CNAqc/templates/main_script.R new file mode 100644 index 0000000..d9468e6 --- /dev/null +++ b/modules/local/cna2CNAqc/templates/main_script.R @@ -0,0 +1,63 @@ +#!/usr/bin/env Rscript + +opt = list( + prefix = ifelse('$task.ext.prefix' == 'null', '$meta.id', '$task.ext.prefix') +) + +library(dplyr) +library(readr) + +parse_Sequenza = function(segments_file, extra_file){ + # Extract the segments information + segments = readr::read_tsv(segments_file, col_types = readr::cols()) %>% + dplyr::rename( + chr = chromosome, + from = start.pos, + to = end.pos, + Major = A, + minor = B) %>% + dplyr::select(chr, from, to, Major, minor, dplyr::everything()) + + solutions = readr::read_tsv(extra_file, col_types = readr::cols()) + purity = solutions[["cellularity"]][2] + ploidy = solutions[["ploidy.estimate"]][2] + return(list(segments = segments, purity = purity, ploidy = ploidy)) +} + +parse_ASCAT = function(segments_file, extra_file){ + # Extract the segments information + segments = readr::read_tsv(segments_file, col_types = readr::cols()) %>% + dplyr::mutate(chr = paste0("chr",chr)) %>% + dplyr::rename( + from = startpos, + to = endpos, + Major = nMajor, + minor = nMinor) %>% + dplyr::select(chr, from, to, Major, minor) + + solutions = readr::read_tsv(extra_file, col_types = readr::cols()) + purity = solutions[["AberrantCellFraction"]] + ploidy = solutions[["Ploidy"]] + return(list(segments = segments, purity = purity, ploidy = ploidy)) +} + +if ("$meta.cna_caller" == 'sequenza'){ +CNA = parse_Sequenza(segments = "$cna_segs", extra = "$cna_extra") + +} else if ("$meta.cna_caller" == 'ASCAT'){ +CNA = parse_ASCAT(segments = "$cna_segs", extra = "$cna_extra") + +} else { +stop('Copy Number Caller not supported.') +} + +saveRDS(object = CNA, file = paste0(opt[["prefix"]], "_cna.rds")) + +# version export +f = file("versions.yml","w") +readr_version = sessionInfo()\$otherPkgs\$readr\$Version +dplyr_version = sessionInfo()\$otherPkgs\$dplyr\$Version +writeLines(paste0('"', "$task.process", '"', ":"), f) +writeLines(paste(" readr:", readr_version), f) +writeLines(paste(" dplyr:", dplyr_version), f) +close(f) diff --git a/modules/local/mobsterh/templates/main_script.R b/modules/local/mobsterh/templates/main_script.R new file mode 100644 index 0000000..83de713 --- /dev/null +++ b/modules/local/mobsterh/templates/main_script.R @@ -0,0 +1,199 @@ +#!/usr/bin/env Rscript + +parse_args = function(x) { + x = gsub("\\\\[","",x) + x = gsub("\\\\]","",x) + # giving errors when we have lists like c(xxx, xxx) since it will separate it + # args_list = unlist(strsplit(x, ', ')[[1]]) + args_list = unlist(strsplit(x, ", (?=[^)]*(?:\\\\(|\$))", perl=TRUE)) + # args_vals = lapply(args_list, function(x) strsplit(x, split=":")[[1]]) + args_vals = lapply(args_list, function(x) { + x_splt = strsplit(x, split=":")[[1]] + c(x_splt[1], paste(x_splt[2:length(x_splt)], collapse=":")) + }) + + # Ensure the option vectors are length 2 (key/ value) to catch empty ones + args_vals = lapply(args_vals, function(z){ length(z) = 2; z}) + + parsed_args = structure(lapply(args_vals, function(x) x[2]), names = lapply(args_vals, function(x) x[1])) + parsed_args[! is.na(parsed_args)] +} + +opt = list( + prefix = ifelse('$task.ext.prefix' == 'null', '$meta.id', '$task.ext.prefix') +) +args_opt = parse_args('$task.ext.args') +for ( ao in names(args_opt)) opt[[ao]] = args_opt[[ao]] + + +# Auxiliary functions ##### + +get_sample = function(m_cnaqc_obj, + sample, + which_obj) { + if (class(m_cnaqc_obj) != "m_cnaqc") { + cli::cli_abort(c( + "mCNAqc object was build without keeping original CNAqc objects", + "x" = "It is not possible to retrieve the required samples" + )) + } + + consented_obj = c("shared", "original") + + if ((which_obj %in% consented_obj) == FALSE) { + cli::cli_abort("{.var which_obj} must be one of 'shared' or 'original'") + } + # define the element names + if (which_obj == "original") { + type = "original_cnaqc_objc" + # check if the original cnaqc obj exist + check_or = any(names(m_cnaqc_obj) == type) + if (check_or == FALSE) { + cli::cli_abort(c( + "mCNAqc object was build without keeping original CNAqc objects", + "x" = "It is not possible to retrieve the required samples" + )) + } else { + cli::cli_h1("Retrieving original {.cls CNAqc} objects") + cnaqc_samples = m_cnaqc_obj[[type]][sample] + } + } else { + type = "cnaqc_obj_new_segmentation" + cli::cli_h1("Retrieving {.cls CNAqc} objects with the new segmentation") + cnaqc_samples = m_cnaqc_obj[[type]][sample] + } + return(cnaqc_samples) +} + +get_sample_name = function(x) { + if (class(x) == "m_cnaqc") { + lapply(x[["cnaqc_obj_new_segmentation"]], function(y) { + y[["sample"]] + }) %>% unlist() %>% unname() + + } else if (class(x) == "cnaqc") { + x[["sample"]] + } else { + cli::cli_abort(c( + "Must provide a {.field m_cnaqc} object", + "x" = "{.var x} is a {.cls {class(x)}}" + )) + } +} + +get_mCNAqc_stats = function(m_cnaqc_obj) { + stats = m_cnaqc_obj[["m_cnaqc_stats"]] + return(stats) +} + + +# Script ##### + +library(CNAqc) +library(mobster) +library(dplyr) +library(ggplot2) + +patientID = description = "$meta.patient" +samples = strsplit(x = "$meta.tumour_sample", ",") %>% unlist() # list of samples + +## read mCNAqc object +if ( grepl(".rds\$", tolower("$rds_join")) ) { + obj = readRDS("$rds_join") + if (class(obj) == "m_cnaqc") { + original = obj %>% get_sample(sample=samples, which_obj="original") + input_table = lapply(names(original), + function(sample_name) + original[[sample_name]] %>% + # keep only mutations on the diploid karyotype + CNAqc::subset_by_segment_karyotype("1:1") %>% + CNAqc::Mutations() %>% + dplyr::mutate(sample_id=sample_name) + ) %>% dplyr::bind_rows() + } else { + cli::cli_abort("Object of class {class($rds_join)} not supported.") + } +} else { + input_table = read.csv("$rds_join") +} + +## Function to run a single mobster fit +run_mobster_fit = function(joint_table, descr) { + # get input table for the single patient + inp_tb = joint_table %>% + dplyr::filter(VAF < 1) %>% + # dplyr::mutate(VAF=replace(VAF, VAF==0, 1e-7)) %>% + dplyr::filter(VAF!=0) %>% + dplyr::filter(karyotype=="1:1") + # dplyr::rename(variantID=gene) %>% + # dplyr::rename(is.driver=is_driver) %>% + # dplyr::rename(tumour_content=purity) %>% + # dplyr::rename(is_driver=is.driver) + # dplyr::rename(is_driver=is.driver, driver_label=variantID) + +# mobster_fit(x = inp_tb, +# K = eval(parse(text="$K")), +# samples = as.integer("$samples"), +# init = "$init", +# tail = eval(parse(text="$tail")), +# epsilon = as.numeric("$epsilon"), +# maxIter = as.integer("$maxIter"), +# fit.type = "$fit_type", +# seed = as.integer("$seed"), +# model.selection = "$model_selection", +# trace = as.logical("$trace"), +# parallel = as.logical("$parallel"), +# pi_cutoff = as.numeric("$pi_cutoff"), +# N_cutoff = as.integer("$n_cutoff"), +# auto_setup = eval(parse(text="$auto_setup")), +# silent = as.logical("$silent"), +# description = descr) +# } + +mobster_fit(x = inp_tb, + K = eval(parse(text=opt[["K"]])), + samples = as.integer(opt[["samples"]]), + init = opt[["init"]], + tail = eval(parse(text=opt[["tail"]])), + epsilon = as.numeric(opt[["epsilon"]]), + maxIter = as.integer(opt[["maxIter"]]), + fit.type = opt[["fit_type"]], + seed = as.integer(opt[["seed"]]), + model.selection = opt[["model_selection"]], + trace = as.logical(opt[["trace"]]), + parallel = as.logical(opt[["parallel"]]), + pi_cutoff = as.numeric(opt[["pi_cutoff"]]), + N_cutoff = as.integer(opt[["n_cutoff"]]), + auto_setup = eval(parse(text=opt[["auto_setup"]])), + silent = as.logical(opt[["silent"]]), + description = descr) +} + +lapply(samples, function(sample_name) { + + fit = run_mobster_fit(joint_table=input_table %>% dplyr::filter(sample_id == !!sample_name), + descr=description) + + best_fit = fit[["best"]] + plot_fit = plot(best_fit) + + saveRDS(object=fit, file=paste0(opt[["prefix"]], "_mobsterh_st_fit.rds")) + saveRDS(object=best_fit, file=paste0(opt[["prefix"]], "_mobsterh_st_best_fit.rds")) + saveRDS(object=plot_fit, file=paste0(opt[["prefix"]], "_mobsterh_st_best_fit_plots.rds")) + + # save report plots + report_fig = mobster::plot_model_selection(fit) + saveRDS(report_fig, file=paste0(opt[["prefix"]], "_REPORT_plots_mobster.rds")) + ggplot2::ggsave(plot=report_fig, filename=paste0(opt[["prefix"]], "_REPORT_plots_mobster.pdf"), height=210, width=210, units="mm", dpi = 200) + ggplot2::ggsave(plot=report_fig, filename=paste0(opt[["prefix"]], "_REPORT_plots_mobster.png"), height=210, width=210, units="mm", dpi = 200) +}) + + +# version export +f = file("versions.yml","w") +cnaqc_version = sessionInfo()\$otherPkgs\$CNAqc\$Version +mobster_version = sessionInfo()\$otherPkgs\$mobster\$Version +writeLines(paste0('"', "$task.process", '"', ":"), f) +writeLines(paste(" CNAqc:", cnaqc_version), f) +writeLines(paste(" mobster:", mobster_version), f) +close(f) diff --git a/modules/local/pyclonevi/templates/main_script.py b/modules/local/pyclonevi/templates/main_script.py new file mode 100644 index 0000000..612a79b --- /dev/null +++ b/modules/local/pyclonevi/templates/main_script.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python + +def parse_args(x): + x = x.replace("[","") + x = x.replace("]","") + + args_list = x.split(", ") + parsed_args = {i.split(":")[0]:i.split(":")[1] for i in args_list if i.split(":")[1] is not None} + return parsed_args + +opt = dict() +opt["prefix"] = "$task.ext.prefix" if "$task.ext.prefix" == None else "$meta.id" +args_opt = parse_args("$task.ext.args") +for ao_k, ao_v in args_opt.items(): + opt[ao_k] = ao_v + +print("$tumour_samples") + +# Script ##### + +import sys +import pandas as pd +import os +import subprocess + +#input data preprocessing +def create_pyclone_input(input_data, patient_id, output_data): + df = pd.read_csv(input_data, sep = '\t',header=0) + # df = df.query('karyotype=="1:1"') + df['normal_cn'] = 2 + df['patient_id'] = patient_id + df['mutation_id'] = df.apply(lambda row: f"{patient_id}:{row['chr'][3:]}:{row['from']}:{row['alt']}", axis=1) + df[['major_cn', 'minor_cn']] = df['karyotype'].str.split(':', expand=True) + df = df.drop(columns=['karyotype']) + df = df.rename(columns={'Indiv': 'sample_id', 'NR': 'ref_counts', 'NV': 'alt_counts', 'purity': 'tumour_content'}) + + column_names = ['mutation_id', 'patient_id','sample_id', 'ref_counts', 'alt_counts', 'normal_cn', 'major_cn','minor_cn','tumour_content','driver_label','is_driver'] + column_names_red = list(set(column_names) & set(df.columns)) + + df = df.loc[:, column_names_red] + df.to_csv(output_data, sep="\t",index=False, header=df.columns) + + +def pyclone_ctree(joint, best_fit, ctree_input): + # parser = argparse.ArgumentParser(description='Process input and output files.') + # parser.add_argument('--joint', help='Joint table path') + # parser.add_argument('--best_fit', help='Pyclone-vi best fit path') + # parser.add_argument('--ctree_input', help='Table for ctree path') + # args = parser.parse_args() + + ## Read pyclone best fit table + best_fit_file = best_fit + df_output = pd.read_csv(best_fit_file, sep='\t') + + ## Read pyclone input table + joint_table = joint + #df_input = pd.read_csv(sys.argv[1], sep = '\t') + df_input = pd.read_csv(joint_table, sep = '\t') + ## Caluclate number of mutations per cluster and add to the subset orginal df_output dataframe + + df_output_small = df_output[['mutation_id','sample_id','cluster_id','cellular_prevalence']] + df_output_small['nMuts'] = df_output_small.groupby('cluster_id')['mutation_id'].transform('nunique') + + ## Find the clonal cluster and add the colum 'is.clonal' to identify it + + samples = df_output_small['sample_id'].unique() + top_clusters = 0 + for s in samples: + ind = df_output_small[df_output_small['sample_id']==s]['cellular_prevalence'].idxmax() + top_cluster = df_output_small.loc[ind,'cluster_id'] + if (top_clusters != top_cluster): + top_clusters = top_cluster + + i = df_output_small[df_output_small['cluster_id']==top_clusters].index + + df_output_small['is.clonal'] = "F" + df_output_small.loc[i,'is.clonal'] = "T" + + ## Merge the input information about driver genes to the clusters + if 'driver_label' not in df_input.columns or 'is_driver' not in df_input.columns: + df_input['driver_label'] = "NA" + df_input['is_driver'] = False + df_input.loc[1,'driver_label'] = "" + df_input.loc[1,'is_driver'] = True + + df_merged = pd.merge(df_output_small, df_input[['mutation_id','sample_id','patient_id','driver_label','is_driver']], on = ["mutation_id","sample_id"], how="inner") + df_final = df_merged.drop_duplicates(subset=['sample_id', 'driver_label','cluster_id','is.clonal','is_driver']) + df_final = df_final.rename(columns={'cellular_prevalence':'CCF','cluster_id': 'cluster','driver_label':'variantID','patient_id':'patientID','is_driver':'is.driver'}) + df_final['is.driver'] = df_final['is.driver'].replace({False: 'F', True: 'T'}) + + ## Final ctree input dataframe + final_jt_file = ctree_input + df_final.to_csv(final_jt_file, sep="\t",index=False, header=True) + + +if __name__ == "__main__": + create_pyclone_input(input_data="$rds_join", patient_id="$meta.patient", + output_data=opt["prefix"]+"_pyclone_input_all_samples.tsv") + + column_name = "sample_id" + input_all_samples = pd.read_csv(opt["prefix"] + "_pyclone_input_all_samples.tsv", sep="\t") + + samples_list = "$tumour_samples".replace("[","") + samples_list = samples_list.replace("]","") + samples_list = samples_list.split(sep=", ") + input_filtered = input_all_samples[input_all_samples[column_name].isin(samples_list)] + input_filtered.to_csv(opt["prefix"] + "_pyclone_input.tsv", sep="\t") + + # Run pyclone + pyclonevi_run = "pyclone-vi fit -i " + opt["prefix"] + "_pyclone_input.tsv -o " + \ + opt["prefix"] + "_all_fits.h5 -c " + opt["n_cluster"] + " -d "+ \ + opt["density"] + " --num-grid-points " + opt["n_grid_point"] + \ + " --num-restarts " + opt["n_restarts"] + subprocess.run(pyclonevi_run, shell=True) + + pyclonevi_write = "pyclone-vi write-results-file -i " + opt["prefix"] + "_all_fits.h5 -o " + \ + opt["prefix"] + "_best_fit.txt" + subprocess.run(pyclonevi_write, shell=True) + + # # Pyclone ctree + pyclone_ctree(joint=opt["prefix"] + "_pyclone_input.tsv", + best_fit=opt["prefix"] + "_best_fit.txt", + ctree_input=opt["prefix"] + "_cluster_table.csv") + + # Version + version = subprocess.check_output("pip show pyclone-vi | grep Version | awk '{print \$NF}'", shell=True).decode().split("\\n")[0] + print(version) + f = open("versions.yml", "a") + f.write("$task.process:") + f.write(" pyclone-vi: {}\\n".format(version)) + f.close() + diff --git a/modules/local/vcf2CNAqc/templates/main_script.R b/modules/local/vcf2CNAqc/templates/main_script.R new file mode 100644 index 0000000..d2fd031 --- /dev/null +++ b/modules/local/vcf2CNAqc/templates/main_script.R @@ -0,0 +1,454 @@ +#!/usr/bin/env Rscript + +parse_args = function(x) { + x = gsub("\\\\[","",x) + x = gsub("\\\\]","",x) + # giving errors when we have lists like c(xxx, xxx) since it will separate it + # args_list = unlist(strsplit(x, ', ')[[1]]) + args_list = unlist(strsplit(x, ", (?=[^)]*(?:\\\\(|\$))", perl=TRUE)) + # args_vals = lapply(args_list, function(x) strsplit(x, split=":")[[1]]) + args_vals = lapply(args_list, function(x) { + x_splt = strsplit(x, split=":")[[1]] + c(x_splt[1], paste(x_splt[2:length(x_splt)], collapse=":")) + }) + + # Ensure the option vectors are length 2 (key/ value) to catch empty ones + args_vals = lapply(args_vals, function(z){ length(z) = 2; z}) + + parsed_args = structure(lapply(args_vals, function(x) x[2]), names = lapply(args_vals, function(x) x[1])) + parsed_args[! is.na(parsed_args)] +} + +opt = list( + prefix = ifelse('$task.ext.prefix' == 'null', '$meta.id', '$task.ext.prefix') +) +args_opt = parse_args('$task.ext.args') +for ( ao in names(args_opt)) opt[[ao]] = args_opt[[ao]] + +print("\n\n\n") +print(opt) +print("\n\n\n") +print("$task.ext.args") +print("\n\n\n") + +# Auxiliriaty functions #### + +parse_FreeBayes = function(vcf, tumour_id, normal_id, filter_mutations = FALSE) { + tb = vcfR::vcfR2tidy(vcf) + + gt_field = tb[["gt"]] %>% + tidyr::separate(gt_AD, sep = ",", into = c("NR", "NV")) %>% + dplyr::mutate( + NR = as.numeric(NR), + NV = as.numeric(NV), + DP = NV + NR, + VAF = NV/DP) %>% + dplyr::rename(sample = Indiv) + + samples_list = gt_field[["sample"]] %>% unique + + if ("CSQ" %in% tb[["meta"]][["ID"]]){ + # VEP specific field extraction + # Take CSQ field names and split by | + + vep_field = tb[["meta"]] %>% + dplyr::filter(ID == "CSQ") %>% + dplyr::select(Description) %>% + dplyr::pull() + + vep_field = strsplit(vep_field, split = "|", fixed = TRUE)[[1]] + print(vep_field) + vep_field = vep_field[2:length(vep_field)-1] + + # Tranform the fix field by splittig the CSQ and select the columns needed + fix_field = tb[["fix"]] %>% + dplyr::rename( + chr = CHROM, + from = POS, + ref = REF, + alt = ALT) %>% + dplyr::rowwise() %>% + dplyr::mutate( + from = as.numeric(from), + to = from + nchar(alt)) %>% + dplyr::ungroup() %>% + dplyr::select(chr, from, to, ref, alt, CSQ, dplyr::everything(), -ChromKey, -DP) %>% + tidyr::separate(CSQ, vep_field, sep = "|") %>% + dplyr::select(chr, from, to, ref, alt, IMPACT, SYMBOL, Gene, dplyr::everything()) + print(fix_field) + + } else { + fix_field = tb[["fix"]] %>% + dplyr::rename( + chr = CHROM, + from = POS, + ref = REF, + alt = ALT + ) %>% + dplyr::rowwise() %>% + dplyr::mutate( + from = as.numeric(from), + to = from + nchar(alt)) %>% + dplyr::ungroup() %>% + dplyr::select(chr, from, to, ref, alt, dplyr::everything(), -ChromKey, -DP) + } + + # if have to filter mutations + if (filter_mutations){ + filter = c('PASS') + } else { + filter = fix_field[["FILTER"]] %>% unique() + } + + calls = lapply( + samples_list, + function(s){ + gt_field_s = gt_field %>% dplyr::filter(sample == s) + + if(nrow(fix_field) != nrow(gt_field_s)) + stop("Mismatch between the VCF fixed fields and the genotypes, will not process this file.") + + fits = list() + fits[["sample"]] = s + fits[["mutations"]] = dplyr::bind_cols(fix_field, gt_field_s) %>% + dplyr::select(chr, from, to, ref, alt, NV, DP, VAF, dplyr::everything()) %>% + dplyr::filter(FILTER %in% filter) + fits + }) + + names(calls) = samples_list + samples = c(tumour_id, normal_id) + calls = calls[samples] + return(calls) +} + + +parse_Mutect = function(vcf, tumour_id, normal_id, filter_mutations = FALSE){ + # Transform vcf to tidy + tb = vcfR::vcfR2tidy(vcf) + + # Extract gt field and obtain coverage (DP) and variant allele frequency (VAF) fields + gt_field = tb[["gt"]] %>% + tidyr::separate(gt_AD, sep = ",", into = c("NR", "NV")) %>% + dplyr::mutate( + NR = as.numeric(NR), + NV = as.numeric(NV), + DP = NV + NR, + VAF = NV/DP) %>% + dplyr::rename(sample = Indiv) + + # Extract sample names + samples_list = gt_field[["sample"]] %>% unique + + # check if VCF is annotated with VEP + if ("CSQ" %in% tb[["meta"]][["ID"]]){ + # VEP specific field extraction + # Take CSQ field names and split by | + + vep_field = tb[["meta"]] %>% + dplyr::filter(ID == "CSQ") %>% + dplyr::select(Description) %>% + dplyr::pull() + + vep_field = strsplit(vep_field, split = "|", fixed = TRUE)[[1]] + print(vep_field) + vep_field = vep_field[2:length(vep_field)-1] + + # Tranform the fix field by splittig the CSQ and select the columns needed + fix_field = tb[["fix"]] %>% + dplyr::rename( + chr = CHROM, + from = POS, + ref = REF, + alt = ALT) %>% + dplyr::rowwise() %>% + dplyr::mutate( + from = as.numeric(from), + to = from + nchar(alt)) %>% + dplyr::ungroup() %>% + dplyr::select(chr, from, to, ref, alt, CSQ, dplyr::everything()) %>% + tidyr::separate(CSQ, vep_field, sep = "|") %>% + dplyr::select(chr, from, to, ref, alt, IMPACT, SYMBOL, Gene, dplyr::everything(), -DP) #can add other thing, CSQ, HGSP + print(fix_field) + + } else { + # Take from fix field some columns + fix_field = tb[["fix"]] %>% + dplyr::rename( + chr = CHROM, + from = POS, + ref = REF, + alt = ALT) %>% + dplyr::rowwise() %>% + dplyr::mutate( + from = as.numeric(from), + to = from + nchar(alt)) %>% + dplyr::ungroup() %>% + dplyr::select(chr, from, to, ref, alt, dplyr::everything(), -ChromKey, -DP) #-DP + } + + if (filter_mutations){ + filter = c('PASS') + } else { + filter = fix_field[["FILTER"]] %>% unique() + } + + # For each sample create the table of mutations + calls = lapply( + samples_list, + function(s){ + gt_field_s = gt_field %>% dplyr::filter(sample == s) + + if(nrow(fix_field) != nrow(gt_field_s)) + stop("Mismatch between the VCF fixed fields and the genotypes, will not process this file.") + + fits = list() + fits[["sample"]] = s + fits[["mutations"]] = dplyr::bind_cols(fix_field, gt_field_s) %>% + dplyr::select(chr, from, to, ref, alt, NV, DP, VAF, dplyr::everything()) %>% + dplyr::filter(FILTER %in% filter) + fits + }) + + names(calls) = samples_list + samples = c(tumour_id, normal_id) + calls = calls[samples] + return(calls) +} + +retrieve_ref_alt = function(row){ + ref = row[["ref"]] + alt = row[["alt"]] + if (ref == 'A'){NR=as.integer(strsplit(row[["gt_AU"]], split=',')[[1]][1])} + if (ref == 'T'){NR=as.integer(strsplit(row[["gt_TU"]], split=',')[[1]][1])} + if (ref == 'G'){NR=as.integer(strsplit(row[["gt_GU"]], split=',')[[1]][1])} + if (ref == 'C'){NR=as.integer(strsplit(row[["gt_CU"]], split=',')[[1]][1])} + + if (alt == 'A'){NV=as.integer(strsplit(row[["gt_AU"]], split=',')[[1]][1])} + if (alt == 'T'){NV=as.integer(strsplit(row[["gt_TU"]], split=',')[[1]][1])} + if (alt == 'G'){NV=as.integer(strsplit(row[["gt_GU"]], split=',')[[1]][1])} + if (alt == 'C'){NV=as.integer(strsplit(row[["gt_GU"]], split=',')[[1]][1])} + + ref_alt = paste0(NR, ',', NV) + ref_alt +} + +parse_Strelka = function(vcf, tumour_id, normal_id, filter_mutations = FALSE){ + tb = vcfR::vcfR2tidy(vcf) + gt_field = tb[["gt"]] %>% rename(sample = Indiv) + samples_list = gt_field[["sample"]] %>% unique + + if ("CSQ" %in% tb[["meta"]][["ID"]]){ + # VEP specific field extraction + # Take CSQ field names and split by | + + vep_field = tb[["meta"]] %>% + dplyr::filter(ID == "CSQ") %>% + dplyr::select(Description) %>% + dplyr::pull() + + vep_field = strsplit(vep_field, split = "|", fixed = TRUE)[[1]] + print(vep_field) + vep_field = vep_field[2:length(vep_field)-1] + + # Tranform the fix field by splittig the CSQ and select the columns needed + fix_field = tb[["fix"]] %>% + dplyr::rename( + chr = CHROM, + from = POS, + ref = REF, + alt = ALT) %>% + dplyr::rowwise() %>% + dplyr::mutate( + from = as.numeric(from), + to = from + nchar(alt)) %>% + dplyr::ungroup() %>% + dplyr::select(chr, from, to, ref, alt, CSQ, dplyr::everything(), -ChromKey) %>% + tidyr::separate(CSQ, vep_field, sep = "|") %>% + dplyr::select(chr, from, to, ref, alt, IMPACT, SYMBOL, Gene, dplyr::everything()) #can add other thing, CSQ, HGSP + print(fix_field) + + } else { + # Take from fix field some columns + fix_field = tb[["fix"]] %>% + dplyr::rename( + chr = CHROM, + from = POS, + ref = REF, + alt = ALT) %>% + dplyr::rowwise() %>% + dplyr::mutate( + from = as.numeric(from), + to = from + nchar(alt)) %>% + dplyr::ungroup() %>% + dplyr::select(chr, from, to, ref, alt, dplyr::everything(), -ChromKey) #-DP + } + + if (filter_mutations){ + filter = c('PASS') + } else { + filter = fix_field[["FILTER"]] %>% unique() + } + + calls = lapply( + samples_list, + function(s){ + gt_field_s = gt_field %>% dplyr::filter(sample == s) + + fits = list() + fits[["sample"]] = s + mutations = dplyr::bind_cols(fix_field, gt_field_s) + ref_alt = lapply(1:nrow(mutations), function(r){ + retrieve_ref_alt(mutations[r,]) + }) + + ref_alt = ref_alt %>% unlist() + mutations[["ref_alt"]] = ref_alt + mutations = mutations %>% + tidyr::separate(ref_alt, into = c('NR', 'NV')) %>% + dplyr::mutate(NR = as.integer(NR), + NV = as.integer(NV)) %>% + dplyr::mutate(DP = NR+NV) %>% + dplyr::mutate(VAF = NV/DP) %>% + dplyr::select(chr, from, to, ref, alt, NV, DP, VAF, everything()) %>% + dplyr::filter(FILTER %in% filter) + + fits[["mutations"]] = mutations + fits + }) + + names(calls) = samples_list + samples = c(tumour_id, normal_id) + calls = calls[samples] + return(calls) +} + + +parse_Platypus = function(vcf, tumour_id, normal_id, filter_mutations = FALSE){ + tb = vcfR::vcfR2tidy(vcf) + + gt_field = tb[["gt"]] %>% + dplyr::mutate( + DP = as.numeric(gt_NR), + NV = as.numeric(gt_NV), + VAF = NV/DP) %>% + dplyr::rename(sample = Indiv) + + # Extract each sample + samples_list = gt_field[["sample"]] %>% unique + + # check if VCF is annotated with VEP + if ("CSQ" %in% tb[["meta"]][["ID"]]){ + # VEP specific field extraction + # Take CSQ field names and split by | + + vep_field = tb[["meta"]] %>% + dplyr::filter(ID == "CSQ") %>% + dplyr::select(Description) %>% + dplyr::pull() + + vep_field = strsplit(vep_field, split = "|", fixed = TRUE)[[1]] + print(vep_field) + vep_field = vep_field[2:length(vep_field)-1] + + # Tranform the fix field by splittig the CSQ and select the columns needed + fix_field = tb[["fix"]] %>% + dplyr::rename( + chr = CHROM, + from = POS, + ref = REF, + alt = ALT) %>% + dplyr::rowwise() %>% + dplyr::mutate( + from = as.numeric(from), + to = from + nchar(alt)) %>% + dplyr::ungroup() %>% + dplyr::select(chr, from, to, ref, alt, CSQ, dplyr::everything(), -ChromKey) %>% + tidyr::separate(CSQ, vep_field, sep = "|") %>% + dplyr::select(chr, from, to, ref, alt, IMPACT, SYMBOL, Gene, dplyr::everything()) + print(fix_field) + + } else { + fix_field = tb[["fix"]] %>% + dplyr::rename( + chr = CHROM, + from = POS, + ref = REF, + alt = ALT) %>% + dplyr::rowwise() %>% + dplyr::mutate( + from = as.numeric(from), + to = from + nchar(alt)) %>% + dplyr::ungroup() %>% + dplyr::select(chr, from, to, ref, alt, dplyr::everything(), -ChromKey) + } + + if (filter_mutations){ + filter = c('PASS') + } else { + filter = fix_field[["FILTER"]] %>% unique() + } + + calls = lapply( + samples_list, + function(s){ + gt_field_s = gt_field %>% + dplyr::filter(sample == s) + + if(nrow(fix_field) != nrow(gt_field_s)) + stop("Mismatch between the VCF fixed fields and the genotypes, will not process this file.") + + fits = list() + fits[["sample"]] = s + fits[["mutations"]] = dplyr::bind_cols(fix_field, gt_field_s) %>% + dplyr::select(chr, from, to, ref, alt, NV, DP, VAF, dplyr::everything()) %>% + dplyr::filter(FILTER %in% filter) + fits + }) + + names(calls) = samples_list + samples = c(tumour_id, normal_id) + calls = calls[samples] + return(calls) +} + + +library(dplyr) +library(tidyr) +library(vcfR) + + +# Read vcf file +vcf = vcfR::read.vcfR("$vcf") + +# Check from which caller the .vcf has been produced +source = vcfR::queryMETA(vcf, element = 'source')[[1]] + +if (TRUE %in% grepl(pattern = 'Mutect', x = source)){ + calls = parse_Mutect(vcf, tumour_id = "$meta.tumour_sample", normal_id = "$meta.normal_sample", filter_mutations = as.logical(opt[["filter_mutations"]])) + +} else if (TRUE %in% grepl(pattern = 'strelka', x = source)){ + calls = parse_Strelka(vcf, tumour_id = "$meta.tumour_sample", normal_id = "$meta.normal_sample", filter_mutations = as.logical(opt[["filter_mutations"]])) + +} else if (TRUE %in% grepl(pattern = 'Platypus', x = source)){ + calls = parse_Platypus(vcf, tumour_id = "$meta.tumour_sample", normal_id = "$meta.normal_sample", filter_mutations = as.logical(opt[["filter_mutations"]])) + +} else if (TRUE %in% grepl(pattern = 'freeBayes', x = source)){ + calls = parse_FreeBayes(vcf, tumour_id = "$meta.tumour_sample", normal_id = "$meta.normal_sample", filter_mutations = as.logical(opt[["filter_mutations"]])) + +} else { + stop('Variant Caller not supported.') +} + +saveRDS(object = calls, file = paste0(opt[["prefix"]], "_snv.rds")) + +# version export +f <- file("versions.yml","w") +dplyr_version <- sessionInfo()\$otherPkgs\$dplyr\$Version +tidyr_version <- sessionInfo()\$otherPkgs\$tidyr\$Version +vcfR_version <- sessionInfo()\$otherPkgs\$vcfR\$Version +writeLines(paste0('"', "$task.process", '"', ":"), f) +writeLines(paste(" dplyr:", dplyr_version), f) +writeLines(paste(" tidyr:", tidyr_version), f) +writeLines(paste(" vcfR:", vcfR_version), f) +close(f) diff --git a/modules/local/viber/templates/main_script.R b/modules/local/viber/templates/main_script.R new file mode 100644 index 0000000..d0a43ea --- /dev/null +++ b/modules/local/viber/templates/main_script.R @@ -0,0 +1,236 @@ +#!/usr/bin/env Rscript + +parse_args = function(x) { + x = gsub("\\\\[","",x) + x = gsub("\\\\]","",x) + # giving errors when we have lists like c(xxx, xxx) since it will separate it + # args_list = unlist(strsplit(x, ', ')[[1]]) + args_list = unlist(strsplit(x, ", (?=[^)]*(?:\\\\(|\$))", perl=TRUE)) + # args_vals = lapply(args_list, function(x) strsplit(x, split=":")[[1]]) + args_vals = lapply(args_list, function(x) { + x_splt = strsplit(x, split=":")[[1]] + c(x_splt[1], paste(x_splt[2:length(x_splt)], collapse=":")) + }) + + # Ensure the option vectors are length 2 (key/ value) to catch empty ones + args_vals = lapply(args_vals, function(z){ length(z) = 2; z}) + + parsed_args = structure(lapply(args_vals, function(x) x[2]), names = lapply(args_vals, function(x) x[1])) + parsed_args[! is.na(parsed_args)] +} + +opt = list( + prefix = ifelse('$task.ext.prefix' == 'null', '$meta.id', '$task.ext.prefix') +) +args_opt = parse_args('$task.ext.args') +for ( ao in names(args_opt)) opt[[ao]] = args_opt[[ao]] + + +# Auxiliary functions ##### + +get_sample = function(m_cnaqc_obj, sample, which_obj) { + if (class(m_cnaqc_obj) != "m_cnaqc") { + wrong_class_all = class(m_cnaqc_obj) + cli::cli_abort( + c("cnaqc_objs must be a {.field m_cnaqc} object", + "x" = "{.var m_cnaqc_obj} is a {.cls {class(m_cnaqc_obj)}}") + ) + } + + consented_obj = c("shared", "original") + if ((which_obj %in% consented_obj) == FALSE) { + cli::cli_abort("{.var which_obj} must be one of {.val shared} or {.val original}") + } + + # define the element names + if (which_obj == "original") { + type = "original_cnaqc_objc" + # check if the original cnaqc obj exist + check_or = any(names(m_cnaqc_obj) == type) + if (check_or == FALSE) { + cli::cli_abort(c("mCNAqc object was build without keeping original CNAqc objects"), + "x" = "It is not possible to retrieve the required samples") + } else { + cli::cli_h1("Retrieving original {.cls CNAqc} objects") + cnaqc_samples = m_cnaqc_obj[[type]][sample] + } + + } else { + type = "cnaqc_obj_new_segmentation" + cli::cli_h1("Retrieving {.cls CNAqc} objects with the new segmentation") + cnaqc_samples = m_cnaqc_obj[[type]][sample] + } + return(cnaqc_samples) +} + +get_sample_name = function(x) { + if (class(x) == "m_cnaqc") { + lapply(x[["cnaqc_obj_new_segmentation"]], function(y) { + y[["sample"]] + }) %>% unlist() %>% unname() + } else if (class(x) == "cnaqc") { + x[["sample"]] + } else { + wrong_class_all = class(x) + cli::cli_abort( + c("must provide a {.field m_cnaqc} object", + "x" = "{.var x} is a {.cls {class(x)}}") + ) + } +} + +get_mCNAqc_stats = function(m_cnaqc_obj){ + stats = m_cnaqc_obj[["m_cnaqc_stats"]] + return(stats) +} + +# Script ##### + +library(VIBER) +library(dplyr) +library(tidyverse) +library(ggplot2) +library(CNAqc) + +patientID = "$meta.patient" +samples = substr("$tumour_samples", 2, nchar("$tumour_samples")-1) +samples = strsplit(samples, ", ")[[1]] +print("$tumour_samples") +print("$rds_join") +print(samples) + +if ( grepl(".rds\$", tolower("$rds_join")) ) { + input_obj = readRDS("$rds_join") + if (class(input_obj) == "m_cnaqc") { + shared = input_obj %>% get_sample(sample=samples, which_obj="shared") + joint_table = lapply(names(shared), + function(sample_name) + shared[[sample_name]] %>% + # keep only mutations on the diploid karyotype + CNAqc::subset_by_segment_karyotype("1:1") %>% + CNAqc::Mutations() %>% + dplyr::mutate(sample_id=sample_name) + ) %>% dplyr::bind_rows() +} else { + cli::cli_alert_warning("Object of class {class(input_obj)} not supported.") + return() + } +} + +print("Subset joint done") + +## TODO : add drivers to `input_tab` + +## Read input joint table +input_tab = joint_table %>% +dplyr::mutate(VAF=replace(VAF, VAF==0, 1e-7)) + +## Convert the input table into longer format +reads_data = input_tab %>% +dplyr::select(chr, from, ref, alt, NV, DP, VAF, sample_id) %>% +tidyr::pivot_wider(names_from="sample_id", + values_from=c("NV","DP","VAF"), names_sep=".") + +## Extract DP (depth) +dp = reads_data %>% +# dplyr::filter(mutation_id %in% non_tail) %>% ## this step should be managed before by other module +dplyr::select(dplyr::starts_with("DP")) %>% +dplyr::mutate(dplyr::across(.cols=dplyr::everything(), + .fns=function(x) replace(x, is.na(x), 0))) %>% +dplyr::rename_all(function(x) stringr::str_remove_all(x,"DP.")) + +## Extract NV (alt_counts) +nv = reads_data %>% +# dplyr::filter(mutation_id %in% non_tail) %>% ## this step should be managed before by other module +dplyr::select(dplyr::starts_with("NV")) %>% +dplyr::mutate(dplyr::across(.cols=dplyr::everything(), + .fns=function(x) replace(x, is.na(x), 0))) %>% +dplyr::rename_all(function(x) stringr::str_remove_all(x,"NV.")) + +# Standard fit +viber_K = eval(parse(text=opt[["K"]])) +viber_K[which.min(viber_K)] = 2 +st_fit = VIBER::variational_fit(nv, dp, + K=viber_K, + data=reads_data, + # %>% dplyr::filter(mutation_id %in% non_tail) + alpha_0=as.numeric(opt[["alpha_0"]]), + a_0=as.integer(opt[["a_0"]]), + b_0=as.integer(opt[["b_0"]]), + max_iter=as.integer(opt[["maxIter"]]), + epsilon_conv=as.numeric(opt[["epsilon_conv"]]), + samples=as.integer(opt[["samples"]]), + q_init=opt[["q_init"]], + trace=as.logical(opt[["trace"]]), + description="" + ) + +best_fit = best_fit_heuristic = st_fit + +# If all clusters are removed -> keep the origianl best fit +tryCatch(expr = { + # Apply the heuristic + best_fit_heuristic = VIBER::choose_clusters(st_fit, + binomial_cutoff=as.numeric(opt[["binomial_cutoff"]]), + dimensions_cutoff=as.integer(opt[["dimensions_cutoff"]]), + pi_cutoff=as.numeric(opt[["pi_cutoff"]]), + re_assign=as.logical(opt[["re_assign"]]) + ) + }, error = function(e) { + print(e) + best_fit_heuristic <<- st_fit + } ) + +# Save fits +saveRDS(best_fit, file=paste0(opt[["prefix"]], "_viber_best_st_fit.rds")) +saveRDS(best_fit_heuristic, file = paste0(opt[["prefix"]], "_viber_best_st_heuristic_fit.rds")) + +# Save plots +if ("$n_samples" >1) { #mutlisample mode on +print("multisample mode on") +plot_fit = plot(best_fit) +plot_fit_heuristic = plot(best_fit_heuristic) + +saveRDS(plot_fit, file=paste0(opt[["prefix"]], "_viber_best_st_fit_plots.rds")) +saveRDS(plot_fit_heuristic, file=paste0(opt[["prefix"]], "_viber_best_st_heuristic_fit_plots.rds")) +} else { +plot_fit_mixing = plot_mixing_proportions(best_fit) +plot_fit_mixing_heuristic = plot_mixing_proportions(best_fit_heuristic) + +saveRDS(plot_fit_mixing, file=paste0(opt[["prefix"]], "_viber_best_st_mixing_plots.rds")) +saveRDS(plot_fit_mixing_heuristic, file=paste0(opt[["prefix"]], "_viber_best_st_heuristic_mixing_plots.rds")) +} + +# Save report plot +n_samples = ncol(best_fit[["x"]]) - 1 +marginals = multivariate = ggplot() + +try(expr = {marginals <<- VIBER::plot_1D(best_fit)} ) +#try(expr = {multivariate = plot(best_fit) %>% patchwork::wrap_plots()} ) +#top_p = patchwork::wrap_plots(marginals, multivariate, design=ifelse(n_samples>2, "ABB", "AAB")) + +try(expr = {multivariate = plot(best_fit)}) +try(expr = {multivariate = ggpubr::ggarrange(plotlist = multivariate)}) +top_p = ggpubr::ggarrange(plotlist = list(marginals, multivariate), widths=ifelse(n_samples>2, c(1,2), c(2,1))) + +mix_p = VIBER::plot_mixing_proportions(best_fit) +binom = VIBER::plot_peaks(best_fit) +elbo = VIBER::plot_ELBO(best_fit) +#bottom_p = patchwork::wrap_plots(mix_p, binom, elbo, design="ABBBC") +bottom_p = ggpubr::ggarrange(plotlist = list(mix_p, binom, elbo), widths = c(1,2,1), nrow = 1) + +#report_fig = patchwork::wrap_plots(top_p, bottom_p, design=ifelse(n_samples>2, "AAAB", "AAB")) +report_fig = ggpubr::ggarrange(top_p, bottom_p, nrow = 2, heights = ifelse(n_samples>2, c(3,1), c(2,1))) +saveRDS(report_fig, file=paste0(opt[["prefix"]], "_REPORT_plots_viber.rds")) +ggplot2::ggsave(plot=report_fig, filename=paste0(opt[["prefix"]], "_REPORT_plots_viber.pdf"), height=210, width=210, units="mm", dpi = 200) +ggplot2::ggsave(plot=report_fig, filename=paste0(opt[["prefix"]], "_REPORT_plots_viber.png"), height=210, width=210, units="mm", dpi = 200) + + +# version export +f = file("versions.yml","w") +cnaqc_version = sessionInfo()\$otherPkgs\$CNAqc\$Version +viber_version = sessionInfo()\$otherPkgs\$VIBER\$Version +writeLines(paste0('"', "$task.process", '"', ":"), f) +writeLines(paste(" CNAqc:", cnaqc_version), f) +writeLines(paste(" VIBER:", viber_version), f) +close(f) From c22238470c8a2f832b67c6500c03a486e94c6b7e Mon Sep 17 00:00:00 2001 From: Elena Buscaroli Date: Wed, 29 Jan 2025 16:30:42 +0100 Subject: [PATCH 5/8] deleting auxiliary scripts included in templates --- modules/local/CNAqc2tsv/utils.R | 84 ----- modules/local/cna2CNAqc/parser_CNA.R | 37 --- modules/local/mobsterh/getters.R | 57 ---- modules/local/pyclonevi/pyclone_ctree.py | 93 ------ modules/local/pyclonevi/pyclone_utils.py | 72 ----- modules/local/vcf2CNAqc/parser_vcf.R | 374 ----------------------- modules/local/viber/getters.R | 82 ----- 7 files changed, 799 deletions(-) delete mode 100644 modules/local/CNAqc2tsv/utils.R delete mode 100644 modules/local/cna2CNAqc/parser_CNA.R delete mode 100644 modules/local/mobsterh/getters.R delete mode 100755 modules/local/pyclonevi/pyclone_ctree.py delete mode 100644 modules/local/pyclonevi/pyclone_utils.py delete mode 100644 modules/local/vcf2CNAqc/parser_vcf.R delete mode 100644 modules/local/viber/getters.R diff --git a/modules/local/CNAqc2tsv/utils.R b/modules/local/CNAqc2tsv/utils.R deleted file mode 100644 index 647a201..0000000 --- a/modules/local/CNAqc2tsv/utils.R +++ /dev/null @@ -1,84 +0,0 @@ -## This source file is temporary, only needed in -## absence of the updated CNAqc package - -get_sample <- function(m_cnaqc_obj, - sample, - which_obj) { - if (class(m_cnaqc_obj) != "m_cnaqc") { - wrong_class_all = class(m_cnaqc_obj) - - cli::cli_abort( - c("cnaqc_objs must be a {.field m_cnaqc} object", - "x" = "{.var m_cnaqc_obj} is a {.cls {class(m_cnaqc_obj)}}") - ) - } - - consented_obj <- c("shared", "original") - - if ((which_obj %in% consented_obj) == FALSE) { - - cli::cli_abort("{.var which_obj} must be one of {.val shared} or {.val original}") - - } - - # define the element names - - if (which_obj == "original") { - - type = "original_cnaqc_objc" - - # check if the original cnaqc obj exist - - check_or = any(names(m_cnaqc_obj) == type) - - if (check_or == FALSE) { - cli::cli_abort(c("mCNAqc object was build without keeping original CNAqc objects"), - "x" = "It is not possible to retrieve the required samples") - } else { - - cli::cli_h1("Retrieving original {.cls CNAqc} objects") - cnaqc_samples = m_cnaqc_obj[[type]][sample] - - } - - } else { - - type = "cnaqc_obj_new_segmentation" - - cli::cli_h1("Retrieving {.cls CNAqc} objects with the new segmentation") - - cnaqc_samples = m_cnaqc_obj[[type]][sample] - } - - return(cnaqc_samples) -} - -get_sample_name <- function(x) { - - if (class(x) == "m_cnaqc") { - - lapply(x$cnaqc_obj_new_segmentation, function(y) { - - y$sample - - }) %>% unlist() %>% unname() - - } else if (class(x) == "cnaqc") { - - x$sample - - } else { - - wrong_class_all = class(x) - - cli::cli_abort( - c("must provide a {.field m_cnaqc} object", - "x" = "{.var x} is a {.cls {class(x)}}") - ) - } -} - -get_mCNAqc_stats <- function(m_cnaqc_obj){ - stats = m_cnaqc_obj[["m_cnaqc_stats"]] - return(stats) -} diff --git a/modules/local/cna2CNAqc/parser_CNA.R b/modules/local/cna2CNAqc/parser_CNA.R deleted file mode 100644 index ee46373..0000000 --- a/modules/local/cna2CNAqc/parser_CNA.R +++ /dev/null @@ -1,37 +0,0 @@ -library(dplyr) -library(readr) - -parse_Sequenza = function(segments_file, extra_file){ - # Extract the segments information - segments = readr::read_tsv(segments_file, col_types = readr::cols()) %>% - dplyr::rename( - chr = chromosome, - from = start.pos, - to = end.pos, - Major = A, - minor = B) %>% - dplyr::select(chr, from, to, Major, minor, dplyr::everything()) - - solutions = readr::read_tsv(extra_file, col_types = readr::cols()) - purity = solutions$cellularity[2] - ploidy = solutions$ploidy.estimate[2] - return(list(segments = segments, purity = purity, ploidy = ploidy)) -} - - -parse_ASCAT = function(segments_file, extra_file){ - # Extract the segments information - segments = readr::read_tsv(segments_file, col_types = readr::cols()) %>% - dplyr::mutate(chr = paste0("chr",chr)) %>% - dplyr::rename( - from = startpos, - to = endpos, - Major = nMajor, - minor = nMinor) %>% - dplyr::select(chr, from, to, Major, minor) - - solutions = readr::read_tsv(extra_file, col_types = readr::cols()) - purity = solutions$AberrantCellFraction - ploidy = solutions$Ploidy - return(list(segments = segments, purity = purity, ploidy = ploidy)) -} diff --git a/modules/local/mobsterh/getters.R b/modules/local/mobsterh/getters.R deleted file mode 100644 index 673fbfe..0000000 --- a/modules/local/mobsterh/getters.R +++ /dev/null @@ -1,57 +0,0 @@ -get_sample = function(m_cnaqc_obj, - sample, - which_obj) { - if (class(m_cnaqc_obj) != "m_cnaqc") { - cli::cli_abort(c( - "mCNAqc object was build without keeping original CNAqc objects", - "x" = "It is not possible to retrieve the required samples" - )) - } - - consented_obj = c("shared", "original") - - if ((which_obj %in% consented_obj) == FALSE) { - cli::cli_abort("{.var which_obj} must be one of 'shared' or 'original'") - } - # define the element names - if (which_obj == "original") { - type = "original_cnaqc_objc" - # check if the original cnaqc obj exist - check_or = any(names(m_cnaqc_obj) == type) - if (check_or == FALSE) { - cli::cli_abort(c( - "mCNAqc object was build without keeping original CNAqc objects", - "x" = "It is not possible to retrieve the required samples" - )) - } else { - cli::cli_h1("Retrieving original {.cls CNAqc} objects") - cnaqc_samples = m_cnaqc_obj[[type]][sample] - } - } else { - type = "cnaqc_obj_new_segmentation" - cli::cli_h1("Retrieving {.cls CNAqc} objects with the new segmentation") - cnaqc_samples = m_cnaqc_obj[[type]][sample] - } - return(cnaqc_samples) -} - -get_sample_name = function(x) { - if (class(x) == "m_cnaqc") { - lapply(x$cnaqc_obj_new_segmentation, function(y) { - y$sample - }) %>% unlist() %>% unname() - - } else if (class(x) == "cnaqc") { - x$sample - } else { - cli::cli_abort(c( - "Must provide a {.field m_cnaqc} object", - "x" = "{.var x} is a {.cls {class(x)}}" - )) - } -} - -get_mCNAqc_stats = function(m_cnaqc_obj) { - stats = m_cnaqc_obj[["m_cnaqc_stats"]] - return(stats) -} diff --git a/modules/local/pyclonevi/pyclone_ctree.py b/modules/local/pyclonevi/pyclone_ctree.py deleted file mode 100755 index 077790e..0000000 --- a/modules/local/pyclonevi/pyclone_ctree.py +++ /dev/null @@ -1,93 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import pandas as pd -import argparse -## Read command line args - -parser = argparse.ArgumentParser(description='Process input and output files.') -parser.add_argument('--joint', help='Joint table path') -parser.add_argument('--best_fit', help='Pyclone-vi best fit path') -parser.add_argument('--ctree_input', help='Table for ctree path') -#parser.add_argument('--pyclone_joint', help='Cluster-annotated joint table') -args = parser.parse_args() - - -## Read pyclone best fit table -#best_fit_file=sys.argv[2] -best_fit_file = args.best_fit -df_output = pd.read_csv(best_fit_file, sep='\t') - -## Read pyclone input table -joint_table = args.joint -#df_input = pd.read_csv(sys.argv[1], sep = '\t') -df_input = pd.read_csv(joint_table, sep = '\t') -## Caluclate number of mutations per cluster and add to the subset orginal df_output dataframe - -df_output_small = df_output[['mutation_id','sample_id','cluster_id','cellular_prevalence']] -df_output_small['nMuts'] = df_output_small.groupby('cluster_id')['mutation_id'].transform('nunique') - -## Find the clonal cluster and add the colum 'is.clonal' to identify it - -samples = df_output_small['sample_id'].unique() -top_clusters = 0 -for s in samples: - ind = df_output_small[df_output_small['sample_id']==s]['cellular_prevalence'].idxmax() - top_cluster = df_output_small.loc[ind,'cluster_id'] - if (top_clusters != top_cluster): - top_clusters = top_cluster - -i = df_output_small[df_output_small['cluster_id']==top_clusters].index - -df_output_small['is.clonal'] = "F" -df_output_small.loc[i,'is.clonal'] = "T" - -## Merge the input information about driver genes to the clusters -if 'driver_label' not in df_input.columns or 'is_driver' not in df_input.columns: - df_input['driver_label'] = "NA" - df_input['is_driver'] = False - df_input.loc[1,'driver_label'] = "" - df_input.loc[1,'is_driver'] = True - -df_merged = pd.merge(df_output_small, df_input[['mutation_id','sample_id','patient_id','driver_label','is_driver']], on = ["mutation_id","sample_id"], how="inner") -df_final = df_merged.drop_duplicates(subset=['sample_id', 'driver_label','cluster_id','is.clonal','is_driver']) -df_final = df_final.rename(columns={'cellular_prevalence':'CCF','cluster_id': 'cluster','driver_label':'variantID','patient_id':'patientID','is_driver':'is.driver'}) -df_final['is.driver'] = df_final['is.driver'].replace({False: 'F', True: 'T'}) - -######### OLD SCRIPT ######### -## Find clusters with driver genes and find cluster wihtout driver genes -## For the ones without drivers add the 'NA' to variantID and False to is.driver -#driver_genes_df = df_final[df_final['is_driver'] == True] - -#non_driver_genes_df = df_final[df_final['is_driver'] == False].drop_duplicates(subset=['sample_id','cluster_id']).copy() -#non_driver_genes_df['gene'] = 'NA' -#non_driver_genes_df['is_driver'] = False -#result_df = pd.concat([driver_genes_df, non_driver_genes_df], ignore_index=True) - -## Identify clusters for which is.driver is both True and False -#conflicting_clusters = result_df.groupby(['sample_id', 'cluster_id'])['is_driver'].nunique() == 2 -#conflicting_clusters = conflicting_clusters[conflicting_clusters].index - -## Create a new DataFrame with only True values for conflicting clusters -#true_values_df = result_df[(result_df['is_driver'] == True) & result_df.set_index(['sample_id', 'cluster_id']).index.isin(conflicting_clusters)] - -## Create a new DataFrame with all values for non-conflicting clusters -#non_conflicting_df = result_df[~result_df.set_index(['sample_id', 'cluster_id']).index.isin(conflicting_clusters)] - -## Concatenate the DataFrames -#result_df = pd.concat([true_values_df, non_conflicting_df], ignore_index=True) -#result_df['is_driver'] = result_df['is_driver'].replace({False: 'F', True: 'T'}) - -#result_df = result_df.rename(columns={'cellular_prevalence':'CCF','cluster_id': 'cluster'}) -#result_df['tool'] = pd.Series(["pyclonevi" for x in range(len(result_df.index))]) -#ctree_input = result_df[['patient_id','gene','is_driver','is.clonal','cluster','nMuts','sample_id','CCF','tool']] - -## Final ctree input dataframe -final_jt_file = args.ctree_input -df_final.to_csv(final_jt_file, sep="\t",index=False, header=True) - -## Annotate input joint table with pyclone-vi assignments -#df2 = pd.merge(df_input, df_output[['mutation_id','sample_id','cluster_id','cellular_prevalence','cluster_assignment_prob']], on=['mutation_id','sample_id']) -#df_joint_new=df2.rename(columns={"cluster_id": "pyclonevi_cluster_id", "cellular_prevalence": "pyclonevi_cellular_prevalence", "cluster_assignment_prob": "pyclone_vi_cluster_assignment_prob"}) -#annotated_joint=args.pyclone_joint -#df_joint_new.to_csv(annotated_joint, sep="\t",index=False, header=True) diff --git a/modules/local/pyclonevi/pyclone_utils.py b/modules/local/pyclonevi/pyclone_utils.py deleted file mode 100644 index cfaf3a3..0000000 --- a/modules/local/pyclonevi/pyclone_utils.py +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import pandas as pd -import argparse - -# Define your functions here -# def add(a, b): -# """Add two numbers.""" -# return a + b - -#input data preprocessing -def create_pyclone_input(input_data, patient_id,output_data): - df = pd.read_csv(input_data, sep = '\t',header=0) - # df = df.query('karyotype=="1:1"') - df['normal_cn'] = 2 - df['patient_id'] = patient_id - df['mutation_id'] = df.apply(lambda row: f"{patient_id}:{row['chr'][3:]}:{row['from']}:{row['alt']}", axis=1) - df[['major_cn', 'minor_cn']] = df['karyotype'].str.split(':', expand=True) - df = df.drop(columns=['karyotype']) - df = df.rename(columns={'Indiv': 'sample_id', 'NR': 'ref_counts', 'NV': 'alt_counts', 'purity': 'tumour_content'}) - - column_names = ['mutation_id', 'patient_id','sample_id', 'ref_counts', 'alt_counts', 'normal_cn', 'major_cn','minor_cn','tumour_content','driver_label','is_driver'] - column_names_red = list(set(column_names) & set(df.columns)) - - df = df.loc[:, column_names_red] - df.to_csv(output_data, sep="\t",index=False, header=df.columns) - print(df) - -# def divide(a, b): -# """Divide two numbers.""" -# if b == 0: -# raise ValueError("Cannot divide by zero!") -# return a / b - -def main(): - # Create the top-level parser - parser = argparse.ArgumentParser(description='Calculator script') - subparsers = parser.add_subparsers(dest='command', help='Available commands') - - # Create sub-parser for the "create_pyclone_input" command - parser_create_pyclone_input = subparsers.add_parser('create_pyclone_input', help='Add two numbers') - parser_create_pyclone_input.add_argument('input_data', help='Joint table path') - parser_create_pyclone_input.add_argument('patient_id', help='Patient identifier') - parser_create_pyclone_input.add_argument('output_data', help='Output path for new table') - - # # Create sub-parser for the "divide" command - # parser_divide = subparsers.add_parser('divide', help='Divide two numbers') - # parser_divide.add_argument('a', type=float, help='First number') - # parser_divide.add_argument('b', type=float, help='Second number') - - # Parse the arguments - args = parser.parse_args() - - # Call the appropriate function based on the command - if args.command == 'create_pyclone_input': - result = create_pyclone_input(args.input_data, args.patient_id, args.output_data) - # elif args.command == 'subtract': - # result = subtract(args.a, args.b) - # elif args.command == 'multiply': - # result = multiply(args.a, args.b) - # elif args.command == 'divide': - # result = divide(args.a, args.b) - else: - parser.print_help() - return - - # Print the result - print(f"The result is: {result}") - -if __name__ == '__main__': - main() diff --git a/modules/local/vcf2CNAqc/parser_vcf.R b/modules/local/vcf2CNAqc/parser_vcf.R deleted file mode 100644 index 1f93248..0000000 --- a/modules/local/vcf2CNAqc/parser_vcf.R +++ /dev/null @@ -1,374 +0,0 @@ -library(tidyr) -library(dplyr) -library(vcfR) - -parse_FreeBayes = function(vcf, tumour_id, normal_id, filter_mutations = FALSE){ - tb = vcfR::vcfR2tidy(vcf) - - gt_field = tb$gt %>% - tidyr::separate(gt_AD, sep = ",", into = c("NR", "NV")) %>% - dplyr::mutate( - NR = as.numeric(NR), - NV = as.numeric(NV), - DP = NV + NR, - VAF = NV/DP) %>% - dplyr::rename(sample = Indiv) - - samples_list = gt_field$sample %>% unique - - if ("CSQ" %in% tb$meta$ID){ - # VEP specific field extraction - # Take CSQ field names and split by | - - vep_field = tb$meta %>% - dplyr::filter(ID == "CSQ") %>% - dplyr::select(Description) %>% - dplyr::pull() - - vep_field = strsplit(vep_field, split = "|", fixed = TRUE)[[1]] - vep_field = vep_field[2:length(vep_field)-1] - - # Tranform the fix field by splittig the CSQ and select the columns needed - fix_field = tb$fix %>% - dplyr::rename( - chr = CHROM, - from = POS, - ref = REF, - alt = ALT) %>% - dplyr::rowwise() %>% - dplyr::mutate( - from = as.numeric(from), - to = from + nchar(alt)) %>% - dplyr::ungroup() %>% - dplyr::select(chr, from, to, ref, alt, CSQ, dplyr::everything(), -ChromKey, -DP) %>% - tidyr::separate(CSQ, vep_field, sep = "\\|") %>% - dplyr::select(chr, from, to, ref, alt, IMPACT, SYMBOL, Gene, dplyr::everything()) - - } else { - fix_field = tb$fix %>% - dplyr::rename( - chr = CHROM, - from = POS, - ref = REF, - alt = ALT - ) %>% - dplyr::rowwise() %>% - dplyr::mutate( - from = as.numeric(from), - to = from + nchar(alt)) %>% - dplyr::ungroup() %>% - dplyr::select(chr, from, to, ref, alt, dplyr::everything(), -ChromKey, -DP) - } - - # if have to filter mutations - if (filter_mutations){ - filter = c('PASS') - } else { - filter = fix_field$FILTER %>% unique() - } - - calls = lapply( - samples_list, - function(s){ - gt_field_s = gt_field %>% dplyr::filter(sample == s) - - if(nrow(fix_field) != nrow(gt_field_s)) - stop("Mismatch between the VCF fixed fields and the genotypes, will not process this file.") - - fits = list() - fits$sample = s - fits$mutations = dplyr::bind_cols(fix_field, gt_field_s) %>% - dplyr::select(chr, from, to, ref, alt, NV, DP, VAF, dplyr::everything()) %>% - dplyr::filter(FILTER %in% filter) - fits - }) - - names(calls) = samples_list - samples = c(tumour_id, normal_id) - calls = calls[samples] - return(calls) -} - - -parse_Mutect = function(vcf, tumour_id, normal_id, filter_mutations = FALSE){ - # Transform vcf to tidy - tb = vcfR::vcfR2tidy(vcf) - - # Extract gt field and obtain coverage (DP) and variant allele frequency (VAF) fields - gt_field = tb$gt %>% - tidyr::separate(gt_AD, sep = ",", into = c("NR", "NV")) %>% - dplyr::mutate( - NR = as.numeric(NR), - NV = as.numeric(NV), - DP = NV + NR, - VAF = NV/DP) %>% - dplyr::rename(sample = Indiv) - - # Extract sample names - samples_list = gt_field$sample %>% unique - - # check if VCF is annotated with VEP - if ("CSQ" %in% tb$meta$ID){ - # VEP specific field extraction - # Take CSQ field names and split by | - - vep_field = tb$meta %>% - dplyr::filter(ID == "CSQ") %>% - dplyr::select(Description) %>% - dplyr::pull() - - vep_field = strsplit(vep_field, split = "|", fixed = TRUE)[[1]] - vep_field = vep_field[2:length(vep_field)-1] - - # Tranform the fix field by splittig the CSQ and select the columns needed - fix_field = tb$fix %>% - dplyr::rename( - chr = CHROM, - from = POS, - ref = REF, - alt = ALT) %>% - dplyr::rowwise() %>% - dplyr::mutate( - from = as.numeric(from), - to = from + nchar(alt)) %>% - dplyr::ungroup() %>% - dplyr::select(chr, from, to, ref, alt, CSQ, dplyr::everything()) %>% - tidyr::separate(CSQ, vep_field, sep = "\\|") %>% - dplyr::select(chr, from, to, ref, alt, IMPACT, SYMBOL, Gene, dplyr::everything(), -DP) #can add other thing, CSQ, HGSP - - } else { - # Take from fix field some columns - fix_field = tb$fix %>% - dplyr::rename( - chr = CHROM, - from = POS, - ref = REF, - alt = ALT) %>% - dplyr::rowwise() %>% - dplyr::mutate( - from = as.numeric(from), - to = from + nchar(alt)) %>% - dplyr::ungroup() %>% - dplyr::select(chr, from, to, ref, alt, dplyr::everything(), -ChromKey, -DP) #-DP - } - - if (filter_mutations){ - filter = c('PASS') - } else { - filter = fix_field$FILTER %>% unique() - } - - # For each sample create the table of mutations - calls = lapply( - samples_list, - function(s){ - gt_field_s = gt_field %>% dplyr::filter(sample == s) - - if(nrow(fix_field) != nrow(gt_field_s)) - stop("Mismatch between the VCF fixed fields and the genotypes, will not process this file.") - - fits = list() - fits$sample = s - fits$mutations = dplyr::bind_cols(fix_field, gt_field_s) %>% - dplyr::select(chr, from, to, ref, alt, NV, DP, VAF, dplyr::everything()) %>% - dplyr::filter(FILTER %in% filter) - fits - }) - - names(calls) = samples_list - samples = c(tumour_id, normal_id) - calls = calls[samples] - return(calls) -} - -retrieve_ref_alt = function(row){ - ref = row$ref - alt = row$alt - if (ref == 'A'){NR=as.integer(strsplit(row$gt_AU, split=',')[[1]][1])} - if (ref == 'T'){NR=as.integer(strsplit(row$gt_TU, split=',')[[1]][1])} - if (ref == 'G'){NR=as.integer(strsplit(row$gt_GU, split=',')[[1]][1])} - if (ref == 'C'){NR=as.integer(strsplit(row$gt_CU, split=',')[[1]][1])} - - if (alt == 'A'){NV=as.integer(strsplit(row$gt_AU, split=',')[[1]][1])} - if (alt == 'T'){NV=as.integer(strsplit(row$gt_TU, split=',')[[1]][1])} - if (alt == 'G'){NV=as.integer(strsplit(row$gt_GU, split=',')[[1]][1])} - if (alt == 'C'){NV=as.integer(strsplit(row$gt_GU, split=',')[[1]][1])} - - ref_alt = paste0(NR, ',', NV) - ref_alt -} - -parse_Strelka = function(vcf, tumour_id, normal_id, filter_mutations = FALSE){ - tb = vcfR::vcfR2tidy(vcf) - gt_field = tb$gt %>% rename(sample = Indiv) - samples_list = gt_field$sample %>% unique - - if ("CSQ" %in% tb$meta$ID){ - # VEP specific field extraction - # Take CSQ field names and split by | - - vep_field = tb$meta %>% - dplyr::filter(ID == "CSQ") %>% - dplyr::select(Description) %>% - dplyr::pull() - - vep_field = strsplit(vep_field, split = "|", fixed = TRUE)[[1]] - vep_field = vep_field[2:length(vep_field)-1] - - # Tranform the fix field by splittig the CSQ and select the columns needed - fix_field = tb$fix %>% - dplyr::rename( - chr = CHROM, - from = POS, - ref = REF, - alt = ALT) %>% - dplyr::rowwise() %>% - dplyr::mutate( - from = as.numeric(from), - to = from + nchar(alt)) %>% - dplyr::ungroup() %>% - dplyr::select(chr, from, to, ref, alt, CSQ, dplyr::everything(), -ChromKey) %>% - tidyr::separate(CSQ, vep_field, sep = "\\|") %>% - dplyr::select(chr, from, to, ref, alt, IMPACT, SYMBOL, Gene, dplyr::everything()) #can add other thing, CSQ, HGSP - - } else { - # Take from fix field some columns - fix_field = tb$fix %>% - dplyr::rename( - chr = CHROM, - from = POS, - ref = REF, - alt = ALT) %>% - dplyr::rowwise() %>% - dplyr::mutate( - from = as.numeric(from), - to = from + nchar(alt)) %>% - dplyr::ungroup() %>% - dplyr::select(chr, from, to, ref, alt, dplyr::everything(), -ChromKey) #-DP - } - - if (filter_mutations){ - filter = c('PASS') - } else { - filter = fix_field$FILTER %>% unique() - } - - calls = lapply( - samples_list, - function(s){ - gt_field_s = gt_field %>% dplyr::filter(sample == s) - - fits = list() - fits$sample = s - mutations = dplyr::bind_cols(fix_field, gt_field_s) - ref_alt = lapply(1:nrow(mutations), function(r){ - retrieve_ref_alt(mutations[r,]) - }) - - ref_alt = ref_alt %>% unlist() - mutations$ref_alt = ref_alt - mutations = mutations %>% - tidyr::separate(ref_alt, into = c('NR', 'NV')) %>% - dplyr::mutate(NR = as.integer(NR), - NV = as.integer(NV)) %>% - dplyr::mutate(DP = NR+NV) %>% - dplyr::mutate(VAF = NV/DP) %>% - dplyr::select(chr, from, to, ref, alt, NV, DP, VAF, everything()) %>% - dplyr::filter(FILTER %in% filter) - - fits$mutations = mutations - fits - }) - - names(calls) = samples_list - samples = c(tumour_id, normal_id) - calls = calls[samples] - return(calls) -} - - -parse_Platypus = function(vcf, tumour_id, normal_id, filter_mutations = FALSE){ - tb = vcfR::vcfR2tidy(vcf) - - gt_field = tb$gt %>% - dplyr::mutate( - DP = as.numeric(gt_NR), - NV = as.numeric(gt_NV), - VAF = NV/DP) %>% - dplyr::rename(sample = Indiv) - - # Extract each sample - samples_list = gt_field$sample %>% unique - - # check if VCF is annotated with VEP - if ("CSQ" %in% tb$meta$ID){ - # VEP specific field extraction - # Take CSQ field names and split by | - - vep_field = tb$meta %>% - dplyr::filter(ID == "CSQ") %>% - dplyr::select(Description) %>% - dplyr::pull() - - vep_field = strsplit(vep_field, split = "|", fixed = TRUE)[[1]] - vep_field = vep_field[2:length(vep_field)-1] - - # Tranform the fix field by splittig the CSQ and select the columns needed - fix_field = tb$fix %>% - dplyr::rename( - chr = CHROM, - from = POS, - ref = REF, - alt = ALT) %>% - dplyr::rowwise() %>% - dplyr::mutate( - from = as.numeric(from), - to = from + nchar(alt)) %>% - dplyr::ungroup() %>% - dplyr::select(chr, from, to, ref, alt, CSQ, dplyr::everything(), -ChromKey) %>% - tidyr::separate(CSQ, vep_field, sep = "\\|") %>% - dplyr::select(chr, from, to, ref, alt, IMPACT, SYMBOL, Gene, dplyr::everything()) - - } else { - fix_field = tb$fix %>% - dplyr::rename( - chr = CHROM, - from = POS, - ref = REF, - alt = ALT) %>% - dplyr::rowwise() %>% - dplyr::mutate( - from = as.numeric(from), - to = from + nchar(alt)) %>% - dplyr::ungroup() %>% - dplyr::select(chr, from, to, ref, alt, dplyr::everything(), -ChromKey) - } - - if (filter_mutations){ - filter = c('PASS') - } else { - filter = fix_field$FILTER %>% unique() - } - - calls = lapply( - samples_list, - function(s){ - gt_field_s = gt_field %>% - dplyr::filter(sample == s) - - if(nrow(fix_field) != nrow(gt_field_s)) - stop("Mismatch between the VCF fixed fields and the genotypes, will not process this file.") - - fits = list() - fits$sample = s - fits$mutations = dplyr::bind_cols(fix_field, gt_field_s) %>% - dplyr::select(chr, from, to, ref, alt, NV, DP, VAF, dplyr::everything()) %>% - dplyr::filter(FILTER %in% filter) - fits - }) - - names(calls) = samples_list - samples = c(tumour_id, normal_id) - calls = calls[samples] - return(calls) -} diff --git a/modules/local/viber/getters.R b/modules/local/viber/getters.R deleted file mode 100644 index 30a926b..0000000 --- a/modules/local/viber/getters.R +++ /dev/null @@ -1,82 +0,0 @@ -get_sample <- function(m_cnaqc_obj, - sample, - which_obj) { - if (class(m_cnaqc_obj) != "m_cnaqc") { - wrong_class_all = class(m_cnaqc_obj) - - cli::cli_abort( - c("cnaqc_objs must be a {.field m_cnaqc} object", - "x" = "{.var m_cnaqc_obj} is a {.cls {class(m_cnaqc_obj)}}") - ) - } - - consented_obj <- c("shared", "original") - - if ((which_obj %in% consented_obj) == FALSE) { - - cli::cli_abort("{.var which_obj} must be one of {.val shared} or {.val original}") - - } - - # define the element names - - if (which_obj == "original") { - - type = "original_cnaqc_objc" - - # check if the original cnaqc obj exist - - check_or = any(names(m_cnaqc_obj) == type) - - if (check_or == FALSE) { - cli::cli_abort(c("mCNAqc object was build without keeping original CNAqc objects"), - "x" = "It is not possible to retrieve the required samples") - } else { - - cli::cli_h1("Retrieving original {.cls CNAqc} objects") - cnaqc_samples = m_cnaqc_obj[[type]][sample] - - } - - } else { - - type = "cnaqc_obj_new_segmentation" - - cli::cli_h1("Retrieving {.cls CNAqc} objects with the new segmentation") - - cnaqc_samples = m_cnaqc_obj[[type]][sample] - } - - return(cnaqc_samples) -} - -get_sample_name <-function(x) { - - if (class(x) == "m_cnaqc") { - - lapply(x$cnaqc_obj_new_segmentation, function(y) { - - y$sample - - }) %>% unlist() %>% unname() - - } else if (class(x) == "cnaqc") { - - x$sample - - } else { - - wrong_class_all = class(x) - - cli::cli_abort( - c("must provide a {.field m_cnaqc} object", - "x" = "{.var x} is a {.cls {class(x)}}") - ) - } -} - -get_mCNAqc_stats <- function(m_cnaqc_obj){ - stats = m_cnaqc_obj[["m_cnaqc_stats"]] - return(stats) -} - From dd8583ccb6d74091e631731c8b292969ffa67ac6 Mon Sep 17 00:00:00 2001 From: Elena Buscaroli Date: Wed, 29 Jan 2025 16:31:58 +0100 Subject: [PATCH 6/8] deleting auxiliary scripts included in templates --- modules/local/SparseSignatures/getters.R | 81 ------------------------ 1 file changed, 81 deletions(-) delete mode 100644 modules/local/SparseSignatures/getters.R diff --git a/modules/local/SparseSignatures/getters.R b/modules/local/SparseSignatures/getters.R deleted file mode 100644 index 18a4313..0000000 --- a/modules/local/SparseSignatures/getters.R +++ /dev/null @@ -1,81 +0,0 @@ -get_sample <- function(m_cnaqc_obj, - sample, - which_obj) { - if (class(m_cnaqc_obj) != "m_cnaqc") { - wrong_class_all = class(m_cnaqc_obj) - - cli::cli_abort( - c("cnaqc_objs must be a {.field m_cnaqc} object", - "x" = "{.var m_cnaqc_obj} is a {.cls {class(m_cnaqc_obj)}}") - ) - } - - consented_obj <- c("shared", "original") - - if ((which_obj %in% consented_obj) == FALSE) { - - cli::cli_abort("{.var which_obj} must be one of {.val shared} or {.val original}") - - } - - # define the element names - - if (which_obj == "original") { - - type = "original_cnaqc_objc" - - # check if the original cnaqc obj exist - - check_or = any(names(m_cnaqc_obj) == type) - - if (check_or == FALSE) { - cli::cli_abort(c("mCNAqc object was build without keeping original CNAqc objects"), - "x" = "It is not possible to retrieve the required samples") - } else { - - cli::cli_h1("Retrieving original {.cls CNAqc} objects") - cnaqc_samples = m_cnaqc_obj[[type]][sample] - - } - - } else { - - type = "cnaqc_obj_new_segmentation" - - cli::cli_h1("Retrieving {.cls CNAqc} objects with the new segmentation") - - cnaqc_samples = m_cnaqc_obj[[type]][sample] - } - - return(cnaqc_samples) -} - -get_sample_name <-function(x) { - - if (class(x) == "m_cnaqc") { - - lapply(x$cnaqc_obj_new_segmentation, function(y) { - - y$sample - - }) %>% unlist() %>% unname() - - } else if (class(x) == "cnaqc") { - - x$sample - - } else { - - wrong_class_all = class(x) - - cli::cli_abort( - c("must provide a {.field m_cnaqc} object", - "x" = "{.var x} is a {.cls {class(x)}}") - ) - } -} - -get_mCNAqc_stats <- function(m_cnaqc_obj){ - stats = m_cnaqc_obj[["m_cnaqc_stats"]] - return(stats) -} From 7a1f850c01fe3875bedae527ce24ad22abf679fb Mon Sep 17 00:00:00 2001 From: Elena Buscaroli Date: Wed, 29 Jan 2025 16:32:44 +0100 Subject: [PATCH 7/8] updating docker containers and setting templates --- modules/local/CNAqc/main.nf | 5 +- modules/local/CNAqc2tsv/main.nf | 45 +---- modules/local/SigProfiler/SigProfiler/main.nf | 7 +- modules/local/SigProfiler/download/main.nf | 5 +- modules/local/SparseSignatures/main.nf | 188 +----------------- modules/local/annotate_driver/main.nf | 6 +- modules/local/cna2CNAqc/main.nf | 34 +--- modules/local/ctree/main.nf | 4 +- modules/local/get_positions/main.nf | 5 +- modules/local/get_positions/main_rel.nf | 5 +- modules/local/join_CNAqc/main.nf | 4 +- modules/local/join_positions/main.nf | 4 +- modules/local/mobsterh/main.nf | 119 +---------- modules/local/pyclonevi/main.nf | 48 +---- modules/local/tinc/main.nf | 4 +- modules/local/vcf2CNAqc/main.nf | 54 +---- modules/local/viber/main.nf | 181 +---------------- 17 files changed, 58 insertions(+), 660 deletions(-) diff --git a/modules/local/CNAqc/main.nf b/modules/local/CNAqc/main.nf index 845f401..f624b33 100755 --- a/modules/local/CNAqc/main.nf +++ b/modules/local/CNAqc/main.nf @@ -1,7 +1,9 @@ process CNAQC { tag "$meta.id" label "process_low" - container = 'docker://lvaleriani/cnaqc:version1.0' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://lvaleriani/cnaqc:version1.0' : + 'docker.io/lvaleriani/cnaqc:version1.0' }" input: tuple val(meta), path(cna_rds), path(snv_rds) @@ -13,7 +15,6 @@ process CNAQC { tuple val(meta), path("*_qc.pdf"), emit: plot_pdf_qc path "versions.yml", emit: versions - script: def args = task.ext.args def prefix = task.ext.prefix ?: "${meta.id}" diff --git a/modules/local/CNAqc2tsv/main.nf b/modules/local/CNAqc2tsv/main.nf index bb1b2cb..6b72a85 100644 --- a/modules/local/CNAqc2tsv/main.nf +++ b/modules/local/CNAqc2tsv/main.nf @@ -1,11 +1,9 @@ -// -// Mutations extraction from mCNAqc -// - process RDS_PROCESSING { tag "$meta.id" label "process_single" - container = 'docker://lvaleriani/cnaqc:version1.0' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://lvaleriani/cnaqc:version1.0' : + 'docker.io/lvaleriani/cnaqc:version1.0' }" input: tuple val(meta), path(rds_join), val(tumour_samples) @@ -14,41 +12,6 @@ process RDS_PROCESSING { tuple val(meta), path("*_joint_table.tsv"), val(tumour_samples), emit: tsv path "versions.yml", emit: versions - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - - """ - #!/usr/bin/env Rscript - library(dplyr) - library(CNAqc) - - source("$moduleDir/utils.R") - - multi_cnaqc = readRDS(file = "$rds_join") - mutations_multisample <- get_sample(m_cnaqc_obj = multi_cnaqc,sample = get_sample_name(multi_cnaqc), - which_obj = "original") - multisample_jointTable <- list() - - for (s in get_sample_name(multi_cnaqc)){ - purity <- mutations_multisample[[s]][["purity"]] - multisample_jointTable[[s]] <- mutations_multisample[[s]][["mutations"]] %>% - dplyr::mutate(purity=purity) - } - - joint_table <- bind_rows(multisample_jointTable) - - write.table(joint_table, file = paste0("$prefix","_joint_table.tsv"), append = F, quote = F, sep = "\t", row.names = FALSE) - - # version export - f <- file("versions.yml","w") - dplyr_version <- sessionInfo()\$otherPkgs\$dplyr\$Version - cnaqc_version <- sessionInfo()\$otherPkgs\$CNAqc\$Version - writeLines(paste0('"', "$task.process", '"', ":"), f) - writeLines(paste(" dplyr:", dplyr_version), f) - writeLines(paste(" CNAqc:", cnaqc_version), f) - close(f) - - """ + template "main_script.R" } diff --git a/modules/local/SigProfiler/SigProfiler/main.nf b/modules/local/SigProfiler/SigProfiler/main.nf index 16a949e..96fef8a 100644 --- a/modules/local/SigProfiler/SigProfiler/main.nf +++ b/modules/local/SigProfiler/SigProfiler/main.nf @@ -1,7 +1,9 @@ process SIGPROFILER { tag "$meta.id" label "process_high" - container = 'docker://katiad/sigprofiler:version1.0' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://katiad/sigprofiler:version1.0' : + 'docker.io/katiad/sigprofiler:version1.0' }" input: tuple val(meta), path(tsv_list, stageAs: '*.tsv') @@ -61,7 +63,6 @@ process SIGPROFILER { from SigProfilerExtractor import sigpro as sig from SigProfilerMatrixGenerator.scripts import SigProfilerMatrixGeneratorFunc as matGen - if __name__ == '__main__': dataset_id = "$meta.dataset" @@ -72,9 +73,7 @@ process SIGPROFILER { output_path = os.path.join("output", "SBS", f"{dataset_id}.SBS96.all") - # input data preprocessing - def process_tsv_join(tsv_list): patients_tsv = tsv_list.split() # Read each file into a pandas DataFrame and ensure all columns are of type 'string' diff --git a/modules/local/SigProfiler/download/main.nf b/modules/local/SigProfiler/download/main.nf index 7129917..42a7dd4 100644 --- a/modules/local/SigProfiler/download/main.nf +++ b/modules/local/SigProfiler/download/main.nf @@ -1,6 +1,8 @@ process DOWNLOAD_GENOME_SIGPROFILER { label "process_single" - container = 'docker://katiad/sigprofiler:version1.0' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://katiad/sigprofiler:version1.0' : + 'docker.io/katiad/sigprofiler:version1.0' }" input: val(reference_genome) // reference_genome : genome -> for example: GRCh37 @@ -9,7 +11,6 @@ process DOWNLOAD_GENOME_SIGPROFILER { path("*"), emit: genome_sigprofiler path "versions.yml", emit: versions - script: """ SigProfilerMatrixGenerator install $reference_genome -v . diff --git a/modules/local/SparseSignatures/main.nf b/modules/local/SparseSignatures/main.nf index 755948c..21a33a2 100644 --- a/modules/local/SparseSignatures/main.nf +++ b/modules/local/SparseSignatures/main.nf @@ -1,10 +1,11 @@ process SPARSE_SIGNATURES { tag "$meta.id" label "process_low_long" - container = 'docker://lvaleriani/sparsesignature:version1.0' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://lvaleriani/sparsesignature:version1.0' : + 'docker.io/lvaleriani/sparsesignature:version1.0' }" input: - tuple val(meta), path(tsv_join, stageAs: '*.tsv') output: @@ -16,186 +17,5 @@ process SPARSE_SIGNATURES { path "versions.yml", emit: versions script: - def args = task.ext.args - def prefix = task.ext.prefix ?: "${meta.id}" - def K = args!='' && args.K ? "$args.K" : "" - def background_signature = args!='' && args.background_signature ? "$args.background_signature" : "" - def beta = args!='' && args.beta ? "$args.beta" : "" - def normalize_counts = args!='' && args.normalize_counts ? "$args.normalize_counts" : "" - def nmf_runs = args!='' && args.nmf_runs ? "$args.nmf_runs" : "" - def iterations = args!='' && args.iterations ? "$args.iterations" : "" - def max_iterations_lasso = args!='' && args.max_iterations_lasso ? "$args.max_iterations_lasso" : "" - def num_processes = args!='' && args.num_processes ? "$args.num_processes" : "" - def cross_validation_entries = args!='' && args.cross_validation_entries ? "$args.cross_validation_entries" : "" - def cross_validation_repetitions = args!='' && args.cross_validation_repetitions ? "$args.cross_validation_repetitions" : "" - def cross_validation_iterations = args!='' && args.cross_validation_iterations ? "$args.cross_validation_iterations" : "" - def lambda_values_alpha = args!='' && args.lambda_values_alpha ? "$args.lambda_values_alpha" : "" - def lambda_values_beta = args!='' && args.lambda_values_beta ? "$args.lambda_values_beta" : "" - def lambda_rate_alpha = args!='' && args.lambda_rate_alpha ? "$args.lambda_rate_alpha" : "" - def verbose = args!='' && args.verbose ? "$args.verbose" : "" - def seed = args!='' && args.seed ? "$args.seed" : "" - - """ - #!/usr/bin/env Rscript - - library(SparseSignatures) - library(ggplot2) - library(stringr) - library(patchwork) - library(dplyr) - - source("$moduleDir/getters.R") - - n_procs = parse(text="$num_processes") - if (n_procs == "all"){ - n_procs = as.double("Inf") - } else { - n_procs = eval(n_procs) - } - - - patients_tsv = strsplit("$tsv_join", " ")[[1]] - tables = lapply(patients_tsv, FUN = function(p_table){ - read.delim(p_table, sep = "\\t", header=T) %>% - mutate(across(everything(), as.character)) - } - ) - multisample_table = dplyr::bind_rows(tables) - - - #Extract input data information - input_data <- multisample_table[,c("Indiv","chr","from","to","ref","alt")] - input_data <- setNames(input_data, c("sample","chrom","start","end","ref","alt")) - input_data[["end"]] <- input_data[["start"]] - input_data <- input_data %>% mutate(start = as.integer(start), end = as.integer(end)) - - #Generate the patient vs mutation count matrix from mutation data - #Load reference human-genome specification. - #The user must select, among the available choices, the reference genome consistent with the mutation dataset. - - load_genome <- function(genome, input_data) { - if (genome == "GRCh37") { - library(BSgenome.Hsapiens.1000genomes.hs37d5) - bsg <- BSgenome.Hsapiens.1000genomes.hs37d5::hs37d5 - input_data[["chrom"]] <- substring(input_data[["chrom"]], 4, 5) - - } else if (genome == "GRCh38") { - library(BSgenome.Hsapiens.UCSC.hg38) - bsg <- BSgenome.Hsapiens.UCSC.hg38 - - # Leave 'chrom' unchanged for GRCh38 - } - return(list(bsg = bsg, input_data = input_data)) - } - - data_list = load_genome("$params.genome", input_data) - bsg <- data_list[["bsg"]] - input_data <- data_list[["input_data"]] - - - mut_counts = SparseSignatures::import.trinucleotides.counts(data=input_data, reference=bsg) - - #Load a reference SBS5 background signature from COSMIC - data(background) - - #Estimate the initial values of beta - starting_betas = SparseSignatures::startingBetaEstimation(x = mut_counts, - K = eval(parse(text="$K")), - background_signature = background) - - - #Find the optimal number of signatures and sparsity level: rely on cross-validation - # higher number of CV repetitions corresponds to more accurate parameter estimates - - cv_out = SparseSignatures::nmfLassoCV( - x = mut_counts, - K = eval(parse(text="$K")), - starting_beta = starting_betas, - background_signature = background, - normalize_counts = as.logical("$normalize_counts"), - nmf_runs = as.integer("$nmf_runs"), - lambda_values_alpha = eval(parse(text="$lambda_values_alpha")), - lambda_values_beta = eval(parse(text="$lambda_values_beta")), - cross_validation_entries = as.numeric("$cross_validation_entries"), - cross_validation_iterations = as.integer("$cross_validation_iterations"), - cross_validation_repetitions = as.integer("$cross_validation_repetitions"), - iterations = as.integer("$iterations"), - max_iterations_lasso = as.integer("$max_iterations_lasso"), - num_processes = n_procs, - verbose = as.logical("$verbose"), - seed = as.integer("$seed") - ) - - - #Analyze the mean squared error results averaging over cross-validation repetitions - cv_mses <- cv_out[["grid_search_mse"]][1,,] - cv_means_mse <- matrix(sapply(cv_mses, FUN = mean), - nrow = dim(cv_mses)[1] - ) - - dimnames(cv_means_mse) <- dimnames(cv_mses) - - #Find the combination of parameters that yields the lowest MSE - min_ii <- which(cv_means_mse == min(cv_means_mse, na.rm = TRUE), arr.ind = TRUE) - min_Lambda_beta <- rownames(cv_means_mse)[min_ii[1]] - min_Lambda_beta <- as.numeric(gsub("_Lambda_Beta", "", min_Lambda_beta)) - min_K <- colnames(cv_means_mse)[min_ii[2]] - min_K <- as.numeric(gsub("_Signatures", "", min_K)) - best_params_config <- data.frame(min_K, min_Lambda_beta) - - saveRDS(object = cv_means_mse, file = paste0("$prefix", "_cv_means_mse.rds")) - saveRDS(object = best_params_config, file = paste0("$prefix", "_best_params_config.rds")) - - #Discovering the signatures within the dataset: NMF Lasso - #Compute the signatures for the best configuration. - - nmf_Lasso_out = SparseSignatures::nmfLasso( - x = mut_counts, - K = min_K, - beta = eval(parse(text="$beta")), - background_signature = background, - normalize_counts = as.logical("$normalize_counts"), - lambda_rate_alpha = eval(parse(text="$lambda_rate_alpha")), - lambda_rate_beta = min_Lambda_beta, - iterations = as.integer("$iterations"), - max_iterations_lasso = as.integer("$max_iterations_lasso"), - verbose = as.logical("$verbose") - ) - - saveRDS(object = nmf_Lasso_out, file = paste0("$prefix", "_nmf_Lasso_out.rds")) - - #Signature visualization - signatures = nmf_Lasso_out\$beta - plot_signatures <- SparseSignatures::signatures.plot(beta=signatures, xlabels=FALSE) - - plot_exposure = nmf_Lasso_out\$alpha %>% - as.data.frame() %>% - tibble::rownames_to_column(var="PatientID") %>% - tidyr::pivot_longer(cols=!"PatientID", names_to="Signatures", values_to="Exposures") %>% - - ggplot() + - geom_bar(aes(x=PatientID, y=Exposures, fill=Signatures), - position="stack", stat="identity") + - theme(axis.text.x=element_text(angle=90,hjust=1), - panel.background=element_blank(), - axis.line=element_line(colour="black")) - - plt_all = patchwork::wrap_plots(plot_exposure, plot_signatures, ncol=2) + patchwork::plot_annotation(title = "$meta.id") - ggplot2::ggsave(plot = plt_all, filename = paste0("$prefix", "_plot_all.pdf"), width = 210, height = 297, units="mm", dpi = 200) - saveRDS(object = plt_all, file = paste0("$prefix", "_plot_all.rds")) - - # version export - f <- file("versions.yml","w") - SparseSignatures_version <- sessionInfo()\$otherPkgs\$SparseSignatures\$Version - dplyr_version <- sessionInfo()\$otherPkgs\$dplyr\$Version - ggplot2_version <- sessionInfo()\$otherPkgs\$ggplot2\$Version - patchwork_version <- sessionInfo()\$otherPkgs\$patchwork\$Version - writeLines(paste0('"', "$task.process", '"', ":"), f) - writeLines(paste(" SparseSignatures:", SparseSignatures_version), f) - writeLines(paste(" dplyr:", dplyr_version), f) - writeLines(paste(" ggplot2:", ggplot2_version), f) - writeLines(paste(" patchwork:", patchwork_version), f) - close(f) - - """ + template "main_script.R" } diff --git a/modules/local/annotate_driver/main.nf b/modules/local/annotate_driver/main.nf index e58f58f..9a3b804 100644 --- a/modules/local/annotate_driver/main.nf +++ b/modules/local/annotate_driver/main.nf @@ -1,8 +1,9 @@ - process ANNOTATE_DRIVER { tag "$meta.id" label "process_single" - container = 'docker://lvaleriani/cnaqc:version1.0' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://lvaleriani/cnaqc:version1.0' : + 'docker.io/lvaleriani/cnaqc:version1.0' }" input: tuple val(meta), path(rds), path(driver_list) @@ -11,7 +12,6 @@ process ANNOTATE_DRIVER { tuple val(meta), path("*.rds"), emit: rds path "versions.yml", emit: versions - script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" diff --git a/modules/local/cna2CNAqc/main.nf b/modules/local/cna2CNAqc/main.nf index e6f009f..2101249 100755 --- a/modules/local/cna2CNAqc/main.nf +++ b/modules/local/cna2CNAqc/main.nf @@ -1,7 +1,9 @@ process CNA_PROCESSING { tag "$meta.id" label "process_single" - container "docker://lvaleriani/cnaqc:version1.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://lvaleriani/cnaqc:version1.0' : + 'docker.io/lvaleriani/cnaqc:version1.0' }" input: tuple val(meta), path(cna_segs), path(cna_extra) @@ -14,33 +16,5 @@ process CNA_PROCESSING { task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - - """ - #!/usr/bin/env Rscript - source(paste0("$moduleDir", '/parser_CNA.R')) - - if ("$meta.cna_caller" == 'sequenza'){ - CNA = parse_Sequenza(segments = "$cna_segs", extra = "$cna_extra") - - } else if ("$meta.cna_caller" == 'ASCAT'){ - CNA = parse_ASCAT(segments = "$cna_segs", extra = "$cna_extra") - - } else { - stop('Copy Number Caller not supported.') - } - - saveRDS(object = CNA, file = paste0("$prefix", "_cna.rds")) - - # version export - f <- file("versions.yml","w") - readr_version <- sessionInfo()\$otherPkgs\$readr\$Version - dplyr_version <- sessionInfo()\$otherPkgs\$dplyr\$Version - writeLines(paste0('"', "$task.process", '"', ":"), f) - writeLines(paste(" readr:", readr_version), f) - writeLines(paste(" dplyr:", dplyr_version), f) - close(f) - - """ + template "main_script.R" } diff --git a/modules/local/ctree/main.nf b/modules/local/ctree/main.nf index d9cfba5..080d8f1 100644 --- a/modules/local/ctree/main.nf +++ b/modules/local/ctree/main.nf @@ -1,7 +1,9 @@ process CTREE { tag "$meta.id" label "process_low" - container = 'docker://elenabuscaroli/ctree:version1.1' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://elenabuscaroli/ctree:version1.1' : + 'docker.io/elenabuscaroli/ctree:version1.1' }" input: tuple val(meta), path(ctree_input) diff --git a/modules/local/get_positions/main.nf b/modules/local/get_positions/main.nf index 097fe6e..d40e3ef 100644 --- a/modules/local/get_positions/main.nf +++ b/modules/local/get_positions/main.nf @@ -1,7 +1,9 @@ process GET_POSITIONS_ALL { tag "$meta.id" label "process_single" - container = 'docker://lvaleriani/cnaqc:version1.0' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://lvaleriani/cnaqc:version1.0' : + 'docker.io/lvaleriani/cnaqc:version1.0' }" input: tuple val(meta), path(rds_list, stageAs: '*.rds') @@ -10,7 +12,6 @@ process GET_POSITIONS_ALL { tuple val(meta), path("*_all_positions.rds"), emit: all_pos path "versions.yml", emit: versions - script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" diff --git a/modules/local/get_positions/main_rel.nf b/modules/local/get_positions/main_rel.nf index 833c9df..6445e94 100644 --- a/modules/local/get_positions/main_rel.nf +++ b/modules/local/get_positions/main_rel.nf @@ -1,7 +1,9 @@ process GET_POSITIONS_REL { tag "$meta.id" label "process_single" - container = 'docker://lvaleriani/cnaqc:dev1' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://lvaleriani/cnaqc:version1.0' : + 'docker.io/lvaleriani/cnaqc:version1.0' }" input: tuple val(meta), path(rds), path(all_pos) @@ -10,7 +12,6 @@ process GET_POSITIONS_REL { tuple val(meta), path("*_positions_missing"), emit: bed path "versions.yml", emit: versions - script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" diff --git a/modules/local/join_CNAqc/main.nf b/modules/local/join_CNAqc/main.nf index 3b80482..4223f54 100755 --- a/modules/local/join_CNAqc/main.nf +++ b/modules/local/join_CNAqc/main.nf @@ -1,7 +1,9 @@ process JOIN_CNAQC { tag "$meta.id" label "process_low" - container = 'docker://lvaleriani/cnaqc:version1.0' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://lvaleriani/cnaqc:version1.0' : + 'docker.io/lvaleriani/cnaqc:version1.0' }" input: tuple val(meta), path(rds_list, stageAs: '*.rds'), val(tumour_samples) diff --git a/modules/local/join_positions/main.nf b/modules/local/join_positions/main.nf index 60d23fd..5306e55 100644 --- a/modules/local/join_positions/main.nf +++ b/modules/local/join_positions/main.nf @@ -1,7 +1,9 @@ process JOIN_POSITIONS { tag "$meta.id" label "process_single" - container = 'docker://lvaleriani/cnaqc:version1.0' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://lvaleriani/cnaqc:version1.0' : + 'docker.io/lvaleriani/cnaqc:version1.0' }" input: tuple val(meta), path(rds), path(vcf_pileup), path(positions) diff --git a/modules/local/mobsterh/main.nf b/modules/local/mobsterh/main.nf index 8352053..aca453a 100644 --- a/modules/local/mobsterh/main.nf +++ b/modules/local/mobsterh/main.nf @@ -1,7 +1,9 @@ process MOBSTERh { tag "$meta.id" label "process_single" - container = 'docker://elenabuscaroli/mobster:version1.0' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://elenabuscaroli/mobster:version1.0' : + 'docker.io/elenabuscaroli/mobster:version1.0' }" input: tuple val(meta), path(rds_join) // rds from JOIN_CNAQC @@ -16,118 +18,5 @@ process MOBSTERh { path "versions.yml", emit: versions script: - def args = task.ext.args ?: "" - def prefix = task.ext.prefix ?: "${meta.id}" - def K = args!="" && args.K ? "$args.K" : "" - def samples = args!="" && args.samples ? "$args.samples" : "" - def init = args!="" && args.init ? "$args.init" : "" - def tail = args!="" && args.tail ? "$args.tail" : "" - def epsilon = args!="" && args.epsilon ? "$args.epsilon" : "" - def maxIter = args!="" && args.maxIter ? "$args.maxIter" : "" - def fit_type = args!="" && args.fit_type ? "$args.fit_type" : "" - def seed = args!="" && args.seed ? "$args.seed" : "" - def model_selection = args!="" && args.model_selection ? "$args.model_selection" : "" - def trace = args!="" && args.trace ? "$args.trace" : "" - def parallel = args!="" && args.parallel ? "$args.parallel" : "" - def pi_cutoff = args!="" && args.pi_cutoff ? "$args.pi_cutoff" : "" - def n_cutoff = args!="" && args.n_cutoff ? "$args.n_cutoff" : "" - def auto_setup = args!="" && args.auto_setup ? "$args.auto_setup" : "" - def silent = args!="" && args.silent ? "$args.silent" : "" - - """ - #!/usr/bin/env Rscript - - # Sys.setenv("VROOM_CONNECTION_SIZE"=99999999) - library(CNAqc) - library(mobster) - library(dplyr) - library(ggplot2) - source("$moduleDir/getters.R") - patientID = description = "$meta.patient" - samples = strsplit(x = "$meta.tumour_sample", ",") %>% unlist() # list of samples - - ## read mCNAqc object - if ( grepl(".rds\$", tolower("$rds_join")) ) { - obj = readRDS("$rds_join") - if (class(obj) == "m_cnaqc") { - original = obj %>% get_sample(sample=samples, which_obj="original") - input_table = lapply(names(original), - function(sample_name) - original[[sample_name]] %>% - # keep only mutations on the diploid karyotype - CNAqc::subset_by_segment_karyotype("1:1") %>% - CNAqc::Mutations() %>% - dplyr::mutate(sample_id=sample_name) - ) %>% dplyr::bind_rows() - } else { - cli::cli_abort("Object of class {class($rds_join)} not supported.") - } - } else { - input_table = read.csv("$rds_join") - } - - ## Function to run a single mobster fit - run_mobster_fit = function(joint_table, descr) { - # get input table for the single patient - inp_tb = joint_table %>% - dplyr::filter(VAF < 1) %>% - # dplyr::mutate(VAF=replace(VAF, VAF==0, 1e-7)) %>% - dplyr::filter(VAF!=0) %>% - dplyr::filter(karyotype=="1:1") - # dplyr::rename(variantID=gene) %>% - # dplyr::rename(is.driver=is_driver) %>% - # dplyr::rename(tumour_content=purity) %>% - # dplyr::rename(is_driver=is.driver) - # dplyr::rename(is_driver=is.driver, driver_label=variantID) - - mobster_fit(x = inp_tb, - K = eval(parse(text="$K")), - samples = as.integer("$samples"), - init = "$init", - tail = eval(parse(text="$tail")), - epsilon = as.numeric("$epsilon"), - maxIter = as.integer("$maxIter"), - fit.type = "$fit_type", - seed = as.integer("$seed"), - model.selection = "$model_selection", - trace = as.logical("$trace"), - parallel = as.logical("$parallel"), - pi_cutoff = as.numeric("$pi_cutoff"), - N_cutoff = as.integer("$n_cutoff"), - auto_setup = eval(parse(text="$auto_setup")), - silent = as.logical("$silent"), - description = descr) - } - - lapply(samples, function(sample_name) { - # dir.create(outDir_sample, recursive = TRUE) - - fit = run_mobster_fit(joint_table=input_table %>% dplyr::filter(sample_id == !!sample_name), - descr=description) - - best_fit = fit[["best"]] - plot_fit = plot(best_fit) - - saveRDS(object=fit, file=paste0("$prefix", "_mobsterh_st_fit.rds")) - saveRDS(object=best_fit, file=paste0("$prefix", "_mobsterh_st_best_fit.rds")) - saveRDS(object=plot_fit, file=paste0("$prefix", "_mobsterh_st_best_fit_plots.rds")) - - # save report plots - report_fig = mobster::plot_model_selection(fit) - saveRDS(report_fig, file=paste0("$prefix", "_REPORT_plots_mobster.rds")) - ggplot2::ggsave(plot=report_fig, filename=paste0("$prefix", "_REPORT_plots_mobster.pdf"), height=210, width=210, units="mm", dpi = 200) - ggplot2::ggsave(plot=report_fig, filename=paste0("$prefix", "_REPORT_plots_mobster.png"), height=210, width=210, units="mm", dpi = 200) - }) - - - # version export - f = file("versions.yml","w") - cnaqc_version = sessionInfo()\$otherPkgs\$CNAqc\$Version - mobster_version = sessionInfo()\$otherPkgs\$mobster\$Version - writeLines(paste0('"', "$task.process", '"', ":"), f) - writeLines(paste(" CNAqc:", cnaqc_version), f) - writeLines(paste(" mobster:", mobster_version), f) - close(f) - """ - + template "main_script.R" } diff --git a/modules/local/pyclonevi/main.nf b/modules/local/pyclonevi/main.nf index 1e77925..de95471 100755 --- a/modules/local/pyclonevi/main.nf +++ b/modules/local/pyclonevi/main.nf @@ -1,7 +1,9 @@ process PYCLONEVI { tag "$meta.id" label "process_low" - container = 'https://depot.galaxyproject.org/singularity/pyclone-vi%3A0.1.3--pyhca03a8a_0' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pyclone-vi%3A0.1.3--pyhca03a8a_0' : + 'docker.io/blcdsdockerregistry/pyclone-vi:0.1.2' }" input: tuple val(meta), path(rds_join), val(tumour_samples) @@ -14,47 +16,5 @@ process PYCLONEVI { path "versions.yml", emit: versions script: - def args = task.ext.args ?: '' - // def prefix = task.ext.prefix ?:"${meta.id}_remove_tail_$args.remove_tail" - def prefix = task.ext.prefix ?:"${meta.id}" - def n_cluster_arg = args.n_cluster ? "$args.n_cluster" : "" - def density_arg = args.density ? "$args.density" : "" - def n_grid_point_arg = args.n_grid_point ? "$args.n_grid_point" : "" - def n_restarts_arg = args.n_restarts ? "$args.n_restarts" : "" - sampleID_string = tumour_samples.join(" ") - - """ - # format the input table in order to be pyclone compliant - python3 $moduleDir/pyclone_utils.py create_pyclone_input $rds_join $meta.patient ${prefix}_pyclone_input_all_samples.tsv - - colnames=\$(head -n 1 ${prefix}_pyclone_input_all_samples.tsv) - - column_number=\$(echo -e "\$colnames" | awk -v col_name="sample_id" 'BEGIN { FS = "\t" } { - for (i = 1; i <= NF; i++) { - if (\$i == col_name) { - print i - exit - } - } - exit 1 # Exit with error if column not found - }') - - echo -e "\$colnames" > ${prefix}_pyclone_input.tsv - for i in $sampleID_string; - do awk -F "\t" '\$'\$column_number' == "'"\$i"'"' ${prefix}_pyclone_input_all_samples.tsv >> ${prefix}_pyclone_input.tsv; - done - - pyclone-vi fit -i ${prefix}_pyclone_input.tsv -o ${prefix}_all_fits.h5 -c $n_cluster_arg -d $density_arg --num-grid-points $n_grid_point_arg --num-restarts $n_restarts_arg - pyclone-vi write-results-file -i ${prefix}_all_fits.h5 -o ${prefix}_best_fit.txt - - python3 $moduleDir/pyclone_ctree.py --joint ${prefix}_pyclone_input.tsv --best_fit ${prefix}_best_fit.txt --ctree_input ${prefix}_cluster_table.csv - - - VERSION=\$(pip show pyclone-vi | grep Version | awk '{print \$NF}') - cat <<-END_VERSIONS > versions.yml - "${task.process}": - pyclone-vi: \$VERSION - END_VERSIONS - """ - + template "main_script.py" } diff --git a/modules/local/tinc/main.nf b/modules/local/tinc/main.nf index cf63ad5..bbbc6ba 100755 --- a/modules/local/tinc/main.nf +++ b/modules/local/tinc/main.nf @@ -1,7 +1,9 @@ process TINC { tag "$meta.id" label "process_single" - container 'docker://vvvirgy/tinc:v2' // define the running container + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://vvvirgy/tinc:v2' : + 'docker.io/vvvirgy/tinc:v2' }" // conda "${moduleDir}/environment.yml" input: diff --git a/modules/local/vcf2CNAqc/main.nf b/modules/local/vcf2CNAqc/main.nf index f9bec2f..8f533e8 100755 --- a/modules/local/vcf2CNAqc/main.nf +++ b/modules/local/vcf2CNAqc/main.nf @@ -1,7 +1,9 @@ process VCF_PROCESSING { tag "$meta.id" label "process_single" - container "docker://lvaleriani/cnaqc:version1.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://lvaleriani/cnaqc:version1.0' : + 'docker.io/lvaleriani/cnaqc:version1.0' }" input: tuple val(meta), path(vcf), path(tbi) @@ -14,54 +16,6 @@ process VCF_PROCESSING { task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def filter_mutations = args.filter_mutations ? "$args.filter_mutations": "NULL" - - """ - #!/usr/bin/env Rscript - - library(dplyr) - library(tidyr) - library(vcfR) - - source(paste0("$moduleDir", '/parser_vcf.R')) - - # Read vcf file - vcf = vcfR::read.vcfR("$vcf") - - # Check from which caller the .vcf has been produced - source = vcfR::queryMETA(vcf, element = 'source')[[1]] - - if (TRUE %in% grepl(pattern = 'Mutect', x = source)){ - calls = parse_Mutect(vcf, tumour_id = "$meta.tumour_sample", normal_id = "$meta.normal_sample", filter_mutations = as.logical("$filter_mutations")) - - } else if (TRUE %in% grepl(pattern = 'strelka', x = source)){ - calls = parse_Strelka(vcf, tumour_id = "$meta.tumour_sample", normal_id = "$meta.normal_sample", filter_mutations = as.logical("$filter_mutations")) - - } else if (TRUE %in% grepl(pattern = 'Platypus', x = source)){ - calls = parse_Platypus(vcf, tumour_id = "$meta.tumour_sample", normal_id = "$meta.normal_sample", filter_mutations = as.logical("$filter_mutations")) - - } else if (TRUE %in% grepl(pattern = 'freeBayes', x = source)){ - calls = parse_FreeBayes(vcf, tumour_id = "$meta.tumour_sample", normal_id = "$meta.normal_sample", filter_mutations = as.logical("$filter_mutations")) - - } else { - stop('Variant Caller not supported.') - } - - saveRDS(object = calls, file = paste0("$prefix", "_snv.rds")) - - # version export - f <- file("versions.yml","w") - dplyr_version <- sessionInfo()\$otherPkgs\$dplyr\$Version - tidyr_version <- sessionInfo()\$otherPkgs\$tidyr\$Version - vcfR_version <- sessionInfo()\$otherPkgs\$vcfR\$Version - writeLines(paste0('"', "$task.process", '"', ":"), f) - writeLines(paste(" dplyr:", dplyr_version), f) - writeLines(paste(" tidyr:", tidyr_version), f) - writeLines(paste(" vcfR:", vcfR_version), f) - close(f) - - """ + template "main_script.R" } diff --git a/modules/local/viber/main.nf b/modules/local/viber/main.nf index 66a07e5..6d1153f 100644 --- a/modules/local/viber/main.nf +++ b/modules/local/viber/main.nf @@ -1,7 +1,9 @@ process VIBER { tag "$meta.id" label "process_single" - container = 'docker://elenabuscaroli/viber:version0.1' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://elenabuscaroli/viber:version0.1' : + 'docker.io/elenabuscaroli/viber:version0.1' }" input: tuple val(meta), path(rds_join), val(tumour_samples) //rds from either JOIN_CNAQC or JOIN_FIT, should be always grouped @@ -17,26 +19,7 @@ process VIBER { path "versions.yml", emit: versions script: - // viber fit params - def args = task.ext.args ?: "" - def prefix = task.ext.prefix ?:"${meta.id}" - def K = args!="" && args.K ? "$args.K" : "" - def alpha_0 = args!="" && args.alpha_0 ? "$args.alpha_0" : "" - def a_0 = args!="" && args.a_0 ? "$args.a_0" : "" - def b_0 = args!="" && args.b_0 ? "$args.b_0" : "" - def maxIter = args!="" && args.maxIter ? "$args.maxIter" : "" - def epsilon_conv = args!="" && args.epsilon_conv ? "$args.epsilon_conv" : "" - def samples = args!="" && args.samples ? "$args.samples" : "" - def q_init = args!="" && args.q_init ? "$args.q_init" : "" - def trace = args!="" && args.trace ? "$args.trace" : "" - // viber filter params - def binomial_cutoff = args!="" && args.binomial_cutoff ? "$args.binomial_cutoff" : "" - def dimensions_cutoff = args!="" && args.dimensions_cutoff ? "$args.dimensions_cutoff" : "" - def pi_cutoff = args!="" && args.pi_cutoff ? "$args.pi_cutoff" : "" - def re_assign = args!="" && args.re_assign ? "$args.re_assign" : "" - // def mode = args.mode ? "$args.mode" : "" def n_samples = tumour_samples.size() - if (n_samples==1) { plot1 = "viber_best_st_mixing_plots.rds" plot2 = "viber_best_st_heuristic_mixing_plots.rds" @@ -45,161 +28,5 @@ process VIBER { plot2 = "viber_best_st_heuristic_fit_plots.rds" } - """ - #!/usr/bin/env Rscript - - # Sys.setenv("VROOM_CONNECTION_SIZE"=99999999) - - library(VIBER) - library(dplyr) - library(tidyverse) - library(ggplot2) - library(CNAqc) - source("$moduleDir/getters.R") - - patientID = "$meta.patient" - samples = substr("$tumour_samples", 2, nchar("$tumour_samples")-1) - samples = strsplit(samples, ", ")[[1]] - print("$tumour_samples") - print("$rds_join") - print(samples) - - if ( grepl(".rds\$", tolower("$rds_join")) ) { - input_obj = readRDS("$rds_join") - if (class(input_obj) == "m_cnaqc") { - shared = input_obj %>% get_sample(sample=samples, which_obj="shared") - joint_table = lapply(names(shared), - function(sample_name) - shared[[sample_name]] %>% - # keep only mutations on the diploid karyotype - CNAqc::subset_by_segment_karyotype("1:1") %>% - CNAqc::Mutations() %>% - dplyr::mutate(sample_id=sample_name) - ) %>% dplyr::bind_rows() - } else { - cli::cli_alert_warning("Object of class {class(input_obj)} not supported.") - return() - } - } - - print("Subset joint done") - - ## TODO : add drivers to `input_tab` - - ## Read input joint table - input_tab = joint_table %>% - dplyr::mutate(VAF=replace(VAF, VAF==0, 1e-7)) - - ## Convert the input table into longer format - reads_data = input_tab %>% - dplyr::select(chr, from, ref, alt, NV, DP, VAF, sample_id) %>% - tidyr::pivot_wider(names_from="sample_id", - values_from=c("NV","DP","VAF"), names_sep=".") - - ## Extract DP (depth) - dp = reads_data %>% - # dplyr::filter(mutation_id %in% non_tail) %>% ## this step should be managed before by other module - dplyr::select(dplyr::starts_with("DP")) %>% - dplyr::mutate(dplyr::across(.cols=dplyr::everything(), - .fns=function(x) replace(x, is.na(x), 0))) %>% - dplyr::rename_all(function(x) stringr::str_remove_all(x,"DP.")) - - ## Extract NV (alt_counts) - nv = reads_data %>% - # dplyr::filter(mutation_id %in% non_tail) %>% ## this step should be managed before by other module - dplyr::select(dplyr::starts_with("NV")) %>% - dplyr::mutate(dplyr::across(.cols=dplyr::everything(), - .fns=function(x) replace(x, is.na(x), 0))) %>% - dplyr::rename_all(function(x) stringr::str_remove_all(x,"NV.")) - - # Standard fit - viber_K = eval(parse(text="$K")) - viber_K[which.min(viber_K)] = 2 - st_fit = VIBER::variational_fit(nv, dp, - K=viber_K, - data=reads_data, - # %>% dplyr::filter(mutation_id %in% non_tail) - alpha_0=as.numeric("$alpha_0"), - a_0=as.integer("$a_0"), - b_0=as.integer("$b_0"), - max_iter=as.integer("$maxIter"), - epsilon_conv=as.numeric("$epsilon_conv"), - samples=as.integer("$samples"), - q_init="$q_init", - trace=as.logical("$trace"), - description="" - ) - - best_fit = best_fit_heuristic = st_fit - - # If all clusters are removed -> keep the origianl best fit - tryCatch(expr = { - # Apply the heuristic - best_fit_heuristic = VIBER::choose_clusters(st_fit, - binomial_cutoff=as.numeric("$binomial_cutoff"), - dimensions_cutoff=as.integer("$dimensions_cutoff"), - pi_cutoff=as.numeric("$pi_cutoff"), - re_assign=as.logical("$re_assign") - ) - }, error = function(e) { - print(e) - best_fit_heuristic <<- st_fit - } ) - - # Save fits - saveRDS(best_fit, file=paste0("$prefix", "_viber_best_st_fit.rds")) - saveRDS(best_fit_heuristic, file = paste0("$prefix", "_viber_best_st_heuristic_fit.rds")) - - # Save plots - if ("$n_samples" >1) { #mutlisample mode on - print("multisample mode on") - plot_fit = plot(best_fit) - plot_fit_heuristic = plot(best_fit_heuristic) - - saveRDS(plot_fit, file=paste0("$prefix", "_viber_best_st_fit_plots.rds")) - saveRDS(plot_fit_heuristic, file=paste0("$prefix", "_viber_best_st_heuristic_fit_plots.rds")) - } else { - plot_fit_mixing = plot_mixing_proportions(best_fit) - plot_fit_mixing_heuristic = plot_mixing_proportions(best_fit_heuristic) - - saveRDS(plot_fit_mixing, file=paste0("$prefix", "_viber_best_st_mixing_plots.rds")) - saveRDS(plot_fit_mixing_heuristic, file=paste0("$prefix", "_viber_best_st_heuristic_mixing_plots.rds")) - } - - # Save report plot - n_samples = ncol(best_fit[["x"]]) - 1 - marginals = multivariate = ggplot() - - try(expr = {marginals <<- VIBER::plot_1D(best_fit)} ) - #try(expr = {multivariate = plot(best_fit) %>% patchwork::wrap_plots()} ) - #top_p = patchwork::wrap_plots(marginals, multivariate, design=ifelse(n_samples>2, "ABB", "AAB")) - - try(expr = {multivariate = plot(best_fit)}) - try(expr = {multivariate = ggpubr::ggarrange(plotlist = multivariate)}) - top_p = ggpubr::ggarrange(plotlist = list(marginals, multivariate), widths=ifelse(n_samples>2, c(1,2), c(2,1))) - - mix_p = VIBER::plot_mixing_proportions(best_fit) - binom = VIBER::plot_peaks(best_fit) - elbo = VIBER::plot_ELBO(best_fit) - #bottom_p = patchwork::wrap_plots(mix_p, binom, elbo, design="ABBBC") - bottom_p = ggpubr::ggarrange(plotlist = list(mix_p, binom, elbo), widths = c(1,2,1), nrow = 1) - - #report_fig = patchwork::wrap_plots(top_p, bottom_p, design=ifelse(n_samples>2, "AAAB", "AAB")) - report_fig = ggpubr::ggarrange(top_p, bottom_p, nrow = 2, heights = ifelse(n_samples>2, c(3,1), c(2,1))) - saveRDS(report_fig, file=paste0("$prefix", "_REPORT_plots_viber.rds")) - ggplot2::ggsave(plot=report_fig, filename=paste0("$prefix", "_REPORT_plots_viber.pdf"), height=210, width=210, units="mm", dpi = 200) - ggplot2::ggsave(plot=report_fig, filename=paste0("$prefix", "_REPORT_plots_viber.png"), height=210, width=210, units="mm", dpi = 200) - - - # version export - f = file("versions.yml","w") - cnaqc_version = sessionInfo()\$otherPkgs\$CNAqc\$Version - viber_version = sessionInfo()\$otherPkgs\$VIBER\$Version - writeLines(paste0('"', "$task.process", '"', ":"), f) - writeLines(paste(" CNAqc:", cnaqc_version), f) - writeLines(paste(" VIBER:", viber_version), f) - close(f) - - """ - + template "main_script.R" } From 21c673228fd9fe5f7481c34de49b1aab9c46b0d5 Mon Sep 17 00:00:00 2001 From: Elena Buscaroli Date: Wed, 29 Jan 2025 16:32:57 +0100 Subject: [PATCH 8/8] update nextflow.config --- nextflow.config | 3 --- 1 file changed, 3 deletions(-) diff --git a/nextflow.config b/nextflow.config index 4725494..9c71447 100644 --- a/nextflow.config +++ b/nextflow.config @@ -527,6 +527,3 @@ validation { afterText = validation.help.afterText } } - -// Load modules.config for DSL2 module specific options -includeConfig 'conf/modules.config'