Skip to content

Commit

Permalink
update oncogenicity classification
Browse files Browse the repository at this point in the history
  • Loading branch information
sigven committed Feb 21, 2025
1 parent 5c81c03 commit ff35ce3
Show file tree
Hide file tree
Showing 23 changed files with 216 additions and 85 deletions.
8 changes: 2 additions & 6 deletions pcgr/annoutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,14 +303,10 @@ def assign_cds_exon_intron_annotations(csq_record, grantham_scores, logger):
if '-' in cds_pos_full and not '?' in cds_pos_full:
cds_pos = cds_pos_full.split('-')[0]
if cds_pos.isdigit():
cds_pos = int(cds_pos)
#else:
# logger.warning(f'Could not determine variant CDS position from VEP annotation - ({csq_record["CDS_position"]})')
cds_pos = int(cds_pos)
else:
if cds_pos_full.isdigit():
cds_pos = int(cds_pos_full)
#else:
# logger.warning(f'Could not determine variant CDS position from VEP annotation - ({csq_record["CDS_position"]})')
cds_pos = int(cds_pos_full)

if int(cds_pos) > -1 and int(cds_pos) <= int(cds_length):
csq_record['CDS_RELATIVE_POSITION'] = float(cds_pos/cds_length)
Expand Down
16 changes: 8 additions & 8 deletions pcgr/oncogenicity.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,21 +318,19 @@ def assign_oncogenicity_evidence(rec = None, oncogenicity_criteria = None, tumor
## check if variant has MAF > 0.01 (SBVS1) or > 0.05 in any of five major gnomAD subpopulations (exome set)
for pop in pcgr_vars.GNOMAD_MAIN_EXON_AF_TAGS:
if not variant_data[pop] is None:
if float(variant_data[pop]) >= pcgr_vars.ONCOGENICITY['gnomAD_very_common_AF']:
if float(variant_data[pop]) >= float(pcgr_vars.ONCOGENICITY['gnomAD_very_common_AF']):
variant_data["ONCG_SBVS1"] = True
for pop in pcgr_vars.GNOMAD_MAIN_EXON_AF_TAGS:
if not variant_data[pop] is None:
if float(variant_data[pop]) >= pcgr_vars.ONCOGENICITY['gnomAD_common_AF'] and variant_data["ONCG_SBVS1"] is False:
if float(variant_data[pop]) >= float(pcgr_vars.ONCOGENICITY['gnomAD_common_AF']) and variant_data["ONCG_SBVS1"] is False:
variant_data["ONCG_SBS1"] = True

approx_zero_pop_freq = 0
for pop in pcgr_vars.GNOMAD_MAIN_EXON_AF_TAGS:
## no MAF recorded in gnomAD for this population
if variant_data[pop] is None:
approx_zero_pop_freq = approx_zero_pop_freq + 1
else:
## Extremely low MAF for this population
if float(variant_data[pop]) < pcgr_vars.ONCOGENICITY['gnomAD_extremely_rare_AF']:
if float(variant_data[pop]) < float(pcgr_vars.ONCOGENICITY['gnomAD_extremely_rare_AF']):
approx_zero_pop_freq = approx_zero_pop_freq + 1

## check if variant is missing or with MAF approximately zero in all five major gnomAD subpopulations (exome set)
Expand Down Expand Up @@ -384,7 +382,7 @@ def assign_oncogenicity_evidence(rec = None, oncogenicity_criteria = None, tumor
variant_data['ONCOGENICITY'] = "VUS"
variant_data["ONCOGENICITY_DOC"] = "."
variant_data["ONCOGENICITY_CODE"] = "."
variant_data["ONCOGENICITY_SCORE"] = "."
variant_data["ONCOGENICITY_SCORE"] = 0
onc_score_pathogenic = 0
onc_score_benign = 0

Expand Down Expand Up @@ -460,7 +458,9 @@ def load_oncogenic_variants(oncogenic_variants_fname: str, logger: Logger):
with gzip.open(oncogenic_variants_fname, mode='rt') as f:
reader = csv.DictReader(f, delimiter='\t')
for row in reader:
gene = str(row['entrezgene'])
gene = str(row['entrezgene'])
if not 'oncogenic' in str(row['oncogenicity']).lower():
continue
oncogenic_variants[str(gene) + '-' + str(row['var_id'])] = row
if not len(row['hgvsp']) == 0:
oncogenic_variants[str(gene) + '-' + str(row['hgvsp'])] = row
Expand Down
4 changes: 2 additions & 2 deletions pcgr/pcgr_vars.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

## Version - software and bundle
PCGR_VERSION = __version__
DB_VERSION = '20250217'
DB_VERSION = '20250221'

## Miscellaneous settings
NCBI_BUILD_MAF = 'GRCh38'
Expand Down Expand Up @@ -222,7 +222,7 @@
r"(stop_(lost|gained)|start_lost|frameshift_|missense_|splice_(donor|acceptor)|protein_altering|inframe_)"
CSQ_CODING_SILENT_PATTERN2 = \
r"(stop_(lost|gained)|start_lost|frameshift_|missense_|splice_(donor|acceptor)|protein_altering|inframe_|synonymous|(start|stop)_retained)"
CSQ_NULL_PATTERN = r"^(stop_gained|frameshift_)"
CSQ_NULL_PATTERN = r"^(stop_gained|frameshift_)|&stop_gained"
CSQ_SPLICE_REGION_PATTERN = r"(splice_|intron_variant)"
CSQ_SPLICE_DONOR_PATTERN = \
r"(splice_region_variant|splice_donor_variant|splice_donor_region_variant|splice_donor_5th_base_variant)"
Expand Down
1 change: 1 addition & 0 deletions pcgrr/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ export(append_dbmts_var_link)
export(append_dbnsfp_var_link)
export(append_drug_var_link)
export(append_gwas_citation_phenotype)
export(append_oncogenicity_docs)
export(append_targeted_drug_annotations)
export(append_tcga_var_link)
export(append_tfbs_annotation)
Expand Down
4 changes: 4 additions & 0 deletions pcgrr/R/data.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@
"variant_db_url"


#' Oncogenicity criteria (ClinGen/CGC/VICC)
#'
"oncogenicity_criteria"

#' Fixed data types/categories used for biomarker evidence, e.g. 'types','levels' etc.
#'
"biomarker_evidence"
Expand Down
44 changes: 23 additions & 21 deletions pcgrr/R/input_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,8 @@ load_somatic_snv_indel <- function(
callset[['variant_display']] <- callset[['variant']] |>
pcgrr::append_cancer_gene_evidence(
ref_data = ref_data) |>
pcgrr::append_oncogenicity_docs(
ref_data = ref_data) |>
pcgrr::append_dbmts_var_link() |>
pcgrr::append_tcga_var_link() |>
pcgrr::append_annotation_links() |>
Expand Down Expand Up @@ -833,7 +835,7 @@ load_dna_variants <- function(
c("VARIANT_ID", "ENTREZGENE","BIOMARKER_SOURCE")),
by = c("VARIANT_ID","BIOMARKER_SOURCE"),
relationship = "many-to-many") |>
dplyr::rename(BIOMARKER_MATCH = BIOMARKER_MATCHTYPE) |>
dplyr::rename("BIOMARKER_MATCH" = "BIOMARKER_MATCHTYPE") |>
dplyr::mutate(BIOMARKER_RESOLUTION = dplyr::case_when(
stringr::str_detect(.data$BIOMARKER_MATCH,"by_cna_segment") ~ "gene",
stringr::str_detect(.data$BIOMARKER_MATCH,"by_genomic_coord") ~ "genomic",
Expand Down Expand Up @@ -918,26 +920,26 @@ load_dna_variants <- function(
relationship = "many-to-many"
) |>
dplyr::rename(
BM_VARIANT_ID = VARIANT_ID,
BM_EVIDENCE_ID = EVIDENCE_ID,
BM_SOURCE_DB = BIOMARKER_SOURCE,
BM_RESOLUTION = BIOMARKER_RESOLUTION,
BM_MATCH = BIOMARKER_MATCH,
BM_PRIMARY_SITE = PRIMARY_SITE,
BM_EVIDENCE_TYPE = EVIDENCE_TYPE,
BM_CANCER_TYPE = CANCER_TYPE,
BM_DISEASE_ONTOLOGY_ID = DISEASE_ONTOLOGY_ID,
BM_VARIANT_ORIGIN = VARIANT_ORIGIN,
BM_EVIDENCE_LEVEL = EVIDENCE_LEVEL,
BM_EVIDENCE_DESCRIPTION = EVIDENCE_DESCRIPTION,
BM_THERAPEUTIC_CONTEXT = THERAPEUTIC_CONTEXT,
BM_CLINICAL_SIGNIFICANCE = CLINICAL_SIGNIFICANCE,
BM_CITATION = CITATION,
BM_REFERENCE = CITATION_HTML,
BM_RATING = RATING,
BM_EVIDENCE_DIRECTION = EVIDENCE_DIRECTION,
BM_MOLECULAR_PROFILE = MOLECULAR_PROFILE_NAME,
BM_MOLECULAR_PROFILE_TYPE = MOLECULAR_PROFILE_TYPE
"BM_VARIANT_ID" = "VARIANT_ID",
"BM_EVIDENCE_ID" = "EVIDENCE_ID",
"BM_SOURCE_DB" = "BIOMARKER_SOURCE",
"BM_RESOLUTION" = "BIOMARKER_RESOLUTION",
"BM_MATCH" = "BIOMARKER_MATCH",
"BM_PRIMARY_SITE" = "PRIMARY_SITE",
"BM_EVIDENCE_TYPE" = "EVIDENCE_TYPE",
"BM_CANCER_TYPE" = "CANCER_TYPE",
"BM_DISEASE_ONTOLOGY_ID" = "DISEASE_ONTOLOGY_ID",
"BM_VARIANT_ORIGIN" = "VARIANT_ORIGIN",
"BM_EVIDENCE_LEVEL" = "EVIDENCE_LEVEL",
"BM_EVIDENCE_DESCRIPTION" = "EVIDENCE_DESCRIPTION",
"BM_THERAPEUTIC_CONTEXT" = "THERAPEUTIC_CONTEXT",
"BM_CLINICAL_SIGNIFICANCE" = "CLINICAL_SIGNIFICANCE",
"BM_CITATION" = "CITATION",
"BM_REFERENCE" = "CITATION_HTML",
"BM_RATING" = "RATING",
"BM_EVIDENCE_DIRECTION" = "EVIDENCE_DIRECTION",
"BM_MOLECULAR_PROFILE" = "MOLECULAR_PROFILE_NAME",
"BM_MOLECULAR_PROFILE_TYPE" = "MOLECULAR_PROFILE_TYPE"
) |>
dplyr::mutate(
BM_RATING = dplyr::if_else(
Expand Down
2 changes: 1 addition & 1 deletion pcgrr/R/mutational_signatures.R
Original file line number Diff line number Diff line change
Expand Up @@ -926,7 +926,7 @@ plot_signature_contributions <- function(
ymax = prop_signature_ci_upper),
width = .3)+
ggplot2::scale_fill_manual(
values = head(
values = utils::head(
pcgrr::color_palette$tier$values,
NROW(plot_data_per_signature))) +
ggplot2::theme_classic() +
Expand Down
7 changes: 7 additions & 0 deletions pcgrr/R/reference_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -426,13 +426,20 @@ load_reference_data <- function(
for(elem in c('tmb',
'mutational_signature',
'pathway',
'oncogenicity',
'hotspot',
'protein_domain')) {

fname_misc <- file.path(
pcgr_db_assembly_dir, "misc", "tsv", elem,
paste0(elem,".tsv.gz")
)
if(elem == "oncogenicity"){
fname_misc <- file.path(
pcgr_db_assembly_dir, "misc", "tsv", elem,
paste0(elem,".tsv")
)
}
check_file_exists(fname_misc)
pcgr_ref_data[['misc']][[elem]] <- as.data.frame(
readr::read_tsv(
Expand Down
3 changes: 2 additions & 1 deletion pcgrr/R/report.R
Original file line number Diff line number Diff line change
Expand Up @@ -726,7 +726,8 @@ load_yaml <- function(yaml_fname, report_mode = "CPSR") {
TRUE ~ PANEL_NAME
)) |>
dplyr::arrange(
dplyr::desc(PRIMARY_TARGET), SYMBOL)
dplyr::desc(.data$PRIMARY_TARGET),
.data$SYMBOL)


}
Expand Down
54 changes: 54 additions & 0 deletions pcgrr/R/variant_annotation.R
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,60 @@ append_gwas_citation_phenotype <-
}
}

#' Function that adds oncogenicity documentation from codes
#'
#' @param var_df Data frame of sample variants from VCF
#' @param ref_data PCGR/CPSR reference data object
#'
#' @return vcf_data_df
#'
#' @export
append_oncogenicity_docs <- function(
var_df,
ref_data = NULL){

if (any(grepl(paste0("^ONCOGENICITY_CODE$"), names(var_df))) &
any(grepl(paste0("^VAR_ID$"), names(var_df)))) {

var_df_unique_slim <- dplyr::select(
var_df, c("VAR_ID", "ONCOGENICITY_CODE")) |>
dplyr::filter(!is.na(.data$ONCOGENICITY_CODE)) |>
tidyr::separate_rows(
"ONCOGENICITY_CODE", sep = "\\|") |>
dplyr::distinct()

if (nrow(var_df_unique_slim ) > 0) {
var_df_unique_slim <- var_df_unique_slim |>
dplyr::left_join(
dplyr::select(
ref_data[['misc']][['oncogenicity']],
"CODE",
"SCORE",
"DESCRIPTION"),
by = c("ONCOGENICITY_CODE" = "CODE")) |>
dplyr::mutate(
ONCOGENICITY_DOC = paste(
.data$ONCOGENICITY_CODE,
paste0(
.data$DESCRIPTION, " (<b>",
.data$SCORE,"</b>)"),
sep=": ")) |>
dplyr::arrange(
.data$VAR_ID, dplyr::desc(.data$SCORE)) |>
dplyr::group_by(.data$VAR_ID) |>
dplyr::summarise(
ONCOGENICITY_DOC = paste(
unique(.data$ONCOGENICITY_DOC), collapse = ", "))

var_df <- dplyr::left_join(
var_df, var_df_unique_slim,
by = c("VAR_ID" = "VAR_ID"))
}else{
var_df$ONCOGENICITY_DOC <- NA
}
}
return(var_df)
}

#' Function that adds TCGA annotations (cohort, frequency etc.) to variant identifiers
#'
Expand Down
15 changes: 14 additions & 1 deletion pcgrr/data-raw/data-raw.R
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,7 @@ data_coltype_defs[['snv_indel_somatic_raw']] <- readr::cols_only(
EXONIC_STATUS = readr::col_character(),
ALTERATION = readr::col_character(),
PROTEIN_CHANGE = readr::col_character(),
GRANTHAM_DISTANCE = readr::col_integer(),
HGVSp_short = readr::col_character(),
HGVSc = readr::col_character(),
HGVSc_RefSeq = readr::col_character(),
Expand All @@ -227,6 +228,8 @@ data_coltype_defs[['snv_indel_somatic_raw']] <- readr::cols_only(
ONCOGENICITY = readr::col_character(),
ONCOGENICITY_CODE = readr::col_character(),
ONCOGENICITY_SCORE = readr::col_integer(),
KNOWN_ONCOGENIC = readr::col_character(),
KNOWN_ONCOGENIC_SITE = readr::col_character(),
PFAM_DOMAIN = readr::col_character(),
PFAM_DOMAIN_NAME = readr::col_character(),
SYMBOL = readr::col_character(),
Expand Down Expand Up @@ -314,6 +317,7 @@ data_coltype_defs[['snv_indel_germline_raw']] <- readr::cols_only(
CODING_STATUS = readr::col_character(),
EXONIC_STATUS = readr::col_character(),
PROTEIN_CHANGE = readr::col_character(),
GRANTHAM_DISTANCE = readr::col_character(),
ALTERATION = readr::col_character(),
HGVSp_short = readr::col_character(),
HGVSc = readr::col_character(),
Expand Down Expand Up @@ -558,6 +562,7 @@ tsv_cols[['snv_indel']] <-
'ONCOGENICITY',
'ONCOGENICITY_CODE',
'ONCOGENICITY_SCORE',
#'ONCOGENICITY_DOC',
'HGVSc',
'HGVSc_RefSeq',
'HGVSp',
Expand Down Expand Up @@ -782,6 +787,7 @@ dt_display[['snv_indel_gene_actionable']] <-
'LOF_FILTER',
'ONCOGENICITY',
'ONCOGENICITY_CODE',
'ONCOGENICITY_DOC',
'ONCOGENICITY_SCORE',
'VEP_ALL_CSQ',
'DBSNP_RSID',
Expand Down Expand Up @@ -842,7 +848,7 @@ dt_display[['snv_indel_tier3']] <-
'PREDICTED_EFFECT',
'ONCOGENICITY_CODE',
'ONCOGENICITY_SCORE',
'VEP_ALL_CSQ',
'ONCOGENICITY_DOC',
'DBSNP_RSID',
'CLINVAR',
'CLINVAR_CLASSIFICATION',
Expand Down Expand Up @@ -884,6 +890,7 @@ dt_display[['tier4']] <-
'REGULATORY_ANNOTATION',
'ONCOGENICITY_CODE',
'ONCOGENICITY_SCORE',
'ONCOGENICITY_DOC',
'VEP_ALL_CSQ',
'DBSNP_RSID',
'ENSEMBL_GENE_ID',
Expand Down Expand Up @@ -1151,3 +1158,9 @@ rm(cancer_phenotypes_regex,
dt_display,
tsv_cols,
c)

oncogenicity_criteria <- readr::read_tsv(
"data-raw/oncogenicity.tsv", show_col_types = F
)
usethis::use_data(oncogenicity_criteria, overwrite = T)

15 changes: 15 additions & 0 deletions pcgrr/data-raw/oncogenicity.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
code score category pole description resource
ONCG_OVS1 8 funcvar P Null variant - predicted as LoF - in bona fide tumor suppressor gene VEP;CGC;CancerMine
ONCG_OS1 4 funcvar P Same amino acid change as previously established oncogenic variant - regardless of nucleotide change ClinVar
ONCG_OS3 4 funcvar P Located in a mutation hotspot with >= 50 samples with variant at AA position, >= 10 samples with same AA change cancerhotspots.org
ONCG_OM1 2 funcvar P Presumably critical site of functional domain CIViC
ONCG_OM2 2 funcvar P Protein length changes from in-frame dels/ins in known oncogene/tumor suppressor genes or stop-loss variants in a tumor suppressor gene VEP;CGC;CancerMine
ONCG_OM3 2 funcvar P Missense variant at an amino acid residue where a different missense variant determined to be oncogenic (using this standard) has been documented ClinVar
ONCG_OM4 2 funcvar P Located in a mutation hotspot with < 50 samples with variant at AA position, >= 10 samples with same AA change cancerhotspots.org
ONCG_OP1 1 funcvar P Multiple lines of computational evidence support of a damaging variant effect on the gene or gene product dbNSFP
ONCG_OP3 1 funccomp P Located in a mutation hotspot with < 10 samples with the same amino acid change cancerhotspots.org
ONCG_OP4 1 clinpop P Absent from controls (gnomAD) / very low MAF (any five major gnomAD subpopulations) gnomAD
ONCG_SBVS1 -8 clinpop B Very high MAF (any five major gnomAD subpopulations) gnomAD
ONCG_SBS1 -4 clinpop B High MAF (any five major gnomAD subpopulations) gnomAD
ONCG_SBP1 -1 funccomp B Multiple lines of computational evidence support a benign variant effect on the gene or gene product dbNSFP
ONCG_SBP2 -1 funcvar B Silent and intronic changes outside of the consensus splice site VEP
Binary file modified pcgrr/data/data_coltype_defs.rda
Binary file not shown.
Binary file modified pcgrr/data/dt_display.rda
Binary file not shown.
Binary file added pcgrr/data/oncogenicity_criteria.rda
Binary file not shown.
2 changes: 1 addition & 1 deletion pcgrr/inst/templates/pcgr_quarto_report/documentation.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ for(i in 1:NROW(ref_datasets)){
if(!is.na(license_url)){
if(source == "cgc" | source == "gepa" | source == "dbnsfp"){
s <- paste0(" * [", source_full, "](", url, ") - ", description, " (<b>", version, "</b>)",
" - <b>[", license,"](", license_url, ")</b>")
" - [<b>", license,"</b>](", license_url, ")")
}else{
s <- paste0(" * [", source_full, "](", url, ") - ", description, " (<b>", version, "</b>)",
" - [", license,"](", license_url, ")")
Expand Down
Loading

0 comments on commit ff35ce3

Please sign in to comment.