Skip to content

Commit

Permalink
clean oncogenicity code
Browse files Browse the repository at this point in the history
  • Loading branch information
sigven committed Feb 17, 2025
1 parent 49ddb15 commit 5c81c03
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 123 deletions.
8 changes: 8 additions & 0 deletions pcgr/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,14 @@ def populate_config_data(conf_options: dict, refdata_assembly_dir: str, workflow
if not ',' in conf_data['conf']['gene_panel']['panel_id']:
conf_data['conf']['gene_panel']['url'] = str(conf_data['conf']['gene_panel']['panel_genes'][0]['panel_url'])
conf_data['conf']['gene_panel']['description_trait'] = str(conf_data['conf']['gene_panel']['panel_genes'][0]['panel_name'])
else:
names = set([str(x['panel_name']) for x in conf_data['conf']['gene_panel']['panel_genes']])
names2 = []
for n in names:
if not 'ACMG' in n and not 'CPIC' in n:
names2.append(n)
conf_data['conf']['gene_panel']['description'] = "Genomics England PanelApp - multiple panels (" + ', '.join(names2) + ")"




Expand Down
175 changes: 54 additions & 121 deletions pcgr/oncogenicity.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@
import gzip

from pcgr.annoutils import threeToOneAA
from pcgr import pcgr_vars

def assign_oncogenicity_evidence(rec = None, oncogenic_variants = None, tumortype = "Any"):
def assign_oncogenicity_evidence(rec = None, oncogenicity_criteria = None, tumortype = "Any"):

clingen_vicc_ev_codes = [
"ONCG_SBVS1",
Expand Down Expand Up @@ -177,15 +178,12 @@ def assign_oncogenicity_evidence(rec = None, oncogenic_variants = None, tumortyp
if not variant_data['KNOWN_ONCOGENIC'] is None:
variant_data['ONCG_OS1'] = True

dbnsfp_minimum_majority = 8
dbnsfp_maximum_minority = 2
dbnsfp_minimum_algos_called = dbnsfp_minimum_majority

variant_data['N_INSILICO_CALLED'] = 0
variant_data['N_INSILICO_DAMAGING'] = 0
variant_data['N_INSILICO_TOLERATED'] = 0
variant_data['N_INSILICO_SPLICING_NEUTRAL'] = 0
variant_data['N_INSILICO_SPLICING_AFFECTED'] = 0
for elem in ['N_INSILICO_CALLED',
'N_INSILICO_DAMAGING',
'N_INSILICO_TOLERATED',
'N_INSILICO_SPLICING_NEUTRAL',
'N_INSILICO_SPLICING_AFFECTED']:
variant_data[elem] = 0

for algo in ['SIFT',
'PROVEAN',
Expand Down Expand Up @@ -221,16 +219,18 @@ def assign_oncogenicity_evidence(rec = None, oncogenic_variants = None, tumortyp
variant_data[col] == 'AS':
variant_data['N_INSILICO_SPLICING_AFFECTED'] += 1

if (variant_data['N_INSILICO_CALLED'] > dbnsfp_minimum_algos_called and \
variant_data['N_INSILICO_DAMAGING'] >= dbnsfp_minimum_majority and \
variant_data['N_INSILICO_TOLERATED'] <= dbnsfp_maximum_minority and \
## majority of insilico predictors predict damaging effect
if (variant_data['N_INSILICO_CALLED'] > pcgr_vars.ONCOGENICITY['insilico_pred_min_majority'] and \
variant_data['N_INSILICO_DAMAGING'] >= pcgr_vars.ONCOGENICITY['insilico_pred_min_majority'] and \
variant_data['N_INSILICO_TOLERATED'] <= pcgr_vars.ONCOGENICITY['insilico_pred_max_minority'] and \
variant_data['N_INSILICO_SPLICING_NEUTRAL'] <= 1) or \
variant_data['N_INSILICO_SPLICING_AFFECTED'] == 2:
variant_data['ONCG_OP1'] = True

if variant_data['N_INSILICO_CALLED'] > dbnsfp_minimum_algos_called and \
variant_data['N_INSILICO_TOLERATED'] >= dbnsfp_minimum_majority and \
variant_data['N_INSILICO_DAMAGING'] <= dbnsfp_maximum_minority and \
## majority of insilico predictors predict tolerated effect
if variant_data['N_INSILICO_CALLED'] > pcgr_vars.ONCOGENICITY['insilico_pred_min_majority'] and \
variant_data['N_INSILICO_TOLERATED'] >= pcgr_vars.ONCOGENICITY['insilico_pred_min_majority'] and \
variant_data['N_INSILICO_DAMAGING'] <= pcgr_vars.ONCOGENICITY['insilico_pred_max_minority'] and \
variant_data['N_INSILICO_SPLICING_AFFECTED'] == 0:
variant_data['ONCG_SBP1'] = True

Expand Down Expand Up @@ -298,7 +298,7 @@ def assign_oncogenicity_evidence(rec = None, oncogenic_variants = None, tumortyp
'by_hgvsp_principal' in eitem or \
'by_codon_principal' in eitem or \
'by_aa_region_principal' in eitem):
## only applicable if OS3 is not set
## only applicable if OS3 and OS1 is not set
if variant_data['ONCG_OS3'] is False and variant_data['ONCG_OS1'] is False:
variant_data['ONCG_OM1'] = True

Expand All @@ -309,42 +309,38 @@ def assign_oncogenicity_evidence(rec = None, oncogenic_variants = None, tumortyp
'Somatic' in eitem and \
('by_genomic_coord' in eitem or \
'by_hgvsc_principal' in eitem):
## only applicable if OS3 is not set
## only applicable if OS3 and OS1 is not set
if variant_data['ONCG_OS3'] is False and variant_data['ONCG_OS1'] is False:
variant_data['ONCG_OM1'] = True

if "gnomADe_EAS_AF" in variant_data.keys() and \
"gnomADe_SAS_AF" in variant_data.keys() and \
"gnomADe_AMR_AF" in variant_data.keys() and \
"gnomADe_AFR_AF" in variant_data.keys() and \
"gnomADe_NFE_AF" in variant_data.keys():
if all(pop in variant_data.keys() for pop in pcgr_vars.GNOMAD_MAIN_EXON_AF_TAGS):

## check if variant has MAF > 0.01 (SBVS1) or > 0.05 in any of five major gnomAD subpopulations (exome set)
for pop in ['gnomADe_SAS_AF','gnomADe_EAS_AF','gnomADe_AMR_AF','gnomADe_AFR_AF','gnomADe_NFE_AF']:
for pop in pcgr_vars.GNOMAD_MAIN_EXON_AF_TAGS:
if not variant_data[pop] is None:
## MAF for this population >= 0.05
if float(variant_data[pop]) >= 0.05:
if float(variant_data[pop]) >= pcgr_vars.ONCOGENICITY['gnomAD_very_common_AF']:
variant_data["ONCG_SBVS1"] = True
for pop in ['gnomADe_SAS_AF','gnomADe_EAS_AF','gnomADe_AMR_AF','gnomADe_AFR_AF','gnomADe_NFE_AF']:
for pop in pcgr_vars.GNOMAD_MAIN_EXON_AF_TAGS:
if not variant_data[pop] is None:
## MAF for this population >= 0.01 (< 0.05)
if float(variant_data[pop]) >= 0.01 and variant_data["ONCG_SBVS1"] is False:
if float(variant_data[pop]) >= pcgr_vars.ONCOGENICITY['gnomAD_common_AF'] and variant_data["ONCG_SBVS1"] is False:
variant_data["ONCG_SBS1"] = True

#missing_pop_freq = 0
approx_zero_pop_freq = 0
for pop in ['gnomADe_SAS_AF','gnomADe_EAS_AF','gnomADe_AMR_AF','gnomADe_AFR_AF','gnomADe_NFE_AF']:
for pop in pcgr_vars.GNOMAD_MAIN_EXON_AF_TAGS:
## no MAF recorded in gnomAD for this population
if variant_data[pop] is None:
approx_zero_pop_freq = approx_zero_pop_freq + 1
else:
## Very low MAF for this population
if float(variant_data[pop]) < 0.0001:
## Extremely low MAF for this population
if float(variant_data[pop]) < pcgr_vars.ONCOGENICITY['gnomAD_extremely_rare_AF']:
approx_zero_pop_freq = approx_zero_pop_freq + 1

## check if variant is missing or with MAF approximately zero in all five major gnomAD subpopulations (exome set)
if approx_zero_pop_freq == 5:
variant_data["ONCG_OP4"] = True

else:
print("ERROR: Missing gnomAD AF tags in VCF INFO field")


## check if variant is a loss-of-function variant in a tumor suppressor gene (Cancer Gene Census/CancerMine)
Expand Down Expand Up @@ -385,77 +381,6 @@ def assign_oncogenicity_evidence(rec = None, oncogenic_variants = None, tumortyp
variant_data['ONCG_OM1'] is False:
variant_data['ONCG_OM3'] = True

og_score_data = {}
og_score_data['code'] = \
['ONCG_SBVS1',
'ONCG_SBS1',
'ONCG_OP4',
'ONCG_SBP1',
'ONCG_SBP2',
'ONCG_OVS1',
'ONCG_OS1',
'ONCG_OS3',
'ONCG_OM1',
'ONCG_OM2',
'ONCG_OM3',
'ONCG_OM4',
'ONCG_OP1',
'ONCG_OP3'
]
og_score_data['category'] = \
['clinpop',
'clinpop',
'clinpop',
'funccomp',
'funcvar',
'funcvar',
'funcvar',
'funcvar',
'funcvar',
'funcvar',
'funcvar',
'funcvar',
'funcvar',
'funccomp']
og_score_data['pole'] = \
['B','B',
'P','B',
'B','P',
'P','P',
'P','P',
'P','P',
'P','P']

og_score_data['description'] = \
['Very high MAF (> 0.05 in gnomAD - any five major continental pops)',
'High MAF (> 0.01 in gnomAD - any five major continental pops)',
'Absent from controls (gnomAD) / very low MAF ( < 0.0001 in all five major subpopulations)',
'Multiple lines (>=8) of computational evidence support a benign variant effect on the gene or gene product - from dbNSFP',
'Silent and intronic changes outside of the consensus splice site',
'Null variant - predicted as LoF - in bona fide tumor suppressor gene',
'Same amino acid change as previously established oncogenic variant - regardless of nucleotide change (ClinVar oncogenicity records)',
'Located in a mutation hotspot (cancerhotspots.org). >= 50 samples with a variant at AA position, >= 10 samples with same AA change',
'Presumably critical site of functional domain - based on indirect evidence from overlap with predictive biomarkers',
'Protein length changes from in-frame dels/ins in known oncogene/tumor suppressor genes or stop-loss variants in a tumor suppressor gene',
'Missense variant at an amino acid residue where a different missense variant determined to be oncogenic (using this standard) has been documented. ',
'Located in a mutation hotspot (cancerhotspots.org). < 50 samples with a variant at AA position, >= 10 samples with same AA change.',
'Multiple lines (>=8) of computational evidence support of a damaging variant effect on the gene or gene product - from dbNSFP',
'Located in a mutation hotspot (cancerhotspots.org). < 10 samples with the same amino acid change.'
]

og_score_data['score'] = \
[-8, -4, 1, -1, -1, 8, 4, 4, 2, 2, 2, 2, 1, 1]

i = 0
oncogenicity_scores = {}
for code in og_score_data['code']:
oncogenicity_scores[code] = {}
for e in ['category','pole','description','score']:
oncogenicity_scores[code][e] = og_score_data[e][i]
if e == 'score':
oncogenicity_scores[code][e] = float(og_score_data[e][i])
i = i + 1

variant_data['ONCOGENICITY'] = "VUS"
variant_data["ONCOGENICITY_DOC"] = "."
variant_data["ONCOGENICITY_CODE"] = "."
Expand All @@ -466,27 +391,18 @@ def assign_oncogenicity_evidence(rec = None, oncogenic_variants = None, tumortyp
for code in clingen_vicc_ev_codes:

if code in variant_data.keys():
if variant_data[code] is True:
score = oncogenicity_scores[code]['score']
pole = oncogenicity_scores[code]['pole']
if variant_data[code] is True and code in oncogenicity_criteria.keys():
score = float(oncogenicity_criteria[code]['score'])
pole = oncogenicity_criteria[code]['pole']
if pole == "P":
onc_score_pathogenic = onc_score_pathogenic + score
if pole == "B":
else:
onc_score_benign = onc_score_benign + score

for e in ['ONCOGENICITY_DOC',
'ONCOGENICITY_CODE']:

if variant_data[e] == ".":
if e == 'ONCOGENICITY_DOC':
variant_data[e] = oncogenicity_scores[code]['description']
else:
variant_data[e] = code
else:
if e == 'ONCOGENICITY_DOC':
variant_data[e] = f'{variant_data[e]}|{oncogenicity_scores[code]["description"]}'
else:
variant_data[e] = f'{variant_data[e]}|{code}'
if variant_data['ONCOGENICITY_CODE'] == ".":
variant_data['ONCOGENICITY_CODE'] = code
else:
variant_data['ONCOGENICITY_CODE'] = f'{variant_data["ONCOGENICITY_CODE"]}|{code}'

likely_benign_upper_limit = -1
likely_benign_lower_limit = -6
Expand Down Expand Up @@ -514,6 +430,23 @@ def assign_oncogenicity_evidence(rec = None, oncogenic_variants = None, tumortyp

return(rec)

def load_oncogenicity_criteria(oncogenic_criteria_fname: str, logger: Logger):
"""
Load oncogenic criteria from a file and create a dictionary of criteria.
"""

oncogenic_criteria = {}
if not os.path.exists(oncogenic_criteria_fname):
logger.info(f"ERROR: File '{oncogenic_criteria_fname}' does not exist - exiting")
exit(1)

with open(oncogenic_criteria_fname, mode='rt') as f:
reader = csv.DictReader(f, delimiter='\t')
for row in reader:
oncogenic_criteria[row['code']] = row

return oncogenic_criteria

def load_oncogenic_variants(oncogenic_variants_fname: str, logger: Logger):
"""
Load oncogenic variants from a file and create a dictionary of variants.
Expand Down
10 changes: 10 additions & 0 deletions pcgr/pcgr_vars.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@
SAMPLE_ID_MAX_LENGTH = 40
SAMPLE_ID_MIN_LENGTH = 3

GNOMAD_MAIN_EXON_AF_TAGS = ['gnomADe_SAS_AF','gnomADe_NFE_AF','gnomADe_AFR_AF','gnomADe_AMR_AF','gnomADe_EAS_AF']

## Classified germline variant input (from CPSR) - required columns
germline_input_required_cols = [
'SAMPLE_ID',
Expand Down Expand Up @@ -291,4 +293,12 @@
'alphamissense': 'DBNSFP_ALPHA_MISSENSE',
'mutformer': 'DBNSFP_MUTFORMER',
'phactboost': 'DBNSFP_PHACTBOOST'
}

ONCOGENICITY = {
'gnomAD_extremely_rare_AF': 0.00001,
'gnomAD_common_AF': 0.01,
'gnomAD_very_common_AF': 0.05,
'insilico_pred_min_majority': 8,
'insilico_pred_max_minority': 2,
}
6 changes: 4 additions & 2 deletions scripts/pcgr_summarise.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from pcgr.annoutils import read_infotag_file, make_transcript_xref_map, read_genexref_namemap, map_regulatory_variant_annotations, write_pass_vcf, load_grantham
from pcgr.vep import parse_vep_csq
from pcgr.dbnsfp import vep_dbnsfp_meta_vcf, map_variant_effect_predictors
from pcgr.oncogenicity import assign_oncogenicity_evidence, load_oncogenic_variants, match_oncogenic_variants
from pcgr.oncogenicity import load_oncogenicity_criteria, assign_oncogenicity_evidence, load_oncogenic_variants, match_oncogenic_variants
from pcgr.mutation_hotspot import load_mutation_hotspots, match_csq_mutation_hotspot
from pcgr.biomarker import load_biomarkers, match_csq_biomarker
from pcgr.utils import error_message, check_subprocess, getlogger
Expand Down Expand Up @@ -98,6 +98,8 @@ def extend_vcf_annotations(arg_dict, logger):
os.path.join(arg_dict['refdata_assembly_dir'], 'gene','tsv','gene_transcript_xref', 'gene_transcript_xref_bedmap.tsv.gz'), logger)
cancer_hotspots = load_mutation_hotspots(
os.path.join(arg_dict['refdata_assembly_dir'], 'misc','tsv','hotspot', 'hotspot.tsv.gz'), logger)
oncogenicity_criteria = load_oncogenicity_criteria(
os.path.join(arg_dict['refdata_assembly_dir'], 'misc','tsv', 'oncogenicity', 'oncogenicity.tsv'), logger)
oncogenic_variants = load_oncogenic_variants(os.path.join(arg_dict['refdata_assembly_dir'], 'variant','tsv', 'clinvar','clinvar_oncogenic.tsv.gz'), logger)
grantham_scores = load_grantham(os.path.join(arg_dict['refdata_assembly_dir'], 'misc','tsv', 'grantham', 'grantham.tsv'), logger)

Expand Down Expand Up @@ -252,7 +254,7 @@ def extend_vcf_annotations(arg_dict, logger):

if arg_dict['oncogenicity_annotation'] == 1:
match_oncogenic_variants(vep_csq_record_results['all_csq'], oncogenic_variants, rec, principal_csq_properties)
assign_oncogenicity_evidence(rec, oncogenic_variants, tumortype = arg_dict['tumortype'])
assign_oncogenicity_evidence(rec, oncogenicity_criteria,tumortype = arg_dict['tumortype'])

if "GENE_TRANSCRIPT_XREF" in vcf_info_element_types:
gene_xref_tag = rec.INFO.get('GENE_TRANSCRIPT_XREF')
Expand Down

0 comments on commit 5c81c03

Please sign in to comment.