Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,35 +1,5 @@
#!/usr/bin/python3

import argparse
import sys
import json
import os.path
import re
import subprocess
import tempfile

S3DIR = 's3://dig-analysis-data/out/varianteffect'


def colocated_variant(row, ref, alt):
"""
Find the first colocated variant with a matching allele string or None.
"""
co = row.get('colocated_variants', [])

# if there is only a single co-located variant, return it
if len(co) == 1:
return co[0]

# only keep colocated variants where the minor allele is ref or alt
variants = [v for v in co if v.get('minor_allele') in [ref, alt]]

# fail if no colocated variants
if not variants:
return None

# find the first with an rsID or default to the first variant
return next((v for v in variants if v.get('id', '').startswith('rs')), variants[0])


def dbSNP(v):
"""
Expand All @@ -40,46 +10,6 @@ def dbSNP(v):
# only return if an actual rsID
return rsid if rsid and rsid.startswith('rs') else None


def allele_frequencies(v, ref, alt):
"""
Extract allele frequencies for a colocated variant.
"""
af = v.get('frequencies', {})

# find the matching allele in the frequency map (prefer alt)
allele = next((a for a in [alt, ref] if a in af), None)

# no allele frequency data?
if not allele:
return {
'EU': None,
'HS': None,
'AA': None,
'EA': None,
'SA': None,
}

# lookup the frequency data
def get_freq(*cols):
freqs = [af[allele].get(c) for c in cols]

# find the first non-null/zero frequency from the columns
freq = next((f for f in freqs if f), None)

# flip the frequency if this is the reference allele
return 1.0 - freq if allele == ref else freq

# try gnomad, if not there use 1kg
return {
'EU': get_freq('gnomad_nfe', 'eur'),
'HS': get_freq('gnomad_amr', 'amr'),
'AA': get_freq('gnomad_afr', 'afr'),
'EA': get_freq('gnomad_eas', 'eas'),
'SA': get_freq('gnomad_sas', 'sas'),
}


def common_fields(row):
"""
Extracts data from the colocated variant fields.
Expand Down Expand Up @@ -117,64 +47,73 @@ def common_fields(row):
'af': allele_frequencies(variant, ref, alt),
}


def process_part(srcdir, outdir, part):
def allele_frequencies(v, ref, alt):
"""
Download and process a part file.
Extract allele frequencies for a colocated variant.
"""
_, tmp = tempfile.mkstemp()

# copy the file into a temp file
subprocess.check_call(['aws', 's3', 'cp', f'{srcdir}/{part}', tmp])

# loop over every line, parse, and create common row
with open(tmp) as fp:
with open(part, mode='w', encoding='utf-8') as out:
for line in fp:
row = json.loads(line)
common = common_fields(row)
af = v.get('frequencies', {})

# write the common record to the output
if common:
json.dump(
common,
out,
separators=(',', ':'),
check_circular=False,
)
# find the matching allele in the frequency map (prefer alt)
allele = next((a for a in [alt, ref] if a in af), None)

# output a newline after the json
print(file=out)
# no allele frequency data?
if not allele:
return {
'EU': None,
'HS': None,
'AA': None,
'EA': None,
'SA': None,
}

# copy the output file to S3
subprocess.check_call(['aws', 's3', 'cp', part, f'{outdir}/{part}'])
# lookup the frequency data
def get_freq(*cols):
freqs = [af[allele].get(c) for c in cols]

# cleanup
os.remove(tmp)
os.remove(part)
# find the first non-null/zero frequency from the columns
freq = next((f for f in freqs if f), None)

# debug output
print(f'Processed {part} successfully')
# flip the frequency if this is the reference allele
return 1.0 - freq if allele == ref else freq

# try gnomad, if not there use 1kg
return {
'EU': get_freq('gnomad_nfe', 'eur'),
'HS': get_freq('gnomad_amr', 'amr'),
'AA': get_freq('gnomad_afr', 'afr'),
'EA': get_freq('gnomad_eas', 'eas'),
'SA': get_freq('gnomad_sas', 'sas'),
}

def main():
def colocated_variant(row, ref, alt):
"""
Arguments: part-file
Find the first colocated variant with a matching allele string or None.
"""
opts = argparse.ArgumentParser()
opts.add_argument('part')
co = row.get('colocated_variants', [])

# parse cli
args = opts.parse_args()
# if there is only a single co-located variant, return it
if len(co) == 1:
return co[0]

# only keep colocated variants where the minor allele is ref or alt
variants = [v for v in co if v.get('minor_allele') in [ref, alt]]

# s3 locations
srcdir = f'{S3DIR}/effects'
outdir = f'{S3DIR}/common'
# fail if no colocated variants
if not variants:
return None

# find the first with an rsID or default to the first variant
return next((v for v in variants if v.get('id', '').startswith('rs')), variants[0])

# run
process_part(srcdir, outdir, args.part)

for line in sys.stdin:
row = json.loads(line)
common = common_fields(row)
json.dump(
common,
sys.stdout,
separators=(',', ':'),
check_circular=False,
)
sys.stdout.write('\n')

# entry point
if __name__ == '__main__':
main()
102 changes: 0 additions & 102 deletions vep/src/main/resources/cqs.py

This file was deleted.

58 changes: 58 additions & 0 deletions vep/src/main/resources/cqsStdin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/usr/bin/python3

import json
import re
import sys

S3DIR = 's3://dig-analysis-data/out/varianteffect'


def rename_cqs_field(s):
"""
Lifted from BioIndex code to camelCase the names of consequences.
"""
s=re.sub(r'(?:[^a-z0-9]+)(.)', lambda m: m.group(1).upper(), s, flags=re.IGNORECASE)
s=re.sub(r'^[A-Z][a-z]+', lambda m: m.group(0).lower(), s)

return s


def exploded_consequences(row):
"""
Extracts transcript consequences from the row.
"""
for cqs in row.get('transcript_consequences', []):
record = dict()

# include the variant ID and locus with each consequence
record['varId'] = row['id']
record['chromosome'] = row['seq_region_name']
record['position'] = int(row['start'])

# apend all the consequence fields; ignore some
for k, v in cqs.items():
if k not in ['domains']:
record[rename_cqs_field(k)] = v

# add the record to the returned values
yield record


for line in sys.stdin:
row = json.loads(line)

# write the common record to the output
for cqs in exploded_consequences(row):
json.dump(
cqs,
sys.stdout,
separators=(',', ':'),
check_circular=False,
)

# output a newline after the json
sys.stdout.write('\n')




Loading