broadinstitute · sagehen03 · Jan 30, 2024 · Jan 30, 2024 · Jan 30, 2024 · Jan 30, 2024
diff --git a/vep/src/main/resources/common.py → vep/src/main/resources/commonStdin.py b/vep/src/main/resources/common.py → vep/src/main/resources/commonStdin.py
@@ -1,35 +1,5 @@
-#!/usr/bin/python3
-
-import argparse
+import sys
 import json
-import os.path
-import re
-import subprocess
-import tempfile
-
-S3DIR = 's3://dig-analysis-data/out/varianteffect'
-
-
-def colocated_variant(row, ref, alt):
-    """
-    Find the first colocated variant with a matching allele string or None.
-    """
-    co = row.get('colocated_variants', [])
-
-    # if there is only a single co-located variant, return it
-    if len(co) == 1:
-        return co[0]
-
-    # only keep colocated variants where the minor allele is ref or alt
-    variants = [v for v in co if v.get('minor_allele') in [ref, alt]]
-
-    # fail if no colocated variants
-    if not variants:
-        return None
-
-    # find the first with an rsID or default to the first variant
-    return next((v for v in variants if v.get('id', '').startswith('rs')), variants[0])
-
 
 def dbSNP(v):
     """
@@ -40,46 +10,6 @@ def dbSNP(v):
     # only return if an actual rsID
     return rsid if rsid and rsid.startswith('rs') else None
 
-
-def allele_frequencies(v, ref, alt):
-    """
-    Extract allele frequencies for a colocated variant.
-    """
-    af = v.get('frequencies', {})
-
-    # find the matching allele in the frequency map (prefer alt)
-    allele = next((a for a in [alt, ref] if a in af), None)
-
-    # no allele frequency data?
-    if not allele:
-        return {
-            'EU': None,
-            'HS': None,
-            'AA': None,
-            'EA': None,
-            'SA': None,
-        }
-
-    # lookup the frequency data
-    def get_freq(*cols):
-        freqs = [af[allele].get(c) for c in cols]
-
-        # find the first non-null/zero frequency from the columns
-        freq = next((f for f in freqs if f), None)
-
-        # flip the frequency if this is the reference allele
-        return 1.0 - freq if allele == ref else freq
-
-    # try gnomad, if not there use 1kg
-    return {
-        'EU': get_freq('gnomad_nfe', 'eur'),
-        'HS': get_freq('gnomad_amr', 'amr'),
-        'AA': get_freq('gnomad_afr', 'afr'),
-        'EA': get_freq('gnomad_eas', 'eas'),
-        'SA': get_freq('gnomad_sas', 'sas'),
-    }
-
-
 def common_fields(row):
     """
     Extracts data from the colocated variant fields.
@@ -117,64 +47,73 @@ def common_fields(row):
         'af': allele_frequencies(variant, ref, alt),
     }
 
-
-def process_part(srcdir, outdir, part):
+def allele_frequencies(v, ref, alt):
     """
-    Download and process a part file.
+    Extract allele frequencies for a colocated variant.
     """
-    _, tmp = tempfile.mkstemp()
-
-    # copy the file into a temp file
-    subprocess.check_call(['aws', 's3', 'cp', f'{srcdir}/{part}', tmp])
-
-    # loop over every line, parse, and create common row
-    with open(tmp) as fp:
-        with open(part, mode='w', encoding='utf-8') as out:
-            for line in fp:
-                row = json.loads(line)
-                common = common_fields(row)
+    af = v.get('frequencies', {})
 
-                # write the common record to the output
-                if common:
-                    json.dump(
-                        common,
-                        out,
-                        separators=(',', ':'),
-                        check_circular=False,
-                    )
+    # find the matching allele in the frequency map (prefer alt)
+    allele = next((a for a in [alt, ref] if a in af), None)
 
-                    # output a newline after the json
-                    print(file=out)
+    # no allele frequency data?
+    if not allele:
+        return {
+            'EU': None,
+            'HS': None,
+            'AA': None,
+            'EA': None,
+            'SA': None,
+        }
 
-    # copy the output file to S3
-    subprocess.check_call(['aws', 's3', 'cp', part, f'{outdir}/{part}'])
+    # lookup the frequency data
+    def get_freq(*cols):
+        freqs = [af[allele].get(c) for c in cols]
 
-    # cleanup
-    os.remove(tmp)
-    os.remove(part)
+        # find the first non-null/zero frequency from the columns
+        freq = next((f for f in freqs if f), None)
 
-    # debug output
-    print(f'Processed {part} successfully')
+        # flip the frequency if this is the reference allele
+        return 1.0 - freq if allele == ref else freq
 
+    # try gnomad, if not there use 1kg
+    return {
+        'EU': get_freq('gnomad_nfe', 'eur'),
+        'HS': get_freq('gnomad_amr', 'amr'),
+        'AA': get_freq('gnomad_afr', 'afr'),
+        'EA': get_freq('gnomad_eas', 'eas'),
+        'SA': get_freq('gnomad_sas', 'sas'),
+    }
 
-def main():
+def colocated_variant(row, ref, alt):
     """
-    Arguments: part-file
+    Find the first colocated variant with a matching allele string or None.
     """
-    opts = argparse.ArgumentParser()
-    opts.add_argument('part')
+    co = row.get('colocated_variants', [])
 
-    # parse cli
-    args = opts.parse_args()
+    # if there is only a single co-located variant, return it
+    if len(co) == 1:
+        return co[0]
+
+    # only keep colocated variants where the minor allele is ref or alt
+    variants = [v for v in co if v.get('minor_allele') in [ref, alt]]
 
-    # s3 locations
-    srcdir = f'{S3DIR}/effects'
-    outdir = f'{S3DIR}/common'
+    # fail if no colocated variants
+    if not variants:
+        return None
+
+    # find the first with an rsID or default to the first variant
+    return next((v for v in variants if v.get('id', '').startswith('rs')), variants[0])
 
-    # run
-    process_part(srcdir, outdir, args.part)
 
+for line in sys.stdin:
+    row = json.loads(line)
+    common = common_fields(row)
+    json.dump(
+        common,
+        sys.stdout,
+        separators=(',', ':'),
+        check_circular=False,
+    )
+    sys.stdout.write('\n')
 
-# entry point
-if __name__ == '__main__':
-    main()
diff --git a/vep/src/main/resources/cqs.py b/vep/src/main/resources/cqs.py
diff --git a/vep/src/main/resources/cqsStdin.py b/vep/src/main/resources/cqsStdin.py
@@ -0,0 +1,58 @@
+#!/usr/bin/python3
+
+import json
+import re
+import sys
+
+S3DIR = 's3://dig-analysis-data/out/varianteffect'
+
+
+def rename_cqs_field(s):
+    """
+    Lifted from BioIndex code to camelCase the names of consequences.
+    """
+    s=re.sub(r'(?:[^a-z0-9]+)(.)', lambda m: m.group(1).upper(), s, flags=re.IGNORECASE)
+    s=re.sub(r'^[A-Z][a-z]+', lambda m: m.group(0).lower(), s)
+
+    return s
+
+
+def exploded_consequences(row):
+    """
+    Extracts transcript consequences from the row.
+    """
+    for cqs in row.get('transcript_consequences', []):
+        record = dict()
+
+        # include the variant ID and locus with each consequence
+        record['varId'] = row['id']
+        record['chromosome'] = row['seq_region_name']
+        record['position'] = int(row['start'])
+
+        # apend all the consequence fields; ignore some
+        for k, v in cqs.items():
+            if k not in ['domains']:
+                record[rename_cqs_field(k)] = v
+
+        # add the record to the returned values
+        yield record
+
+
+for line in sys.stdin:
+    row = json.loads(line)
+
+    # write the common record to the output
+    for cqs in exploded_consequences(row):
+        json.dump(
+            cqs,
+            sys.stdout,
+            separators=(',', ':'),
+            check_circular=False,
+        )
+
+        # output a newline after the json
+        sys.stdout.write('\n')
+
+
+
+