|
| 1 | +#!/usr/bin/env python |
| 2 | + |
| 3 | +import os |
| 4 | +import sys |
| 5 | +import pickle |
| 6 | + |
| 7 | +import numpy as np |
| 8 | + |
| 9 | +import logging |
| 10 | +import optparse |
| 11 | + |
| 12 | +from hagfish_file_util import * |
| 13 | + |
| 14 | +## Arguments: General options |
| 15 | +parser = optparse.OptionParser() |
| 16 | +parser.set_defaults(binSize=500) |
| 17 | +parser.add_option('-v', dest='verbose', action="count", |
| 18 | + help='Show debug information') |
| 19 | +parser.add_option('-b', dest='binSize', type='int', |
| 20 | + help='binSize') |
| 21 | +parser.add_option('-s', dest='seqId', |
| 22 | + help='seqId to use in the output..') |
| 23 | + |
| 24 | +options, args = parser.parse_args() |
| 25 | + |
| 26 | +l = logging.getLogger('hagfish') |
| 27 | +handler = logging.StreamHandler() |
| 28 | +logmark = chr(27) + '[0;37;44mHAGFISH' + \ |
| 29 | + chr(27) + '[0m ' |
| 30 | + |
| 31 | +formatter = logging.Formatter( |
| 32 | + logmark + '%(levelname)-6s %(message)s') |
| 33 | + |
| 34 | +handler.setFormatter(formatter) |
| 35 | +l.addHandler(handler) |
| 36 | + |
| 37 | +if options.verbose >= 2: |
| 38 | + l.setLevel(logging.DEBUG) |
| 39 | +elif options.verbose == 1: |
| 40 | + l.setLevel(logging.INFO) |
| 41 | +else: |
| 42 | + l.setLevel(logging.WARNING) |
| 43 | + |
| 44 | + |
| 45 | +def generate_histogram(F, |
| 46 | + cptype, |
| 47 | + category, |
| 48 | + seqId, |
| 49 | + group, |
| 50 | + data, |
| 51 | + bin): |
| 52 | + |
| 53 | + if category == 'low': |
| 54 | + color = '001,086,102' |
| 55 | + elif category == 'ok': |
| 56 | + color = '000,125,028' |
| 57 | + else: |
| 58 | + color = '166,16,0' |
| 59 | + |
| 60 | + F.write('track name="Hf_%s_%s_%s" description="Hagfish wiggle plot %s %s for %s" color=%s\n' % |
| 61 | + (cptype, category, seqId, cptype, category, seqId, color)) |
| 62 | + F.write("fixedStep chrom=%s start=1 step=%d\n"% ( |
| 63 | + seqId, bin)) |
| 64 | + |
| 65 | + |
| 66 | + for r in range(0, len(data)+bin-1,bin): |
| 67 | + score = np.average(data[r:r+bin-1]) |
| 68 | + if not np.isnan(score): |
| 69 | + F.write("%.2f\n" % score) |
| 70 | + |
| 71 | +if __name__ == '__main__': |
| 72 | + |
| 73 | + #read an arbitrary seqId file |
| 74 | + for f in os.listdir('seqInfo'): |
| 75 | + if '.seqinfo' in f: |
| 76 | + seqInfoFile = os.path.join('seqInfo', f) |
| 77 | + break |
| 78 | + else: |
| 79 | + l.critical("cannot find a seqInfo file") |
| 80 | + sys.exit(-1) |
| 81 | + |
| 82 | + l.info("reading %s to get seqinfo" % seqInfoFile) |
| 83 | + with open(seqInfoFile) as F: |
| 84 | + seqInfo = pickle.load(F) |
| 85 | + |
| 86 | + l.info("discovered %d sequences" % len(seqInfo)) |
| 87 | + |
| 88 | + binSize = options.binSize |
| 89 | + if len(args) > 0: |
| 90 | + seqIds = args |
| 91 | + else: |
| 92 | + seqIds = seqInfo.keys() |
| 93 | + |
| 94 | + if not os.path.exists('wiggle'): |
| 95 | + os.mkdir('wiggle') |
| 96 | + |
| 97 | + if options.seqId: assert(len(seqIds) == 1) |
| 98 | + |
| 99 | + for seqId in seqIds: |
| 100 | + l.info("Processing %s" % seqId) |
| 101 | + |
| 102 | + if options.seqId: |
| 103 | + outSeqId = options.seqId |
| 104 | + else: |
| 105 | + outSeqId = seqId |
| 106 | + |
| 107 | + l.info("Outputting %s" % outSeqId) |
| 108 | + |
| 109 | + FG = open(os.path.join('wiggle', '%s.icp.ok.WIG' % (outSeqId)), 'w') |
| 110 | + FS = open(os.path.join('wiggle', '%s.icp.low.WIG' % (outSeqId)), 'w') |
| 111 | + FL = open(os.path.join('wiggle', '%s.icp.high.WIG' % (outSeqId)), 'w') |
| 112 | + FC = open(os.path.join('wiggle', '%s.icp.score.WIG' % (outSeqId)), 'w') |
| 113 | + |
| 114 | + FGE = open(os.path.join('wiggle', '%s.ecp.ok.WIG' % (outSeqId)), 'w') |
| 115 | + FSE = open(os.path.join('wiggle', '%s.ecp.low.WIG' % (outSeqId)), 'w') |
| 116 | + FLE = open(os.path.join('wiggle', '%s.ecp.high.WIG' % (outSeqId)), 'w') |
| 117 | + |
| 118 | + file_base = os.path.join('combined', seqId) |
| 119 | + r_ok = np_load(file_base, 'r_ok') |
| 120 | + r_low = np_load(file_base, 'r_low') |
| 121 | + r_high = np_load(file_base, 'r_high') |
| 122 | + r_ok_ends = np_load(file_base, 'r_ok_ends') |
| 123 | + r_low_ends = np_load(file_base, 'r_low_ends') |
| 124 | + r_high_ends = np_load(file_base, 'r_high_ends') |
| 125 | + |
| 126 | + l.info("read %d datapoints" % len(r_ok)) |
| 127 | + median = np.median(r_ok) |
| 128 | + score = 1 - 2 * np.exp(-1 * (r_ok / median)) \ |
| 129 | + + np.exp(-1 * ((r_ok + r_low + r_high)/median)) |
| 130 | + |
| 131 | + seqLen = len(r_ok) |
| 132 | + l.info("sequence length of %s is %d" % (seqId, seqLen)) |
| 133 | + |
| 134 | + |
| 135 | + generate_histogram(FG,'ICP', 'ok', outSeqId, 'ok', r_ok, binSize) |
| 136 | + generate_histogram(FS,'ICP', 'low', outSeqId, 'short', r_low, binSize) |
| 137 | + generate_histogram(FL,'ICP', 'high', outSeqId, 'long', r_high, binSize) |
| 138 | + |
| 139 | + generate_histogram(FGE,'ECP', 'ok', outSeqId, 'ok_ends', r_ok_ends, binSize) |
| 140 | + generate_histogram(FSE,'ECP', 'low', outSeqId, 'short_ends', r_low_ends, binSize) |
| 141 | + generate_histogram(FLE,'ECP', 'high', outSeqId, 'long_ends', r_high_ends, binSize) |
| 142 | + |
| 143 | + generate_histogram(FC,'ICP', 'Score', outSeqId, 'score', score, binSize) |
| 144 | + |
| 145 | + FG.close() |
| 146 | + FS.close() |
| 147 | + FL.close() |
| 148 | + FC.close() |
| 149 | + |
| 150 | + FGE.close() |
| 151 | + FSE.close() |
| 152 | + FLE.close() |
| 153 | + |
0 commit comments