|
| 1 | +#!/usr/bin/env python |
| 2 | +import os |
| 3 | +import sys |
| 4 | +import math |
| 5 | +import pickle |
| 6 | +import jinja2 |
| 7 | + |
| 8 | +import numpy as np |
| 9 | + |
| 10 | +import logging |
| 11 | +import optparse |
| 12 | + |
| 13 | +from hagfish_file_util import * |
| 14 | + |
| 15 | +## Arguments: General options |
| 16 | +parser = optparse.OptionParser() |
| 17 | + |
| 18 | +parser.add_option('-f', dest='fasta', |
| 19 | + help='reference fasta file') |
| 20 | +parser.add_option('-v', dest='verbose', action="count", |
| 21 | + help='Show debug information') |
| 22 | + |
| 23 | +options, args = parser.parse_args() |
| 24 | + |
| 25 | +l = logging.getLogger('hagfish') |
| 26 | + |
| 27 | +handler = logging.StreamHandler() |
| 28 | +logmark = chr(27) + '[0;37;44mHAGFISH' + \ |
| 29 | + chr(27) + '[0m ' |
| 30 | + |
| 31 | +formatter = logging.Formatter( |
| 32 | + logmark + '%(levelname)-6s %(message)s') |
| 33 | + |
| 34 | +handler.setFormatter(formatter) |
| 35 | +l.addHandler(handler) |
| 36 | + |
| 37 | +if options.verbose >= 2: |
| 38 | + l.setLevel(logging.DEBUG) |
| 39 | +elif options.verbose == 1: |
| 40 | + l.setLevel(logging.INFO) |
| 41 | +else: |
| 42 | + l.setLevel(logging.WARNING) |
| 43 | + |
| 44 | +if not options.fasta: |
| 45 | + l.critical("Must specify a fasta input file") |
| 46 | + sys.exit(-1) |
| 47 | + |
| 48 | +if __name__ == '__main__': |
| 49 | + |
| 50 | + if not os.path.exists('gaps'): |
| 51 | + os.makedirs('gaps') |
| 52 | + |
| 53 | + #read an arbitrary seqId file |
| 54 | + for f in os.listdir('seqInfo'): |
| 55 | + if '.seqinfo' in f: |
| 56 | + seqInfoFile = os.path.join('seqInfo', f) |
| 57 | + break |
| 58 | + else: |
| 59 | + l.critical("cannot find a seqInfo file") |
| 60 | + sys.exit(-1) |
| 61 | + |
| 62 | + l.info("reading %s for seqinfo" % seqInfoFile) |
| 63 | + with open(seqInfoFile) as F: |
| 64 | + seqInfo = pickle.load(F) |
| 65 | + |
| 66 | + l.info("discovered %d sequences" % len(seqInfo)) |
| 67 | + |
| 68 | + seqslist = set(seqInfo.keys()) |
| 69 | + |
| 70 | + for seqId, seq in fastareader(options.fasta): |
| 71 | + l.info('Processing %s (%d nt)' % (seqId, len(seq))) |
| 72 | + |
| 73 | + nns = np.array(np.array(list(seq)) == 'n', 'b') |
| 74 | + np_savez(os.path.join('gaps', seqId), nns = nns ) |
| 75 | + |
0 commit comments