hagfish_report

#!/usr/bin/env python
import os
import sys
import math
import pickle
import jinja2

import numpy as np
import matplotlib as mpl
mpl.use('agg')
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab

try:
    import scipy.stats
    SCIPY = True
except ImportError:
    SCIPY = False

import logging
import optparse

import hagfishUtils as hu
from hagfish_file_util import *

## Arguments: General options 
parser = optparse.OptionParser()
parser.add_option('-v', dest='verbose', action="count", 
                  help='Show debug information')

parser.add_option('--csc', dest='cum_seqlen_cutoff', type='int',
                  default=10000, help='cutoff for seq lenght in calculating ' + 
                  'cumulative stats - ignore seqs shorter than this')
parser.set_defaults(format=['png'], dpi=100)
parser.add_option('--dpi', dest='dpi', type='int',
                  help='dpi of the image, pixel calculations are based on ' +
                  'dpi 100, setting dpi to 200 will double the x/y pixel ' +
                  'size of your image)')

parser.add_option('--only_ok', dest='onlyok', action='store_true', 
                  default=False,
                  help='only print the "ok" plot')

options, args = parser.parse_args()

l = logging.getLogger('hagfish')
handler = logging.StreamHandler()
logmark = chr(27) + '[0;37;44mHAGFISH' + \
          chr(27) + '[0m ' 

formatter = logging.Formatter(
    logmark + '%(levelname)-6s %(message)s')

handler.setFormatter(formatter)
l.addHandler(handler)

if options.verbose >= 2:
    l.setLevel(logging.DEBUG)
elif options.verbose == 1:
    l.setLevel(logging.INFO)
else:
    l.setLevel(logging.WARNING)

if not SCIPY:
    l.warning("Cannot load scipy - no density plots, no smooth lines")

if __name__ == '__main__':
    
    if not os.path.exists('report'):
        os.makedirs('report')

    #read an arbitrary seqId file
    for f in os.listdir('seqInfo'):
        if '.seqinfo' in f:
            seqInfoFile = os.path.join('seqInfo', f)
            break
        else:
            l.critical("cannot find a seqInfo file")
            sys.exit(-1)

    l.info("reading %s for seqinfo" % seqInfoFile)
    with open(seqInfoFile) as F:
        seqInfo = pickle.load(F)
        
    l.info("discovered %d sequences" % len(seqInfo))

    if len(args) > 0:
        seqs_to_parse = args
    else:
        seqs_to_parse = seqInfo.keys()

    if len(seqs_to_parse) > 18:
        l.info("quite a few sequences (%d) :( might take a while!" % len(seqInfo))

    cum_r_ok = np.array([])
    cum_r_ok_ends = np.array([])

    if not options.onlyok:
        cum_r_high = np.array([])
        cum_r_low = np.array([])
        cum_r_high_ends = np.array([])
        cum_r_low_ends = np.array([])

    l.info("Start parsing %d sequences" % len(seqs_to_parse))

    seqCount =  0
    totalSeqLen = 0
    foundBins = False

    #see if there is gapdata to load
    if os.path.exists('gaps'):
        GAP = True
        nons = 0
    else:
        GAP = False

    for seqId in seqs_to_parse:
        
        seqCount += 1

        seqLen = seqInfo[seqId]['length']
        totalSeqLen += seqLen

        l.info("processing %s (%d nt)" % (seqId, seqLen))

        #read the coverage plots
        file_base = os.path.join('combined', seqId)
        gap_base = os.path.join('gaps', seqId)

            
        try:
            r_ok = np_load(file_base, 'r_ok')
            r_ok_ends = np_load(file_base, 'r_ok_ends')
            if True: #not options.onlyok:
                r_high = np_load(file_base, 'r_high')
                r_low = np_load(file_base, 'r_low')
                r_high_ends = np_load(file_base, 'r_high_ends')
                r_low_ends = np_load(file_base, 'r_low_ends')

            if GAP:
                nons += len(np.flatnonzero(np_load(gap_base, 'nns')))

        except IOError:
            l.critical("skipping %s - no data" % seqId)
            continue
        
        if GAP:
            l.info('found %d NNNs' % nons)

        #add to the cumulative plot
        if seqLen >= options.cum_seqlen_cutoff: 
            cum_r_ok = np.concatenate((cum_r_ok, r_ok))
            cum_r_ok_ends = np.concatenate((cum_r_ok_ends, r_ok_ends))
            if True: #not options.onlyok:
                cum_r_high = np.concatenate((cum_r_high, r_high))
                cum_r_low = np.concatenate((cum_r_low, r_low))
                cum_r_high_ends = np.concatenate((cum_r_high_ends, r_high_ends))
                cum_r_low_ends = np.concatenate((cum_r_low_ends, r_low_ends))

    seqLen = totalSeqLen

    l.info("done loading")
    l.info("start calculating & plotting")

    median_ok = np.median(cum_r_ok)
    median_avg = np.mean(cum_r_ok)

    if median_ok == 0: 
        l.error("median is zero - using average :(")
        median_ok = median_avg

    l.info("median ok for %s is %s" % (seqId, median_ok))
    l.info("average ok for %s is %s" % (seqId, median_avg))

    score = 0.5 * median_ok * (
        1 - 2 * np.exp(-1 * (cum_r_ok / median_ok))
        + np.exp(-1 * ( ( cum_r_ok + cum_r_low + cum_r_high) / median_ok) ) )

    l.debug("Calculated score: min %s, max %s" % (np.min(score), np.max(score)))

    #determine what the bins are
    if False: # True: #options.onlyok:
        maxx = max(max(cum_r_ok))
    else:
        maxx = max(max(cum_r_ok), max(cum_r_high), max(cum_r_low))

    maxx = 1500 * ( ( maxx / 1000 ) + 1 )             
    
    #bins =  np.array([0,1,2,3,4] + range(5, int(maxx)))
    bins = np.array([x**1.5 for x in range(100)])
    bins = np.array(range(1000))
    rBins = np.array([0,1,10,20,30,40,50,100,int(10e9)])
    
    no_score_bins = 100
    score_bins = (np.array(range(0,no_score_bins+1)) / (no_score_bins / 2.)) - 1

    l.debug("Bins %s" % bins)
    ok_hist, oe = np.histogram(cum_r_ok, bins = bins)
    ok_hist_ends, _oee = np.histogram(cum_r_ok_ends, bins = bins)
    rep_ok_hist_ends, roee = np.histogram(cum_r_ok_ends, bins = rBins)
    rep_ok_hist, roe = np.histogram(cum_r_ok, bins = rBins)


    if True: # not options.onlyok:
        high_hist, he = np.histogram(cum_r_high, bins = bins)
        low_hist, le = np.histogram(cum_r_low, bins = bins)
        high_hist_ends, _hee = np.histogram(cum_r_high_ends, bins = bins)
        low_hist_ends, _lee = np.histogram(cum_r_low_ends, bins = bins)
        rep_high_hist_ends, rhee = np.histogram(cum_r_high_ends, bins = rBins)
        rep_low_hist_ends, rlee = np.histogram(cum_r_low_ends, bins = rBins)
        rep_high_hist, rhe = np.histogram(cum_r_high, bins = rBins)
        rep_low_hist, rle = np.histogram(cum_r_low, bins = rBins)


    hist_edges = oe
    rep_hist_edges = roee

    #print coverage distribution plot
    fig = plt.figure()                
    ax = fig.add_subplot(111)
    plt.title('Coverage distribution for sequence',
              fontdict={'size' : 10})
    ax.set_xlabel('coverage')
    ax.set_ylabel('no nucleotides with coverage')        

    if not options.onlyok:
        ax.plot((he[:-1]), (high_hist), '.', mfc='#B73147', mew=0,
                alpha=0.3, label="long insert")
        ax.plot((le[:-1]), (low_hist), '.', mfc='#538ABF', mew=0,
                label="short insert", alpha=0.3)
    ax.plot((hist_edges[:-1]), (ok_hist), '.', mfc='#3BB058', mew=0,
            label="correct insert", alpha=0.3)


    minX, maxX = ax.get_axes().get_xlim()
    minY, maxY = ax.get_axes().get_ylim()
    ax.get_axes().set_ylim([10, maxY])
    
    ax.hlines(seqLen, minX, maxX, linestyles='solid',
              linewidth=1, colors="black",
              label='Sequence length (%d)' % seqLen)
    ax.legend(prop={'size' :'x-small'})

    def smoother(a, steps):
        result = np.zeros(len(a) - steps + 1)
        for fr in range(steps):
            to = - (steps - fr - 1)
            if to == 0: to = None
            result += a[fr:to]        
        return result / float(steps)

    smooth_step = 5
    smthok = smoother(ok_hist, smooth_step)
    plt.plot(hist_edges[:-smooth_step], smthok, '#3BB058')
    if not options.onlyok:
        smthlow =  smoother(low_hist, smooth_step)
        smthhigh = smoother(high_hist, smooth_step)
        plt.plot(hist_edges[:-smooth_step], smthlow, '#538ABF')
        plt.plot(hist_edges[:-smooth_step], smthhigh, '#B73147')

    ax.set_yscale('log')        
    ax.hlines(seqLen, minX, maxX, linestyles='solid',
              linewidth=1, colors="black",
              label='Sequence length (%d)' % seqLen)
    plt.savefig(os.path.join('report', 'coverage.png'),
                    dpi=options.dpi)


    #print cumulative coverage distribution plot
    fig = plt.figure()                
    ax = fig.add_subplot(111)
    plt.title('Inverse cumulative coverage',
              fontdict={'size' : 10})
    ax.set_xlabel('coverage')
    ax.set_ylabel('genome fraction')        


    cp_ok_hist = np.cumsum(ok_hist[1:][::-1])[::-1] / float(seqLen)
    cp_ok_hist_ends = np.cumsum(ok_hist_ends[1:][::-1])[::-1] / float(seqLen)

    if not options.onlyok:
        cp_high_hist = np.cumsum(high_hist[1:][::-1])[::-1] / float(seqLen)
        cp_high_hist_ends = np.cumsum(high_hist_ends[1:][::-1])[::-1] / float(seqLen)

        cp_low_hist = np.cumsum(low_hist[1:][::-1])[::-1] / float(seqLen)
        cp_low_hist_ends = np.cumsum(low_hist_ends[1:][::-1])[::-1] / float(seqLen)

    if GAP:
        gapfrac = 1 - (nons / float(seqLen))
        print gapfrac
        ax.axhspan(gapfrac, 1, facecolor=hu.COLLIGHTYELLOW, 
                   lw=0, label="gaps")

    ax.fill_between(he[1:-1],
                    cp_ok_hist_ends, cp_ok_hist,
                    color='#3BB058', alpha=0.3,
                    label='Ok ICP')
    if not options.onlyok:
        ax.fill_between(he[1:-1],
                        cp_high_hist_ends, cp_high_hist,
                        color='#B73147', alpha=0.3,
                        label="High ICP")
        ax.fill_between(he[1:-1],
                        cp_low_hist_ends, cp_low_hist,
                        color='#538ABF', alpha=0.3,
                        label="High ICP")


    ax.plot((he[1:-1]), cp_ok_hist_ends, '-', c='#3BB058',
             label="Ok ECP")
    if not options.onlyok:
        ax.plot((he[1:-1]), cp_high_hist_ends, '-', c='#B73147',
                  label="High ECP")
        ax.plot((he[1:-1]), cp_low_hist_ends, '-', c='#538ABF',
                 label="Low ECP")

    ax.legend(prop={'size' :'x-small'})
    minX, maxX = ax.get_axes().get_xlim()

    ax.set_xscale('log')
    ax.set_xlim(1, 1000)
    plt.savefig(os.path.join('report', 'cumul.coverage.png'),
                    dpi=options.dpi)

    #generate text reports
    report_file = os.path.join('report', 'report')
    template_file = os.path.join(os.path.dirname(__file__),
                                 'report_template.jinja2')


    with open(template_file) as F:
        template = jinja2.Template(open(template_file).read())

    rep_ok_proc = rep_ok_hist / float(seqLen) * 100
    rep_ok_proc_ends = rep_ok_hist_ends / float(seqLen) * 100

    rep_high_proc = rep_high_hist / float(seqLen) * 100
    rep_high_proc_ends = rep_high_hist_ends / float(seqLen) * 100

    rep_low_proc = rep_low_hist / float(seqLen) * 100
    rep_low_proc_ends = rep_low_hist_ends / float(seqLen) * 100

    with open(report_file, 'w') as F:
        F.write(template.render(locals()))