diff --git a/.bumpversion.cfg b/.bumpversion.cfg deleted file mode 100644 index ea736da..0000000 --- a/.bumpversion.cfg +++ /dev/null @@ -1,13 +0,0 @@ -[bumpversion] -current_version = 1.0.3 -commit = True -tag = False - -[bumpversion:file:setup.py] - -[bumpversion:file:contiguity/Contiguity.py] - -[bumpversion:file:do_release.sh] - -[bumpversion:file:docs/conf.py] - diff --git a/.idea/.name b/.idea/.name new file mode 100644 index 0000000..5d17c81 --- /dev/null +++ b/.idea/.name @@ -0,0 +1 @@ +Contiguity \ No newline at end of file diff --git a/.idea/Contiguity.iml b/.idea/Contiguity.iml new file mode 100644 index 0000000..a34a857 --- /dev/null +++ b/.idea/Contiguity.iml @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/.idea/encodings.xml b/.idea/encodings.xml new file mode 100644 index 0000000..e206d70 --- /dev/null +++ b/.idea/encodings.xml @@ -0,0 +1,5 @@ + + + + + diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..1d5c03f --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,15 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..3b31283 --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,7 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..2773d17 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,5 @@ + + + + + diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..0569752 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/.idea/scopes/scope_settings.xml b/.idea/scopes/scope_settings.xml new file mode 100644 index 0000000..922003b --- /dev/null +++ b/.idea/scopes/scope_settings.xml @@ -0,0 +1,5 @@ + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..c80f219 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,7 @@ + + + + + + + diff --git a/.idea/workspace.xml b/.idea/workspace.xml new file mode 100644 index 0000000..915c6be --- /dev/null +++ b/.idea/workspace.xml @@ -0,0 +1,225 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 1437446196822 + + + 1437449775461 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index ce220d3..0000000 --- a/.travis.yml +++ /dev/null @@ -1,14 +0,0 @@ -language: python -python: - - 2.7 -before_install: - # We do this to make sure we get dependencies - - sudo apt-get update -qq -install: - - pip install -r requirements.txt --use-mirrors - - pip install . -notifications: - email: true -script: - # cd tests && sh TEST.sh - - echo 'Not testing ATM' diff --git a/contiguity/Contiguity.py b/Contiguity.py similarity index 98% rename from contiguity/Contiguity.py rename to Contiguity.py index e6586d2..ace3496 100755 --- a/contiguity/Contiguity.py +++ b/Contiguity.py @@ -24,7 +24,7 @@ __title__ = 'Contiguity' -__version__ = '1.0.3' +__version__ = '1.0.4' __description__ = "Tool for visualising assemblies" __author__ = 'Mitchell Sullivan' __license__ = 'GPLv3' @@ -2141,7 +2141,7 @@ def clear_all(self): # load an assembly from a variety of files def load_assembly(self): - filename = tkFileDialog.askopenfilename() + filename = tkFileDialog.askopenfilename(title='select assembly graph file') if filename == '' or filename == (): return self.clear_all() @@ -2176,10 +2176,76 @@ def load_assembly(self): elif what.startswith('#FASTG'): self.load_fastg() self.update_console('FASTG loaded.') + elif what.startswith('H\tVN:Z:'): + self.load_gfa() + self.update_console('GFA loaded.') else: tkMessageBox.showerror('Invalid format', 'Contiguity cannot recognise file type.') self.writeWorkCont() + def load_gfa(self): + seqdict = {} + with open(self.csagfile.get()) as gfa: + for line in gfa: + if line.startswith('S\t'): + sname = line.split()[1] + seq = line.split()[2] + if seq == '*' and seqdict == {}: + fasta = tkFileDialog.askopenfilename(title='select FASTA file.') + with open(fasta) as f: + first = True + for faline in f: + if faline.startswith('>'): + if first: + first = False + else: + seqdict[name] = seq + name = faline[1:].split()[0] + seq = '' + else: + seq += faline.rstrip() + seqdict[name] = seq + if seq == '*': + seq = seqdict[sname] + aninstance = contig(sname, sname, seq) + self.contigDict[sname] = aninstance + if line.startswith('L\t') or line.startswith('C\t'): + splitline = line.split() + if splitline[2] == '+': + dira = True + else: + dira = False + if splitline[4] == '+': + dirb = True + else: + dirb = False + cigar = splitline[5] + intstring = '' + overlap = 0 + for i in cigar: + if i.isdigit(): + intstring += i + else: + if i in ['M', 'D', 'N', 'H', 'P']: + overlap += int(intstring) + intstring = '' + self.edgelist.append((splitline[1], dira, splitline[3], dirb, overlap)) + for i in self.edgelist: + contiga, dira, contigb, dirb, overlap = i + if dira and dirb: + self.contigDict[contiga].to.append((contigb, True, overlap)) + self.contigDict[contigb].fr.append((contiga, False, overlap)) + elif dira and not dirb: + self.contigDict[contiga].to.append((contigb, False, overlap)) + self.contigDict[contigb].to.append((contiga, False, overlap)) + elif not dira and dirb: + self.contigDict[contiga].fr.append((contigb, True, overlap)) + self.contigDict[contigb].fr.append((contiga, True, overlap)) + else: + self.contigDict[contiga].fr.append((contigb, False, overlap)) + self.contigDict[contigb].to.append((contiga, True, overlap)) + + # load ace file def load_ace(self): ace = open(self.csagfile.get()) @@ -2760,7 +2826,7 @@ def ok_edges(self): global khmer import khmer except ImportError: - proceed_no_khmer = tkMessageBox.askyesno('Khmer not found', 'Proceed without installing Khmer (Not recommended)?') + proceed_no_khmer = tkMessageBox.askyesno('Khmer not found.', 'Proceed without installing Khmer? (not recommended - see manual)?') if proceed_no_khmer: args.khmer = False else: @@ -3437,26 +3503,15 @@ def get_long_edge(self): self.edgelist.append((i[1:], True, j[1:], False, 'nnnnnnnnn')) def get_nmer_freq_khmer(self): - nmersize, reads, ht_size, ht_n, n_threads = self.nmersize.get(), self.readfile.get(), self.ht_size.get(), self.ht_number.get(), self.num_threads.get() - n_threads = 1 + nmersize, reads, ht_size, ht_n = self.nmersize.get(), self.readfile.get(), self.ht_size.get(), self.ht_number.get() ht_size = float('2e9') ht_n = 4 bigcount = True - self.ht = khmer.new_counting_hash(nmersize, ht_size, ht_n, n_threads) # HT_size, number ht, threads + self.ht = khmer.new_counting_hash(nmersize, ht_size, ht_n) # HT_size, number ht, threads self.ht.set_use_bigcount(bigcount) - rparser = khmer.ReadParser(reads, n_threads) - threads = [] + rparser = khmer.ReadParser(reads) self.queue.put('consuming input ' + reads) - for tnum in xrange(n_threads): - t = \ - threading.Thread( - target=self.ht.consume_fasta_with_reads_parser, - args=(rparser, ) - ) - threads.append(t) - t.start() - for t in threads: - t.join() + self.ht.consume_fasta_with_reads_parser(rparser) fp_rate = khmer.calc_expected_collisions(self.ht) self.queue.put('fp rate estimated to be %1.3f' % fp_rate) if fp_rate > 0.20: @@ -4875,22 +4930,32 @@ def writeMultiFasta(self): parser = argparse.ArgumentParser(prog='Contiguity', formatter_class=argparse.RawDescriptionHelpFormatter, description=''' Contiguity.py: A pairwise comparison and contig adjacency graph exploration tool. -USAGE: Contiguity.py -cl -c -fq -o +Version: 1.0.4 +License: GPLv3 -REQUIREMENTS: With default settings Contigutiy requires at least 8gb of free memory (RAM) +USAGE: Contiguity.py -cl -c -fq -o contig file: FASTA file of contigs or scaffolds read file: Interleaved fastq file - read1_left, read1_right, read2_left etc... orientated as such --> <-- output folder: folder to put output files in, can and will overwrite files in this folder, will create folder if folder doesn't exist -Only other option to keep in mind is -rl if the read length is not 101bp +REQUIREMENTS: With default settings Contigutiy requires at least (Potentially more) 6gb of free memory (RAM). This is +because Contiguity uses Khmer to create a De Bruijn graph. + +If you are running into memory issues (e.g. the process is being killed by the Operating system (killed 9)) +you may want to reduce the hash table number or hash table size (using the flags -ht_n and -ht_s), or free up more memory. +For large datasets more memory may need to be used. +please read http://khmer.readthedocs.org/en/v1.1/choosing-table-sizes.html for more information about hash tables + +The only other option to keep in mind is -rl if the read length is not 101bp, you may want to increase this value +for longer read lengths, or decrease for shorter read lengths (75% of maximum read length seems to work well). ''', epilog="Thanks for using Contiguity") parser.add_argument('-co', '--contig_file', action='store', help='fasta file of assembled contigs or scaffolds') parser.add_argument('-rf', '--read_file', action='store', help='read file') parser.add_argument('-o', '--output_folder', action='store', help='output folder') parser.add_argument('-k', '--kmer_size', action='store', type=int, default=31, help='k-mer size for finding adjacent contigs [31]') -parser.add_argument('-max_d', '--max_distance', action='store', type=int, default=300, help='maximum distance apart in the de bruijn graph for contigs to count as adjacent [300]') +parser.add_argument('-max_d', '--max_distance', action='store', type=int, default=200, help='maximum distance apart in the de bruijn graph for contigs to count as adjacent [300]') parser.add_argument('-kmer_a', '--kmer_average', action='store', type=int, default=-1, help='All k-mers above half this value will be traversed [auto]') parser.add_argument('-kmer_c', '--kmer_cutoff', action='store', type=int, default=-1, help='cutoff for k-mer values [auto]') parser.add_argument('-ov', '--overlap', action='store', type=int, default=None, help='minimum overlap to create edge [kmer_size-1]') @@ -4904,15 +4969,17 @@ def writeMultiFasta(self): parser.add_argument('-nd', '--no_db_edges', action='store_true', default=False, help='Don\'t get De Bruijn edges') parser.add_argument('-np', '--no_paired_edges', action='store_true', default=False, help='Don\'t get paired-end edges') parser.add_argument('-km', '--khmer', action='store_false', default=True, help='Don\'t use khmer for De Bruijn graph contruction (not recommended)') -parser.add_argument('-nt', '--num_threads', action='store', type=int, default=1, help='Number of threads to use for hash table building with khmer and for mapping reads with bowtie') -parser.add_argument('-ht_s', '--ht_size', action='store', default='2e9', help='Hash table size, for more information check http://khmer.readthedocs.org/en/v1.1/choosing-table-sizes.html') -parser.add_argument('-ht_n', '--ht_number', action='store', type=int, default=4, help='Hash table number, for more information check http://khmer.readthedocs.org/en/v1.1/choosing-table-sizes.html') +parser.add_argument('-nt', '--num_threads', action='store', type=int, default=1, help='Number of threads to use for mapping reads with bowtie [1]') +parser.add_argument('-ht_s', '--ht_size', action='store', default='1e9', help='Hash table size.') +parser.add_argument('-ht_n', '--ht_number', action='store', type=int, default=4, help='Hash table number.') args = parser.parse_args() if args.command_line: + if platform.system() == 'Windows': + args.khmer = False if args.khmer: import khmer if args.contig_file is None or args.read_file is None or args.output_folder is None: @@ -4937,4 +5004,4 @@ def writeMultiFasta(self): root.option_add("*Scrollbar.Background", "#C0C0FF") root.option_add("*Entry.Background", "#FFFFFF") app = App(root) - root.mainloop() + root.mainloop() \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index a594fd6..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,4 +0,0 @@ -include LICENSE -include requirements.txt -include README.rst -recursive-include docs/_build/html * diff --git a/README.rst b/README.rst index 20af3aa..1136ffe 100644 --- a/README.rst +++ b/README.rst @@ -5,121 +5,13 @@ Contiguity is a tool for constructing and visualising assembly graphs. It uses a linear layout so that the assembly graph can be directly compared to a reference. +The main website for contiguity can be found at http://mjsull.github.io/Contiguity -.. image:: https://pypip.in/version/Contiguity/badge.svg - :target: https://pypi.python.org/pypi/Contiguity/ - :alt: Latest Version - -.. image:: https://pypip.in/download/Contiguity/badge.svg - :target: https://pypi.python.org/pypi/Contiguity/ - :alt: Downloads - -.. image:: https://travis-ci.org/BeatsonLab-MicrobialGenomics/Contiguity.svg?branch=master - :target: https://travis-ci.org/BeatsonLab-MicrobialGenomics/Contiguity - :alt: Build status - - -.. image:: https://github.com/BeatsonLab-MicrobialGenomics/Contiguity/blob/master/docs/manual/Contiguity_SS.png - :alt: Contiguity Screen shot - :align: center - - -Requirements: - * Python 2.7+ - * NCBI-BLAST+ (needed for overlap edge creation and automatic comparison - generation) - * Bowtie 2 (needed for paired end edge creation) - - -Installation ------------- - -If you're not familiar with the command-line we recommend you ask local IT -support to help you install Contiguity. - - -Checking requirements are installed -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -You will need to install/have installed: - * ncbiblast+ >= 2.2.28 - * python >= 2.7 (**Python 3 is not supported**) - * bowtie2 >= 2.1.0 - -You can check these are installed by:: - - $ python --version - $ blastn -version - $ bowtie2 --version - -Installation of python, blastn or bowtie2 (without a package manager) is -beyond the scope of this document. - -If you have both python, blastn and bowtie2 you need to (if not already -present) install pip_. - -You can check if pip_ exists with:: - - $ which pip - -If you get a "not found", please read the `pip installation instructions`_. - -**If you already have pip we do suggest you upgrade it.** We are using version -1.5.6 at the time of writing this document. - -You can upgrade pip_ like this:: - - $ pip install --upgrade pip - - -pip based installation of Contiguity -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -If you have root/admin something like:: - - $ pip install Contiguity - -Otherwise (not root/admin or permission denied errors running above):: - - $ pip install --user Contiguity - -If you installed using the --user option of pip_, Contiguity will typically -end up in: /home/$USER/.local/bin/ -You need to add this location to you ~/.bash_profile. - -Add Contiguity to your path:: - - $ echo 'export PATH=$PATH:/home/$USER/.local/bin/' >> ~/.bash_profile - $ source !$ - - -Testing the installation of Contiguity -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Run (in the Terminal):: - - $ Contiguity - - -Upgrading Contiguity -~~~~~~~~~~~~~~~~~~~~ - -You can upgrade like this:: - - pip install --upgrade Contiguity - - -**Please regularly check back to make sure you're running the most recent -Contiguity version.** - - -Usage/Docs ----------- +Contiguity can be downloaded from http://mjsull.github.io/Contiguity/files.html For detailed information on how to use Contiguity please see the manual_ otherwise see Quick Start below. - Quick Start ----------- @@ -127,7 +19,7 @@ Supported formats & the CAG ~~~~~~~~~~~~~~~~~~~~~~~~~~~ Contiguity works with ABySS_ (**.dot**), Velvet_ (**LastGraph**), Newbler_ -(**.ace**) and SPAdes_ (**FASTG**) formats. +(**.ace**), GFA_ and SPAdes_ (**FASTG**) formats. For all other assemblies, an assembly graph (.cag) can be created from the Contiguity GUI (file->create cag file) or using the command line. @@ -145,6 +37,8 @@ You can generate a CAG from the command line like:: $ Contiguity -cl -c -fq -o +Or by selecting File > Create CAG in the GUI and providing a read file and a contig file + This assumes: * (~8GB of free memory) * contig_file.fa: is in FASTA file of contigs or scaffolds @@ -188,16 +82,16 @@ Citation If you use Contiguity in your work, please cite it using:: - Mitchell J Sullivan, Nouri Ben Zakour, Brian Forde, Mitchell Stanton-Cook & Scott A Beatson* - Contiguity: Contig adjacency graph construction and visualisation - https://github.com/BeatsonLab-MicrobialGenomics/Contiguity + Sullivan MJ, Ben Zakour NL, Forde BM, Stanton-Cook M, Beatson SA. (2015) + Contiguity: Contig adjacency graph construction and visualisation. + PeerJ PrePrints 3:e1273 https://dx.doi.org/10.7287/peerj.preprints.1037v1 -.. _manual: https://github.com/BeatsonLab-MicrobialGenomics/Contiguity/raw/master/docs/manual/Contiguity_manual.pdf -.. _pip: http://www.pip-installer.org/en/latest/ +.. _manual: https://github.com/mjsull/Contiguity/wiki .. _pip installation instructions: http://pip.readthedocs.org/en/latest/installing.html .. _ABySS: http://www.bcgsc.ca/platform/bioinfo/software/abyss .. _Velvet: https://www.ebi.ac.uk/~zerbino/velvet/ .. _Newbler: http://www.454.com/products/analysis-software/ .. _SPAdes: http://bioinf.spbau.ru/spades +.. _GFA: https://github.com/pmelsted/GFA-spec diff --git a/contiguity/Contiguity b/contiguity/Contiguity deleted file mode 120000 index 040774c..0000000 --- a/contiguity/Contiguity +++ /dev/null @@ -1 +0,0 @@ -Contiguity.py \ No newline at end of file diff --git a/contiguity/__init__.py b/contiguity/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/contiguity/util/__init__.py b/contiguity/util/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/contiguity/util/checkCSAG.py b/contiguity/util/checkCSAG.py deleted file mode 100644 index 5c06cca..0000000 --- a/contiguity/util/checkCSAG.py +++ /dev/null @@ -1,241 +0,0 @@ -# Contiguity - Tool for visualising assemblies -# Copyright (C) 2013-2015 Mitchell Sullivan -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# -# Mitchell Sullivan -# mjsull@gmail.com -# School of Chemistry & Molecular Biosciences -# The University of Queensland -# Brisbane, QLD 4072. -# Australia - - -import sys, subprocess, string - -transtab = string.maketrans('atcgATCG', 'tagcTAGC') -theident = 85 - - -class contig: - def __init__(self, name, forseq, revseq): - self.name = name - self.forseq = forseq - self.revseq = revseq - self.to = [] - self.fr = [] - self.length = len(forseq) - -contigfile = open(sys.argv[1]) -getline = 0 -contigDict = {} -edgelist = [] -templg = open('templg.fa', 'w') -bignodes = set() -count = 0 -first = True -temped = open('tempedge.fa', 'w') -for line in contigfile: - if line.startswith('NODE'): - name = line.split()[1] - seq = line.split()[-1] - revseq = seq[::-1].translate(transtab) - aninstance = contig(name, seq, revseq) - contigDict[name] = aninstance - templg.write('>' + name + '\n' + seq + '\n') - elif line.startswith('EDGE'): - junk, a, dira, b, dirb, overlap = line.split() - if overlap == '.': - overlap = '' - if not 'n' in overlap: - temped.write('>' + str(count) + '\n') - if dira == 'True': - if dirb == 'True': - if overlap.isdigit(): - contigDict[a].to.append((b, True, int(overlap))) - contigDict[b].fr.append((a, False, int(overlap))) - temped.write(contigDict[a].forseq + contigDict[b].forseq[int(overlap):] + '\n') - edgelist.append((a, b, len(contigDict[a].forseq + contigDict[b].forseq[int(overlap):]))) - else: - contigDict[a].to.append((b, True, overlap)) - contigDict[b].fr.append((a, False, overlap[::-1].translate(transtab))) - temped.write(contigDict[a].forseq + overlap + contigDict[b].forseq + '\n') - edgelist.append((a, b, len(contigDict[a].forseq + overlap + contigDict[b].forseq))) - else: - if overlap.isdigit(): - contigDict[a].to.append((b, False, int(overlap))) - contigDict[b].to.append((a, False, int(overlap))) - temped.write(contigDict[a].forseq + contigDict[b].revseq[int(overlap):] + '\n') - edgelist.append((a, b, len(contigDict[a].forseq + contigDict[b].revseq[int(overlap):]))) - else: - contigDict[a].to.append((b, False, overlap)) - contigDict[b].to.append((a, False, overlap[::-1].translate(transtab))) - temped.write(contigDict[a].forseq + overlap + contigDict[b].revseq + '\n') - edgelist.append((a, b, len(contigDict[a].forseq + overlap + contigDict[b].revseq))) - else: - if dirb == 'True': - if overlap.isdigit(): - contigDict[a].fr.append((b, True, int(overlap))) - contigDict[b].fr.append((a, True, int(overlap))) - temped.write(contigDict[a].revseq + contigDict[b].forseq[int(overlap):] + '\n') - edgelist.append((a, b, len(contigDict[a].revseq + contigDict[b].forseq[int(overlap):]))) - else: - contigDict[a].fr.append((b, True, overlap)) - contigDict[b].fr.append((a, True, overlap[::-1].translate(transtab))) - temped.write(contigDict[a].revseq + overlap + contigDict[b].forseq + '\n') - edgelist.append((a, b, len(contigDict[a].revseq + overlap + contigDict[b].forseq))) - else: - if overlap.isdigit(): - contigDict[a].fr.append((b, False, int(overlap))) - contigDict[b].to.append((a, True, int(overlap))) - temped.write(contigDict[a].revseq + contigDict[b].revseq[int(overlap):] + '\n') - edgelist.append((a, b, len(contigDict[a].revseq + contigDict[b].revseq[int(overlap):]))) - else: - contigDict[a].fr.append((b, False, overlap)) - contigDict[b].to.append((a, True, overlap[::-1].translate(transtab))) - temped.write(contigDict[a].revseq + overlap + contigDict[b].revseq + '\n') - edgelist.append((a, b, len(contigDict[a].revseq + overlap + contigDict[b].revseq))) - count += 1 - -templg.close() -temped.close() - - - -first = True -reflist = {} -contiglist = {} -for i in sys.argv[2:]: - tempref = open(i) - for line in tempref: - if line.startswith('>'): - if first: - first = False - else: - reflist[name] = seq - contiglist[name] = [] - name = line[1:].split()[0] - seq = '' - else: - seq += line.rstrip() - tempref.close() -reflist[name] = seq -contiglist[name] = [] - -refout = open('tempref.fa', 'w') -reflen = {} -for i in reflist: - refout.write('>' + i + '\n' + reflist[i] + reflist[i] + '\n') - reflen[i] = len(reflist[i]) -refout.close() - - -subprocess.Popen('makeblastdb -dbtype nucl -out tempdb -in tempref.fa', shell=True).wait() -subprocess.Popen('blastn -task blastn -db tempdb -outfmt 6 -query templg.fa -out query_tempdb1.out', shell=True).wait() -subprocess.Popen('blastn -task blastn -db tempdb -outfmt 6 -query tempedge.fa -out query_tempdb2.out', shell=True).wait() - -bout = open('query_tempdb1.out') -inref = set() - -for line in bout: - query, subject, ident, length, mismatch, indel, qStart, qEnd, rStart, rEnd, eVal, bitScore = line.split() - ident = float(ident) - length = int(length) - qStart = int(qStart) - qEnd = int(qEnd) - rStart = int(rStart) - rEnd = int(rEnd) - if ident >= theident and qStart == 1 and qEnd == contigDict[query].length: - inref.add(query) - contiglist[subject].append((query, min([rStart, rEnd]), max([rStart, rEnd]), rStart < rEnd, ident, length)) -bout.close() -bout = open('query_tempdb2.out') -testset = set() -allconts = set(contigDict) -edges = set() -edgedict ={} - -for i in contiglist: - hitlist = contiglist[i] - hitlist.sort(key=lambda x: x[-2], reverse=True) - hitlist.sort(key=lambda x: x[-1], reverse=True) - newhitlist = [] - for j in hitlist: - getit = True - for k in newhitlist: - if j[1] >= k[1] and j[2] <= k[2]: - getit = False - break - if getit: - newhitlist.append(j) - newhitlist.sort(key=lambda x: x[1]) - hitlist = newhitlist - lasthit = None - for j in hitlist: - if lasthit != None: - if not (j[0], not j[3], lasthit[0], not lasthit[3]) in edges and j[1] < lasthit[2] + 301: - edges.add((lasthit[0], lasthit[3], j[0], j[3])) - edgedict[(lasthit[0], lasthit[3], j[0], j[3])] = [lasthit, j] - lasthit = j - -SP = 0 -for i in edges: - #print i - gotit = False - if i[1]: - for j in contigDict[i[0]].to: - if j[0] == i[2] and j[1] == i[3]: - gotit = True - break - else: - for j in contigDict[i[0]].fr: - if j[0] == i[2] and j[1] == i[3]: - gotit = True - break - if gotit: - SP += 1 - - - - - - -edgeinref = set() -for line in bout: - query, subject, ident, length, mismatch, indel, qStart, qEnd, rStart, rEnd, eVal, bitScore = line.split() - ident = float(ident) - length = int(length) - qStart = int(qStart) - qEnd = int(qEnd) - rStart = int(rStart) - rEnd = int(rEnd) - if ident >= theident and length >= 0.98 * edgelist[int(query)][-1]: - edgeinref.add(int(query)) - - - -TP = 0 -FP = 0 -MA = 0 -for i in range(len(edgelist)): - if edgelist[i][0] in inref and edgelist[i][1] in inref: - if i in edgeinref: - TP += 1 - else: - FP += 1 - else: - MA += 1 - -print TP , FP , SP, len(edges) - SP -print TP * 100.0 / (TP+FP), '% precision' -print SP * 100.0 / len(edges), '% sensitivity' diff --git a/contiguity/util/checkLG.py b/contiguity/util/checkLG.py deleted file mode 100644 index 302eb0f..0000000 --- a/contiguity/util/checkLG.py +++ /dev/null @@ -1,279 +0,0 @@ -# Contiguity - Tool for visualising assemblies -# Copyright (C) 2013-2015 Mitchell Sullivan -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# -# Mitchell Sullivan -# mjsull@gmail.com -# School of Chemistry & Molecular Biosciences -# The University of Queensland -# Brisbane, QLD 4072. -# Australia - - -import sys, subprocess - -theident = 85 -bignodesize = 49 - -class contig: - def __init__(self, name, forseq, revseq): - self.name = name - self.forseq = forseq - self.revseq = revseq - self.to = [] - self.fr = [] - self.length = len(forseq) - -lg = open(sys.argv[1]) -getline = 0 -contigDict = {} -edgelist = [] -templg = open('templg.fa', 'w') -bignodes = set() -count = 0 -for line in lg: - if line.startswith('NODE'): - name = line.split()[1] - getline = 1 - elif getline == 1: - forseq = line.rstrip() - getline = 2 - elif getline == 2: - revseq = line.rstrip() - getline = 0 - aninstance = contig(name, forseq, revseq) - if len(forseq) >= bignodesize: - bignodes.add(name) - templg.write('>' + name + '\n' + forseq + '\n') - contigDict[name] = aninstance - elif line.startswith('ARC'): - to, fr = line.split()[1:3] - if to[0] == '-': - if fr not in contigDict[to[1:]].fr: - contigDict[to[1:]].fr.append(fr) - else: - if fr not in contigDict[to].to: - contigDict[to].to.append(fr) - if to[0] == '-': - to = to[1:] - else: - to = '-' + to - if fr[0] == '-': - if to not in contigDict[fr[1:]].to: - contigDict[fr[1:]].to.append(to) - else: - if to not in contigDict[fr].fr: - contigDict[fr].fr.append(to) -templg.close() -temped = open('tempedge.fa', 'w') -todo = [] -outpaths = [] -for i in bignodes: - for j in contigDict[i].to: - todo.append([i, j]) - for j in contigDict[i].fr: - todo.append(['-' + i, j]) - -while len(todo) != 0: - currpath = todo.pop() - if currpath[-1].replace('-', '') in bignodes: - outpaths.append(currpath) - elif len(currpath) < 300: - if currpath[-1][0] == '-': - for i in contigDict[currpath[-1][1:]].fr: - todo.append(currpath + [i]) - else: - for i in contigDict[currpath[-1]].to: - todo.append(currpath + [i]) - -temppaths = [] -for i in outpaths: - if len(i) > 2: - lall = 0 - for j in i[1:-1]: - if j[0] == '-': - lall += contigDict[j[1:]].length - else: - lall += contigDict[j].length - if lall <= 300: - temppaths.append(i) - else: - temppaths.append(i) - # print i - -count = 0 -for i in contigDict: - contigDict[i].to = [] - contigDict[i].fr = [] -for i in temppaths: - seq = '' - for j in i: - if j[0] == '-': - seq += contigDict[j[1:]].revseq - else: - seq += contigDict[j].forseq - edgelist.append((i[0].replace('-', ''), i[-1].replace('-', ''), len(seq))) - if i[0][0] == '-': - a = i[0][1:] - if i[-1][0] == '-': - b = i[-1][1:] - contigDict[a].fr.append((b, False)) - contigDict[b].to.append((a, True)) - else: - b = i[-1] - contigDict[a].fr.append((b, True)) - contigDict[b].fr.append((a, True)) - else: - a = i[0] - if i[-1][0] == '-': - b = i[-1][1:] - contigDict[a].to.append((b, False)) - contigDict[b].to.append((a, False)) - else: - b = i[-1] - contigDict[a].to.append((b, True)) - contigDict[b].fr.append((a, False)) - temped.write('>' + str(count) + '\n' + seq + '\n') - count += 1 -temped.close() - - - -first = True -reflist = {} -contiglist = {} -for i in sys.argv[2:]: - tempref = open(i) - for line in tempref: - if line.startswith('>'): - if first: - first = False - else: - reflist[name] = seq - contiglist[name] = [] - name = line[1:].split()[0] - seq = '' - else: - seq += line.rstrip() - tempref.close() -reflist[name] = seq -contiglist[name] = [] - -refout = open('tempref.fa', 'w') -reflen = {} -for i in reflist: - refout.write('>' + i + '\n' + reflist[i] + reflist[i] + '\n') - reflen[i] = len(reflist[i]) -refout.close() - - -subprocess.Popen('makeblastdb -dbtype nucl -out tempdb -in tempref.fa', shell=True).wait() -subprocess.Popen('blastn -task blastn -db tempdb -outfmt 6 -query templg.fa -out query_tempdb1.out', shell=True).wait() -subprocess.Popen('blastn -task blastn -db tempdb -outfmt 6 -query tempedge.fa -out query_tempdb2.out', shell=True).wait() - -bout = open('query_tempdb1.out') -inref = set() - -for line in bout: - query, subject, ident, length, mismatch, indel, qStart, qEnd, rStart, rEnd, eVal, bitScore = line.split() - ident = float(ident) - length = int(length) - qStart = int(qStart) - qEnd = int(qEnd) - rStart = int(rStart) - rEnd = int(rEnd) - if ident >= theident and qStart == 1 and qEnd == contigDict[query].length: - inref.add(query) - contiglist[subject].append((query, min([rStart, rEnd]), max([rStart, rEnd]), rStart < rEnd, ident, length)) -bout.close() -bout = open('query_tempdb2.out') -testset = set() -allconts = set(contigDict) -edges = set() -edgedict ={} - -for i in contiglist: - hitlist = contiglist[i] - hitlist.sort(key=lambda x: x[-2], reverse=True) - hitlist.sort(key=lambda x: x[-1], reverse=True) - newhitlist = [] - for j in hitlist: - getit = True - for k in newhitlist: - if j[1] >= k[1] and j[2] <= k[2]: - getit = False - break - if getit: - newhitlist.append(j) - newhitlist.sort(key=lambda x: x[1]) - hitlist = newhitlist - lasthit = None - for j in hitlist: - if lasthit != None: - if not (j[0], not j[3], lasthit[0], not lasthit[3]) in edges and j[1] < lasthit[2] + 301: - edges.add((lasthit[0], lasthit[3], j[0], j[3])) - edgedict[(lasthit[0], lasthit[3], j[0], j[3])] = [lasthit, j] - lasthit = j - -SP = 0 -for i in edges: - #print i - gotit = False - if i[1]: - for j in contigDict[i[0]].to: - if j[0] == i[2] and j[1] == i[3]: - gotit = True - break - else: - for j in contigDict[i[0]].fr: - if j[0] == i[2] and j[1] == i[3]: - gotit = True - break - if gotit: - SP += 1 - - - - - - -edgeinref = set() -for line in bout: - query, subject, ident, length, mismatch, indel, qStart, qEnd, rStart, rEnd, eVal, bitScore = line.split() - ident = float(ident) - length = int(length) - qStart = int(qStart) - qEnd = int(qEnd) - rStart = int(rStart) - rEnd = int(rEnd) - if ident >= theident and length >= 0.98 * edgelist[int(query)][-1]: - edgeinref.add(int(query)) - - - -TP = 0 -FP = 0 -MA = 0 -for i in range(len(edgelist)): - if edgelist[i][0] in inref and edgelist[i][1] in inref: - if i in edgeinref: - TP += 1 - else: - FP += 1 - else: - MA += 1 - -print TP , FP , SP, len(edges) - SP -print TP * 100.0 / (TP+FP), '% precision' -print SP * 100.0 / len(edges), '% sensitivity' diff --git a/contiguity/util/coif.py b/contiguity/util/coif.py deleted file mode 100644 index 3fb7040..0000000 --- a/contiguity/util/coif.py +++ /dev/null @@ -1,955 +0,0 @@ -# COIF - Plasmid detection toolkit -# Copyright (C) 2013-2015 Mitchell Sullivan -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# -# Mitchell Sullivan -# mjsull@gmail.com -# School of Chemistry & Molecular Biosciences -# The University of Queensland -# Brisbane, QLD 4072. -# Australia - - -import networkx -import argparse -import os -import subprocess -import string -import sys -import numpy as np - -transtab = string.maketrans('atcgATCG', 'tagcTAGC') - - -# class containing all information for a contig -class contig: - def __init__(self, name, shortname, sequence, revseq=None, coverage=None): - self.name = name - self.shortname = shortname - self.forseq = sequence.upper() - if revseq == None: - tempseq = self.forseq[::-1] - self.revseq = tempseq.translate(transtab) - else: - self.revseq = revseq - self.length = len(sequence) - self.xlength = None - if self.length >= 1000000000: - self.strlen = str(round(self.length * 1.0 / 1000000000, 2)) + 'Gb' - elif self.length >= 1000000: - self.strlen = str(round(self.length * 1.0 / 1000000, 2)) + 'Mb' - elif self.length >= 1000: - self.strlen = str(self.length / 1000) + 'Kb' - else: - self.strlen = str(self.length) + 'bp' - self.visible = False - self.to = [] - self.fr = [] - if coverage is None: - self.coverage = 'N/A' - else: - self.coverage = round(coverage, 2) - try: - self.coverage = float(name.split('_')[5]) - except: - pass - gcount = self.forseq.count('G') - ccount = self.forseq.count('C') - acount = self.forseq.count('A') - tcount = self.forseq.count('T') - self.gccontent = round((gcount + ccount) * 100.0 / self.length, 2) - try: - self.gcskew = round((gcount - ccount) * 1.0 / (gcount + ccount), 2) - except ZeroDivisionError: - self.gcskew = 0 - try: - self.atskew = round((acount - tcount) * 1.0 / (acount + tcount), 2) - except ZeroDivisionError: - self.atskew = 0 - - -def load_fasta(self): - fasta = open(args.basic) - for line in fasta: - if line.startswith('>'): - if first: - first = False - else: - aninstance = contig(name, name, seq) - contigDict[name] = aninstance - name = line.rstrip()[1:] - seq = '' - else: - seq += line.rstrip() - aninstance = contig(name, name, seq) - contigDict[name] = aninstance - -# load a CAG file -def load_cag(self): - csag = open(args.cag_file) - edgelist = [] - global contigDict - for line in csag: - if line.split()[0] == 'NODE': - if len(line.split()) == 4: - title, entry, name, seq = line.split() - aninstance = contig(entry, name, seq) - else: - title, entry, name, coverage, seq = line.split() - if coverage == 'N/A': - coverage = None - else: - coverage = float(coverage) - aninstance = contig(entry, name, seq, None, coverage) - contigDict[entry] = aninstance - elif line.split()[0] == 'EDGE': - title, n1, d1, n2, d2, overlap = line.split() - if overlap == '.': - overlap = '' - if d1 == 'True': - d1 = True - else: - d1 = False - if d2 == 'True': - d2 = True - else: - d2 = False - if overlap.isdigit(): - overlap = int(overlap) - edgelist.append((n1, d1, n2, d2, overlap)) - for i in edgelist: - contiga, dira, contigb, dirb, overlap = i - if dira and dirb: - contigDict[contiga].to.append((contigb, True, overlap)) - contigDict[contigb].fr.append((contiga, False, overlap)) - elif dira and not dirb: - contigDict[contiga].to.append((contigb, False, overlap)) - contigDict[contigb].to.append((contiga, False, overlap)) - elif not dira and dirb: - contigDict[contiga].fr.append((contigb, True, overlap)) - contigDict[contigb].fr.append((contiga, True, overlap)) - else: - contigDict[contiga].fr.append((contigb, False, overlap)) - contigDict[contigb].to.append((contiga, True, overlap)) - -# find best reference -def predict_blast1(args): - maxgot = 0 - maxgot2 = 0 - bestref2 = None - bestref = None - for i in os.listdir(args.blast1): - reflen = 0 - ref = open(args.blast1 + '/' + i) - for line in ref: - if not line.startswith('>'): - reflen += len(line.rstrip()) - ref.close() - subprocess.Popen('makeblastdb -dbtype nucl -out ' + args.work_dir + '/tempdb -in ' + args.blast1 + '/' + i, shell=True, stdout=subprocess.PIPE).wait() - subprocess.Popen('blastn -db ' + args.work_dir + '/tempdb -outfmt 6 -num_threads 8 -query ' + args.work_dir + '/contigs.fa -out ' + args.work_dir + '/contigs_tempdb.out', shell=True).wait() - blast = open(args.work_dir + '/contigs_tempdb.out') - gotset = set() - querydict = {} - for line in blast: - query, subject, ident, length, mm, indel, qstart, qstop, rstart, rstop, eval, bitscore = line.split() - qstart, qstop, rstart, rstop, length, mm = map(int, [qstart, qstop, rstart, rstop, length, mm]) - eval = float(eval) - if eval <= 0.005: - for j in range(min([rstart, rstop]), max([rstart, rstop]) + 1): - gotset.add(j) - if not query in querydict: - querydict[query] = set() - for j in range(qstart, qstop + 1): - querydict[query].add(j) - blast.close() - aval, bval = 0, 0 - for j in contigDict: - if j in querydict: - aval += len(querydict[j]) - bval += contigDict[j].length - gotset2 = aval * 1.0 / bval - if len(gotset) * 1.0/ reflen + gotset2 >= maxgot: - bestref2 = bestref - maxgot2 = maxgot - bestref = i - maxgot = len(gotset) * 1.0 / reflen + gotset2 - elif len(gotset) * 1.0/reflen + gotset2 >= maxgot2: - bestref2 = i - maxgot2 = len(gotset) * 1.0 /reflen + gotset2 - return bestref2 - -# write CAG to FASTA for perfoming BLAST etc. -def write_fasta_cag(args): - out = open(args.work_dir + '/contigs.fa', 'w') - for i in contigDict: - out.write('>' + i + '\n' + contigDict[i].forseq + '\n') - out.close() - -# use selected reference to predict plasmid contigs -def predict_blast2(args): - subprocess.Popen('makeblastdb -dbtype nucl -out ' + args.work_dir + '/tempdb -in ' + args.blast2, shell=True, stdout=subprocess.PIPE).wait() - subprocess.Popen('blastn -db ' + args.work_dir + '/tempdb -outfmt 6 -num_threads 8 -query ' + args.work_dir + '/contigs.fa -out ' + args.work_dir + '/contigs_tempdb.out', shell=True).wait() - filtered = set() - blast = open(args.work_dir + '/contigs_tempdb.out') - for line in blast: - query, subject, ident, length, mm, indel, qstart, qstop, rstart, rstop, eval, bitscore = line.split() - qstart, qstop, rstart, rstop, length, mm = map(int, [qstart, qstop, rstart, rstop, length, mm]) - ident = float(ident) - if length >= args.min_length and ident >= args.min_ident and length >= contigDict[query].length * args.min_len_fract: - filtered.add(query) - blast.close() - candidates = set() - for i in contigDict: - if not i in filtered: - candidates.add(i) - return candidates - -# turn the contig dictionary into a Directional graph for networkx -def contigDict2nx(): - dg = networkx.DiGraph() - for i in contigDict: - dg.add_node(i) - dg.add_node('-' + i) - for j in contigDict[i].to: - if j[1]: - dg.add_edge(i, j[0]) - else: - dg.add_edge(i, '-' + j[0]) - for j in contigDict[i].fr: - if j[1]: - dg.add_edge('-' + i, j[0]) - else: - - dg.add_edge('-' + i, '-' + j[0]) - return dg - -# Find all simple paths between a list of candidates -def find_paths(dg, candidates, args): - outpaths = [] - listcan = list(candidates) - while len(listcan) > 0: - i = listcan[0] - for j in listcan: - paths = list(networkx.all_simple_paths(dg, i, j, args.max_path_node)) - for k in paths: - pathseq = '' - lastcontig = i - lastcontigdir = True - getit = True - for l in k[1:-1]: - if '-' in l: - contig = l[1:] - contigdir = False - else: - contig = l - contigdir = True - if contig in candidates: - getit = False - break - if lastcontigdir: - for m in contigDict[lastcontig].to: - if m[0] == contig and m[1] == contigdir: - overlap = m[2] - break - else: - for m in contigDict[lastcontig].fr: - if m[0] == contig and m[1] == contigdir: - overlap = m[2] - break - if type(overlap) is int: - if contigdir: - pathseq += contigDict[contig].forseq[overlap:] - else: - pathseq += contigDict[contig].revseq[overlap:] - else: - pathseq += overlap - if contigdir: - pathseq += contigDict[contig].forseq - else: - pathseq += contigDict[contig].revseq - lastcontig = contig - lastcontigdir = contigdir - if k[-1][0] == '-': - contig = k[-1][1:] - contigdir = False - else: - contig = k[-1] - contigdir = True - if lastcontigdir: - for m in contigDict[lastcontig].to: - if m[0] == contig and m[1] == contigdir: - overlap = m[2] - break - else: - for m in contigDict[lastcontig].fr: - if m[0] == contig and m[1] == contigdir: - overlap = m[2] - break - if getit: - if type(overlap) is int: - pathseq = pathseq[:-overlap] - else: - pathseq += overlap - if len(pathseq) <= args.max_path_length: - outpaths.append((k, pathseq)) - paths = list(networkx.all_simple_paths(dg, i, '-' + j, args.max_path_node)) - for k in paths: - pathseq = '' - lastcontig = i - lastcontigdir = True - getit = True - for l in k[1:-1]: - if '-' in l: - contig = l[1:] - contigdir = False - else: - contig = l - contigdir = True - if contig in candidates: - getit = False - break - if lastcontigdir: - for m in contigDict[lastcontig].to: - if m[0] == contig and m[1] == contigdir: - overlap = m[2] - break - else: - for m in contigDict[lastcontig].fr: - if m[0] == contig and m[1] == contigdir: - overlap = m[2] - break - if type(overlap) is int: - if contigdir: - pathseq += contigDict[contig].forseq[overlap:] - else: - pathseq += contigDict[contig].revseq[overlap:] - else: - pathseq += overlap - if contigdir: - pathseq += contigDict[contig].forseq - else: - pathseq += contigDict[contig].revseq - lastcontig = contig - lastcontigdir = contigdir - if k[-1][0] == '-': - contig = k[-1][1:] - contigdir = False - else: - contig = k[-1] - contigdir = True - if lastcontigdir: - for m in contigDict[lastcontig].to: - if m[0] == contig and m[1] == contigdir: - overlap = m[2] - break - else: - for m in contigDict[lastcontig].fr: - if m[0] == contig and m[1] == contigdir: - overlap = m[2] - break - if getit: - if type(overlap) is int: - pathseq = pathseq[:-overlap] - else: - pathseq += overlap - if len(pathseq) <= args.max_path_length: - outpaths.append((k, pathseq)) - paths = list(networkx.all_simple_paths(dg, '-' + i, j, args.max_path_node)) - for k in paths: - pathseq = '' - lastcontig = i - lastcontigdir = False - getit = True - for l in k[1:-1]: - if '-' in l: - contig = l[1:] - contigdir = False - else: - contig = l - contigdir = True - if contig in candidates: - getit = False - break - if lastcontigdir: - for m in contigDict[lastcontig].to: - if m[0] == contig and m[1] == contigdir: - overlap = m[2] - break - else: - for m in contigDict[lastcontig].fr: - if m[0] == contig and m[1] == contigdir: - overlap = m[2] - break - if type(overlap) is int: - if contigdir: - pathseq += contigDict[contig].forseq[overlap:] - else: - pathseq += contigDict[contig].revseq[overlap:] - else: - pathseq += overlap - if contigdir: - pathseq += contigDict[contig].forseq - else: - pathseq += contigDict[contig].revseq - lastcontig = contig - lastcontigdir = contigdir - if k[-1][0] == '-': - contig = k[-1][1:] - contigdir = False - else: - contig = k[-1] - contigdir = True - if lastcontigdir: - for m in contigDict[lastcontig].to: - if m[0] == contig and m[1] == contigdir: - overlap = m[2] - break - else: - for m in contigDict[lastcontig].fr: - if m[0] == contig and m[1] == contigdir: - overlap = m[2] - break - if getit: - if type(overlap) is int: - pathseq = pathseq[:-overlap] - else: - pathseq += overlap - if len(pathseq) <= args.max_path_length: - outpaths.append((k, pathseq)) - paths = list(networkx.all_simple_paths(dg, '-' + i, '-' + j, args.max_path_node)) - for k in paths: - pathseq = '' - lastcontig = i - lastcontigdir = False - getit = True - for l in k[1:-1]: - if '-' in l: - contig = l[1:] - contigdir = False - else: - contig = l - contigdir = True - if contig in candidates: - getit = False - break - if lastcontigdir: - for m in contigDict[lastcontig].to: - if m[0] == contig and m[1] == contigdir: - overlap = m[2] - break - else: - for m in contigDict[lastcontig].fr: - if m[0] == contig and m[1] == contigdir: - overlap = m[2] - break - if type(overlap) is int: - if contigdir: - pathseq += contigDict[contig].forseq[overlap:] - else: - pathseq += contigDict[contig].revseq[overlap:] - else: - pathseq += overlap - if contigdir: - pathseq += contigDict[contig].forseq - else: - pathseq += contigDict[contig].revseq - lastcontig = contig - lastcontigdir = contigdir - if k[-1][0] == '-': - contig = k[-1][1:] - contigdir = False - else: - contig = k[-1] - contigdir = True - if lastcontigdir: - for m in contigDict[lastcontig].to: - if m[0] == contig and m[1] == contigdir: - overlap = m[2] - break - else: - for m in contigDict[lastcontig].fr: - if m[0] == contig and m[1] == contigdir: - overlap = m[2] - break - if getit: - if type(overlap) is int: - pathseq = pathseq[:-overlap] - else: - pathseq += overlap - if len(pathseq) <= args.max_path_length: - outpaths.append((k, pathseq)) - listcan.pop(0) - return outpaths - - - -# check paths with paired end mapping -def check_paths(paths): - pass - - -# given a list of paths find the shortest -def getShortest(paths): - outpaths = {} - for i in paths: - sn = i[0][0] - en = i[0][-1] - if sn in outpaths: - if en in outpaths[sn]: - if len(i[1]) < len(outpaths[sn][en][1]): - outpaths[sn][en] = i - else: - outpaths[sn][en] = i - else: - outpaths[sn] = {en:i} - return outpaths - - - -def predict_blast3(filename): - pass - -# improve predictions -def improve_predict(candContigs): - pass - -# predict small contigs -def predict_small(candContigs): - pass - -# remove similar paths -def remove_dup(paths): - pass - - -# find all simple circuits using a graph of candidates -def find_circuits(canddict): - nx = networkx.DiGraph() - for i in canddict: - for j in canddict[i]: - nx.add_edge(i, j) - if i[0] == '-': - newi = i[1:] - else: - newi = '-' + i - if j[0] == '-': - newj = j[1:] - else: - newj = '-' + j - nx.add_edge(newj, newi) - return networkx.simple_cycles(nx) - - - -def predict_contigs(canddict, plascan): - sys.stdout.write('Trimming contigs not in circuit..\n') - G = networkx.DiGraph() - for i in canddict: - for j in canddict[i]: - G.add_edge(i, j) - if i[0] == '-': - newi = i[1:] - else: - newi = '-' + i - if j[0] == '-': - newj = j[1:] - else: - newj = '-' + j - G.add_edge(newj, newi) - sys.stdout.write(str(len(G.nodes())) + ' initial nodes in ' + str(networkx.number_connected_components(G.to_undirected())) + ' connected components.\n') - notgotemall = True - while notgotemall: - notgotemall = False - for i in plascan: - if i in G: - if len(G[i]) == 0 or len(G['-' + i]) == 0: - notgotemall = True - G.remove_node(i) - G.remove_node('-' + i) - sys.stdout.write(str(len(G.nodes())) + ' remaining nodes.\n') - predset = set() - for i in G.edges(): - if i[0] in canddict and i[1] in canddict[i[0]]: - for j in canddict[i[0]][i[1]][0]: - if j[0] == '-': - predset.add(j[1:]) - else: - predset.add(j) - return predset - -def theil_sen(x,y): - n = len(x) - ord = np.argsort(x) - xs = x[ord] - ys = y[ord] - vec1 = np.zeros( (n,n) ) - for ii in range(n): - for jj in range(n): - vec1[ii,jj] = ys[ii]-ys[jj] - vec2 = np.zeros( (n,n) ) - for ii in range(n): - for jj in range(n): - vec2[ii,jj] = xs[ii]-xs[jj] - v1 = vec1[vec2>0] - v2 = vec2[vec2>0] - slope = np.median( v1/v2 ) - coef = np.zeros( (2,1) ) - b_0 = np.median(y)-slope*np.median(x) - b_1 = slope - res = y-b_1*x-b_0 # residuals - return (b_0,b_1) - - -def predict_cov(args): - plascan = set() - count = 0 - for i in contigDict: - if contigDict[i].length >= args.min_can_length: - count += 1 - x = np.zeros(count) - y = np.zeros(count) - contignamelist = [] - index = 0 - for i in contigDict: - if contigDict[i].length >= args.min_can_length: - x[index] = contigDict[i].gccontent - y[index] = contigDict[i].coverage - contignamelist.append(i) - index += 1 - thec, ther = theil_sen(x, y) - lessthan = [] - for i in range(len(x)): - if x[i] * ther + thec - y[i] >= 0: - lessthan.append(x[i] * ther + thec - y[i]) - lessthansd = (sum(lessthan) * 1.0 / len(lessthan)) ** 0.5 - templessthan = [] - for i in lessthan: - if i <= 4 * lessthansd: - templessthan.append(i) - lessthan = templessthan - lessthan.sort() - thecutoff = lessthan[int(0.95 * len(lessthan))] - out = open(args.work_dir + '/coverage.csv', 'w') - for i in contigDict: - out.write(i + '\t' + str(contigDict[i].coverage) + '\t' + str(contigDict[i].gccontent) + '\t' + str(contigDict[i].length) - + '\t' + str(contigDict[i].gccontent * ther + thec + thecutoff) + '\t' + str(contigDict[i].gccontent * ther + thec) + '\n') - out.write('\n\n\n') - for i in contigDict: - if contigDict[i].length > args.min_can_length: - out.write(i + '\t' + str(contigDict[i].coverage) + '\t' + str(contigDict[i].gccontent) + '\t' + str(contigDict[i].length) - + '\t' + str(contigDict[i].gccontent * ther + thec + thecutoff) + '\t' + str(contigDict[i].gccontent * ther + thec) + '\n') - for i in contigDict: - contigDict[i].predScore = min([2, (contigDict[i].coverage - (contigDict[i].gccontent * ther + thec)) / thecutoff]) - if contigDict[i].length >= args.min_can_length and contigDict[i].coverage >= contigDict[i].gccontent * ther + thec + thecutoff: - if args.filter_high_cov and contigDict[i].coverage <= contigDict[i].gccontent * ther * 2 + thec * 2 - thecutoff: - plascan.add(i) - elif not args.filter_high_cov: - plascan.add(i) - out.close() - return plascan - - - - -# scaffold -def scaffold(candContigs): - pass - -def best_guess(candContigs): - pass - - -def get_pred_qual(args, candidates, initit, onlyplas=False): - first = True - plasconts = set() - chromconts = set() - totalplaslengths = 0 - sharedbp, TPbp, FPbp, TNbp, FNbp, unmappedbp = 0, 0, 0, 0, 0, 0 - for i in args.debug: - tempref = open(args.work_dir + '/tempseq.fa', 'w') - inref = open(i) - for line in inref: - if line.startswith('>'): - tempref.write(line) - seq = '' - else: - seq += line.rstrip() - tempref.write(seq + seq + '\n') - tempref.close() - thelen = len(seq) - gotthem = set() - gotthem2 = set() - subprocess.Popen('makeblastdb -dbtype nucl -out ' + args.work_dir + '/tempdb -in ' + args.work_dir + '/tempseq.fa', shell=True, stdout=subprocess.PIPE).wait() - subprocess.Popen('blastn -db ' + args.work_dir + '/tempdb -outfmt 6 -num_threads 8 -query ' + args.work_dir + '/contigs.fa -out ' + args.work_dir + '/contigs_tempdb.out', shell=True).wait() - blast = open(args.work_dir + '/contigs_tempdb.out') - for line in blast: - query, subject, ident, length, mm, indel, qstart, qstop, rstart, rstop, eval, bitscore = line.split() - qstart, qstop, rstart, rstop, length, mm = map(int, [qstart, qstop, rstart, rstop, length, mm]) - ident = float(ident) - if length >= 0.99 * contigDict[query].length and ident >= 90.0: - if first and not onlyplas: - chromconts.add(query) - else: - plasconts.add(query) - if min([rstart, rstop]) <= thelen: - if query in candidates: - for q in range(min([rstart, rstop]), max([rstart, rstop]) + 1): - gotthem.add(q) - for q in range(min([rstart, rstop]), max([rstart, rstop]) + 1): - gotthem2.add(q) - if not first or onlyplas: - TPbp += len(gotthem) - FNbp += len(gotthem2) - totalplaslengths += thelen - first = False - shared, TP, FP, TN, FN, unmapped = 0, 0, 0, 0, 0, 0 - for i in contigDict: - if i in chromconts and i in plasconts: - shared += 1 - sharedbp += contigDict[i].length - if i in candidates: - TP += 1 - else: - FN += 1 - elif i in chromconts: - if i in candidates: - FP += 1 - FPbp += contigDict[i].length - else: - TN += 1 - TNbp += contigDict[i].length - elif i in plasconts: - if i in candidates: - TP += 1 - else: - FN += 1 - else: - unmapped += 1 - unmappedbp += contigDict[i].length - if onlyplas: - if i in candidates: - FP += 1 - FPbp += contigDict[i].length - else: - TN += 1 - TNbp += contigDict[i].length - if initit: - out = open(args.work_dir + '/degbug.txt', 'w') - else: - out = open(args.work_dir + '/degbug.txt', 'a') - #out.write('candidates\n' + '\t'.join(candidates) + '\n') - if initit: - out.write('predictive power initial set\ntp\tfp\ttn\tfn\tsensitivity\tprecision\tunmapped\tshared\n') - else: - out.write('predictive power final set\ntp\tfp\ttn\tfn\tsensitivity\tprecision\tunmapped\tshared\n') - try: - out.write('\t'.join(map(str, [TP, FP, TN, FN, TP * 1.0 / (TP+FN), TP * 1.0 / (TP+FP), unmapped, shared])) + '\n') - except: - out.write('\t'.join(map(str, [TP, FP, TN, FN, 0, 0, unmapped, shared])) + '\n') - if initit: - out.write('predictive power initial set (bp)\ntp\tfp\ttn\tfn\tsensitivity\tprecision\tunmapped\tshared\tplasass\n') - else: - out.write('predictive power final set (bp)\ntp\tfp\ttn\tfn\tsensitivity\tprecision\tunmapped\tshared\tplasass\n') - try: - out.write('\t'.join(map(str, [TPbp, FPbp, TNbp, FNbp - TPbp, TPbp * 1.0 / FNbp, TPbp * 1.0 / (TPbp+FPbp), unmappedbp, sharedbp, FNbp * 1.0 / totalplaslengths])) + '\n') - except: - out.write('\t'.join(map(str, [TPbp, FPbp, TNbp, FNbp - TPbp, 0, 0, unmappedbp, sharedbp, 0])) + '\n') - out.close() - -# write list of candidates to FASTA file -def write_cand(candidates, outfile): - out = open(args.work_dir + '/' + outfile, 'w') - for i in candidates: - out.write(i + '\n') - out.close() - - -def main(args): - global contigDict - contigDict = {} - if args.basic is None: - load_cag(args) - else: - load_fasta(args) - if os.path.exists(args.work_dir): # create a working directory if it doesn't exist - if not os.path.isdir(args.work_dir): - sys.stderr.write('Working directory is a file not a folder.\n') - sys.exit() - else: - os.makedirs(args.work_dir) - write_fasta_cag(args) # create a FASTA file of the graph file in the working directory - if args.predict_cov: # if flag set predict initial candidates using coverage - sys.stdout.write('Finding candidates.\n') - candidates = predict_cov(args) - sys.stdout.write(str(len(candidates)) + ' candidates found.\n') - elif not args.blast1 is None: - sys.stdout.write('Finding best reference..\n') - bestref = predict_blast1(args) - sys.stdout.write('Using reference ' + bestref + '\n') - args.blast2 = args.blast1 + '/' + bestref - sys.stdout.write('Finding candidates.\n') - candidates = predict_blast2(args) - sys.stdout.write(str(len(candidates)) + ' candidates found.\n') - elif not args.blast_coverage is None: - sys.stdout.write('Finding best reference..\n') - bestref = predict_blast1(args) - sys.stdout.write('Using reference ' + bestref + '\n') - args.blast2 = args.blast1 + '/' + bestref - sys.stdout.write('Finding candidates BLAST.\n') - candidates = predict_blast2(args) - sys.stdout.write(str(len(candidates)) + ' candidates found.\n') - sys.stdout.write('Finding candidates.\n') - candidates2 = predict_cov(args) - sys.stdout.write(str(len(candidates2)) + ' candidates found.\n') - elif not args.blast2 is None: - sys.stdout.write('Finding candidates.\n') - candidates = predict_blast2(args) - sys.stdout.write(str(len(candidates)) + ' candidates found.\n') - if not args.debug is None: - get_pred_qual(args, candidates, True, args.only_plas) - if args.blast_coverage is None: - write_cand(candidates, 'candidates.txt') - else: - write_cand(candidates, 'BLAST_candidates.txt') - write_cand(candidates2, 'coverage_candidates.txt') - if args.basic is None: - sys.stdout.write('Basic mode finished.') - return - sys.stdout.write('Creating graph\n') - dg = contigDict2nx() - sys.stdout.write('Graph created.\nFinding paths between candidates\n') - if args.blast_coverage is None: - paths = find_paths(dg, candidates, args) - sys.stdout.write(str(len(paths)) + ' paths found.\n') - if args.check_paths: # check paths with paired-end reads TODO - pass - sys.stdout.write('Finding shortest paths.\n') - pathDict = getShortest(paths) - count = 0 - for i in pathDict: - count += len(pathDict[i]) - sys.stdout.write(str(count) + ' paths remaining.\nPredicting final set of contigs..\n') - newcandidates = predict_contigs(pathDict, candidates) - sys.stdout.write(str(len(newcandidates)) + ' predicted plasmid contigs.\n') - newcandidates = getcolor(newcandidates, candidates, set(), set()) - else: - paths = find_paths(dg, candidates, args) - sys.stdout.write(str(len(paths)) + 'and ' + str(len(paths2)) + ' paths found.\n') - if args.check_paths: # check paths with paired-end reads TODO - pass - sys.stdout.write('Finding shortest paths BLAST.\n') - pathDict = getShortest(paths) - count = 0 - for i in pathDict: - count += len(pathDict[i]) - sys.stdout.write(str(count) + ' paths remaining.\nPredicting final set of contigs..\n') - newcandidates = predict_contigs(pathDict, candidates) - paths = find_paths(dg, candidates2, args) - if args.check_paths: # check paths with paired-end reads TODO - pass - sys.stdout.write('Finding shortest paths Coverage.\n') - pathDict = getShortest(paths) - count = 0 - for i in pathDict: - count += len(pathDict[i]) - sys.stdout.write(str(count) + ' paths remaining.\nPredicting final set of contigs..\n') - newcandidates2 = predict_contigs(pathDict, candidates) - newcandidates = getcolor(newcandidates, candidates, newcandidates2, candidates2) - if not args.debug is None: - get_pred_qual(args, newcandidates, False, args.only_plas) - sys.stdout.write('Writing predicted contigs to file, thanks for using COIF.\n') - write_cand_color(newcandidates, 'final_candidates.txt') - - - - - - -parser = argparse.ArgumentParser(prog='coif.py', formatter_class=argparse.RawDescriptionHelpFormatter, description=''' -coif.py: A script for identifying plasmid contigs. - -COIF is run on a Contig Adjacency Graph (CAG) generated by Contiguity, for instructions on how to generate -a CAG see Contiguity manual (in this repo) - -Find plasmid contigs by removing chromosomal contigs: -coif.py -c assembly.cag -d working_dir -b2 reference_chromosome.fa - -Identical to above method but chooses best chromosome to use from a folder of chromosomes -coif.py -c assembly.cag -d working_dir -b1 folder_of_reference_genomes - -Find plasmid using coverage of contigs: -coif.py -c assembly.cag -d working_dir -pc - -Predict plasmids using a combination of both coverage and directory of references -coif.py -c assembly.cag -d working_dir -bc folder_of_reference_genomes - -OUTPUT: -COIF outputs a list of initially predicted contigs and a list of predictions in the final set -predictions from the final set will generally be more sensitive and accurate - -For the combination mode all contigs will be listed with a colour based on whether they were -predicted in the intial or final set of both prediction methods - - Coverage -BLAST Not predicted in initial set in final set in both -not predicted #FFFFFF -in initial set -in final set -in both - - -''', epilog="Thanks for using Contiguity") -parser.add_argument('-c', '--cag_file', action='store', help='CAG file of assembled contigs or scaffolds and graph') -parser.add_argument('-b', '--basic', action='store', default=None, help='Only do initial prediction, do not improve predictions with graph information. Uses a FASTA file as input instead of a CAG.') -parser.add_argument('-d', '--work_dir', action='store', help='Working directory') -parser.add_argument('-b1', '--blast1', action='store', default=None, help='Find best reference from folder of references and use it to predict initial set of plasmid contigs.') -parser.add_argument('-b2', '--blast2', action='store', default=None, help='Use reference to remove chromsomal contigs.') -parser.add_argument('-pc', '--predict_cov', action='store_true', default=False, help='Find best reference from folder of references and use it to predict initial set of plasmid contigs.') -parser.add_argument('-bc', '--blast_coverage', action='store', default=None, help='Find best reference from folder of references and use it AND coverage to predict initial set of plasmid contigs.') -parser.add_argument('-db', '--debug', action='store', default=None, nargs='+', help='Give references to report performance of COIF [chromosome plas1 plas2 etc.].') -parser.add_argument('-i', '--min_ident', action='store', type=float, default=80.0, help='Min idenity of hits to draw') -parser.add_argument('-l', '--min_length', action='store', type=int, default=0, help='Min length of hits to draw') -parser.add_argument('-f', '--min_len_fract', action='store', type=float, default=0.1, help='Min length of hits to draw') -parser.add_argument('-mp', '--max_path_length', action='store', type=int, default=15000, help='Max length (bp) of paths') -parser.add_argument('-mn', '--max_path_node', action='store', type=int, default=10, help='Max nodes to search paths') -parser.add_argument('-cp', '--check_paths', action='store_true', default=False, help='Check paths with paired end reads') -parser.add_argument('-op', '--only_plas', action='store_true', default=False, help='only use plasmids for debug mode') -parser.add_argument('-mc', '--min_can_length', action='store', type=int, default=500, help='minimum length of contig for initial predictions') -parser.add_argument('-fh', '--filter_high_cov', action='store_true', default=True, help='Check paths with paired end reads') -# parser.add_argument('-rf', '--read_file', action='store', help='read file') -# parser.add_argument('-o', '--output_folder', action='store', help='output folder') -# parser.add_argument('-k', '--kmer_size', action='store', type=int, default=31, help='k-mer size for finding adjacent contigs [31]') -# parser.add_argument('-max_d', '--max_distance', action='store', type=int, default=300, help='maximum distance apart in the de bruijn graph for contigs to count as adjacent [300]') -# parser.add_argument('-kmer_a', '--kmer_average', action='store', type=int, default=-1, help='All k-mers above half this value will be traversed [auto]') -# parser.add_argument('-kmer_c', '--kmer_cutoff', action='store', type=int, default=-1, help='cutoff for k-mer values [auto]') -# parser.add_argument('-ov', '--overlap', action='store', type=int, default=None, help='minimum overlap to create edge [kmer_size-1]') -# parser.add_argument('-rl', '--min_read_length', action='store', type=int, default=75, help='Minimum read length [75]') -# parser.add_argument('-max_mm', '--max_mismatch', action='store', type=int, default=2, help='maximum number of mismatches to count overlap [2]') -# parser.add_argument('-lo', '--long_overlap_ident', action='store', type=int, default=85, help='minimum percent identity to create an edge where there is a long overlap [85]') -# parser.add_argument('-mp', '--minimum_pairs_edge', action='store', type=int, default=2, help='Minimum pairs to create edge [2]') -# parser.add_argument('-is', '--max_insert_size', action='store', type=int, default=600, help='Upper bound on insert size [600]') -# parser.add_argument('-cl', '--command_line', action='store_true', default=False, help='Run contiguity in command line mode') -# parser.add_argument('-no', '--no_overlap_edges', action='store_true', default=False, help='Don\'t get overlap edges') -# parser.add_argument('-nd', '--no_db_edges', action='store_true', default=False, help='Don\'t get De Bruijn edges') -# parser.add_argument('-np', '--no_paired_edges', action='store_true', default=False, help='Don\'t get paired-end edges') -# parser.add_argument('-km', '--khmer', action='store_false', default=True, help='Don\'t use khmer for De Bruijn graph contruction (not recommended)') -# parser.add_argument('-nt', '--num_threads', action='store', type=int, default=1, help='Number of threads to use for hash table building with khmer and for mapping reads with bowtie') -# parser.add_argument('-ht_s', '--ht_size', action='store', default='2e9', help='Hash table size, for more information check http://khmer.readthedocs.org/en/v1.1/choosing-table-sizes.html') -# parser.add_argument('-ht_n', '--ht_number', action='store', type=int, default=4, help='Hash table number, for more information check http://khmer.readthedocs.org/en/v1.1/choosing-table-sizes.html') - - - -args = parser.parse_args() - -main(args) diff --git a/do_release.sh b/do_release.sh deleted file mode 100755 index 0904b2b..0000000 --- a/do_release.sh +++ /dev/null @@ -1,85 +0,0 @@ -# Release script -# Copyright (C) 2013-2015 Mitchell Jon Stanton-Cook -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License along -# with this program; if not, write to the Free Software Foundation, Inc., -# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -# -# m.stantoncook@gmail.com -# School of Chemistry & Molecular Biosciences -# The University of Queensland -# Brisbane, QLD 4072. -# Australia - - -#VERSION=1.0.3 - -# Perform an install-uninstall cycle -pip uninstall Contiguity -python setup.py install -pip uninstall Contiguity -python setup.py clean - - -# Do all the versioning stuff here.. -bumpversion patch - - -# Clean, test, build the source distribution & pip install it -# Need to get exit statuses here... -python setup.py clean -#python setup.py test -#STATUS=`echo $?` -#if [ $STATUS -eq 0 ]; then -# echo "" -#else -# echo "Tests failed. Will not release" -# exit -#fi - -python setup.py sdist bdist_wheel -pip install dist/Contiguity-$VERSION.tar.gz -STATUS=`echo $?` -if [ $STATUS -eq 0 ]; then - echo "" -else - echo "Package is not pip installable. Will not release" - exit -fi - - -# Docs -# Need to get exit statuses here... -cd docs -make clean -sphinx-apidoc -o API ../Contiguity -mv API/* . -rmdir API -make html -cd .. - -git push -# tag & push the tag to github -GIT=`git status` -CLEAN='# On branch master nothing to commit, working directory clean' -if [ "$s1" == "$s2" ]; then - git tag v$VERSION - git push --tags -else - echo "Git not clean. Will not release" - exit -fi - - -# Upload to PyPI & clean -twine upload -u mscook -p $PYPIPASS dist/* && python setup.py clean diff --git a/docs/Contiguity.rst b/docs/Contiguity.rst deleted file mode 100644 index b9c860c..0000000 --- a/docs/Contiguity.rst +++ /dev/null @@ -1,29 +0,0 @@ -Contiguity package -================== - -Subpackages ------------ - -.. toctree:: - - Contiguity.util - -Submodules ----------- - -Contiguity.Contiguity module ----------------------------- - -.. automodule:: Contiguity.Contiguity - :members: - :undoc-members: - :show-inheritance: - - -Module contents ---------------- - -.. automodule:: Contiguity - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/Contiguity.util.rst b/docs/Contiguity.util.rst deleted file mode 100644 index 9005778..0000000 --- a/docs/Contiguity.util.rst +++ /dev/null @@ -1,38 +0,0 @@ -Contiguity.util package -======================= - -Submodules ----------- - -Contiguity.util.checkCSAG module --------------------------------- - -.. automodule:: Contiguity.util.checkCSAG - :members: - :undoc-members: - :show-inheritance: - -Contiguity.util.checkLG module ------------------------------- - -.. automodule:: Contiguity.util.checkLG - :members: - :undoc-members: - :show-inheritance: - -Contiguity.util.coif module ---------------------------- - -.. automodule:: Contiguity.util.coif - :members: - :undoc-members: - :show-inheritance: - - -Module contents ---------------- - -.. automodule:: Contiguity.util - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/manual/Contiguity_SS.png b/docs/Contiguity_SS.png similarity index 100% rename from docs/manual/Contiguity_SS.png rename to docs/Contiguity_SS.png diff --git a/docs/manual/Contiguity_manual.docx b/docs/Contiguity_manual.docx similarity index 100% rename from docs/manual/Contiguity_manual.docx rename to docs/Contiguity_manual.docx diff --git a/docs/manual/Contiguity_manual.pdf b/docs/Contiguity_manual.pdf similarity index 100% rename from docs/manual/Contiguity_manual.pdf rename to docs/Contiguity_manual.pdf diff --git a/docs/Makefile b/docs/Makefile deleted file mode 100644 index 3008416..0000000 --- a/docs/Makefile +++ /dev/null @@ -1,177 +0,0 @@ -# Makefile for Sphinx documentation -# - -# You can set these variables from the command line. -SPHINXOPTS = -SPHINXBUILD = sphinx-build -PAPER = -BUILDDIR = _build - -# User-friendly check for sphinx-build -ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) -$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) -endif - -# Internal variables. -PAPEROPT_a4 = -D latex_paper_size=a4 -PAPEROPT_letter = -D latex_paper_size=letter -ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . -# the i18n builder cannot share the environment and doctrees with the others -I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . - -.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext - -help: - @echo "Please use \`make ' where is one of" - @echo " html to make standalone HTML files" - @echo " dirhtml to make HTML files named index.html in directories" - @echo " singlehtml to make a single large HTML file" - @echo " pickle to make pickle files" - @echo " json to make JSON files" - @echo " htmlhelp to make HTML files and a HTML help project" - @echo " qthelp to make HTML files and a qthelp project" - @echo " devhelp to make HTML files and a Devhelp project" - @echo " epub to make an epub" - @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" - @echo " latexpdf to make LaTeX files and run them through pdflatex" - @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" - @echo " text to make text files" - @echo " man to make manual pages" - @echo " texinfo to make Texinfo files" - @echo " info to make Texinfo files and run them through makeinfo" - @echo " gettext to make PO message catalogs" - @echo " changes to make an overview of all changed/added/deprecated items" - @echo " xml to make Docutils-native XML files" - @echo " pseudoxml to make pseudoxml-XML files for display purposes" - @echo " linkcheck to check all external links for integrity" - @echo " doctest to run all doctests embedded in the documentation (if enabled)" - -clean: - rm -rf $(BUILDDIR)/* - -html: - $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." - -dirhtml: - $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." - -singlehtml: - $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml - @echo - @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." - -pickle: - $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle - @echo - @echo "Build finished; now you can process the pickle files." - -json: - $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json - @echo - @echo "Build finished; now you can process the JSON files." - -htmlhelp: - $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp - @echo - @echo "Build finished; now you can run HTML Help Workshop with the" \ - ".hhp project file in $(BUILDDIR)/htmlhelp." - -qthelp: - $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp - @echo - @echo "Build finished; now you can run "qcollectiongenerator" with the" \ - ".qhcp project file in $(BUILDDIR)/qthelp, like this:" - @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/Contiguity.qhcp" - @echo "To view the help file:" - @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/Contiguity.qhc" - -devhelp: - $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp - @echo - @echo "Build finished." - @echo "To view the help file:" - @echo "# mkdir -p $$HOME/.local/share/devhelp/Contiguity" - @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/Contiguity" - @echo "# devhelp" - -epub: - $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub - @echo - @echo "Build finished. The epub file is in $(BUILDDIR)/epub." - -latex: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo - @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." - @echo "Run \`make' in that directory to run these through (pdf)latex" \ - "(use \`make latexpdf' here to do that automatically)." - -latexpdf: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo "Running LaTeX files through pdflatex..." - $(MAKE) -C $(BUILDDIR)/latex all-pdf - @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." - -latexpdfja: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo "Running LaTeX files through platex and dvipdfmx..." - $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja - @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." - -text: - $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text - @echo - @echo "Build finished. The text files are in $(BUILDDIR)/text." - -man: - $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man - @echo - @echo "Build finished. The manual pages are in $(BUILDDIR)/man." - -texinfo: - $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo - @echo - @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." - @echo "Run \`make' in that directory to run these through makeinfo" \ - "(use \`make info' here to do that automatically)." - -info: - $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo - @echo "Running Texinfo files through makeinfo..." - make -C $(BUILDDIR)/texinfo info - @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." - -gettext: - $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale - @echo - @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." - -changes: - $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes - @echo - @echo "The overview file is in $(BUILDDIR)/changes." - -linkcheck: - $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck - @echo - @echo "Link check complete; look for any errors in the above output " \ - "or in $(BUILDDIR)/linkcheck/output.txt." - -doctest: - $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest - @echo "Testing of doctests in the sources finished, look at the " \ - "results in $(BUILDDIR)/doctest/output.txt." - -xml: - $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml - @echo - @echo "Build finished. The XML files are in $(BUILDDIR)/xml." - -pseudoxml: - $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml - @echo - @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." diff --git a/docs/conf.py b/docs/conf.py deleted file mode 100644 index 25e5ed9..0000000 --- a/docs/conf.py +++ /dev/null @@ -1,271 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Contiguity documentation build configuration file, created by -# sphinx-quickstart on Tue Feb 24 14:23:38 2015. -# -# This file is execfile()d with the current directory set to its -# containing dir. -# -# Note that not all possible configuration values are present in this -# autogenerated file. -# -# All configuration values have a default; values that are commented out -# serve to show the default. - -import sys -import os - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -#sys.path.insert(0, os.path.abspath('.')) - -# -- General configuration ------------------------------------------------ - -# If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.doctest', - 'sphinx.ext.intersphinx', - 'sphinx.ext.todo', - 'sphinx.ext.coverage', - 'sphinx.ext.mathjax', - 'sphinx.ext.ifconfig', - 'sphinx.ext.viewcode', -] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] - -# The suffix of source filenames. -source_suffix = '.rst' - -# The encoding of source files. -#source_encoding = 'utf-8-sig' - -# The master toctree document. -master_doc = 'index' - -# General information about the project. -project = u'Contiguity' -copyright = u'2015, Mitchell Sullivan' - -# The version info for the project you're documenting, acts as replacement for -# |version| and |release|, also used in various other places throughout the -# built documents. -# -# The short X.Y version. -version = '1.0.3' -# The full version, including alpha/beta/rc tags. -release = '1.0.3' - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -#language = None - -# There are two options for replacing |today|: either, you set today to some -# non-false value, then it is used: -#today = '' -# Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -exclude_patterns = ['_build'] - -# The reST default role (used for this markup: `text`) to use for all -# documents. -#default_role = None - -# If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True - -# If true, the current module name will be prepended to all description -# unit titles (such as .. function::). -#add_module_names = True - -# If true, sectionauthor and moduleauthor directives will be shown in the -# output. They are ignored by default. -#show_authors = False - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' - -# A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] - -# If true, keep warnings as "system message" paragraphs in the built documents. -#keep_warnings = False - - -# -- Options for HTML output ---------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -html_theme = 'default' - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. -#html_theme_options = {} - -# Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] - -# The name for this set of Sphinx documents. If None, it defaults to -# " v documentation". -#html_title = None - -# A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None - -# The name of an image file (relative to this directory) to place at the top -# of the sidebar. -#html_logo = None - -# The name of an image file (within the static path) to use as favicon of the -# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 -# pixels large. -#html_favicon = None - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] - -# Add any extra paths that contain custom files (such as robots.txt or -# .htaccess) here, relative to this directory. These files are copied -# directly to the root of the documentation. -#html_extra_path = [] - -# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, -# using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' - -# If true, SmartyPants will be used to convert quotes and dashes to -# typographically correct entities. -#html_use_smartypants = True - -# Custom sidebar templates, maps document names to template names. -#html_sidebars = {} - -# Additional templates that should be rendered to pages, maps page names to -# template names. -#html_additional_pages = {} - -# If false, no module index is generated. -#html_domain_indices = True - -# If false, no index is generated. -#html_use_index = True - -# If true, the index is split into individual pages for each letter. -#html_split_index = False - -# If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True - -# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -#html_show_sphinx = True - -# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -#html_show_copyright = True - -# If true, an OpenSearch description file will be output, and all pages will -# contain a tag referring to it. The value of this option must be the -# base URL from which the finished HTML is served. -#html_use_opensearch = '' - -# This is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = None - -# Output file base name for HTML help builder. -htmlhelp_basename = 'Contiguitydoc' - - -# -- Options for LaTeX output --------------------------------------------- - -latex_elements = { -# The paper size ('letterpaper' or 'a4paper'). -#'papersize': 'letterpaper', - -# The font size ('10pt', '11pt' or '12pt'). -#'pointsize': '10pt', - -# Additional stuff for the LaTeX preamble. -#'preamble': '', -} - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, -# author, documentclass [howto, manual, or own class]). -latex_documents = [ - ('index', 'Contiguity.tex', u'Contiguity Documentation', - u'Mitchell Sullivan', 'manual'), -] - -# The name of an image file (relative to this directory) to place at the top of -# the title page. -#latex_logo = None - -# For "manual" documents, if this is true, then toplevel headings are parts, -# not chapters. -#latex_use_parts = False - -# If true, show page references after internal links. -#latex_show_pagerefs = False - -# If true, show URL addresses after external links. -#latex_show_urls = False - -# Documents to append as an appendix to all manuals. -#latex_appendices = [] - -# If false, no module index is generated. -#latex_domain_indices = True - - -# -- Options for manual page output --------------------------------------- - -# One entry per manual page. List of tuples -# (source start file, name, description, authors, manual section). -man_pages = [ - ('index', 'contiguity', u'Contiguity Documentation', - [u'Mitchell Sullivan'], 1) -] - -# If true, show URL addresses after external links. -#man_show_urls = False - - -# -- Options for Texinfo output ------------------------------------------- - -# Grouping the document tree into Texinfo files. List of tuples -# (source start file, target name, title, author, -# dir menu entry, description, category) -texinfo_documents = [ - ('index', 'Contiguity', u'Contiguity Documentation', - u'Mitchell Sullivan', 'Contiguity', 'One line description of project.', - 'Miscellaneous'), -] - -# Documents to append as an appendix to all manuals. -#texinfo_appendices = [] - -# If false, no module index is generated. -#texinfo_domain_indices = True - -# How to display URL addresses: 'footnote', 'no', or 'inline'. -#texinfo_show_urls = 'footnote' - -# If true, do not generate a @detailmenu in the "Top" node's menu. -#texinfo_no_detailmenu = False - - -# Example configuration for intersphinx: refer to the Python standard library. -intersphinx_mapping = {'http://docs.python.org/': None} diff --git a/docs/index.rst b/docs/index.rst deleted file mode 100644 index f84905a..0000000 --- a/docs/index.rst +++ /dev/null @@ -1,22 +0,0 @@ -.. Contiguity documentation master file, created by - sphinx-quickstart on Tue Feb 24 14:23:38 2015. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - -Welcome to Contiguity's documentation! -====================================== - -Contents: - -.. toctree:: - :maxdepth: 2 - - - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` - diff --git a/docs/modules.rst b/docs/modules.rst deleted file mode 100644 index efdac3d..0000000 --- a/docs/modules.rst +++ /dev/null @@ -1,7 +0,0 @@ -Contiguity -========== - -.. toctree:: - :maxdepth: 4 - - Contiguity diff --git a/examples_files/example1.zip b/examples_files/example1.zip new file mode 100644 index 0000000..f1ebac7 Binary files /dev/null and b/examples_files/example1.zip differ diff --git a/requirements-dev.txt b/requirements-dev.txt deleted file mode 100644 index dbcf58a..0000000 --- a/requirements-dev.txt +++ /dev/null @@ -1,4 +0,0 @@ -Sphinx==1.2.3 -bumpversion==0.5.1 -wheel==0.24.0 -twine==1.4.0 diff --git a/setup.py b/setup.py index 388c130..ef24ddb 100644 --- a/setup.py +++ b/setup.py @@ -1,94 +1,19 @@ -#!/usr/bin/env python +# cx_Freeze setup file -import os import sys -import glob - -# Try and import pip. We'll stop if it is not present -try: - import pip -except ImportError: - print "Installation of Contiguity requires pip. Please install it! See -" - print "http://pip.readthedocs.org/en/latest/installing.html" - sys.exit(1) - -from setuptools import setup - -__title__ = 'Contiguity' -__version__ = '1.0.3' -__description__ = "Tool for visualising assemblies" -__author__ = 'Mitchell Sullivan' -__license__ = 'GPLv3' -__author_email__ = "mjsull@gmail.com" -__url__ = 'https://github.com/BeatsonLab-MicrobialGenomics/Contiguity' - - -# Helper functions -if sys.argv[-1] == 'publish': - print "Please use twine or do_release.sh" - sys.exit() - -if sys.argv[-1] == 'clean': - os.system('rm -rf Contiguity.egg-info build dist') - sys.exit() - -if sys.argv[-1] == 'docs': - os.system('cd docs && make html') - sys.exit() - - -packages = [__title__, ] - -requires = [] -with open('requirements.txt') as fin: - lines = fin.readlines() - for line in lines: - requires.append(line.strip()) - -# Build lists to package the docs -html, sources, static = [], [], [] -html_f = glob.glob('docs/_build/html/*') -accessory = glob.glob('docs/_build/html/*/*') -for f in html_f: - if os.path.isfile(f): - html.append(f) -for f in accessory: - if f.find("_static") != -1: - if os.path.isfile(f): - static.append(f) - elif f.find("_sources"): - if os.path.isfile(f): - sources.append(f) - -setup( - name=__title__, - version=__version__, - description=__description__, - long_description=open('README.rst').read(), - author=__author__, - author_email=__author_email__, - url=__url__, - packages=packages, - test_suite="tests", - package_dir={__title__.lower(): __title__}, - scripts=[__title__.lower()+'/'+__title__], - package_data={}, - data_files=[('', ['LICENSE', 'requirements.txt', 'README.rst']), - ('docs', html), ('docs/_static', static), - ('docs/_sources', sources)], - include_package_data=True, - install_requires=requires, - license=__license__, - zip_safe=False, - classifiers=('Development Status :: 4 - Beta', - 'Environment :: X11 Applications', - 'Intended Audience :: Science/Research', - 'License :: OSI Approved', - 'Natural Language :: English', - 'Operating System :: POSIX :: Linux', - 'Programming Language :: Python', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 2 :: Only', - 'Topic :: Scientific/Engineering :: Bio-Informatics', - 'Topic :: Scientific/Engineering :: Visualization',), -) +from cx_Freeze import setup, Executable + +# Dependencies are automatically detected, but it might need fine tuning. +build_exe_options = {"packages": ["khmer"]} + +# GUI applications require a different base on Windows (the default is for a +# console application). +base = None +if sys.platform == "win32": + base = "Win32GUI" + +setup( name = "Contiguity", + version = "1.0.4", + description = "Assembly graph construction and visualisation.", + options = {"build_exe": build_exe_options}, + executables = [Executable("Contiguity.py", base=base)])