From 269f260069c283ba989e76d4804689b3b42fddbc Mon Sep 17 00:00:00 2001 From: Space Robot <> Date: Wed, 28 Nov 2018 17:51:15 +0100 Subject: [PATCH] Convert tabs to spaces --- .editorconfig | 8 +- doc/develop.rst | 30 +- doc/guide.rst | 14 +- doc/recipes.rst | 12 +- setup.py | 164 ++-- src/cutadapt/__main__.py | 1324 +++++++++++++++--------------- src/cutadapt/_align.pyx | 1016 +++++++++++------------ src/cutadapt/adapters.py | 1598 ++++++++++++++++++------------------- src/cutadapt/align.py | 30 +- src/cutadapt/filters.py | 532 ++++++------ src/cutadapt/modifiers.py | 372 ++++----- src/cutadapt/pipeline.py | 1264 ++++++++++++++--------------- src/cutadapt/qualtrim.pyx | 138 ++-- src/cutadapt/report.py | 778 +++++++++--------- src/cutadapt/utils.py | 36 +- tests/conftest.py | 2 +- tests/test_adapters.py | 446 +++++------ tests/test_align.py | 158 ++-- tests/test_commandline.py | 460 +++++------ tests/test_filters.py | 46 +- tests/test_modifiers.py | 60 +- tests/test_paired.py | 626 +++++++-------- tests/test_qualtrim.py | 14 +- tests/test_trim.py | 90 +-- tests/utils.py | 58 +- 25 files changed, 4640 insertions(+), 4636 deletions(-) diff --git a/.editorconfig b/.editorconfig index 3b69b4d4..9796f995 100644 --- a/.editorconfig +++ b/.editorconfig @@ -1,2 +1,6 @@ -[*.py] -indent_style = tab +[*.{py,pyx,rst}] +charset=utf-8 +end_of_line=lf +insert_final_newline=true +indent_style=space +indent_size=4 diff --git a/doc/develop.rst b/doc/develop.rst index 6aa68d77..0cfb55db 100644 --- a/doc/develop.rst +++ b/doc/develop.rst @@ -12,20 +12,20 @@ Development installation For development, make sure that you install Cython and tox. We also recommend using a virtualenv. This sequence of commands should work:: - git clone https://github.com/marcelm/cutadapt.git # or clone your own fork - cd cutadapt - python3 -m venv venv - venv/bin/pip3 install Cython pytest nose tox - venv/bin/pip3 install -e . + git clone https://github.com/marcelm/cutadapt.git # or clone your own fork + cd cutadapt + python3 -m venv venv + venv/bin/pip3 install Cython pytest nose tox + venv/bin/pip3 install -e . Then you can run Cutadapt like this (or activate the virtualenv and omit the ``venv/bin`` part):: - venv/bin/cutadapt --help + venv/bin/cutadapt --help The tests can then be run like this:: - venv/bin/pytest + venv/bin/pytest Or with tox (but then you will need to have binaries for all tested Python versions installed):: @@ -39,8 +39,8 @@ Development installation (without virtualenv) Alternatively, if you do not want to use virtualenv, running the following may work from within the cloned repository:: - python3 setup.py build_ext -i - pytest + python3 setup.py build_ext -i + pytest This requires Cython and pytest to be installed. Avoid this method and use a virtualenv instead if you can. @@ -113,13 +113,13 @@ If this is the first time you attempt to upload a distribution to PyPI, create a configuration file named ``.pypirc`` in your home directory with the following contents:: - [distutils] - index-servers = - pypi + [distutils] + index-servers = + pypi - [pypi] - username=my-user-name - password=my-password + [pypi] + username=my-user-name + password=my-password See also `this blog post about getting started with PyPI `_. In particular, diff --git a/doc/guide.rst b/doc/guide.rst index 71a0cd2e..402242fa 100644 --- a/doc/guide.rst +++ b/doc/guide.rst @@ -1443,10 +1443,10 @@ If you have paired-end data, trim also read 2 with the reverse complement of the “TruSeq Universal Adapter”. The full command-line looks as follows:: cutadapt \ - -a AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC \ - -A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT \ - -o trimmed.1.fastq.gz -p trimmed.2.fastq.gz \ - reads.1.fastq.gz reads.2.fastq.gz + -a AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC \ + -A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT \ + -o trimmed.1.fastq.gz -p trimmed.2.fastq.gz \ + reads.1.fastq.gz reads.2.fastq.gz See also the :ref:`section about paired-end adapter trimming above `. @@ -1456,9 +1456,9 @@ be aware that this sequence occurs multiple times in the human genome and it could therefore skew your results very slightly at those loci :: cutadapt \ - -a AGATCGGAAGAGC -A AGATCGGAAGAGC \ - -o trimmed.1.fastq.gz -p trimmed.2.fastq.gz \ - reads.1.fastq.gz reads.2.fastq.gz + -a AGATCGGAAGAGC -A AGATCGGAAGAGC \ + -o trimmed.1.fastq.gz -p trimmed.2.fastq.gz \ + reads.1.fastq.gz reads.2.fastq.gz The adapter sequences can be found in the document `Illumina TruSeq Adapters De-Mystified `__. diff --git a/doc/recipes.rst b/doc/recipes.rst index f0c4760a..83990e54 100644 --- a/doc/recipes.rst +++ b/doc/recipes.rst @@ -18,7 +18,7 @@ but only one 3' adapter, then you have two options. First, you can specify the adapters and also ``--times=2`` (or the short version ``-n 2``). For example:: - cutadapt -g ^TTAAGGCC -g ^AAGCTTA -a TACGGACT -n 2 -o output.fastq input.fastq + cutadapt -g ^TTAAGGCC -g ^AAGCTTA -a TACGGACT -n 2 -o output.fastq input.fastq This instructs Cutadapt to run two rounds of adapter finding and removal. That means that, after the first round and only when an adapter was actually found, @@ -30,7 +30,7 @@ The second option is to not use the ``-n`` option, but to run Cutadapt twice, first removing one adapter and then the other. It is easiest if you use a pipe as in this example:: - cutadapt -g ^TTAAGGCC -g ^AAGCTTA input.fastq | cutadapt -a TACGGACT - > output.fastq + cutadapt -g ^TTAAGGCC -g ^AAGCTTA input.fastq | cutadapt -a TACGGACT - > output.fastq Trim poly-A tails @@ -41,16 +41,16 @@ adapter type (``-a``) with an adapter sequence of many repeated ``A`` nucleotides. Starting with version 1.8 of Cutadapt, you can use the following notation to specify a sequence that consists of 100 ``A``:: - cutadapt -a "A{100}" -o output.fastq input.fastq + cutadapt -a "A{100}" -o output.fastq input.fastq This also works when there are sequencing errors in the poly-A tail. So this read :: - TACGTACGTACGTACGAAATAAAAAAAAAAA + TACGTACGTACGTACGAAATAAAAAAAAAAA will be trimmed to:: - TACGTACGTACGTACG + TACGTACGTACGTACG If for some reason you would like to use a shorter sequence of ``A``, you can do so: The matching algorithm always picks the leftmost match that it can find, @@ -59,7 +59,7 @@ used in the adapter sequence. However, sequencing errors may result in shorter matches than desired. For example, using ``-a "A{10}"``, the read above (where the ``AAAT`` is followed by eleven ``A``) would be trimmed to:: - TACGTACGTACGTACGAAAT + TACGTACGTACGTACGAAAT Depending on your application, perhaps a variant of ``-a A{10}N{90}`` is an alternative, forcing the match to be located as much to the left as possible, diff --git a/setup.py b/setup.py index bd8a2b1d..5b02c60d 100644 --- a/setup.py +++ b/setup.py @@ -13,49 +13,49 @@ MIN_CYTHON_VERSION = '0.28' if sys.version_info[:2] < (3, 4): - sys.stdout.write('You need at least Python 3.4\n') - sys.exit(1) + sys.stdout.write('You need at least Python 3.4\n') + sys.exit(1) def no_cythonize(extensions, **_ignore): - """ - Change file extensions from .pyx to .c or .cpp. - - Copied from Cython documentation - """ - for extension in extensions: - sources = [] - for sfile in extension.sources: - path, ext = os.path.splitext(sfile) - if ext in ('.pyx', '.py'): - if extension.language == 'c++': - ext = '.cpp' - else: - ext = '.c' - sfile = path + ext - sources.append(sfile) - extension.sources[:] = sources + """ + Change file extensions from .pyx to .c or .cpp. + + Copied from Cython documentation + """ + for extension in extensions: + sources = [] + for sfile in extension.sources: + path, ext = os.path.splitext(sfile) + if ext in ('.pyx', '.py'): + if extension.language == 'c++': + ext = '.cpp' + else: + ext = '.c' + sfile = path + ext + sources.append(sfile) + extension.sources[:] = sources def check_cython_version(): - """Exit if Cython was not found or is too old""" - try: - from Cython import __version__ as cyversion - except ImportError: - sys.stdout.write( - "ERROR: Cython is not installed. Install at least Cython version " + - str(MIN_CYTHON_VERSION) + " to continue.\n") - sys.exit(1) - if LooseVersion(cyversion) < LooseVersion(MIN_CYTHON_VERSION): - sys.stdout.write( - "ERROR: Your Cython is at version '" + str(cyversion) + - "', but at least version " + str(MIN_CYTHON_VERSION) + " is required.\n") - sys.exit(1) + """Exit if Cython was not found or is too old""" + try: + from Cython import __version__ as cyversion + except ImportError: + sys.stdout.write( + "ERROR: Cython is not installed. Install at least Cython version " + + str(MIN_CYTHON_VERSION) + " to continue.\n") + sys.exit(1) + if LooseVersion(cyversion) < LooseVersion(MIN_CYTHON_VERSION): + sys.stdout.write( + "ERROR: Your Cython is at version '" + str(cyversion) + + "', but at least version " + str(MIN_CYTHON_VERSION) + " is required.\n") + sys.exit(1) extensions = [ - Extension('cutadapt._align', sources=['src/cutadapt/_align.pyx']), - Extension('cutadapt.qualtrim', sources=['src/cutadapt/qualtrim.pyx']), + Extension('cutadapt._align', sources=['src/cutadapt/_align.pyx']), + Extension('cutadapt.qualtrim', sources=['src/cutadapt/qualtrim.pyx']), ] cmdclass = versioneer.get_cmdclass() @@ -64,28 +64,28 @@ def check_cython_version(): class build_ext(versioneer_build_ext): - def run(self): - # If we encounter a PKG-INFO file, then this is likely a .tar.gz/.zip - # file retrieved from PyPI that already includes the pre-cythonized - # extension modules, and then we do not need to run cythonize(). - if os.path.exists('PKG-INFO'): - no_cythonize(extensions) - else: - # Otherwise, this is a 'developer copy' of the code, and then the - # only sensible thing is to require Cython to be installed. - check_cython_version() - from Cython.Build import cythonize - self.extensions = cythonize(self.extensions) - versioneer_build_ext.run(self) + def run(self): + # If we encounter a PKG-INFO file, then this is likely a .tar.gz/.zip + # file retrieved from PyPI that already includes the pre-cythonized + # extension modules, and then we do not need to run cythonize(). + if os.path.exists('PKG-INFO'): + no_cythonize(extensions) + else: + # Otherwise, this is a 'developer copy' of the code, and then the + # only sensible thing is to require Cython to be installed. + check_cython_version() + from Cython.Build import cythonize + self.extensions = cythonize(self.extensions) + versioneer_build_ext.run(self) class sdist(versioneer_sdist): - def run(self): - # Make sure the compiled Cython files in the distribution are up-to-date - from Cython.Build import cythonize - check_cython_version() - cythonize(extensions) - versioneer_sdist.run(self) + def run(self): + # Make sure the compiled Cython files in the distribution are up-to-date + from Cython.Build import cythonize + check_cython_version() + cythonize(extensions) + versioneer_sdist.run(self) cmdclass['build_ext'] = build_ext @@ -94,35 +94,35 @@ def run(self): encoding_arg = {'encoding': 'utf-8'} if sys.version > '3' else dict() with open('README.rst', **encoding_arg) as f: - long_description = f.read() + long_description = f.read() setup( - name='cutadapt', - version=versioneer.get_version(), - author='Marcel Martin', - author_email='marcel.martin@scilifelab.se', - url='https://cutadapt.readthedocs.io/', - description='trim adapters from high-throughput sequencing reads', - long_description=long_description, - license='MIT', - cmdclass=cmdclass, - ext_modules=extensions, - package_dir={'': 'src'}, - packages=find_packages('src'), - entry_points={'console_scripts': ['cutadapt = cutadapt.__main__:main']}, - install_requires=['dnaio>=0.3', 'xopen>=0.3.2'], - extras_require={ - 'dev': ['Cython', 'pytest', 'pytest-timeout', 'sphinx', 'sphinx_issues'], - }, - python_requires='>=3', - classifiers=[ - "Development Status :: 5 - Production/Stable", - "Environment :: Console", - "Intended Audience :: Science/Research", - "License :: OSI Approved :: MIT License", - "Natural Language :: English", - "Programming Language :: Cython", - "Programming Language :: Python :: 3", - "Topic :: Scientific/Engineering :: Bio-Informatics" - ], + name='cutadapt', + version=versioneer.get_version(), + author='Marcel Martin', + author_email='marcel.martin@scilifelab.se', + url='https://cutadapt.readthedocs.io/', + description='trim adapters from high-throughput sequencing reads', + long_description=long_description, + license='MIT', + cmdclass=cmdclass, + ext_modules=extensions, + package_dir={'': 'src'}, + packages=find_packages('src'), + entry_points={'console_scripts': ['cutadapt = cutadapt.__main__:main']}, + install_requires=['dnaio>=0.3', 'xopen>=0.3.2'], + extras_require={ + 'dev': ['Cython', 'pytest', 'pytest-timeout', 'sphinx', 'sphinx_issues'], + }, + python_requires='>=3', + classifiers=[ + "Development Status :: 5 - Production/Stable", + "Environment :: Console", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Natural Language :: English", + "Programming Language :: Cython", + "Programming Language :: Python :: 3", + "Topic :: Scientific/Engineering :: Bio-Informatics" + ], ) diff --git a/src/cutadapt/__main__.py b/src/cutadapt/__main__.py index 07d2933f..35f5ae7b 100644 --- a/src/cutadapt/__main__.py +++ b/src/cutadapt/__main__.py @@ -67,8 +67,8 @@ from cutadapt import __version__ from cutadapt.adapters import AdapterParser from cutadapt.modifiers import (LengthTagModifier, SuffixRemover, PrefixSuffixAdder, - ZeroCapper, QualityTrimmer, UnconditionalCutter, NEndTrimmer, AdapterCutter, - NextseqQualityTrimmer, Shortener) + ZeroCapper, QualityTrimmer, UnconditionalCutter, NEndTrimmer, AdapterCutter, + NextseqQualityTrimmer, Shortener) from cutadapt.report import print_report, print_minimal_report, redirect_standard_output from cutadapt.pipeline import SingleEndPipeline, PairedEndPipeline, OutputFiles, ParallelPipelineRunner from cutadapt.utils import available_cpu_count @@ -77,708 +77,708 @@ class CutadaptOptionParser(OptionParser): - def get_usage(self): - return self.usage.lstrip().replace('%version', __version__) + def get_usage(self): + return self.usage.lstrip().replace('%version', __version__) - def error(self, msg): - print('Run "cutadapt --help" to see command-line options.', file=sys.stderr) - print('See https://cutadapt.readthedocs.io/ for full documentation.', file=sys.stderr) - self.exit(2, "\n%s: error: %s\n" % (self.get_prog_name(), msg)) + def error(self, msg): + print('Run "cutadapt --help" to see command-line options.', file=sys.stderr) + print('See https://cutadapt.readthedocs.io/ for full documentation.', file=sys.stderr) + self.exit(2, "\n%s: error: %s\n" % (self.get_prog_name(), msg)) class CommandLineError(Exception): - pass + pass class NiceFormatter(logging.Formatter): - """ - Do not prefix "INFO:" to info-level log messages (but do it for all other - levels). + """ + Do not prefix "INFO:" to info-level log messages (but do it for all other + levels). - Based on http://stackoverflow.com/a/9218261/715090 . - """ - def format(self, record): - if record.levelno != logging.INFO: - record.msg = '{}: {}'.format(record.levelname, record.msg) - return super().format(record) + Based on http://stackoverflow.com/a/9218261/715090 . + """ + def format(self, record): + if record.levelno != logging.INFO: + record.msg = '{}: {}'.format(record.levelname, record.msg) + return super().format(record) def setup_logging(stdout=False, quiet=False, debug=False): - """ - Attach handler to the global logger object - """ - # Due to backwards compatibility, logging output is sent to standard output - # instead of standard error if the -o option is used. - stream_handler = logging.StreamHandler(sys.stdout if stdout else sys.stderr) - stream_handler.setFormatter(NiceFormatter()) - # debug overrides quiet - if debug: - level = logging.DEBUG - elif quiet: - level = logging.ERROR - else: - level = logging.INFO - stream_handler.setLevel(level) - logger.setLevel(level) - logger.addHandler(stream_handler) + """ + Attach handler to the global logger object + """ + # Due to backwards compatibility, logging output is sent to standard output + # instead of standard error if the -o option is used. + stream_handler = logging.StreamHandler(sys.stdout if stdout else sys.stderr) + stream_handler.setFormatter(NiceFormatter()) + # debug overrides quiet + if debug: + level = logging.DEBUG + elif quiet: + level = logging.ERROR + else: + level = logging.INFO + stream_handler.setLevel(level) + logger.setLevel(level) + logger.addHandler(stream_handler) def get_option_parser(): - parser = CutadaptOptionParser(usage=__doc__, version=__version__) - - parser.add_option("--debug", action='store_true', default=False, - help="Print debugging information.") - parser.add_option("-f", "--format", - help="Input file format ('fasta' or 'fastq'). Default: auto-detect.") - parser.add_option('-j', '--cores', type=int, default=1, - help='Number of CPU cores to use. Use 0 to auto-detect. Default: %default') - - # Hidden options - parser.add_option("--gc-content", type=float, default=50, # it's a percentage - help=SUPPRESS_HELP) - parser.add_option("--buffer-size", type=int, default=4000000, - help=SUPPRESS_HELP) # buffer size for the reader process when running in parallel - - group = OptionGroup(parser, "Finding adapters", - description="Parameters -a, -g, -b specify adapters to be removed from " - "each read (or from the first read in a pair if data is paired). " - "If specified multiple times, only the best matching adapter is " - "trimmed (but see the --times option). When the special notation " - "'file:FILE' is used, adapter sequences are read from the given " - "FASTA file.") - group.add_option("-a", "--adapter", action="append", default=[], metavar="ADAPTER", - dest="adapters", - help="Sequence of an adapter ligated to the 3' end (paired data: of the " - "first read). The adapter and subsequent bases are trimmed. If a " - "'$' character is appended ('anchoring'), the adapter is only " - "found if it is a suffix of the read.") - group.add_option("-g", "--front", action="append", default=[], metavar="ADAPTER", - help="Sequence of an adapter ligated to the 5' end (paired data: of the " - "first read). The adapter and any preceding bases are trimmed. " - "Partial matches at the 5' end are allowed. If a '^' character is " - "prepended ('anchoring'), the adapter is only found if it is a " - "prefix of the read.") - group.add_option("-b", "--anywhere", action="append", default=[], metavar="ADAPTER", - help="Sequence of an adapter that may be ligated to the 5' or 3' end " - "(paired data: of the first read). Both types of matches as " - "described under -a und -g are allowed. If the first base of the " - "read is part of the match, the behavior is as with -g, otherwise " - "as with -a. This option is mostly for rescuing failed library " - "preparations - do not use if you know which end your adapter was " - "ligated to!") - group.add_option("-e", "--error-rate", type=float, default=0.1, metavar="RATE", - help="Maximum allowed error rate as value between 0 and 1 (no. of " - "errors divided by length of matching region). Default: %default (=10%)") - group.add_option("--no-indels", action='store_false', dest='indels', default=True, - help="Allow only mismatches in alignments. " - "Default: allow both mismatches and indels") - group.add_option("-n", "--times", type=int, metavar="COUNT", default=1, - help="Remove up to COUNT adapters from each read. Default: %default") - group.add_option("-O", "--overlap", type=int, metavar="MINLENGTH", default=3, - help="Require MINLENGTH overlap between read and adapter for an adapter " - "to be found. Default: %default") - group.add_option("--match-read-wildcards", action="store_true", default=False, - help="Interpret IUPAC wildcards in reads. Default: %default") - group.add_option("-N", "--no-match-adapter-wildcards", action="store_false", - default=True, dest='match_adapter_wildcards', - help="Do not interpret IUPAC wildcards in adapters.") - group.add_option("--action", choices=('trim', 'mask', 'none'), default='trim', - help="What to do with found adapters: 'trim', 'mask' or 'none'. " - "mask: replace with 'N' characters; " - "none: leave unchanged (useful with " - "--discard-untrimmed). Default: trim") - group.add_option("--no-trim", dest='action', action='store_const', const='none', - help=SUPPRESS_HELP) # Deprecated, use --action=none - group.add_option("--mask-adapter", dest='action', action='store_const', const='mask', - help=SUPPRESS_HELP) # Deprecated, use --action=mask - parser.add_option_group(group) - - group = OptionGroup(parser, "Additional read modifications") - group.add_option("-u", "--cut", action='append', default=[], type=int, metavar="LENGTH", - help="Remove bases from each read (first read only if paired). " - "If LENGTH is positive, remove bases from the beginning. " - "If LENGTH is negative, remove bases from the end. " - "Can be used twice if LENGTHs have different signs. " - "This is applied *before* adapter trimming.") - group.add_option("--nextseq-trim", type=int, default=None, metavar="3'CUTOFF", - help="NextSeq-specific quality trimming (each read). Trims also dark " - "cycles appearing as high-quality G bases.") - group.add_option("-q", "--quality-cutoff", default=None, metavar="[5'CUTOFF,]3'CUTOFF", - help="Trim low-quality bases from 5' and/or 3' ends of each read before " - "adapter removal. Applied to both reads if data is paired. If one " - "value is given, only the 3' end is trimmed. If two " - "comma-separated cutoffs are given, the 5' end is trimmed with " - "the first cutoff, the 3' end with the second.") - group.add_option("--quality-base", type=int, default=33, metavar='N', - help="Assume that quality values in FASTQ are encoded as ascii(quality " - "+ N). This needs to be set to 64 for some old Illumina " - "FASTQ files. Default: %default") - group.add_option("--length", "-l", type=int, default=None, metavar="LENGTH", - help="Shorten reads to LENGTH. Positive values remove bases at the end " - "while negative ones remove bases at the beginning. This and the following modifications " - "are applied after adapter trimming.") - group.add_option("--trim-n", action='store_true', default=False, - help="Trim N's on ends of reads.") - group.add_option("--length-tag", metavar="TAG", - help="Search for TAG followed by a decimal number in the description " - "field of the read. Replace the decimal number with the correct " - "length of the trimmed read. For example, use --length-tag 'length=' " - "to correct fields like 'length=123'.") - group.add_option("--strip-suffix", action='append', default=[], - help="Remove this suffix from read names if present. Can be given multiple times.") - group.add_option("-x", "--prefix", default='', - help="Add this prefix to read names. Use {name} to insert the name of the matching adapter.") - group.add_option("-y", "--suffix", default='', - help="Add this suffix to read names; can also include {name}") - parser.add_option_group(group) - - group = OptionGroup(parser, "Filtering of processed reads", - description="Filters are applied after above read modifications. " - "Paired-end reads are always discarded pairwise (see also " - "--pair-filter).") - group.add_option("-m", "--minimum-length", default=None, metavar="LEN[:LEN2]", - help="Discard reads shorter than LEN. Default: 0") - group.add_option("-M", "--maximum-length", default=None, metavar="LEN[:LEN2]", - help="Discard reads longer than LEN. Default: no limit") - group.add_option("--max-n", type=float, default=None, metavar="COUNT", - help="Discard reads with more than COUNT 'N' bases. If COUNT is a number " - "between 0 and 1, it is interpreted as a fraction of the read length.") - group.add_option("--discard-trimmed", "--discard", action='store_true', default=False, - help="Discard reads that contain an adapter. Also use -O to avoid " - "discarding too many randomly matching reads!") - group.add_option("--discard-untrimmed", "--trimmed-only", action='store_true', default=False, - help="Discard reads that do not contain an adapter.") - group.add_option("--discard-casava", action='store_true', default=False, - help="Discard reads that did not pass CASAVA filtering (header has :Y:).") - group.add_option("--zero-cap", "-z", action='store_true', default=False, - help="Change negative quality values to zero.") - parser.add_option_group(group) - - group = OptionGroup(parser, "Output") - group.add_option("--quiet", default=False, action='store_true', - help="Print only error messages.") - group.add_option("--report", choices=('full', 'minimal'), default=None, - help="Which type of report to print: 'full' or 'minimal'. Default: full") - group.add_option("-o", "--output", metavar="FILE", - help="Write trimmed reads to FILE. FASTQ or FASTA format is chosen " - "depending on input. The summary report is sent to standard output. " - "Use '{name}' in FILE to demultiplex reads into multiple " - "files. Default: write to standard output") - group.add_option("--info-file", metavar="FILE", - help="Write information about each read and its adapter matches into FILE. " - "See the documentation for the file format.") - group.add_option("-r", "--rest-file", metavar="FILE", - help="When the adapter matches in the middle of a read, write the " - "rest (after the adapter) to FILE.") - group.add_option("--wildcard-file", metavar="FILE", - help="When the adapter has N wildcard bases, write adapter bases " - "matching wildcard positions to FILE. (Inaccurate with indels.)") - group.add_option("--too-short-output", metavar="FILE", - help="Write reads that are too short (according to length specified by " - "-m) to FILE. Default: discard reads") - group.add_option("--too-long-output", metavar="FILE", - help="Write reads that are too long (according to length specified by " - "-M) to FILE. Default: discard reads") - group.add_option("--untrimmed-output", default=None, metavar="FILE", - help="Write reads that do not contain any adapter to FILE. Default: " - "output to same file as trimmed reads") - parser.add_option_group(group) - - group = OptionGroup(parser, "Paired-end options", description="The " - "-A/-G/-B/-U options work like their -a/-b/-g/-u counterparts, but " - "are applied to the second read in each pair.") - group.add_option("-A", dest='adapters2', action='append', default=[], metavar='ADAPTER', - help="3' adapter to be removed from second read in a pair.") - group.add_option("-G", dest='front2', action='append', default=[], metavar='ADAPTER', - help="5' adapter to be removed from second read in a pair.") - group.add_option("-B", dest='anywhere2', action='append', default=[], metavar='ADAPTER', - help="5'/3 adapter to be removed from second read in a pair.") - group.add_option("-U", dest='cut2', action='append', default=[], type=int, metavar="LENGTH", - help="Remove LENGTH bases from second read in a pair.") - group.add_option("-p", "--paired-output", metavar="FILE", - help="Write second read in a pair to FILE.") - # Setting the default for pair_filter to None allows us to find out whether - # the option was used at all. - group.add_option("--pair-filter", metavar='(any|both|first)', default=None, - choices=("any", "both", "first"), - help="Which of the reads in a paired-end read have to match the " - "filtering criterion in order for the pair to be filtered. " - "Default: any") - group.add_option("--interleaved", action='store_true', default=False, - help="Read and write interleaved paired-end reads.") - group.add_option("--untrimmed-paired-output", metavar="FILE", - help="Write second read in a pair to this FILE when no adapter " - "was found. Use with --untrimmed-output. Default: output " - "to same file as trimmed reads") - group.add_option("--too-short-paired-output", metavar="FILE", default=None, - help="Write second read in a pair to this file if pair is too short. " - "Use also --too-short-output.") - group.add_option("--too-long-paired-output", metavar="FILE", default=None, - help="Write second read in a pair to this file if pair is too long. " - "Use also --too-long-output.") - parser.add_option_group(group) - - for opt in ("--colorspace", "-c", "-d", "--double-encode", "-t", "--trim-primer", - "--strip-f3", "--maq", "--bwa", "--no-zero-cap"): - parser.add_option(opt, dest='colorspace', action='store_true', default=False, - help=SUPPRESS_HELP) - parser.set_defaults(colorspace=False) - - return parser + parser = CutadaptOptionParser(usage=__doc__, version=__version__) + + parser.add_option("--debug", action='store_true', default=False, + help="Print debugging information.") + parser.add_option("-f", "--format", + help="Input file format ('fasta' or 'fastq'). Default: auto-detect.") + parser.add_option('-j', '--cores', type=int, default=1, + help='Number of CPU cores to use. Use 0 to auto-detect. Default: %default') + + # Hidden options + parser.add_option("--gc-content", type=float, default=50, # it's a percentage + help=SUPPRESS_HELP) + parser.add_option("--buffer-size", type=int, default=4000000, + help=SUPPRESS_HELP) # buffer size for the reader process when running in parallel + + group = OptionGroup(parser, "Finding adapters", + description="Parameters -a, -g, -b specify adapters to be removed from " + "each read (or from the first read in a pair if data is paired). " + "If specified multiple times, only the best matching adapter is " + "trimmed (but see the --times option). When the special notation " + "'file:FILE' is used, adapter sequences are read from the given " + "FASTA file.") + group.add_option("-a", "--adapter", action="append", default=[], metavar="ADAPTER", + dest="adapters", + help="Sequence of an adapter ligated to the 3' end (paired data: of the " + "first read). The adapter and subsequent bases are trimmed. If a " + "'$' character is appended ('anchoring'), the adapter is only " + "found if it is a suffix of the read.") + group.add_option("-g", "--front", action="append", default=[], metavar="ADAPTER", + help="Sequence of an adapter ligated to the 5' end (paired data: of the " + "first read). The adapter and any preceding bases are trimmed. " + "Partial matches at the 5' end are allowed. If a '^' character is " + "prepended ('anchoring'), the adapter is only found if it is a " + "prefix of the read.") + group.add_option("-b", "--anywhere", action="append", default=[], metavar="ADAPTER", + help="Sequence of an adapter that may be ligated to the 5' or 3' end " + "(paired data: of the first read). Both types of matches as " + "described under -a und -g are allowed. If the first base of the " + "read is part of the match, the behavior is as with -g, otherwise " + "as with -a. This option is mostly for rescuing failed library " + "preparations - do not use if you know which end your adapter was " + "ligated to!") + group.add_option("-e", "--error-rate", type=float, default=0.1, metavar="RATE", + help="Maximum allowed error rate as value between 0 and 1 (no. of " + "errors divided by length of matching region). Default: %default (=10%)") + group.add_option("--no-indels", action='store_false', dest='indels', default=True, + help="Allow only mismatches in alignments. " + "Default: allow both mismatches and indels") + group.add_option("-n", "--times", type=int, metavar="COUNT", default=1, + help="Remove up to COUNT adapters from each read. Default: %default") + group.add_option("-O", "--overlap", type=int, metavar="MINLENGTH", default=3, + help="Require MINLENGTH overlap between read and adapter for an adapter " + "to be found. Default: %default") + group.add_option("--match-read-wildcards", action="store_true", default=False, + help="Interpret IUPAC wildcards in reads. Default: %default") + group.add_option("-N", "--no-match-adapter-wildcards", action="store_false", + default=True, dest='match_adapter_wildcards', + help="Do not interpret IUPAC wildcards in adapters.") + group.add_option("--action", choices=('trim', 'mask', 'none'), default='trim', + help="What to do with found adapters: 'trim', 'mask' or 'none'. " + "mask: replace with 'N' characters; " + "none: leave unchanged (useful with " + "--discard-untrimmed). Default: trim") + group.add_option("--no-trim", dest='action', action='store_const', const='none', + help=SUPPRESS_HELP) # Deprecated, use --action=none + group.add_option("--mask-adapter", dest='action', action='store_const', const='mask', + help=SUPPRESS_HELP) # Deprecated, use --action=mask + parser.add_option_group(group) + + group = OptionGroup(parser, "Additional read modifications") + group.add_option("-u", "--cut", action='append', default=[], type=int, metavar="LENGTH", + help="Remove bases from each read (first read only if paired). " + "If LENGTH is positive, remove bases from the beginning. " + "If LENGTH is negative, remove bases from the end. " + "Can be used twice if LENGTHs have different signs. " + "This is applied *before* adapter trimming.") + group.add_option("--nextseq-trim", type=int, default=None, metavar="3'CUTOFF", + help="NextSeq-specific quality trimming (each read). Trims also dark " + "cycles appearing as high-quality G bases.") + group.add_option("-q", "--quality-cutoff", default=None, metavar="[5'CUTOFF,]3'CUTOFF", + help="Trim low-quality bases from 5' and/or 3' ends of each read before " + "adapter removal. Applied to both reads if data is paired. If one " + "value is given, only the 3' end is trimmed. If two " + "comma-separated cutoffs are given, the 5' end is trimmed with " + "the first cutoff, the 3' end with the second.") + group.add_option("--quality-base", type=int, default=33, metavar='N', + help="Assume that quality values in FASTQ are encoded as ascii(quality " + "+ N). This needs to be set to 64 for some old Illumina " + "FASTQ files. Default: %default") + group.add_option("--length", "-l", type=int, default=None, metavar="LENGTH", + help="Shorten reads to LENGTH. Positive values remove bases at the end " + "while negative ones remove bases at the beginning. This and the following modifications " + "are applied after adapter trimming.") + group.add_option("--trim-n", action='store_true', default=False, + help="Trim N's on ends of reads.") + group.add_option("--length-tag", metavar="TAG", + help="Search for TAG followed by a decimal number in the description " + "field of the read. Replace the decimal number with the correct " + "length of the trimmed read. For example, use --length-tag 'length=' " + "to correct fields like 'length=123'.") + group.add_option("--strip-suffix", action='append', default=[], + help="Remove this suffix from read names if present. Can be given multiple times.") + group.add_option("-x", "--prefix", default='', + help="Add this prefix to read names. Use {name} to insert the name of the matching adapter.") + group.add_option("-y", "--suffix", default='', + help="Add this suffix to read names; can also include {name}") + parser.add_option_group(group) + + group = OptionGroup(parser, "Filtering of processed reads", + description="Filters are applied after above read modifications. " + "Paired-end reads are always discarded pairwise (see also " + "--pair-filter).") + group.add_option("-m", "--minimum-length", default=None, metavar="LEN[:LEN2]", + help="Discard reads shorter than LEN. Default: 0") + group.add_option("-M", "--maximum-length", default=None, metavar="LEN[:LEN2]", + help="Discard reads longer than LEN. Default: no limit") + group.add_option("--max-n", type=float, default=None, metavar="COUNT", + help="Discard reads with more than COUNT 'N' bases. If COUNT is a number " + "between 0 and 1, it is interpreted as a fraction of the read length.") + group.add_option("--discard-trimmed", "--discard", action='store_true', default=False, + help="Discard reads that contain an adapter. Also use -O to avoid " + "discarding too many randomly matching reads!") + group.add_option("--discard-untrimmed", "--trimmed-only", action='store_true', default=False, + help="Discard reads that do not contain an adapter.") + group.add_option("--discard-casava", action='store_true', default=False, + help="Discard reads that did not pass CASAVA filtering (header has :Y:).") + group.add_option("--zero-cap", "-z", action='store_true', default=False, + help="Change negative quality values to zero.") + parser.add_option_group(group) + + group = OptionGroup(parser, "Output") + group.add_option("--quiet", default=False, action='store_true', + help="Print only error messages.") + group.add_option("--report", choices=('full', 'minimal'), default=None, + help="Which type of report to print: 'full' or 'minimal'. Default: full") + group.add_option("-o", "--output", metavar="FILE", + help="Write trimmed reads to FILE. FASTQ or FASTA format is chosen " + "depending on input. The summary report is sent to standard output. " + "Use '{name}' in FILE to demultiplex reads into multiple " + "files. Default: write to standard output") + group.add_option("--info-file", metavar="FILE", + help="Write information about each read and its adapter matches into FILE. " + "See the documentation for the file format.") + group.add_option("-r", "--rest-file", metavar="FILE", + help="When the adapter matches in the middle of a read, write the " + "rest (after the adapter) to FILE.") + group.add_option("--wildcard-file", metavar="FILE", + help="When the adapter has N wildcard bases, write adapter bases " + "matching wildcard positions to FILE. (Inaccurate with indels.)") + group.add_option("--too-short-output", metavar="FILE", + help="Write reads that are too short (according to length specified by " + "-m) to FILE. Default: discard reads") + group.add_option("--too-long-output", metavar="FILE", + help="Write reads that are too long (according to length specified by " + "-M) to FILE. Default: discard reads") + group.add_option("--untrimmed-output", default=None, metavar="FILE", + help="Write reads that do not contain any adapter to FILE. Default: " + "output to same file as trimmed reads") + parser.add_option_group(group) + + group = OptionGroup(parser, "Paired-end options", description="The " + "-A/-G/-B/-U options work like their -a/-b/-g/-u counterparts, but " + "are applied to the second read in each pair.") + group.add_option("-A", dest='adapters2', action='append', default=[], metavar='ADAPTER', + help="3' adapter to be removed from second read in a pair.") + group.add_option("-G", dest='front2', action='append', default=[], metavar='ADAPTER', + help="5' adapter to be removed from second read in a pair.") + group.add_option("-B", dest='anywhere2', action='append', default=[], metavar='ADAPTER', + help="5'/3 adapter to be removed from second read in a pair.") + group.add_option("-U", dest='cut2', action='append', default=[], type=int, metavar="LENGTH", + help="Remove LENGTH bases from second read in a pair.") + group.add_option("-p", "--paired-output", metavar="FILE", + help="Write second read in a pair to FILE.") + # Setting the default for pair_filter to None allows us to find out whether + # the option was used at all. + group.add_option("--pair-filter", metavar='(any|both|first)', default=None, + choices=("any", "both", "first"), + help="Which of the reads in a paired-end read have to match the " + "filtering criterion in order for the pair to be filtered. " + "Default: any") + group.add_option("--interleaved", action='store_true', default=False, + help="Read and write interleaved paired-end reads.") + group.add_option("--untrimmed-paired-output", metavar="FILE", + help="Write second read in a pair to this FILE when no adapter " + "was found. Use with --untrimmed-output. Default: output " + "to same file as trimmed reads") + group.add_option("--too-short-paired-output", metavar="FILE", default=None, + help="Write second read in a pair to this file if pair is too short. " + "Use also --too-short-output.") + group.add_option("--too-long-paired-output", metavar="FILE", default=None, + help="Write second read in a pair to this file if pair is too long. " + "Use also --too-long-output.") + parser.add_option_group(group) + + for opt in ("--colorspace", "-c", "-d", "--double-encode", "-t", "--trim-primer", + "--strip-f3", "--maq", "--bwa", "--no-zero-cap"): + parser.add_option(opt, dest='colorspace', action='store_true', default=False, + help=SUPPRESS_HELP) + parser.set_defaults(colorspace=False) + + return parser def parse_cutoffs(s): - """Parse a string INT[,INT] into a two-element list of integers""" - cutoffs = s.split(',') - if len(cutoffs) == 1: - try: - cutoffs = [0, int(cutoffs[0])] - except ValueError as e: - raise CommandLineError("Quality cutoff value not recognized: {}".format(e)) - elif len(cutoffs) == 2: - try: - cutoffs = [int(cutoffs[0]), int(cutoffs[1])] - except ValueError as e: - raise CommandLineError("Quality cutoff value not recognized: {}".format(e)) - else: - raise CommandLineError("Expected one value or two values separated by comma for " - "the quality cutoff") - return cutoffs + """Parse a string INT[,INT] into a two-element list of integers""" + cutoffs = s.split(',') + if len(cutoffs) == 1: + try: + cutoffs = [0, int(cutoffs[0])] + except ValueError as e: + raise CommandLineError("Quality cutoff value not recognized: {}".format(e)) + elif len(cutoffs) == 2: + try: + cutoffs = [int(cutoffs[0]), int(cutoffs[1])] + except ValueError as e: + raise CommandLineError("Quality cutoff value not recognized: {}".format(e)) + else: + raise CommandLineError("Expected one value or two values separated by comma for " + "the quality cutoff") + return cutoffs def parse_lengths(s): - """Parse [INT][:[INT]] into a pair of integers. If a value is omitted, use None - - >>> parse_lengths('25') - (25,) - >>> parse_lengths('17:25') - (17, 25) - >>> parse_lengths('25:') - (25, None) - >>> parse_lengths(':25') - (None, 25) - """ - fields = s.split(':') - if len(fields) not in (1, 2): - raise CommandLineError("Only at most one colon is allowed") - try: - values = tuple(int(f) if f != '' else None for f in fields) - except ValueError as e: - raise CommandLineError("Value not recognized: {}".format(e)) - if len(values) == 2 and values[0] is None and values[1] is None: - raise CommandLineError("Cannot parse {!r}: At least one length needs to be given".format(s)) - return tuple(values) + """Parse [INT][:[INT]] into a pair of integers. If a value is omitted, use None + + >>> parse_lengths('25') + (25,) + >>> parse_lengths('17:25') + (17, 25) + >>> parse_lengths('25:') + (25, None) + >>> parse_lengths(':25') + (None, 25) + """ + fields = s.split(':') + if len(fields) not in (1, 2): + raise CommandLineError("Only at most one colon is allowed") + try: + values = tuple(int(f) if f != '' else None for f in fields) + except ValueError as e: + raise CommandLineError("Value not recognized: {}".format(e)) + if len(values) == 2 and values[0] is None and values[1] is None: + raise CommandLineError("Cannot parse {!r}: At least one length needs to be given".format(s)) + return tuple(values) def open_output_files(options, default_outfile, interleaved): - """ - Return an OutputFiles instance. If demultiplex is True, the untrimmed, untrimmed2, out and out2 - attributes are not opened files, but paths (out and out2 with the '{name}' template). - """ - rest_file = info_file = wildcard = None - if options.rest_file is not None: - rest_file = xopen(options.rest_file, 'w') - if options.info_file is not None: - info_file = xopen(options.info_file, 'w') - if options.wildcard_file is not None: - wildcard = xopen(options.wildcard_file, 'w') - - def open2(path1, path2): - file1 = file2 = None - if path1 is not None: - file1 = xopen(path1, 'wb') - if path2 is not None: - file2 = xopen(path2, 'wb') - return file1, file2 - - too_short = too_short2 = None - if options.minimum_length is not None: - too_short, too_short2 = open2(options.too_short_output, options.too_short_paired_output) - - too_long = too_long2 = None - if options.maximum_length is not None: - too_long, too_long2 = open2(options.too_long_output, options.too_long_paired_output) - - if int(options.discard_trimmed) + int(options.discard_untrimmed) + int( - options.untrimmed_output is not None) > 1: - raise CommandLineError("Only one of the --discard-trimmed, --discard-untrimmed " - "and --untrimmed-output options can be used at the same time.") - - demultiplex = options.output is not None and '{name}' in options.output - if options.paired_output is not None and (demultiplex != ('{name}' in options.paired_output)): - raise CommandLineError('When demultiplexing paired-end data, "{name}" must appear in ' - 'both output file names (-o and -p)') - - if demultiplex: - if options.discard_trimmed: - raise CommandLineError("Do not use --discard-trimmed when demultiplexing.") - - out = options.output - untrimmed = options.output.replace('{name}', 'unknown') - if options.untrimmed_output: - untrimmed = options.untrimmed_output - if options.discard_untrimmed: - untrimmed = None - - if options.paired_output is not None: - out2 = options.paired_output - untrimmed2 = options.paired_output.replace('{name}', 'unknown') - if options.untrimmed_paired_output: - untrimmed2 = options.untrimmed_paired_output - if options.discard_untrimmed: - untrimmed2 = None - - else: - untrimmed2 = out2 = None - else: - untrimmed, untrimmed2 = open2(options.untrimmed_output, options.untrimmed_paired_output) - out, out2 = open2(options.output, options.paired_output) - if out is None: - out = default_outfile - - if demultiplex: - assert out is not None and '{name}' in out and (out2 is None or '{name}' in out2) - return OutputFiles( - rest=rest_file, - info=info_file, - wildcard=wildcard, - too_short=too_short, - too_short2=too_short2, - too_long=too_long, - too_long2=too_long2, - untrimmed=untrimmed, - untrimmed2=untrimmed2, - out=out, - out2=out2, - demultiplex=demultiplex, - interleaved=interleaved, - ) + """ + Return an OutputFiles instance. If demultiplex is True, the untrimmed, untrimmed2, out and out2 + attributes are not opened files, but paths (out and out2 with the '{name}' template). + """ + rest_file = info_file = wildcard = None + if options.rest_file is not None: + rest_file = xopen(options.rest_file, 'w') + if options.info_file is not None: + info_file = xopen(options.info_file, 'w') + if options.wildcard_file is not None: + wildcard = xopen(options.wildcard_file, 'w') + + def open2(path1, path2): + file1 = file2 = None + if path1 is not None: + file1 = xopen(path1, 'wb') + if path2 is not None: + file2 = xopen(path2, 'wb') + return file1, file2 + + too_short = too_short2 = None + if options.minimum_length is not None: + too_short, too_short2 = open2(options.too_short_output, options.too_short_paired_output) + + too_long = too_long2 = None + if options.maximum_length is not None: + too_long, too_long2 = open2(options.too_long_output, options.too_long_paired_output) + + if int(options.discard_trimmed) + int(options.discard_untrimmed) + int( + options.untrimmed_output is not None) > 1: + raise CommandLineError("Only one of the --discard-trimmed, --discard-untrimmed " + "and --untrimmed-output options can be used at the same time.") + + demultiplex = options.output is not None and '{name}' in options.output + if options.paired_output is not None and (demultiplex != ('{name}' in options.paired_output)): + raise CommandLineError('When demultiplexing paired-end data, "{name}" must appear in ' + 'both output file names (-o and -p)') + + if demultiplex: + if options.discard_trimmed: + raise CommandLineError("Do not use --discard-trimmed when demultiplexing.") + + out = options.output + untrimmed = options.output.replace('{name}', 'unknown') + if options.untrimmed_output: + untrimmed = options.untrimmed_output + if options.discard_untrimmed: + untrimmed = None + + if options.paired_output is not None: + out2 = options.paired_output + untrimmed2 = options.paired_output.replace('{name}', 'unknown') + if options.untrimmed_paired_output: + untrimmed2 = options.untrimmed_paired_output + if options.discard_untrimmed: + untrimmed2 = None + + else: + untrimmed2 = out2 = None + else: + untrimmed, untrimmed2 = open2(options.untrimmed_output, options.untrimmed_paired_output) + out, out2 = open2(options.output, options.paired_output) + if out is None: + out = default_outfile + + if demultiplex: + assert out is not None and '{name}' in out and (out2 is None or '{name}' in out2) + return OutputFiles( + rest=rest_file, + info=info_file, + wildcard=wildcard, + too_short=too_short, + too_short2=too_short2, + too_long=too_long, + too_long2=too_long2, + untrimmed=untrimmed, + untrimmed2=untrimmed2, + out=out, + out2=out2, + demultiplex=demultiplex, + interleaved=interleaved, + ) def determine_paired_mode(options): - """ - Determine the paired-end mode: single-end, paired-end or legacy paired-end. + """ + Determine the paired-end mode: single-end, paired-end or legacy paired-end. - Return False, 'first' or 'both'. + Return False, 'first' or 'both'. - False -- single-end - 'first' -- Backwards-compatible "legacy" mode in which read modifications apply only to read 1 - 'both' -- normal paired-end mode in which read modifications apply to read 1 and 2 + False -- single-end + 'first' -- Backwards-compatible "legacy" mode in which read modifications apply only to read 1 + 'both' -- normal paired-end mode in which read modifications apply to read 1 and 2 - Legacy mode is deactivated as soon as any option is used that exists only in cutadapt 1.8 or - later, such as -A/-G/-B/-U/--interleaved/--nextseq-trim. - """ - paired = False - if options.paired_output: - paired = 'first' + Legacy mode is deactivated as soon as any option is used that exists only in cutadapt 1.8 or + later, such as -A/-G/-B/-U/--interleaved/--nextseq-trim. + """ + paired = False + if options.paired_output: + paired = 'first' - # Switch off legacy mode if certain options given - if paired and options.nextseq_trim: - paired = 'both' - if (options.adapters2 or options.front2 or options.anywhere2 or - options.cut2 or options.interleaved or options.pair_filter or - options.too_short_paired_output or options.too_long_paired_output): - paired = 'both' - return paired + # Switch off legacy mode if certain options given + if paired and options.nextseq_trim: + paired = 'both' + if (options.adapters2 or options.front2 or options.anywhere2 or + options.cut2 or options.interleaved or options.pair_filter or + options.too_short_paired_output or options.too_long_paired_output): + paired = 'both' + return paired def determine_interleaved(options, args): - is_interleaved_input = False - is_interleaved_output = False - if options.interleaved: - is_interleaved_input = len(args) == 1 - is_interleaved_output = not options.paired_output - if not is_interleaved_input and not is_interleaved_output: - raise CommandLineError("When --interleaved is used, you cannot provide both two " - "input files and two output files") - return is_interleaved_input, is_interleaved_output + is_interleaved_input = False + is_interleaved_output = False + if options.interleaved: + is_interleaved_input = len(args) == 1 + is_interleaved_output = not options.paired_output + if not is_interleaved_input and not is_interleaved_output: + raise CommandLineError("When --interleaved is used, you cannot provide both two " + "input files and two output files") + return is_interleaved_input, is_interleaved_output def input_files_from_parsed_args(args, paired, interleaved): - """ - Return tuple (input_filename, input_paired_filename) - """ - if len(args) == 0: - raise CommandLineError("You did not provide any input file names. Please give me something to do!") - elif len(args) > 2: - raise CommandLineError( - "You provided {} input file names, but either one or two are expected. ".format(len(args)) - + "The file names were:\n - " - + "\n - ".join("{!r}".format(p) for p in args) - + "\nHint: If your path contains spaces, you need to enclose it in quotes") - input_filename = args[0] - if paired and not interleaved: - # Two file names required - if len(args) == 1: - raise CommandLineError("When paired-end trimming is enabled via -A/-G/-B/-U" - " or -p, two input files are required.") - else: - input_paired_filename = args[1] - else: - if len(args) == 2: - raise CommandLineError("When trimming single-end data, only one input file name must " - "be given (got two)") - input_paired_filename = None - - return input_filename, input_paired_filename + """ + Return tuple (input_filename, input_paired_filename) + """ + if len(args) == 0: + raise CommandLineError("You did not provide any input file names. Please give me something to do!") + elif len(args) > 2: + raise CommandLineError( + "You provided {} input file names, but either one or two are expected. ".format(len(args)) + + "The file names were:\n - " + + "\n - ".join("{!r}".format(p) for p in args) + + "\nHint: If your path contains spaces, you need to enclose it in quotes") + input_filename = args[0] + if paired and not interleaved: + # Two file names required + if len(args) == 1: + raise CommandLineError("When paired-end trimming is enabled via -A/-G/-B/-U" + " or -p, two input files are required.") + else: + input_paired_filename = args[1] + else: + if len(args) == 2: + raise CommandLineError("When trimming single-end data, only one input file name must " + "be given (got two)") + input_paired_filename = None + + return input_filename, input_paired_filename def pipeline_from_parsed_args(options, paired, pair_filter_mode, is_interleaved_output): - """ - Setup a processing pipeline from parsed command-line options. - - If there are any problems parsing the arguments, a CommandLineError is thrown. - - Return an instance of Pipeline (SingleEndPipeline or PairedEndPipeline) - """ - - if not paired: - if options.untrimmed_paired_output: - raise CommandLineError("Option --untrimmed-paired-output can only be used when " - "trimming paired-end reads (with option -p).") - - if paired: - if not is_interleaved_output: - if not options.paired_output: - raise CommandLineError("When paired-end trimming is enabled via -A/-G/-B/-U, " - "a second output file needs to be specified via -p (--paired-output).") - if not options.output: - raise CommandLineError("When you use -p or --paired-output, you must also " - "use the -o option.") - - if bool(options.untrimmed_output) != bool(options.untrimmed_paired_output): - raise CommandLineError("When trimming paired-end reads, you must use either none " - "or both of the --untrimmed-output/--untrimmed-paired-output options.") - if options.too_short_output and not options.too_short_paired_output: - raise CommandLineError("When using --too-short-output with paired-end " - "reads, you also need to use --too-short-paired-output") - if options.too_long_output and not options.too_long_paired_output: - raise CommandLineError("When using --too-long-output with paired-end " - "reads, you also need to use --too-long-paired-output") - - if options.format is not None and options.format.lower() not in ['fasta', 'fastq']: - raise CommandLineError("The input file format must be 'fasta' or 'fastq', " - "not '{}'.".format(options.format)) - - if not (0 <= options.error_rate <= 1.): - raise CommandLineError("The maximum error rate must be between 0 and 1.") - if options.overlap < 1: - raise CommandLineError("The overlap must be at least 1.") - if not (0 <= options.gc_content <= 100): - raise CommandLineError("GC content must be given as percentage between 0 and 100") - if options.action == 'none': - options.action = None - - adapter_parser = AdapterParser( - max_error_rate=options.error_rate, - min_overlap=options.overlap, - read_wildcards=options.match_read_wildcards, - adapter_wildcards=options.match_adapter_wildcards, - indels=options.indels) - - try: - adapters = adapter_parser.parse_multi(options.adapters, options.anywhere, options.front) - adapters2 = adapter_parser.parse_multi(options.adapters2, options.anywhere2, options.front2) - except IOError as e: - if e.errno == errno.ENOENT: - raise CommandLineError(e) - raise - except ValueError as e: - raise CommandLineError(e) - if options.debug: - for adapter in adapters + adapters2: - adapter.enable_debug() - - # Create the processing pipeline. - # If no second-read adapters were given (via -A/-G/-B/-U), we need to - # be backwards compatible and *no modifications* are done to the second read. - if paired: - pipeline = PairedEndPipeline(pair_filter_mode, modify_first_read_only=paired == 'first') - else: - pipeline = SingleEndPipeline() - - if options.cut: - if len(options.cut) > 2: - raise CommandLineError("You cannot remove bases from more than two ends.") - if len(options.cut) == 2 and options.cut[0] * options.cut[1] > 0: - raise CommandLineError("You cannot remove bases from the same end twice.") - for cut in options.cut: - if cut != 0: - pipeline.add1(UnconditionalCutter(cut)) - - if options.cut2: - if len(options.cut2) > 2: - raise CommandLineError("You cannot remove bases from more than two ends.") - if len(options.cut2) == 2 and options.cut2[0] * options.cut2[1] > 0: - raise CommandLineError("You cannot remove bases from the same end twice.") - for cut in options.cut2: - if cut != 0: - pipeline.add2(UnconditionalCutter(cut)) - - if options.nextseq_trim is not None: - pipeline.add(NextseqQualityTrimmer(options.nextseq_trim, options.quality_base)) - if options.quality_cutoff is not None: - cutoffs = parse_cutoffs(options.quality_cutoff) - pipeline.add(QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base)) - - if adapters: - adapter_cutter = AdapterCutter(adapters, options.times, options.action) - pipeline.add1(adapter_cutter) - if adapters2: - adapter_cutter2 = AdapterCutter(adapters2, options.times, options.action) - pipeline.add2(adapter_cutter2) - - # Modifiers that apply to both reads of paired-end reads unless in legacy mode - if options.length is not None: - pipeline.add(Shortener(options.length)) - if options.trim_n: - pipeline.add(NEndTrimmer()) - if options.length_tag: - pipeline.add(LengthTagModifier(options.length_tag)) - for suffix in options.strip_suffix: - pipeline.add(SuffixRemover(suffix)) - if options.prefix or options.suffix: - pipeline.add(PrefixSuffixAdder(options.prefix, options.suffix)) - if options.zero_cap: - pipeline.add(ZeroCapper(quality_base=options.quality_base)) - - # Set filtering parameters - # Minimum/maximum length - for attr in 'minimum_length', 'maximum_length': - param = getattr(options, attr) - if param is not None: - lengths = parse_lengths(param) - if not paired and len(lengths) == 2: - raise CommandLineError('Two minimum or maximum lengths given for single-end data') - if paired and len(lengths) == 1: - lengths = (lengths[0], lengths[0]) - setattr(pipeline, attr, lengths) - pipeline.max_n = options.max_n - pipeline.discard_casava = options.discard_casava - pipeline.discard_trimmed = options.discard_trimmed - pipeline.discard_untrimmed = options.discard_untrimmed - - return pipeline + """ + Setup a processing pipeline from parsed command-line options. + + If there are any problems parsing the arguments, a CommandLineError is thrown. + + Return an instance of Pipeline (SingleEndPipeline or PairedEndPipeline) + """ + + if not paired: + if options.untrimmed_paired_output: + raise CommandLineError("Option --untrimmed-paired-output can only be used when " + "trimming paired-end reads (with option -p).") + + if paired: + if not is_interleaved_output: + if not options.paired_output: + raise CommandLineError("When paired-end trimming is enabled via -A/-G/-B/-U, " + "a second output file needs to be specified via -p (--paired-output).") + if not options.output: + raise CommandLineError("When you use -p or --paired-output, you must also " + "use the -o option.") + + if bool(options.untrimmed_output) != bool(options.untrimmed_paired_output): + raise CommandLineError("When trimming paired-end reads, you must use either none " + "or both of the --untrimmed-output/--untrimmed-paired-output options.") + if options.too_short_output and not options.too_short_paired_output: + raise CommandLineError("When using --too-short-output with paired-end " + "reads, you also need to use --too-short-paired-output") + if options.too_long_output and not options.too_long_paired_output: + raise CommandLineError("When using --too-long-output with paired-end " + "reads, you also need to use --too-long-paired-output") + + if options.format is not None and options.format.lower() not in ['fasta', 'fastq']: + raise CommandLineError("The input file format must be 'fasta' or 'fastq', " + "not '{}'.".format(options.format)) + + if not (0 <= options.error_rate <= 1.): + raise CommandLineError("The maximum error rate must be between 0 and 1.") + if options.overlap < 1: + raise CommandLineError("The overlap must be at least 1.") + if not (0 <= options.gc_content <= 100): + raise CommandLineError("GC content must be given as percentage between 0 and 100") + if options.action == 'none': + options.action = None + + adapter_parser = AdapterParser( + max_error_rate=options.error_rate, + min_overlap=options.overlap, + read_wildcards=options.match_read_wildcards, + adapter_wildcards=options.match_adapter_wildcards, + indels=options.indels) + + try: + adapters = adapter_parser.parse_multi(options.adapters, options.anywhere, options.front) + adapters2 = adapter_parser.parse_multi(options.adapters2, options.anywhere2, options.front2) + except IOError as e: + if e.errno == errno.ENOENT: + raise CommandLineError(e) + raise + except ValueError as e: + raise CommandLineError(e) + if options.debug: + for adapter in adapters + adapters2: + adapter.enable_debug() + + # Create the processing pipeline. + # If no second-read adapters were given (via -A/-G/-B/-U), we need to + # be backwards compatible and *no modifications* are done to the second read. + if paired: + pipeline = PairedEndPipeline(pair_filter_mode, modify_first_read_only=paired == 'first') + else: + pipeline = SingleEndPipeline() + + if options.cut: + if len(options.cut) > 2: + raise CommandLineError("You cannot remove bases from more than two ends.") + if len(options.cut) == 2 and options.cut[0] * options.cut[1] > 0: + raise CommandLineError("You cannot remove bases from the same end twice.") + for cut in options.cut: + if cut != 0: + pipeline.add1(UnconditionalCutter(cut)) + + if options.cut2: + if len(options.cut2) > 2: + raise CommandLineError("You cannot remove bases from more than two ends.") + if len(options.cut2) == 2 and options.cut2[0] * options.cut2[1] > 0: + raise CommandLineError("You cannot remove bases from the same end twice.") + for cut in options.cut2: + if cut != 0: + pipeline.add2(UnconditionalCutter(cut)) + + if options.nextseq_trim is not None: + pipeline.add(NextseqQualityTrimmer(options.nextseq_trim, options.quality_base)) + if options.quality_cutoff is not None: + cutoffs = parse_cutoffs(options.quality_cutoff) + pipeline.add(QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base)) + + if adapters: + adapter_cutter = AdapterCutter(adapters, options.times, options.action) + pipeline.add1(adapter_cutter) + if adapters2: + adapter_cutter2 = AdapterCutter(adapters2, options.times, options.action) + pipeline.add2(adapter_cutter2) + + # Modifiers that apply to both reads of paired-end reads unless in legacy mode + if options.length is not None: + pipeline.add(Shortener(options.length)) + if options.trim_n: + pipeline.add(NEndTrimmer()) + if options.length_tag: + pipeline.add(LengthTagModifier(options.length_tag)) + for suffix in options.strip_suffix: + pipeline.add(SuffixRemover(suffix)) + if options.prefix or options.suffix: + pipeline.add(PrefixSuffixAdder(options.prefix, options.suffix)) + if options.zero_cap: + pipeline.add(ZeroCapper(quality_base=options.quality_base)) + + # Set filtering parameters + # Minimum/maximum length + for attr in 'minimum_length', 'maximum_length': + param = getattr(options, attr) + if param is not None: + lengths = parse_lengths(param) + if not paired and len(lengths) == 2: + raise CommandLineError('Two minimum or maximum lengths given for single-end data') + if paired and len(lengths) == 1: + lengths = (lengths[0], lengths[0]) + setattr(pipeline, attr, lengths) + pipeline.max_n = options.max_n + pipeline.discard_casava = options.discard_casava + pipeline.discard_trimmed = options.discard_trimmed + pipeline.discard_untrimmed = options.discard_untrimmed + + return pipeline def main(cmdlineargs=None, default_outfile='-'): - """ - Main function that sets up a processing pipeline and runs it. - - default_outfile is the file to which trimmed reads are sent if the ``-o`` - parameter is not used. - """ - start_time = time.time() - parser = get_option_parser() - if cmdlineargs is None: - cmdlineargs = sys.argv[1:] - options, args = parser.parse_args(args=cmdlineargs) - # Setup logging only if there are not already any handlers (can happen when - # this function is being called externally such as from unit tests) - if not logging.root.handlers: - setup_logging(stdout=bool(options.output), - quiet=options.quiet or options.report == 'minimal', debug=options.debug) - if options.quiet and options.report: - parser.error("Options --quiet and --report cannot be used at the same time") - - if options.colorspace: - parser.error( - "These colorspace-specific options are no longer supported: " - "--colorspace, -c, -d, --double-encode, -t, --trim-primer, " - "--strip-f3, --maq, --bwa, --no-zero-cap. " - "Use Cutadapt version 1.18 or earlier to work with colorspace data.") - paired = determine_paired_mode(options) - assert paired in (False, 'first', 'both') - - if paired == 'first': - # legacy mode - assert options.pair_filter is None - pair_filter_mode = 'first' - elif options.pair_filter is None: - # default - pair_filter_mode = 'any' - else: - # user-provided behavior - pair_filter_mode = options.pair_filter - - try: - is_interleaved_input, is_interleaved_output = determine_interleaved(options, args) - input_filename, input_paired_filename = input_files_from_parsed_args(args, - paired, is_interleaved_input) - pipeline = pipeline_from_parsed_args(options, paired, pair_filter_mode, is_interleaved_output) - outfiles = open_output_files(options, default_outfile, is_interleaved_output) - except CommandLineError as e: - parser.error(e) - return # avoid IDE warnings below - - if options.cores < 0: - parser.error('Value for --cores cannot be negative') - cores = available_cpu_count() if options.cores == 0 else options.cores - if cores > 1: - if ( - ParallelPipelineRunner.can_output_to(outfiles) - and options.format is None - ): - runner = ParallelPipelineRunner(pipeline, cores, options.buffer_size) - else: - logger.error('Running in parallel is currently not supported for ' - 'the given combination of command-line parameters.\nThese ' - 'options are not supported: --info-file, --rest-file, ' - '--wildcard-file, --untrimmed-output, ' - '--untrimmed-paired-output, --too-short-output, ' - '--too-short-paired-output, --too-long-output, ' - '--too-long-paired-output, --format\n' - 'Omit --cores/-j to continue.') - sys.exit(1) - else: - runner = pipeline - try: - runner.set_input(input_filename, file2=input_paired_filename, - fileformat=options.format, interleaved=is_interleaved_input) - runner.set_output(outfiles) - except (dnaio.UnknownFileFormat, IOError) as e: - parser.error(e) - - implementation = platform.python_implementation() - opt = ' (' + implementation + ')' if implementation != 'CPython' else '' - logger.info("This is cutadapt %s with Python %s%s", __version__, - platform.python_version(), opt) - logger.info("Command line parameters: %s", " ".join(cmdlineargs)) - logger.info("Processing reads on %d core%s in %s mode ...", - cores, 's' if cores > 1 else '', - {False: 'single-end', 'first': 'paired-end legacy', 'both': 'paired-end'}[pipeline.paired]) - - if pipeline.should_warn_legacy: - logger.warning('\n'.join(textwrap.wrap('Legacy mode is ' - 'enabled. Read modification and filtering options *ignore* ' - 'the second read. To switch to regular paired-end mode, ' - 'provide the --pair-filter=any option or use any of the ' - '-A/-B/-G/-U/--interleaved options.'))) - - try: - stats = runner.run() - # cProfile.runctx('stats=runner.run()', globals(), locals(), 'profile_main.prof') - runner.close() - except KeyboardInterrupt: - print("Interrupted", file=sys.stderr) - sys.exit(130) - except IOError as e: - if e.errno == errno.EPIPE: - sys.exit(1) - raise - except (dnaio.FileFormatError, dnaio.UnknownFileFormat, EOFError) as e: - sys.exit("cutadapt: error: {}".format(e)) - - elapsed = time.time() - start_time - if not options.quiet: - # send statistics to stderr if result was sent to stdout - stat_file = sys.stderr if options.output is None else None - with redirect_standard_output(stat_file): - if options.report == 'minimal': - print_minimal_report(stats, elapsed, options.gc_content / 100) - else: - print_report(stats, elapsed, options.gc_content / 100) + """ + Main function that sets up a processing pipeline and runs it. + + default_outfile is the file to which trimmed reads are sent if the ``-o`` + parameter is not used. + """ + start_time = time.time() + parser = get_option_parser() + if cmdlineargs is None: + cmdlineargs = sys.argv[1:] + options, args = parser.parse_args(args=cmdlineargs) + # Setup logging only if there are not already any handlers (can happen when + # this function is being called externally such as from unit tests) + if not logging.root.handlers: + setup_logging(stdout=bool(options.output), + quiet=options.quiet or options.report == 'minimal', debug=options.debug) + if options.quiet and options.report: + parser.error("Options --quiet and --report cannot be used at the same time") + + if options.colorspace: + parser.error( + "These colorspace-specific options are no longer supported: " + "--colorspace, -c, -d, --double-encode, -t, --trim-primer, " + "--strip-f3, --maq, --bwa, --no-zero-cap. " + "Use Cutadapt version 1.18 or earlier to work with colorspace data.") + paired = determine_paired_mode(options) + assert paired in (False, 'first', 'both') + + if paired == 'first': + # legacy mode + assert options.pair_filter is None + pair_filter_mode = 'first' + elif options.pair_filter is None: + # default + pair_filter_mode = 'any' + else: + # user-provided behavior + pair_filter_mode = options.pair_filter + + try: + is_interleaved_input, is_interleaved_output = determine_interleaved(options, args) + input_filename, input_paired_filename = input_files_from_parsed_args(args, + paired, is_interleaved_input) + pipeline = pipeline_from_parsed_args(options, paired, pair_filter_mode, is_interleaved_output) + outfiles = open_output_files(options, default_outfile, is_interleaved_output) + except CommandLineError as e: + parser.error(e) + return # avoid IDE warnings below + + if options.cores < 0: + parser.error('Value for --cores cannot be negative') + cores = available_cpu_count() if options.cores == 0 else options.cores + if cores > 1: + if ( + ParallelPipelineRunner.can_output_to(outfiles) + and options.format is None + ): + runner = ParallelPipelineRunner(pipeline, cores, options.buffer_size) + else: + logger.error('Running in parallel is currently not supported for ' + 'the given combination of command-line parameters.\nThese ' + 'options are not supported: --info-file, --rest-file, ' + '--wildcard-file, --untrimmed-output, ' + '--untrimmed-paired-output, --too-short-output, ' + '--too-short-paired-output, --too-long-output, ' + '--too-long-paired-output, --format\n' + 'Omit --cores/-j to continue.') + sys.exit(1) + else: + runner = pipeline + try: + runner.set_input(input_filename, file2=input_paired_filename, + fileformat=options.format, interleaved=is_interleaved_input) + runner.set_output(outfiles) + except (dnaio.UnknownFileFormat, IOError) as e: + parser.error(e) + + implementation = platform.python_implementation() + opt = ' (' + implementation + ')' if implementation != 'CPython' else '' + logger.info("This is cutadapt %s with Python %s%s", __version__, + platform.python_version(), opt) + logger.info("Command line parameters: %s", " ".join(cmdlineargs)) + logger.info("Processing reads on %d core%s in %s mode ...", + cores, 's' if cores > 1 else '', + {False: 'single-end', 'first': 'paired-end legacy', 'both': 'paired-end'}[pipeline.paired]) + + if pipeline.should_warn_legacy: + logger.warning('\n'.join(textwrap.wrap('Legacy mode is ' + 'enabled. Read modification and filtering options *ignore* ' + 'the second read. To switch to regular paired-end mode, ' + 'provide the --pair-filter=any option or use any of the ' + '-A/-B/-G/-U/--interleaved options.'))) + + try: + stats = runner.run() + # cProfile.runctx('stats=runner.run()', globals(), locals(), 'profile_main.prof') + runner.close() + except KeyboardInterrupt: + print("Interrupted", file=sys.stderr) + sys.exit(130) + except IOError as e: + if e.errno == errno.EPIPE: + sys.exit(1) + raise + except (dnaio.FileFormatError, dnaio.UnknownFileFormat, EOFError) as e: + sys.exit("cutadapt: error: {}".format(e)) + + elapsed = time.time() - start_time + if not options.quiet: + # send statistics to stderr if result was sent to stdout + stat_file = sys.stderr if options.output is None else None + with redirect_standard_output(stat_file): + if options.report == 'minimal': + print_minimal_report(stats, elapsed, options.gc_content / 100) + else: + print_report(stats, elapsed, options.gc_content / 100) if __name__ == '__main__': - main() + main() diff --git a/src/cutadapt/_align.pyx b/src/cutadapt/_align.pyx index c3c6c240..d7679930 100644 --- a/src/cutadapt/_align.pyx +++ b/src/cutadapt/_align.pyx @@ -9,73 +9,73 @@ DEF SEMIGLOBAL = 15 # structure for a DP matrix entry ctypedef struct _Entry: - int cost - int matches # no. of matches in this alignment - int origin # where the alignment originated: negative for positions within seq1, positive for pos. within seq2 + int cost + int matches # no. of matches in this alignment + int origin # where the alignment originated: negative for positions within seq1, positive for pos. within seq2 ctypedef struct _Match: - int origin - int cost - int matches - int ref_stop - int query_stop + int origin + int cost + int matches + int ref_stop + int query_stop def _acgt_table(): - """ - Return a translation table that maps A, C, G, T characters to the lower - four bits of a byte. Other characters (including possibly IUPAC characters) - are mapped to zero. + """ + Return a translation table that maps A, C, G, T characters to the lower + four bits of a byte. Other characters (including possibly IUPAC characters) + are mapped to zero. - Lowercase versions are also translated, and U is treated the same as T. - """ - d = dict(A=1, C=2, G=4, T=8, U=8) - t = bytearray(b'\0') * 256 - for c, v in d.items(): - t[ord(c)] = v - t[ord(c.lower())] = v - return bytes(t) + Lowercase versions are also translated, and U is treated the same as T. + """ + d = dict(A=1, C=2, G=4, T=8, U=8) + t = bytearray(b'\0') * 256 + for c, v in d.items(): + t[ord(c)] = v + t[ord(c.lower())] = v + return bytes(t) def _iupac_table(): - """ - Return a translation table for IUPAC characters. - - The table maps ASCII-encoded IUPAC nucleotide characters to bytes in which - the four least significant bits are used to represent one nucleotide each. - - Whether two characters x and y match can then be checked with the - expression "x & y != 0". - """ - A = 1 - C = 2 - G = 4 - T = 8 - iupac = dict( - X=0, - A=A, - C=C, - G=G, - T=T, - U=T, - R=A|G, - Y=C|T, - S=G|C, - W=A|T, - K=G|T, - M=A|C, - B=C|G|T, - D=A|G|T, - H=A|C|T, - V=A|C|G, - N=A|C|G|T - ) - t = bytearray(b'\0') * 256 - for c, v in iupac.items(): - t[ord(c)] = v - t[ord(c.lower())] = v - return bytes(t) + """ + Return a translation table for IUPAC characters. + + The table maps ASCII-encoded IUPAC nucleotide characters to bytes in which + the four least significant bits are used to represent one nucleotide each. + + Whether two characters x and y match can then be checked with the + expression "x & y != 0". + """ + A = 1 + C = 2 + G = 4 + T = 8 + iupac = dict( + X=0, + A=A, + C=C, + G=G, + T=T, + U=T, + R=A|G, + Y=C|T, + S=G|C, + W=A|T, + K=G|T, + M=A|C, + B=C|G|T, + D=A|G|T, + H=A|C|T, + V=A|C|G, + N=A|C|G|T + ) + t = bytearray(b'\0') * 256 + for c, v in iupac.items(): + t[ord(c)] = v + t[ord(c.lower())] = v + return bytes(t) cdef bytes ACGT_TABLE = _acgt_table() @@ -83,460 +83,460 @@ cdef bytes IUPAC_TABLE = _iupac_table() class DPMatrix: - """ - Representation of the dynamic-programming matrix. - - This is used only when debugging is enabled in the Aligner class since the - matrix is normally not stored in full. - - Entries in the matrix may be None, in which case that value was not - computed. - """ - def __init__(self, reference, query): - m = len(reference) - n = len(query) - self._rows = [ [None] * (n+1) for _ in range(m + 1) ] - self.reference = reference - self.query = query - - def set_entry(self, int i, int j, cost): - """ - Set an entry in the dynamic programming matrix. - """ - self._rows[i][j] = cost - - def __str__(self): - """ - Return a representation of the matrix as a string. - """ - rows = [' ' + ' '.join(c.rjust(2) for c in self.query)] - for c, row in zip(' ' + self.reference, self._rows): - r = c + ' ' + ' '.join(' ' if v is None else '{:2d}'.format(v) for v in row) - rows.append(r) - return '\n'.join(rows) + """ + Representation of the dynamic-programming matrix. + + This is used only when debugging is enabled in the Aligner class since the + matrix is normally not stored in full. + + Entries in the matrix may be None, in which case that value was not + computed. + """ + def __init__(self, reference, query): + m = len(reference) + n = len(query) + self._rows = [ [None] * (n+1) for _ in range(m + 1) ] + self.reference = reference + self.query = query + + def set_entry(self, int i, int j, cost): + """ + Set an entry in the dynamic programming matrix. + """ + self._rows[i][j] = cost + + def __str__(self): + """ + Return a representation of the matrix as a string. + """ + rows = [' ' + ' '.join(c.rjust(2) for c in self.query)] + for c, row in zip(' ' + self.reference, self._rows): + r = c + ' ' + ' '.join(' ' if v is None else '{:2d}'.format(v) for v in row) + rows.append(r) + return '\n'.join(rows) cdef class Aligner: - """ - TODO documentation still uses s1 (reference) and s2 (query). - - Locate one string within another by computing an optimal semiglobal - alignment between string1 and string2. - - The alignment uses unit costs, which means that mismatches, insertions and deletions are - counted as one error. - - flags is a bitwise 'or' of the allowed flags. - To allow skipping of a prefix of string1 at no cost, set the - START_WITHIN_SEQ1 flag. - To allow skipping of a prefix of string2 at no cost, set the - START_WITHIN_SEQ2 flag. - If both are set, a prefix of string1 or of string1 is skipped, - never both. - Similarly, set STOP_WITHIN_SEQ1 and STOP_WITHIN_SEQ2 to - allow skipping of suffixes of string1 or string2. Again, when both - flags are set, never suffixes in both strings are skipped. - If all flags are set, this results in standard semiglobal alignment. - - The skipped parts are described with two intervals (start1, stop1), - (start2, stop2). - - For example, an optimal semiglobal alignment of SISSI and MISSISSIPPI looks like this: - - ---SISSI--- - MISSISSIPPI - - start1, stop1 = 0, 5 - start2, stop2 = 3, 8 - (with zero errors) - - The aligned parts are string1[start1:stop1] and string2[start2:stop2]. - - The error rate is: errors / length where length is (stop1 - start1). - - An optimal alignment fulfills all of these criteria: - - - its error_rate is at most max_error_rate - - Among those alignments with error_rate <= max_error_rate, the alignment contains - a maximal number of matches (there is no alignment with more matches). - - If there are multiple alignments with the same no. of matches, then one that - has minimal no. of errors is chosen. - - If there are still multiple candidates, choose the alignment that starts at the - leftmost position within the read. - - The alignment itself is not returned, only the tuple - (start1, stop1, start2, stop2, matches, errors), where the first four fields have the - meaning as described, matches is the number of matches and errors is the number of - errors in the alignment. - - It is always the case that at least one of start1 and start2 is zero. - - IUPAC wildcard characters can be allowed in the reference and the query - by setting the appropriate flags. - - If neither flag is set, the full ASCII alphabet is used for comparison. - If any of the flags is set, all non-IUPAC characters in the sequences - compare as 'not equal'. - """ - cdef: - int m - _Entry* column # one column of the DP matrix - double max_error_rate - int flags - int _insertion_cost - int _deletion_cost - int _min_overlap - bint wildcard_ref - bint wildcard_query - bint debug - object _dpmatrix - bytes _reference # TODO rename to translated_reference or so - str str_reference - - START_WITHIN_REFERENCE = 1 - START_WITHIN_QUERY = 2 - STOP_WITHIN_REFERENCE = 4 - STOP_WITHIN_QUERY = 8 - - def __cinit__(self, str reference, double max_error_rate, int flags=SEMIGLOBAL, bint wildcard_ref=False, bint wildcard_query=False): - self.max_error_rate = max_error_rate - self.flags = flags - self.wildcard_ref = wildcard_ref - self.wildcard_query = wildcard_query - self.str_reference = reference - self.reference = reference - self._min_overlap = 1 - self.debug = False - self._dpmatrix = None - self._insertion_cost = 1 - self._deletion_cost = 1 - - property min_overlap: - def __get__(self): - return self._min_overlap - - def __set__(self, int value): - if value < 1: - raise ValueError('Minimum overlap must be at least 1') - self._min_overlap = value - - property indel_cost: - """ - Matches cost 0, mismatches cost 1. Only insertion/deletion costs can be - changed. - """ - def __set__(self, value): - if value < 1: - raise ValueError('Insertion/deletion cost must be at least 1') - self._insertion_cost = value - self._deletion_cost = value - - property reference: - def __get__(self): - return self._reference - - def __set__(self, str reference): - mem = <_Entry*> PyMem_Realloc(self.column, (len(reference) + 1) * sizeof(_Entry)) - if not mem: - raise MemoryError() - self.column = mem - self._reference = reference.encode('ascii') - self.m = len(reference) - if self.wildcard_ref: - self._reference = self._reference.translate(IUPAC_TABLE) - elif self.wildcard_query: - self._reference = self._reference.translate(ACGT_TABLE) - self.str_reference = reference - - property dpmatrix: - """ - The dynamic programming matrix as a DPMatrix object. This attribute is - usually None, unless debugging has been enabled with enable_debug(). - """ - def __get__(self): - return self._dpmatrix - - def enable_debug(self): - """ - Store the dynamic programming matrix while running the locate() method - and make it available in the .dpmatrix attribute. - """ - self.debug = True - - def locate(self, str query): - """ - locate(query) -> (refstart, refstop, querystart, querystop, matches, errors) - - Find the query within the reference associated with this aligner. The - intervals (querystart, querystop) and (refstart, refstop) give the - location of the match. - - That is, the substrings query[querystart:querystop] and - self.reference[refstart:refstop] were found to align best to each other, - with the given number of matches and the given number of errors. - - The alignment itself is not returned. - """ - cdef: - char* s1 = self._reference - bytes query_bytes = query.encode('ascii') - char* s2 = query_bytes - int m = self.m - int n = len(query) - _Entry* column = self.column # Current column of the DP matrix - double max_error_rate = self.max_error_rate - bint start_in_ref = self.flags & START_WITHIN_SEQ1 - bint start_in_query = self.flags & START_WITHIN_SEQ2 - bint stop_in_ref = self.flags & STOP_WITHIN_SEQ1 - bint stop_in_query = self.flags & STOP_WITHIN_SEQ2 - bint compare_ascii = False - - if self.wildcard_query: - query_bytes = query_bytes.translate(IUPAC_TABLE) - s2 = query_bytes - elif self.wildcard_ref: - query_bytes = query_bytes.translate(ACGT_TABLE) - s2 = query_bytes - else: - compare_ascii = True - """ - DP Matrix: - query (j) - ----------> n - | - ref (i) | - | - V - m - """ - cdef int i, j - - # maximum no. of errors - cdef int k = (max_error_rate * m) - - # Determine largest and smallest column we need to compute - cdef int max_n = n - cdef int min_n = 0 - if not start_in_query: - # costs can only get worse after column m - max_n = min(n, m + k) - if not stop_in_query: - min_n = max(0, n - m - k) - - # Fill column min_n. - # - # Four cases: - # not startin1, not startin2: c(i,j) = max(i,j); origin(i, j) = 0 - # startin1, not startin2: c(i,j) = j ; origin(i, j) = min(0, j - i) - # not startin1, startin2: c(i,j) = i ; origin(i, j) = - # startin1, startin2: c(i,j) = min(i,j) - - # TODO (later) - # fill out columns only until 'last' - if not start_in_ref and not start_in_query: - for i in range(m + 1): - column[i].matches = 0 - column[i].cost = max(i, min_n) * self._insertion_cost - column[i].origin = 0 - elif start_in_ref and not start_in_query: - for i in range(m + 1): - column[i].matches = 0 - column[i].cost = min_n * self._insertion_cost - column[i].origin = min(0, min_n - i) - elif not start_in_ref and start_in_query: - for i in range(m + 1): - column[i].matches = 0 - column[i].cost = i * self._insertion_cost - column[i].origin = max(0, min_n - i) - else: - for i in range(m + 1): - column[i].matches = 0 - column[i].cost = min(i, min_n) * self._insertion_cost - column[i].origin = min_n - i - - if self.debug: - self._dpmatrix = DPMatrix(self.str_reference, query) - for i in range(m + 1): - self._dpmatrix.set_entry(i, min_n, column[i].cost) - cdef _Match best - best.ref_stop = m - best.query_stop = n - best.cost = m + n - best.origin = 0 - best.matches = 0 - - # Ukkonen's trick: index of the last cell that is at most k - cdef int last = min(m, k + 1) - if start_in_ref: - last = m - - cdef: - int cost_diag - int cost_deletion - int cost_insertion - int origin, cost, matches - int length - bint characters_equal - # We keep only a single column of the DP matrix in memory. - # To access the diagonal cell to the upper left, - # we store it here before overwriting it. - _Entry diag_entry - - with nogil: - # iterate over columns - for j in range(min_n + 1, max_n + 1): - # remember first entry before overwriting - diag_entry = column[0] - - # fill in first entry in this column - if start_in_query: - column[0].origin = j - else: - column[0].cost = j * self._insertion_cost - for i in range(1, last + 1): - if compare_ascii: - characters_equal = (s1[i-1] == s2[j-1]) - else: - characters_equal = (s1[i-1] & s2[j-1]) != 0 - if characters_equal: - # If the characters match, skip computing costs for - # insertion and deletion as they are at least as high. - cost = diag_entry.cost - origin = diag_entry.origin - matches = diag_entry.matches + 1 - else: - # Characters do not match. - cost_diag = diag_entry.cost + 1 - cost_deletion = column[i].cost + self._deletion_cost - cost_insertion = column[i-1].cost + self._insertion_cost - - if cost_diag <= cost_deletion and cost_diag <= cost_insertion: - # MISMATCH - cost = cost_diag - origin = diag_entry.origin - matches = diag_entry.matches - elif cost_insertion <= cost_deletion: - # INSERTION - cost = cost_insertion - origin = column[i-1].origin - matches = column[i-1].matches - else: - # DELETION - cost = cost_deletion - origin = column[i].origin - matches = column[i].matches - - # Remember the current cell for next iteration - diag_entry = column[i] - - column[i].cost = cost - column[i].origin = origin - column[i].matches = matches - if self.debug: - with gil: - for i in range(last + 1): - self._dpmatrix.set_entry(i, j, column[i].cost) - while last >= 0 and column[last].cost > k: - last -= 1 - # last can be -1 here, but will be incremented next. - # TODO if last is -1, can we stop searching? - if last < m: - last += 1 - elif stop_in_query: - # Found a match. If requested, find best match in last row. - # length of the aligned part of the reference - length = m + min(column[m].origin, 0) - cost = column[m].cost - matches = column[m].matches - if length >= self._min_overlap and cost <= length * max_error_rate and (matches > best.matches or (matches == best.matches and cost < best.cost)): - # update - best.matches = matches - best.cost = cost - best.origin = column[m].origin - best.ref_stop = m - best.query_stop = j - if cost == 0 and matches == m: - # exact match, stop early - break - # column finished - - if max_n == n: - first_i = 0 if stop_in_ref else m - # search in last column # TODO last? - for i in range(first_i, m+1): - length = i + min(column[i].origin, 0) - cost = column[i].cost - matches = column[i].matches - if length >= self._min_overlap and cost <= length * max_error_rate and (matches > best.matches or (matches == best.matches and cost < best.cost)): - # update best - best.matches = matches - best.cost = cost - best.origin = column[i].origin - best.ref_stop = i - best.query_stop = n - if best.cost == m + n: - # best.cost was initialized with this value. - # If it is unchanged, no alignment was found that has - # an error rate within the allowed range. - return None - - cdef int start1, start2 - if best.origin >= 0: - start1 = 0 - start2 = best.origin - else: - start1 = -best.origin - start2 = 0 - - assert best.ref_stop - start1 > 0 # Do not return empty alignments. - return (start1, best.ref_stop, start2, best.query_stop, best.matches, best.cost) - - def __dealloc__(self): - PyMem_Free(self.column) + """ + TODO documentation still uses s1 (reference) and s2 (query). + + Locate one string within another by computing an optimal semiglobal + alignment between string1 and string2. + + The alignment uses unit costs, which means that mismatches, insertions and deletions are + counted as one error. + + flags is a bitwise 'or' of the allowed flags. + To allow skipping of a prefix of string1 at no cost, set the + START_WITHIN_SEQ1 flag. + To allow skipping of a prefix of string2 at no cost, set the + START_WITHIN_SEQ2 flag. + If both are set, a prefix of string1 or of string1 is skipped, + never both. + Similarly, set STOP_WITHIN_SEQ1 and STOP_WITHIN_SEQ2 to + allow skipping of suffixes of string1 or string2. Again, when both + flags are set, never suffixes in both strings are skipped. + If all flags are set, this results in standard semiglobal alignment. + + The skipped parts are described with two intervals (start1, stop1), + (start2, stop2). + + For example, an optimal semiglobal alignment of SISSI and MISSISSIPPI looks like this: + + ---SISSI--- + MISSISSIPPI + + start1, stop1 = 0, 5 + start2, stop2 = 3, 8 + (with zero errors) + + The aligned parts are string1[start1:stop1] and string2[start2:stop2]. + + The error rate is: errors / length where length is (stop1 - start1). + + An optimal alignment fulfills all of these criteria: + + - its error_rate is at most max_error_rate + - Among those alignments with error_rate <= max_error_rate, the alignment contains + a maximal number of matches (there is no alignment with more matches). + - If there are multiple alignments with the same no. of matches, then one that + has minimal no. of errors is chosen. + - If there are still multiple candidates, choose the alignment that starts at the + leftmost position within the read. + + The alignment itself is not returned, only the tuple + (start1, stop1, start2, stop2, matches, errors), where the first four fields have the + meaning as described, matches is the number of matches and errors is the number of + errors in the alignment. + + It is always the case that at least one of start1 and start2 is zero. + + IUPAC wildcard characters can be allowed in the reference and the query + by setting the appropriate flags. + + If neither flag is set, the full ASCII alphabet is used for comparison. + If any of the flags is set, all non-IUPAC characters in the sequences + compare as 'not equal'. + """ + cdef: + int m + _Entry* column # one column of the DP matrix + double max_error_rate + int flags + int _insertion_cost + int _deletion_cost + int _min_overlap + bint wildcard_ref + bint wildcard_query + bint debug + object _dpmatrix + bytes _reference # TODO rename to translated_reference or so + str str_reference + + START_WITHIN_REFERENCE = 1 + START_WITHIN_QUERY = 2 + STOP_WITHIN_REFERENCE = 4 + STOP_WITHIN_QUERY = 8 + + def __cinit__(self, str reference, double max_error_rate, int flags=SEMIGLOBAL, bint wildcard_ref=False, bint wildcard_query=False): + self.max_error_rate = max_error_rate + self.flags = flags + self.wildcard_ref = wildcard_ref + self.wildcard_query = wildcard_query + self.str_reference = reference + self.reference = reference + self._min_overlap = 1 + self.debug = False + self._dpmatrix = None + self._insertion_cost = 1 + self._deletion_cost = 1 + + property min_overlap: + def __get__(self): + return self._min_overlap + + def __set__(self, int value): + if value < 1: + raise ValueError('Minimum overlap must be at least 1') + self._min_overlap = value + + property indel_cost: + """ + Matches cost 0, mismatches cost 1. Only insertion/deletion costs can be + changed. + """ + def __set__(self, value): + if value < 1: + raise ValueError('Insertion/deletion cost must be at least 1') + self._insertion_cost = value + self._deletion_cost = value + + property reference: + def __get__(self): + return self._reference + + def __set__(self, str reference): + mem = <_Entry*> PyMem_Realloc(self.column, (len(reference) + 1) * sizeof(_Entry)) + if not mem: + raise MemoryError() + self.column = mem + self._reference = reference.encode('ascii') + self.m = len(reference) + if self.wildcard_ref: + self._reference = self._reference.translate(IUPAC_TABLE) + elif self.wildcard_query: + self._reference = self._reference.translate(ACGT_TABLE) + self.str_reference = reference + + property dpmatrix: + """ + The dynamic programming matrix as a DPMatrix object. This attribute is + usually None, unless debugging has been enabled with enable_debug(). + """ + def __get__(self): + return self._dpmatrix + + def enable_debug(self): + """ + Store the dynamic programming matrix while running the locate() method + and make it available in the .dpmatrix attribute. + """ + self.debug = True + + def locate(self, str query): + """ + locate(query) -> (refstart, refstop, querystart, querystop, matches, errors) + + Find the query within the reference associated with this aligner. The + intervals (querystart, querystop) and (refstart, refstop) give the + location of the match. + + That is, the substrings query[querystart:querystop] and + self.reference[refstart:refstop] were found to align best to each other, + with the given number of matches and the given number of errors. + + The alignment itself is not returned. + """ + cdef: + char* s1 = self._reference + bytes query_bytes = query.encode('ascii') + char* s2 = query_bytes + int m = self.m + int n = len(query) + _Entry* column = self.column # Current column of the DP matrix + double max_error_rate = self.max_error_rate + bint start_in_ref = self.flags & START_WITHIN_SEQ1 + bint start_in_query = self.flags & START_WITHIN_SEQ2 + bint stop_in_ref = self.flags & STOP_WITHIN_SEQ1 + bint stop_in_query = self.flags & STOP_WITHIN_SEQ2 + bint compare_ascii = False + + if self.wildcard_query: + query_bytes = query_bytes.translate(IUPAC_TABLE) + s2 = query_bytes + elif self.wildcard_ref: + query_bytes = query_bytes.translate(ACGT_TABLE) + s2 = query_bytes + else: + compare_ascii = True + """ + DP Matrix: + query (j) + ----------> n + | + ref (i) | + | + V + m + """ + cdef int i, j + + # maximum no. of errors + cdef int k = (max_error_rate * m) + + # Determine largest and smallest column we need to compute + cdef int max_n = n + cdef int min_n = 0 + if not start_in_query: + # costs can only get worse after column m + max_n = min(n, m + k) + if not stop_in_query: + min_n = max(0, n - m - k) + + # Fill column min_n. + # + # Four cases: + # not startin1, not startin2: c(i,j) = max(i,j); origin(i, j) = 0 + # startin1, not startin2: c(i,j) = j ; origin(i, j) = min(0, j - i) + # not startin1, startin2: c(i,j) = i ; origin(i, j) = + # startin1, startin2: c(i,j) = min(i,j) + + # TODO (later) + # fill out columns only until 'last' + if not start_in_ref and not start_in_query: + for i in range(m + 1): + column[i].matches = 0 + column[i].cost = max(i, min_n) * self._insertion_cost + column[i].origin = 0 + elif start_in_ref and not start_in_query: + for i in range(m + 1): + column[i].matches = 0 + column[i].cost = min_n * self._insertion_cost + column[i].origin = min(0, min_n - i) + elif not start_in_ref and start_in_query: + for i in range(m + 1): + column[i].matches = 0 + column[i].cost = i * self._insertion_cost + column[i].origin = max(0, min_n - i) + else: + for i in range(m + 1): + column[i].matches = 0 + column[i].cost = min(i, min_n) * self._insertion_cost + column[i].origin = min_n - i + + if self.debug: + self._dpmatrix = DPMatrix(self.str_reference, query) + for i in range(m + 1): + self._dpmatrix.set_entry(i, min_n, column[i].cost) + cdef _Match best + best.ref_stop = m + best.query_stop = n + best.cost = m + n + best.origin = 0 + best.matches = 0 + + # Ukkonen's trick: index of the last cell that is at most k + cdef int last = min(m, k + 1) + if start_in_ref: + last = m + + cdef: + int cost_diag + int cost_deletion + int cost_insertion + int origin, cost, matches + int length + bint characters_equal + # We keep only a single column of the DP matrix in memory. + # To access the diagonal cell to the upper left, + # we store it here before overwriting it. + _Entry diag_entry + + with nogil: + # iterate over columns + for j in range(min_n + 1, max_n + 1): + # remember first entry before overwriting + diag_entry = column[0] + + # fill in first entry in this column + if start_in_query: + column[0].origin = j + else: + column[0].cost = j * self._insertion_cost + for i in range(1, last + 1): + if compare_ascii: + characters_equal = (s1[i-1] == s2[j-1]) + else: + characters_equal = (s1[i-1] & s2[j-1]) != 0 + if characters_equal: + # If the characters match, skip computing costs for + # insertion and deletion as they are at least as high. + cost = diag_entry.cost + origin = diag_entry.origin + matches = diag_entry.matches + 1 + else: + # Characters do not match. + cost_diag = diag_entry.cost + 1 + cost_deletion = column[i].cost + self._deletion_cost + cost_insertion = column[i-1].cost + self._insertion_cost + + if cost_diag <= cost_deletion and cost_diag <= cost_insertion: + # MISMATCH + cost = cost_diag + origin = diag_entry.origin + matches = diag_entry.matches + elif cost_insertion <= cost_deletion: + # INSERTION + cost = cost_insertion + origin = column[i-1].origin + matches = column[i-1].matches + else: + # DELETION + cost = cost_deletion + origin = column[i].origin + matches = column[i].matches + + # Remember the current cell for next iteration + diag_entry = column[i] + + column[i].cost = cost + column[i].origin = origin + column[i].matches = matches + if self.debug: + with gil: + for i in range(last + 1): + self._dpmatrix.set_entry(i, j, column[i].cost) + while last >= 0 and column[last].cost > k: + last -= 1 + # last can be -1 here, but will be incremented next. + # TODO if last is -1, can we stop searching? + if last < m: + last += 1 + elif stop_in_query: + # Found a match. If requested, find best match in last row. + # length of the aligned part of the reference + length = m + min(column[m].origin, 0) + cost = column[m].cost + matches = column[m].matches + if length >= self._min_overlap and cost <= length * max_error_rate and (matches > best.matches or (matches == best.matches and cost < best.cost)): + # update + best.matches = matches + best.cost = cost + best.origin = column[m].origin + best.ref_stop = m + best.query_stop = j + if cost == 0 and matches == m: + # exact match, stop early + break + # column finished + + if max_n == n: + first_i = 0 if stop_in_ref else m + # search in last column # TODO last? + for i in range(first_i, m+1): + length = i + min(column[i].origin, 0) + cost = column[i].cost + matches = column[i].matches + if length >= self._min_overlap and cost <= length * max_error_rate and (matches > best.matches or (matches == best.matches and cost < best.cost)): + # update best + best.matches = matches + best.cost = cost + best.origin = column[i].origin + best.ref_stop = i + best.query_stop = n + if best.cost == m + n: + # best.cost was initialized with this value. + # If it is unchanged, no alignment was found that has + # an error rate within the allowed range. + return None + + cdef int start1, start2 + if best.origin >= 0: + start1 = 0 + start2 = best.origin + else: + start1 = -best.origin + start2 = 0 + + assert best.ref_stop - start1 > 0 # Do not return empty alignments. + return (start1, best.ref_stop, start2, best.query_stop, best.matches, best.cost) + + def __dealloc__(self): + PyMem_Free(self.column) def compare_prefixes(str ref, str query, bint wildcard_ref=False, bint wildcard_query=False): - """ - Find out whether one string is the prefix of the other one, allowing - IUPAC wildcards in ref and/or query if the appropriate flag is set. - - This is used to find an anchored 5' adapter (type 'FRONT') in the 'no indels' mode. - This is very simple as only the number of errors needs to be counted. - - This function returns a tuple compatible with what Aligner.locate outputs. - """ - cdef: - int m = len(ref) - int n = len(query) - bytes query_bytes = query.encode('ascii') - bytes ref_bytes = ref.encode('ascii') - char* r_ptr - char* q_ptr - int length = min(m, n) - int i, matches = 0 - bint compare_ascii = False - - if wildcard_ref: - ref_bytes = ref_bytes.translate(IUPAC_TABLE) - elif wildcard_query: - ref_bytes = ref_bytes.translate(ACGT_TABLE) - else: - compare_ascii = True - if wildcard_query: - query_bytes = query_bytes.translate(IUPAC_TABLE) - elif wildcard_ref: - query_bytes = query_bytes.translate(ACGT_TABLE) - - if compare_ascii: - for i in range(length): - if ref[i] == query[i]: - matches += 1 - else: - r_ptr = ref_bytes - q_ptr = query_bytes - for i in range(length): - if (r_ptr[i] & q_ptr[i]) != 0: - matches += 1 - - # length - matches = no. of errors - return (0, length, 0, length, matches, length - matches) + """ + Find out whether one string is the prefix of the other one, allowing + IUPAC wildcards in ref and/or query if the appropriate flag is set. + + This is used to find an anchored 5' adapter (type 'FRONT') in the 'no indels' mode. + This is very simple as only the number of errors needs to be counted. + + This function returns a tuple compatible with what Aligner.locate outputs. + """ + cdef: + int m = len(ref) + int n = len(query) + bytes query_bytes = query.encode('ascii') + bytes ref_bytes = ref.encode('ascii') + char* r_ptr + char* q_ptr + int length = min(m, n) + int i, matches = 0 + bint compare_ascii = False + + if wildcard_ref: + ref_bytes = ref_bytes.translate(IUPAC_TABLE) + elif wildcard_query: + ref_bytes = ref_bytes.translate(ACGT_TABLE) + else: + compare_ascii = True + if wildcard_query: + query_bytes = query_bytes.translate(IUPAC_TABLE) + elif wildcard_ref: + query_bytes = query_bytes.translate(ACGT_TABLE) + + if compare_ascii: + for i in range(length): + if ref[i] == query[i]: + matches += 1 + else: + r_ptr = ref_bytes + q_ptr = query_bytes + for i in range(length): + if (r_ptr[i] & q_ptr[i]) != 0: + matches += 1 + + # length - matches = no. of errors + return (0, length, 0, length, matches, length - matches) diff --git a/src/cutadapt/adapters.py b/src/cutadapt/adapters.py index 28721736..3494b08c 100644 --- a/src/cutadapt/adapters.py +++ b/src/cutadapt/adapters.py @@ -25,836 +25,836 @@ # TODO put this in some kind of "list of pre-defined adapter types" along with the info above WHERE_TO_REMOVE_MAP = { - PREFIX: 'prefix', - FRONT_NOT_INTERNAL: 'prefix', - FRONT: 'prefix', - BACK: 'suffix', - SUFFIX: 'suffix', - BACK_NOT_INTERNAL: 'suffix', - ANYWHERE: 'auto', + PREFIX: 'prefix', + FRONT_NOT_INTERNAL: 'prefix', + FRONT: 'prefix', + BACK: 'suffix', + SUFFIX: 'suffix', + BACK_NOT_INTERNAL: 'suffix', + ANYWHERE: 'auto', } def expand_braces(sequence): - """ - Replace all occurrences of ``x{n}`` (where x is any character) with n - occurrences of x. Raise ValueError if the expression cannot be parsed. - - >>> expand_braces('TGA{5}CT') - 'TGAAAAACT' - """ - # Simple DFA with four states, encoded in prev - result = '' - prev = None - for s in re.split('([{}])', sequence): - if s == '': - continue - if prev is None: - if s == '{': - raise ValueError('"{" must be used after a character') - if s == '}': - raise ValueError('"}" cannot be used here') - prev = s - result += s - elif prev == '{': - prev = int(s) - if not 0 <= prev <= 10000: - raise ValueError('Value {} invalid'.format(prev)) - elif isinstance(prev, int): - if s != '}': - raise ValueError('"}" expected') - result = result[:-1] + result[-1] * prev - prev = None - else: - if s != '{': - raise ValueError('Expected "{"') - prev = '{' - # Check if we are in a non-terminating state - if isinstance(prev, int) or prev == '{': - raise ValueError("Unterminated expression") - return result + """ + Replace all occurrences of ``x{n}`` (where x is any character) with n + occurrences of x. Raise ValueError if the expression cannot be parsed. + + >>> expand_braces('TGA{5}CT') + 'TGAAAAACT' + """ + # Simple DFA with four states, encoded in prev + result = '' + prev = None + for s in re.split('([{}])', sequence): + if s == '': + continue + if prev is None: + if s == '{': + raise ValueError('"{" must be used after a character') + if s == '}': + raise ValueError('"}" cannot be used here') + prev = s + result += s + elif prev == '{': + prev = int(s) + if not 0 <= prev <= 10000: + raise ValueError('Value {} invalid'.format(prev)) + elif isinstance(prev, int): + if s != '}': + raise ValueError('"}" expected') + result = result[:-1] + result[-1] * prev + prev = None + else: + if s != '{': + raise ValueError('Expected "{"') + prev = '{' + # Check if we are in a non-terminating state + if isinstance(prev, int) or prev == '{': + raise ValueError("Unterminated expression") + return result class AdapterParser: - """ - Factory for Adapter classes that all use the same parameters (error rate, - indels etc.). The given **kwargs will be passed to the Adapter constructors. - """ - def __init__(self, **kwargs): - # kwargs: max_error_rate, min_overlap, read_wildcards, adapter_wildcards, indels - self.default_parameters = kwargs - - @staticmethod - def _extract_name(spec): - """ - Parse an adapter specification given as 'name=adapt' into 'name' and 'adapt'. - """ - fields = spec.split('=', 1) - if len(fields) > 1: - name, spec = fields - name = name.strip() - else: - name = None - spec = spec.strip() - return name, spec - - parameters = { - # abbreviations - 'e': 'max_error_rate', - 'error_rate': 'max_error_rate', - 'o': 'min_overlap', - - # allowed parameters - 'max_error_rate': None, - 'min_overlap': None, - 'anywhere': None, - } - - @staticmethod - def _parse_parameters(spec): - """Parse key=value;key=value;key=value into a dict""" - - fields = spec.split(';') - result = dict() - for field in fields: - field = field.strip() - if not field: - continue - key, equals, value = field.partition('=') - if equals == '=' and value == '': - raise ValueError('No value given') - key = key.strip() - if key not in AdapterParser.parameters: - raise KeyError('Unknown parameter {}'.format(key)) - # unabbreviate - while AdapterParser.parameters[key] is not None: - key = AdapterParser.parameters[key] - value = value.strip() - if value == '': - value = True - else: - try: - value = int(value) - except ValueError: - value = float(value) - if key in result: - raise KeyError('Key {} specified twice'.format(key)) - result[key] = value - return result - - @staticmethod - def _parse_not_linked(spec, cmdline_type): - """ - Parse an adapter specification for a non-linked adapter (without '...') - - Allow: - 'back' and ADAPTER - 'back' and ADAPTERX - 'back' and ADAPTER$ - 'front' and ADAPTER - 'front' and XADAPTER - 'front' and ^ADAPTER - 'anywhere' and ADAPTER - """ - error = ValueError( - "You cannot use multiple placement restrictions for an adapter at the same time. " - "Choose one of ^ADAPTER, ADAPTER$, XADAPTER or ADAPTERX") - spec, middle, parameters_spec = spec.partition(';') - name, spec = AdapterParser._extract_name(spec) - spec = spec.strip() - - parameters = AdapterParser._parse_parameters(parameters_spec) - - spec = expand_braces(spec) - - # Special case for adapters consisting of only X characters: - # This needs to be supported for backwards-compatibilitity - if len(spec.strip('X')) == 0: - return name, None, spec, {} - - front_restriction = None - if spec.startswith('^'): - front_restriction = 'anchored' - spec = spec[1:] - if spec.upper().startswith('X'): - if front_restriction is not None: - raise error - front_restriction = 'noninternal' - spec = spec.lstrip('xX') - - back_restriction = None - if spec.endswith('$'): - back_restriction = 'anchored' - spec = spec[:-1] - if spec.upper().endswith('X'): - if back_restriction is not None: - raise error - back_restriction = 'noninternal' - spec = spec.rstrip('xX') - - n_placement_restrictions = int(bool(front_restriction)) + int(bool(back_restriction)) - if n_placement_restrictions > 1: - raise error - - if cmdline_type == 'front' and back_restriction: - raise ValueError( - "Allowed placement restrictions for a 5' adapter are XADAPTER and ^ADAPTER") - if cmdline_type == 'back' and front_restriction: - raise ValueError( - "Allowed placement restrictions for a 3' adapter are ADAPTERX and ADAPTER$") - - assert front_restriction is None or back_restriction is None - if front_restriction is not None: - restriction = front_restriction - else: - restriction = back_restriction - - if cmdline_type == 'anywhere' and restriction is not None: - raise ValueError( - "Placement restrictions (with X, ^, $) not supported for 'anywhere' (-b) adapters") - - return name, restriction, spec, parameters - - @staticmethod - def _restriction_to_where(cmdline_type, restriction): - if cmdline_type == 'front': - if restriction is None: - return FRONT - elif restriction == 'anchored': - return PREFIX - elif restriction == 'noninternal': - return FRONT_NOT_INTERNAL - else: - raise ValueError( - 'Value {} for a front restriction not allowed'.format(restriction)) - elif cmdline_type == 'back': - if restriction is None: - return BACK - elif restriction == 'anchored': - return SUFFIX - elif restriction == 'noninternal': - return BACK_NOT_INTERNAL - else: - raise ValueError( - 'Value {} for a back restriction not allowed'.format(restriction)) - else: - assert cmdline_type == 'anywhere' - if restriction is None: - return ANYWHERE - else: - raise ValueError('Not placement may be specified for "anywhere" adapters') - - def _parse(self, spec, cmdline_type='back', name=None): - """ - Parse an adapter specification not using ``file:`` notation and return - an object of an appropriate Adapter class. - - name -- Adapter name if not included as part of the spec. (If spec is - 'name=ADAPTER', name will be 'name'.) - - cmdline_type -- describes which commandline parameter was used (``-a`` - is 'back', ``-b`` is 'anywhere', and ``-g`` is 'front'). - """ - if cmdline_type not in ('front', 'back', 'anywhere'): - raise ValueError('cmdline_type cannot be {!r}'.format(cmdline_type)) - spec1, middle, spec2 = spec.partition('...') - del spec - - # Handle linked adapter - if middle == '...' and spec1 and spec2: - if cmdline_type == 'anywhere': - raise ValueError("'anywhere' (-b) adapters may not be linked") - name1, front_restriction, sequence1, parameters1 = self._parse_not_linked(spec1, 'front') - name2, back_restriction, sequence2, parameters2 = self._parse_not_linked(spec2, 'back') - if not name: - name = name1 - - # Automatically anchor the 5' adapter if -a is used - if cmdline_type == 'back' and front_restriction is None: - front_restriction = 'anchored' - - front_anchored = front_restriction is not None - back_anchored = back_restriction is not None - - front_parameters = self.default_parameters.copy() - front_parameters.update(parameters1) - back_parameters = self.default_parameters.copy() - back_parameters.update(parameters2) - - if cmdline_type == 'front': - # -g requires both adapters to be present - front_required = True - back_required = True - else: - # -a requires only the anchored adapters to be present - front_required = front_anchored - back_required = back_anchored - - front_where = self._restriction_to_where('front', front_restriction) - back_where = self._restriction_to_where('back', back_restriction) - front_adapter = Adapter(sequence1, where=front_where, name=None, **front_parameters) - back_adapter = Adapter(sequence2, where=back_where, name=None, **back_parameters) - - return LinkedAdapter( - front_adapter=front_adapter, - back_adapter=back_adapter, - front_required=front_required, - back_required=back_required, - name=name, - ) - - if middle == '...': - if not spec1: - if cmdline_type == 'back': # -a ...ADAPTER - spec = spec2 - else: # -g ...ADAPTER - raise ValueError('Invalid adapter specification') - elif not spec2: - if cmdline_type == 'back': # -a ADAPTER... - cmdline_type = 'front' - spec = '^' + spec1 - else: # -g ADAPTER... - spec = spec1 - else: - assert False, 'This should not happen' - else: - spec = spec1 - - # TODO - specname, restriction, sequence, parameters = self._parse_not_linked(spec, cmdline_type) - del spec - - where = self._restriction_to_where(cmdline_type, restriction) - - if not name: - name = specname - if parameters.get('anywhere', False): - parameters['remove'] = WHERE_TO_REMOVE_MAP[where] - where = ANYWHERE - del parameters['anywhere'] - params = self.default_parameters.copy() - params.update(parameters) - if where in (FRONT, BACK): - adapter_class = BackOrFrontAdapter - else: - adapter_class = Adapter - return adapter_class(sequence=sequence, where=where, name=name, **params) - - def parse(self, spec, cmdline_type='back'): - """ - Parse an adapter specification and yield appropriate Adapter classes. - This works like the _parse_no_file() function above, but also supports the - ``file:`` notation for reading adapters from an external FASTA - file. Since a file can contain multiple adapters, this - function is a generator. - """ - if spec.startswith('file:'): - # read adapter sequences from a file - with FastaReader(spec[5:]) as fasta: - for record in fasta: - name = record.name.split(None, 1)[0] - yield self._parse(record.sequence, cmdline_type, name=name) - else: - yield self._parse(spec, cmdline_type, name=None) - - def parse_multi(self, back, anywhere, front): - """ - Parse all three types of commandline options that can be used to - specify adapters. back, anywhere and front are lists of strings, - corresponding to the respective commandline types (-a, -b, -g). - - Return a list of appropriate Adapter classes. - """ - adapters = [] - for specs, cmdline_type in (back, 'back'), (anywhere, 'anywhere'), (front, 'front'): - for spec in specs: - adapters.extend(self.parse(spec, cmdline_type)) - return adapters + """ + Factory for Adapter classes that all use the same parameters (error rate, + indels etc.). The given **kwargs will be passed to the Adapter constructors. + """ + def __init__(self, **kwargs): + # kwargs: max_error_rate, min_overlap, read_wildcards, adapter_wildcards, indels + self.default_parameters = kwargs + + @staticmethod + def _extract_name(spec): + """ + Parse an adapter specification given as 'name=adapt' into 'name' and 'adapt'. + """ + fields = spec.split('=', 1) + if len(fields) > 1: + name, spec = fields + name = name.strip() + else: + name = None + spec = spec.strip() + return name, spec + + parameters = { + # abbreviations + 'e': 'max_error_rate', + 'error_rate': 'max_error_rate', + 'o': 'min_overlap', + + # allowed parameters + 'max_error_rate': None, + 'min_overlap': None, + 'anywhere': None, + } + + @staticmethod + def _parse_parameters(spec): + """Parse key=value;key=value;key=value into a dict""" + + fields = spec.split(';') + result = dict() + for field in fields: + field = field.strip() + if not field: + continue + key, equals, value = field.partition('=') + if equals == '=' and value == '': + raise ValueError('No value given') + key = key.strip() + if key not in AdapterParser.parameters: + raise KeyError('Unknown parameter {}'.format(key)) + # unabbreviate + while AdapterParser.parameters[key] is not None: + key = AdapterParser.parameters[key] + value = value.strip() + if value == '': + value = True + else: + try: + value = int(value) + except ValueError: + value = float(value) + if key in result: + raise KeyError('Key {} specified twice'.format(key)) + result[key] = value + return result + + @staticmethod + def _parse_not_linked(spec, cmdline_type): + """ + Parse an adapter specification for a non-linked adapter (without '...') + + Allow: + 'back' and ADAPTER + 'back' and ADAPTERX + 'back' and ADAPTER$ + 'front' and ADAPTER + 'front' and XADAPTER + 'front' and ^ADAPTER + 'anywhere' and ADAPTER + """ + error = ValueError( + "You cannot use multiple placement restrictions for an adapter at the same time. " + "Choose one of ^ADAPTER, ADAPTER$, XADAPTER or ADAPTERX") + spec, middle, parameters_spec = spec.partition(';') + name, spec = AdapterParser._extract_name(spec) + spec = spec.strip() + + parameters = AdapterParser._parse_parameters(parameters_spec) + + spec = expand_braces(spec) + + # Special case for adapters consisting of only X characters: + # This needs to be supported for backwards-compatibilitity + if len(spec.strip('X')) == 0: + return name, None, spec, {} + + front_restriction = None + if spec.startswith('^'): + front_restriction = 'anchored' + spec = spec[1:] + if spec.upper().startswith('X'): + if front_restriction is not None: + raise error + front_restriction = 'noninternal' + spec = spec.lstrip('xX') + + back_restriction = None + if spec.endswith('$'): + back_restriction = 'anchored' + spec = spec[:-1] + if spec.upper().endswith('X'): + if back_restriction is not None: + raise error + back_restriction = 'noninternal' + spec = spec.rstrip('xX') + + n_placement_restrictions = int(bool(front_restriction)) + int(bool(back_restriction)) + if n_placement_restrictions > 1: + raise error + + if cmdline_type == 'front' and back_restriction: + raise ValueError( + "Allowed placement restrictions for a 5' adapter are XADAPTER and ^ADAPTER") + if cmdline_type == 'back' and front_restriction: + raise ValueError( + "Allowed placement restrictions for a 3' adapter are ADAPTERX and ADAPTER$") + + assert front_restriction is None or back_restriction is None + if front_restriction is not None: + restriction = front_restriction + else: + restriction = back_restriction + + if cmdline_type == 'anywhere' and restriction is not None: + raise ValueError( + "Placement restrictions (with X, ^, $) not supported for 'anywhere' (-b) adapters") + + return name, restriction, spec, parameters + + @staticmethod + def _restriction_to_where(cmdline_type, restriction): + if cmdline_type == 'front': + if restriction is None: + return FRONT + elif restriction == 'anchored': + return PREFIX + elif restriction == 'noninternal': + return FRONT_NOT_INTERNAL + else: + raise ValueError( + 'Value {} for a front restriction not allowed'.format(restriction)) + elif cmdline_type == 'back': + if restriction is None: + return BACK + elif restriction == 'anchored': + return SUFFIX + elif restriction == 'noninternal': + return BACK_NOT_INTERNAL + else: + raise ValueError( + 'Value {} for a back restriction not allowed'.format(restriction)) + else: + assert cmdline_type == 'anywhere' + if restriction is None: + return ANYWHERE + else: + raise ValueError('Not placement may be specified for "anywhere" adapters') + + def _parse(self, spec, cmdline_type='back', name=None): + """ + Parse an adapter specification not using ``file:`` notation and return + an object of an appropriate Adapter class. + + name -- Adapter name if not included as part of the spec. (If spec is + 'name=ADAPTER', name will be 'name'.) + + cmdline_type -- describes which commandline parameter was used (``-a`` + is 'back', ``-b`` is 'anywhere', and ``-g`` is 'front'). + """ + if cmdline_type not in ('front', 'back', 'anywhere'): + raise ValueError('cmdline_type cannot be {!r}'.format(cmdline_type)) + spec1, middle, spec2 = spec.partition('...') + del spec + + # Handle linked adapter + if middle == '...' and spec1 and spec2: + if cmdline_type == 'anywhere': + raise ValueError("'anywhere' (-b) adapters may not be linked") + name1, front_restriction, sequence1, parameters1 = self._parse_not_linked(spec1, 'front') + name2, back_restriction, sequence2, parameters2 = self._parse_not_linked(spec2, 'back') + if not name: + name = name1 + + # Automatically anchor the 5' adapter if -a is used + if cmdline_type == 'back' and front_restriction is None: + front_restriction = 'anchored' + + front_anchored = front_restriction is not None + back_anchored = back_restriction is not None + + front_parameters = self.default_parameters.copy() + front_parameters.update(parameters1) + back_parameters = self.default_parameters.copy() + back_parameters.update(parameters2) + + if cmdline_type == 'front': + # -g requires both adapters to be present + front_required = True + back_required = True + else: + # -a requires only the anchored adapters to be present + front_required = front_anchored + back_required = back_anchored + + front_where = self._restriction_to_where('front', front_restriction) + back_where = self._restriction_to_where('back', back_restriction) + front_adapter = Adapter(sequence1, where=front_where, name=None, **front_parameters) + back_adapter = Adapter(sequence2, where=back_where, name=None, **back_parameters) + + return LinkedAdapter( + front_adapter=front_adapter, + back_adapter=back_adapter, + front_required=front_required, + back_required=back_required, + name=name, + ) + + if middle == '...': + if not spec1: + if cmdline_type == 'back': # -a ...ADAPTER + spec = spec2 + else: # -g ...ADAPTER + raise ValueError('Invalid adapter specification') + elif not spec2: + if cmdline_type == 'back': # -a ADAPTER... + cmdline_type = 'front' + spec = '^' + spec1 + else: # -g ADAPTER... + spec = spec1 + else: + assert False, 'This should not happen' + else: + spec = spec1 + + # TODO + specname, restriction, sequence, parameters = self._parse_not_linked(spec, cmdline_type) + del spec + + where = self._restriction_to_where(cmdline_type, restriction) + + if not name: + name = specname + if parameters.get('anywhere', False): + parameters['remove'] = WHERE_TO_REMOVE_MAP[where] + where = ANYWHERE + del parameters['anywhere'] + params = self.default_parameters.copy() + params.update(parameters) + if where in (FRONT, BACK): + adapter_class = BackOrFrontAdapter + else: + adapter_class = Adapter + return adapter_class(sequence=sequence, where=where, name=name, **params) + + def parse(self, spec, cmdline_type='back'): + """ + Parse an adapter specification and yield appropriate Adapter classes. + This works like the _parse_no_file() function above, but also supports the + ``file:`` notation for reading adapters from an external FASTA + file. Since a file can contain multiple adapters, this + function is a generator. + """ + if spec.startswith('file:'): + # read adapter sequences from a file + with FastaReader(spec[5:]) as fasta: + for record in fasta: + name = record.name.split(None, 1)[0] + yield self._parse(record.sequence, cmdline_type, name=name) + else: + yield self._parse(spec, cmdline_type, name=None) + + def parse_multi(self, back, anywhere, front): + """ + Parse all three types of commandline options that can be used to + specify adapters. back, anywhere and front are lists of strings, + corresponding to the respective commandline types (-a, -b, -g). + + Return a list of appropriate Adapter classes. + """ + adapters = [] + for specs, cmdline_type in (back, 'back'), (anywhere, 'anywhere'), (front, 'front'): + for spec in specs: + adapters.extend(self.parse(spec, cmdline_type)) + return adapters def returns_defaultdict_int(): - # We need this function to make EndStatistics picklable. - # Even a @staticmethod of EndStatistics is not sufficient - # as that is not picklable before Python 3.5. - return defaultdict(int) + # We need this function to make EndStatistics picklable. + # Even a @staticmethod of EndStatistics is not sufficient + # as that is not picklable before Python 3.5. + return defaultdict(int) class EndStatistics: - """Statistics about the 5' or 3' end""" - - def __init__(self, adapter): - self.where = adapter.where - self.max_error_rate = adapter.max_error_rate - self.sequence = adapter.sequence - self.has_wildcards = adapter.adapter_wildcards - # self.errors[l][e] == n iff n times a sequence of length l matching at e errors was removed - self.errors = defaultdict(returns_defaultdict_int) - self._remove = adapter.remove - self.adjacent_bases = {'A': 0, 'C': 0, 'G': 0, 'T': 0, '': 0} - - def __iadd__(self, other): - if (self.where != other.where or self._remove != other._remove or - self.max_error_rate != other.max_error_rate or self.sequence != other.sequence): - raise RuntimeError('Incompatible EndStatistics, cannot be added') - for base in ('A', 'C', 'G', 'T', ''): - self.adjacent_bases[base] += other.adjacent_bases[base] - for length, error_dict in other.errors.items(): - for errors in error_dict: - self.errors[length][errors] += other.errors[length][errors] - return self - - @property - def lengths(self): - # Python 2.6 has no dict comprehension - d = dict((length, sum(errors.values())) for length, errors in self.errors.items()) - return d - - def random_match_probabilities(self, gc_content): - """ - Estimate probabilities that this adapter end matches a - random sequence. Indels are not taken into account. - - Returns a list p, where p[i] is the probability that - i bases of this adapter match a random sequence with - GC content gc_content. - - The where parameter is necessary for linked adapters to - specify which (front or back) of the two adapters is meant. - """ - seq = self.sequence - # FIXME this is broken for self._remove == 'auto' - if self._remove == 'prefix': - seq = seq[::-1] - allowed_bases = 'CGRYSKMBDHVN' if self.has_wildcards else 'GC' - p = 1 - probabilities = [p] - for i, c in enumerate(seq): - if c in allowed_bases: - p *= gc_content / 2. - else: - p *= (1 - gc_content) / 2 - probabilities.append(p) - return probabilities + """Statistics about the 5' or 3' end""" + + def __init__(self, adapter): + self.where = adapter.where + self.max_error_rate = adapter.max_error_rate + self.sequence = adapter.sequence + self.has_wildcards = adapter.adapter_wildcards + # self.errors[l][e] == n iff n times a sequence of length l matching at e errors was removed + self.errors = defaultdict(returns_defaultdict_int) + self._remove = adapter.remove + self.adjacent_bases = {'A': 0, 'C': 0, 'G': 0, 'T': 0, '': 0} + + def __iadd__(self, other): + if (self.where != other.where or self._remove != other._remove or + self.max_error_rate != other.max_error_rate or self.sequence != other.sequence): + raise RuntimeError('Incompatible EndStatistics, cannot be added') + for base in ('A', 'C', 'G', 'T', ''): + self.adjacent_bases[base] += other.adjacent_bases[base] + for length, error_dict in other.errors.items(): + for errors in error_dict: + self.errors[length][errors] += other.errors[length][errors] + return self + + @property + def lengths(self): + # Python 2.6 has no dict comprehension + d = dict((length, sum(errors.values())) for length, errors in self.errors.items()) + return d + + def random_match_probabilities(self, gc_content): + """ + Estimate probabilities that this adapter end matches a + random sequence. Indels are not taken into account. + + Returns a list p, where p[i] is the probability that + i bases of this adapter match a random sequence with + GC content gc_content. + + The where parameter is necessary for linked adapters to + specify which (front or back) of the two adapters is meant. + """ + seq = self.sequence + # FIXME this is broken for self._remove == 'auto' + if self._remove == 'prefix': + seq = seq[::-1] + allowed_bases = 'CGRYSKMBDHVN' if self.has_wildcards else 'GC' + p = 1 + probabilities = [p] + for i, c in enumerate(seq): + if c in allowed_bases: + p *= gc_content / 2. + else: + p *= (1 - gc_content) / 2 + probabilities.append(p) + return probabilities class AdapterStatistics: - """ - Statistics about an adapter. An adapter can work on the 5' end (front) - or 3' end (back) of a read, and statistics for that are captured - separately. - """ - - def __init__(self, adapter, adapter2=None, where=None): - self.name = adapter.name - self.where = where if where is not None else adapter.where - self.front = EndStatistics(adapter) - if adapter2 is None: - self.back = EndStatistics(adapter) - else: - self.back = EndStatistics(adapter2) - - def __iadd__(self, other): - if self.where != other.where: # TODO self.name != other.name or - raise ValueError('incompatible objects') - self.front += other.front - self.back += other.back - return self + """ + Statistics about an adapter. An adapter can work on the 5' end (front) + or 3' end (back) of a read, and statistics for that are captured + separately. + """ + + def __init__(self, adapter, adapter2=None, where=None): + self.name = adapter.name + self.where = where if where is not None else adapter.where + self.front = EndStatistics(adapter) + if adapter2 is None: + self.back = EndStatistics(adapter) + else: + self.back = EndStatistics(adapter2) + + def __iadd__(self, other): + if self.where != other.where: # TODO self.name != other.name or + raise ValueError('incompatible objects') + self.front += other.front + self.back += other.back + return self class Match: - """ - Representation of a single adapter matched to a single read. - """ - __slots__ = ['astart', 'astop', 'rstart', 'rstop', 'matches', 'errors', 'remove_before', - 'adapter', 'read', 'length', '_trimmed_read', 'adjacent_base'] - - def __init__(self, astart, astop, rstart, rstop, matches, errors, remove_before, adapter, read): - """ - remove_before -- True: remove bases before adapter. False: remove after - """ - self.astart = astart - self.astop = astop - self.rstart = rstart - self.rstop = rstop - self.matches = matches - self.errors = errors - self.adapter = adapter - self.read = read - if remove_before: - # Compute the trimmed read, assuming it’s a 'front' adapter - self._trimmed_read = read[rstop:] - self.adjacent_base = '' - else: - # Compute the trimmed read, assuming it’s a 'back' adapter - self.adjacent_base = read.sequence[rstart - 1:rstart] - self._trimmed_read = read[:rstart] - self.remove_before = remove_before - # Number of aligned characters in the adapter. If there are - # indels, this may be different from the number of characters - # in the read. - self.length = astop - astart - - def __repr__(self): - return 'Match(astart={}, astop={}, rstart={}, rstop={}, matches={}, errors={})'.format( - self.astart, self.astop, self.rstart, self.rstop, self.matches, self.errors) - - def wildcards(self, wildcard_char='N'): - """ - Return a string that contains, for each wildcard character, - the character that it matches. For example, if the adapter - ATNGNA matches ATCGTA, then the string 'CT' is returned. - - If there are indels, this is not reliable as the full alignment - is not available. - """ - wildcards = [self.read.sequence[self.rstart + i] for i in range(self.length) - if self.adapter.sequence[self.astart + i] == wildcard_char and - self.rstart + i < len(self.read.sequence)] - return ''.join(wildcards) - - def rest(self): - """ - Return the part of the read before this match if this is a - 'front' (5') adapter, - return the part after the match if this is not a 'front' adapter (3'). - This can be an empty string. - """ - if self.remove_before: - return self.read.sequence[:self.rstart] - else: - return self.read.sequence[self.rstop:] - - def get_info_record(self): - seq = self.read.sequence - qualities = self.read.qualities - info = ( - self.read.name, - self.errors, - self.rstart, - self.rstop, - seq[0:self.rstart], - seq[self.rstart:self.rstop], - seq[self.rstop:], - self.adapter.name - ) - if qualities: - info += ( - qualities[0:self.rstart], - qualities[self.rstart:self.rstop], - qualities[self.rstop:] - ) - else: - info += ('', '', '') - - return info - - def trimmed(self): - return self._trimmed_read - - def update_statistics(self, statistics): - """Update AdapterStatistics in place""" - if self.remove_before: - statistics.front.errors[self.rstop][self.errors] += 1 - else: - statistics.back.errors[len(self.read) - len(self._trimmed_read)][self.errors] += 1 - try: - statistics.back.adjacent_bases[self.adjacent_base] += 1 - except KeyError: - statistics.back.adjacent_bases[''] = 1 + """ + Representation of a single adapter matched to a single read. + """ + __slots__ = ['astart', 'astop', 'rstart', 'rstop', 'matches', 'errors', 'remove_before', + 'adapter', 'read', 'length', '_trimmed_read', 'adjacent_base'] + + def __init__(self, astart, astop, rstart, rstop, matches, errors, remove_before, adapter, read): + """ + remove_before -- True: remove bases before adapter. False: remove after + """ + self.astart = astart + self.astop = astop + self.rstart = rstart + self.rstop = rstop + self.matches = matches + self.errors = errors + self.adapter = adapter + self.read = read + if remove_before: + # Compute the trimmed read, assuming it’s a 'front' adapter + self._trimmed_read = read[rstop:] + self.adjacent_base = '' + else: + # Compute the trimmed read, assuming it’s a 'back' adapter + self.adjacent_base = read.sequence[rstart - 1:rstart] + self._trimmed_read = read[:rstart] + self.remove_before = remove_before + # Number of aligned characters in the adapter. If there are + # indels, this may be different from the number of characters + # in the read. + self.length = astop - astart + + def __repr__(self): + return 'Match(astart={}, astop={}, rstart={}, rstop={}, matches={}, errors={})'.format( + self.astart, self.astop, self.rstart, self.rstop, self.matches, self.errors) + + def wildcards(self, wildcard_char='N'): + """ + Return a string that contains, for each wildcard character, + the character that it matches. For example, if the adapter + ATNGNA matches ATCGTA, then the string 'CT' is returned. + + If there are indels, this is not reliable as the full alignment + is not available. + """ + wildcards = [self.read.sequence[self.rstart + i] for i in range(self.length) + if self.adapter.sequence[self.astart + i] == wildcard_char and + self.rstart + i < len(self.read.sequence)] + return ''.join(wildcards) + + def rest(self): + """ + Return the part of the read before this match if this is a + 'front' (5') adapter, + return the part after the match if this is not a 'front' adapter (3'). + This can be an empty string. + """ + if self.remove_before: + return self.read.sequence[:self.rstart] + else: + return self.read.sequence[self.rstop:] + + def get_info_record(self): + seq = self.read.sequence + qualities = self.read.qualities + info = ( + self.read.name, + self.errors, + self.rstart, + self.rstop, + seq[0:self.rstart], + seq[self.rstart:self.rstop], + seq[self.rstop:], + self.adapter.name + ) + if qualities: + info += ( + qualities[0:self.rstart], + qualities[self.rstart:self.rstop], + qualities[self.rstop:] + ) + else: + info += ('', '', '') + + return info + + def trimmed(self): + return self._trimmed_read + + def update_statistics(self, statistics): + """Update AdapterStatistics in place""" + if self.remove_before: + statistics.front.errors[self.rstop][self.errors] += 1 + else: + statistics.back.errors[len(self.read) - len(self._trimmed_read)][self.errors] += 1 + try: + statistics.back.adjacent_bases[self.adjacent_base] += 1 + except KeyError: + statistics.back.adjacent_bases[''] = 1 def _generate_adapter_name(_start=[1]): - name = str(_start[0]) - _start[0] += 1 - return name + name = str(_start[0]) + _start[0] += 1 + return name class Adapter: - """ - This class can find a single adapter characterized by sequence, error rate, - type etc. within reads. - - where -- One of the BACK, FRONT, PREFIX, SUFFIX or ANYWHERE constants. - This influences where the adapter is allowed to appear within the - read. - - remove -- describes which part of the read to remove if the adapter was found: - * "prefix" (for a 3' adapter) - * "suffix" (for a 5' adapter) - * "auto" for a 5'/3' mixed adapter (if the match involves the first base of the read, it - is assumed to be a 5' adapter and a 3' otherwise) - * None: One of the above is chosen depending on the 'where' parameter - - sequence -- The adapter sequence as string. Will be converted to uppercase. - Also, Us will be converted to Ts. - - max_error_rate -- Maximum allowed error rate. The error rate is - the number of errors in the alignment divided by the length - of the part of the alignment that matches the adapter. - - minimum_overlap -- Minimum length of the part of the alignment - that matches the adapter. - - read_wildcards -- Whether IUPAC wildcards in the read are allowed. - - adapter_wildcards -- Whether IUPAC wildcards in the adapter are - allowed. - - name -- optional name of the adapter. If not provided, the name is set to a - unique number. - """ - - def __init__(self, sequence, where, remove=None, max_error_rate=0.1, min_overlap=3, - read_wildcards=False, adapter_wildcards=True, name=None, indels=True): - self._debug = False - self.name = _generate_adapter_name() if name is None else name - self.sequence = sequence.upper().replace('U', 'T') - if not self.sequence: - raise ValueError('Sequence is empty') - self.where = where - if remove not in (None, 'prefix', 'suffix', 'auto'): - raise ValueError('remove parameter must be "prefix", "suffix", "auto" or None') - self.remove = WHERE_TO_REMOVE_MAP[where] if remove is None else remove - self.max_error_rate = max_error_rate - self.min_overlap = min(min_overlap, len(self.sequence)) - iupac = frozenset('XACGTURYSWKMBDHVN') - if adapter_wildcards and not set(self.sequence) <= iupac: - for c in self.sequence: - if c not in iupac: - raise ValueError('Character {!r} in adapter sequence {!r} is ' - 'not a valid IUPAC code. Use only characters ' - 'XACGTURYSWKMBDHVN.'.format(c, self.sequence)) - # Optimization: Use non-wildcard matching if only ACGT is used - self.adapter_wildcards = adapter_wildcards and not set(self.sequence) <= set('ACGT') - self.read_wildcards = read_wildcards - - self.aligner = align.Aligner(self.sequence, self.max_error_rate, - flags=self.where, wildcard_ref=self.adapter_wildcards, wildcard_query=self.read_wildcards) - self.aligner.min_overlap = self.min_overlap - self.indels = indels - if not self.indels: - # TODO - # When indels are disallowed, an entirely different algorithm - # should be used. - self.aligner.indel_cost = 100000 - - def __repr__(self): - return ''.format(**vars(self)) - - def enable_debug(self): - """ - Print out the dynamic programming matrix after matching a read to an - adapter. - """ - self._debug = True - self.aligner.enable_debug() - - def match_to(self, read): - """ - Attempt to match this adapter to the given read. - - Return a Match instance if a match was found; - return None if no match was found given the matching criteria (minimum - overlap length, maximum error rate). - """ - read_seq = read.sequence.upper() # temporary copy - pos = -1 - - # try to find an exact match first unless wildcards are allowed - if not self.adapter_wildcards: - if self.where == PREFIX: - pos = 0 if read_seq.startswith(self.sequence) else -1 - elif self.where == SUFFIX: - pos = (len(read_seq) - len(self.sequence)) if read_seq.endswith(self.sequence) else -1 - elif self.where == BACK or self.where == FRONT: - pos = read_seq.find(self.sequence) - # TODO BACK_NOT_INTERNAL, FRONT_NOT_INTERNAL - if pos >= 0: - match_args = ( - 0, len(self.sequence), pos, pos + len(self.sequence), - len(self.sequence), 0) - else: - # try approximate matching - if not self.indels and self.where in (PREFIX, SUFFIX): - if self.where == PREFIX: - alignment = align.compare_prefixes(self.sequence, read_seq, - wildcard_ref=self.adapter_wildcards, wildcard_query=self.read_wildcards) - else: - alignment = align.compare_suffixes(self.sequence, read_seq, - wildcard_ref=self.adapter_wildcards, wildcard_query=self.read_wildcards) - astart, astop, rstart, rstop, matches, errors = alignment - if astop - astart >= self.min_overlap and errors / (astop - astart) <= self.max_error_rate: - match_args = alignment - else: - match_args = None - else: - alignment = self.aligner.locate(read_seq) - if self._debug: - print(self.aligner.dpmatrix) # pragma: no cover - if alignment is None: - match_args = None - else: - astart, astop, rstart, rstop, matches, errors = alignment - match_args = (astart, astop, rstart, rstop, matches, errors) - - if match_args is None: - return None - if self.remove == 'auto': - # guess: if alignment starts at pos 0, it’s a 5' adapter - remove_before = match_args[2] == 0 # index 2 is rstart - else: - remove_before = self.remove == 'prefix' - match = Match(*match_args, remove_before=remove_before, adapter=self, read=read) - - assert match.length >= self.min_overlap - return match - - def __len__(self): - return len(self.sequence) - - def create_statistics(self): - return AdapterStatistics(self) + """ + This class can find a single adapter characterized by sequence, error rate, + type etc. within reads. + + where -- One of the BACK, FRONT, PREFIX, SUFFIX or ANYWHERE constants. + This influences where the adapter is allowed to appear within the + read. + + remove -- describes which part of the read to remove if the adapter was found: + * "prefix" (for a 3' adapter) + * "suffix" (for a 5' adapter) + * "auto" for a 5'/3' mixed adapter (if the match involves the first base of the read, it + is assumed to be a 5' adapter and a 3' otherwise) + * None: One of the above is chosen depending on the 'where' parameter + + sequence -- The adapter sequence as string. Will be converted to uppercase. + Also, Us will be converted to Ts. + + max_error_rate -- Maximum allowed error rate. The error rate is + the number of errors in the alignment divided by the length + of the part of the alignment that matches the adapter. + + minimum_overlap -- Minimum length of the part of the alignment + that matches the adapter. + + read_wildcards -- Whether IUPAC wildcards in the read are allowed. + + adapter_wildcards -- Whether IUPAC wildcards in the adapter are + allowed. + + name -- optional name of the adapter. If not provided, the name is set to a + unique number. + """ + + def __init__(self, sequence, where, remove=None, max_error_rate=0.1, min_overlap=3, + read_wildcards=False, adapter_wildcards=True, name=None, indels=True): + self._debug = False + self.name = _generate_adapter_name() if name is None else name + self.sequence = sequence.upper().replace('U', 'T') + if not self.sequence: + raise ValueError('Sequence is empty') + self.where = where + if remove not in (None, 'prefix', 'suffix', 'auto'): + raise ValueError('remove parameter must be "prefix", "suffix", "auto" or None') + self.remove = WHERE_TO_REMOVE_MAP[where] if remove is None else remove + self.max_error_rate = max_error_rate + self.min_overlap = min(min_overlap, len(self.sequence)) + iupac = frozenset('XACGTURYSWKMBDHVN') + if adapter_wildcards and not set(self.sequence) <= iupac: + for c in self.sequence: + if c not in iupac: + raise ValueError('Character {!r} in adapter sequence {!r} is ' + 'not a valid IUPAC code. Use only characters ' + 'XACGTURYSWKMBDHVN.'.format(c, self.sequence)) + # Optimization: Use non-wildcard matching if only ACGT is used + self.adapter_wildcards = adapter_wildcards and not set(self.sequence) <= set('ACGT') + self.read_wildcards = read_wildcards + + self.aligner = align.Aligner(self.sequence, self.max_error_rate, + flags=self.where, wildcard_ref=self.adapter_wildcards, wildcard_query=self.read_wildcards) + self.aligner.min_overlap = self.min_overlap + self.indels = indels + if not self.indels: + # TODO + # When indels are disallowed, an entirely different algorithm + # should be used. + self.aligner.indel_cost = 100000 + + def __repr__(self): + return ''.format(**vars(self)) + + def enable_debug(self): + """ + Print out the dynamic programming matrix after matching a read to an + adapter. + """ + self._debug = True + self.aligner.enable_debug() + + def match_to(self, read): + """ + Attempt to match this adapter to the given read. + + Return a Match instance if a match was found; + return None if no match was found given the matching criteria (minimum + overlap length, maximum error rate). + """ + read_seq = read.sequence.upper() # temporary copy + pos = -1 + + # try to find an exact match first unless wildcards are allowed + if not self.adapter_wildcards: + if self.where == PREFIX: + pos = 0 if read_seq.startswith(self.sequence) else -1 + elif self.where == SUFFIX: + pos = (len(read_seq) - len(self.sequence)) if read_seq.endswith(self.sequence) else -1 + elif self.where == BACK or self.where == FRONT: + pos = read_seq.find(self.sequence) + # TODO BACK_NOT_INTERNAL, FRONT_NOT_INTERNAL + if pos >= 0: + match_args = ( + 0, len(self.sequence), pos, pos + len(self.sequence), + len(self.sequence), 0) + else: + # try approximate matching + if not self.indels and self.where in (PREFIX, SUFFIX): + if self.where == PREFIX: + alignment = align.compare_prefixes(self.sequence, read_seq, + wildcard_ref=self.adapter_wildcards, wildcard_query=self.read_wildcards) + else: + alignment = align.compare_suffixes(self.sequence, read_seq, + wildcard_ref=self.adapter_wildcards, wildcard_query=self.read_wildcards) + astart, astop, rstart, rstop, matches, errors = alignment + if astop - astart >= self.min_overlap and errors / (astop - astart) <= self.max_error_rate: + match_args = alignment + else: + match_args = None + else: + alignment = self.aligner.locate(read_seq) + if self._debug: + print(self.aligner.dpmatrix) # pragma: no cover + if alignment is None: + match_args = None + else: + astart, astop, rstart, rstop, matches, errors = alignment + match_args = (astart, astop, rstart, rstop, matches, errors) + + if match_args is None: + return None + if self.remove == 'auto': + # guess: if alignment starts at pos 0, it’s a 5' adapter + remove_before = match_args[2] == 0 # index 2 is rstart + else: + remove_before = self.remove == 'prefix' + match = Match(*match_args, remove_before=remove_before, adapter=self, read=read) + + assert match.length >= self.min_overlap + return match + + def __len__(self): + return len(self.sequence) + + def create_statistics(self): + return AdapterStatistics(self) class BackOrFrontAdapter(Adapter): - """A 5' or 3' adapter. - - This is separate from the Adapter class so that a specialized match_to - method can be implemented that reduces some of the runtime checks. - - TODO The generic Adapter class should become abstract, and the other - adapter types should also get their own classes. - """ - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - assert self.where == BACK or self.where == FRONT - self._remove_before = self.remove == 'prefix' - - def match_to(self, read): - """ - Attempt to match this adapter to the given read. - - Return a Match instance if a match was found; - return None if no match was found given the matching criteria (minimum - overlap length, maximum error rate). - """ - read_seq = read.sequence.upper() # temporary copy - pos = -1 - - # try to find an exact match first unless wildcards are allowed - if not self.adapter_wildcards: - pos = read_seq.find(self.sequence) - if pos >= 0: - alignment = ( - 0, len(self.sequence), pos, pos + len(self.sequence), - len(self.sequence), 0) - else: - alignment = self.aligner.locate(read_seq) - if self._debug: - print(self.aligner.dpmatrix) # pragma: no cover - if alignment is None: - return None - - match = Match(*alignment, remove_before=self._remove_before, adapter=self, read=read) - return match + """A 5' or 3' adapter. + + This is separate from the Adapter class so that a specialized match_to + method can be implemented that reduces some of the runtime checks. + + TODO The generic Adapter class should become abstract, and the other + adapter types should also get their own classes. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.where == BACK or self.where == FRONT + self._remove_before = self.remove == 'prefix' + + def match_to(self, read): + """ + Attempt to match this adapter to the given read. + + Return a Match instance if a match was found; + return None if no match was found given the matching criteria (minimum + overlap length, maximum error rate). + """ + read_seq = read.sequence.upper() # temporary copy + pos = -1 + + # try to find an exact match first unless wildcards are allowed + if not self.adapter_wildcards: + pos = read_seq.find(self.sequence) + if pos >= 0: + alignment = ( + 0, len(self.sequence), pos, pos + len(self.sequence), + len(self.sequence), 0) + else: + alignment = self.aligner.locate(read_seq) + if self._debug: + print(self.aligner.dpmatrix) # pragma: no cover + if alignment is None: + return None + + match = Match(*alignment, remove_before=self._remove_before, adapter=self, read=read) + return match class LinkedMatch: - """ - Represent a match of a LinkedAdapter - """ - def __init__(self, front_match, back_match, adapter): - """ - One of front_match and back_match must be not None! - """ - self.front_match = front_match - self.back_match = back_match - self.adapter = adapter - - def __repr__(self): - return ''.format( - self.front_match, self.back_match, self.adapter) - - @property - def matches(self): - """Number of matching bases""" - m = getattr(self.front_match, 'matches', 0) - if self.back_match is not None: - m += self.back_match.matches - return m - - def trimmed(self): - if self.back_match: - # back match is relative to front match, so even if a front match exists, - # this is correct - return self.back_match.trimmed() - else: - assert self.front_match - return self.front_match.trimmed() - - @property - def adjacent_base(self): - return self.back_match.adjacent_base - - def update_statistics(self, statistics): - """Update AdapterStatistics in place""" - if self.front_match: - statistics.front.errors[self.front_match.rstop][self.front_match.errors] += 1 - if self.back_match: - statistics.back.errors[len(self.back_match.read) - self.back_match.rstart][self.back_match.errors] += 1 + """ + Represent a match of a LinkedAdapter + """ + def __init__(self, front_match, back_match, adapter): + """ + One of front_match and back_match must be not None! + """ + self.front_match = front_match + self.back_match = back_match + self.adapter = adapter + + def __repr__(self): + return ''.format( + self.front_match, self.back_match, self.adapter) + + @property + def matches(self): + """Number of matching bases""" + m = getattr(self.front_match, 'matches', 0) + if self.back_match is not None: + m += self.back_match.matches + return m + + def trimmed(self): + if self.back_match: + # back match is relative to front match, so even if a front match exists, + # this is correct + return self.back_match.trimmed() + else: + assert self.front_match + return self.front_match.trimmed() + + @property + def adjacent_base(self): + return self.back_match.adjacent_base + + def update_statistics(self, statistics): + """Update AdapterStatistics in place""" + if self.front_match: + statistics.front.errors[self.front_match.rstop][self.front_match.errors] += 1 + if self.back_match: + statistics.back.errors[len(self.back_match.read) - self.back_match.rstart][self.back_match.errors] += 1 class LinkedAdapter: - """ - """ - def __init__( - self, - front_adapter, - back_adapter, - front_required, - back_required, - name, - ): - """ - require_both -- require both adapters to match. If not specified, the default is to - require only anchored adapters to match. - kwargs are passed on to individual Adapter constructors - """ - self._require_front_match = front_required - self._require_back_match = back_required - - # The following attributes are needed for the report - self.where = LINKED - self.name = _generate_adapter_name() if name is None else name - self.front_adapter = front_adapter - self.back_adapter = back_adapter - - def enable_debug(self): - self.front_adapter.enable_debug() - self.back_adapter.enable_debug() - - def match_to(self, read): - """ - Match the linked adapters against the given read. Any anchored adapters are - required to exist for a successful match. If both adapters are unanchored, - both need to match. - """ - front_match = self.front_adapter.match_to(read) - if self._require_front_match and front_match is None: - return None - - if front_match is not None: - # TODO statistics - read = front_match.trimmed() - back_match = self.back_adapter.match_to(read) - if back_match is None and (self._require_back_match or front_match is None): - return None - return LinkedMatch(front_match, back_match, self) - - def create_statistics(self): - return AdapterStatistics(self.front_adapter, self.back_adapter, where=LINKED) + """ + """ + def __init__( + self, + front_adapter, + back_adapter, + front_required, + back_required, + name, + ): + """ + require_both -- require both adapters to match. If not specified, the default is to + require only anchored adapters to match. + kwargs are passed on to individual Adapter constructors + """ + self._require_front_match = front_required + self._require_back_match = back_required + + # The following attributes are needed for the report + self.where = LINKED + self.name = _generate_adapter_name() if name is None else name + self.front_adapter = front_adapter + self.back_adapter = back_adapter + + def enable_debug(self): + self.front_adapter.enable_debug() + self.back_adapter.enable_debug() + + def match_to(self, read): + """ + Match the linked adapters against the given read. Any anchored adapters are + required to exist for a successful match. If both adapters are unanchored, + both need to match. + """ + front_match = self.front_adapter.match_to(read) + if self._require_front_match and front_match is None: + return None + + if front_match is not None: + # TODO statistics + read = front_match.trimmed() + back_match = self.back_adapter.match_to(read) + if back_match is None and (self._require_back_match or front_match is None): + return None + return LinkedMatch(front_match, back_match, self) + + def create_statistics(self): + return AdapterStatistics(self.front_adapter, self.back_adapter, where=LINKED) diff --git a/src/cutadapt/align.py b/src/cutadapt/align.py index 99b34233..da199c47 100644 --- a/src/cutadapt/align.py +++ b/src/cutadapt/align.py @@ -1,7 +1,7 @@ __all__ = [ - 'Aligner', - 'compare_prefixes', - 'compare_suffixes', + 'Aligner', + 'compare_prefixes', + 'compare_suffixes', ] from cutadapt._align import Aligner, compare_prefixes @@ -25,19 +25,19 @@ def compare_suffixes(s1, s2, wildcard_ref=False, wildcard_query=False): - """ - Find out whether one string is the suffix of the other one, allowing - mismatches. Used to find an anchored 3' adapter when no indels are allowed. - """ - s1 = s1[::-1] - s2 = s2[::-1] - _, length, _, _, matches, errors = compare_prefixes(s1, s2, wildcard_ref, wildcard_query) - return (len(s1) - length, len(s1), len(s2) - length, len(s2), matches, errors) + """ + Find out whether one string is the suffix of the other one, allowing + mismatches. Used to find an anchored 3' adapter when no indels are allowed. + """ + s1 = s1[::-1] + s2 = s2[::-1] + _, length, _, _, matches, errors = compare_prefixes(s1, s2, wildcard_ref, wildcard_query) + return (len(s1) - length, len(s1), len(s2) - length, len(s2), matches, errors) # convenience function (to avoid having to instantiate an Aligner manually) def locate(reference, query, max_error_rate, flags=SEMIGLOBAL, wildcard_ref=False, - wildcard_query=False, min_overlap=1): - aligner = Aligner(reference, max_error_rate, flags, wildcard_ref, wildcard_query) - aligner.min_overlap = min_overlap - return aligner.locate(query) + wildcard_query=False, min_overlap=1): + aligner = Aligner(reference, max_error_rate, flags, wildcard_ref, wildcard_query) + aligner.min_overlap = min_overlap + return aligner.locate(query) diff --git a/src/cutadapt/filters.py b/src/cutadapt/filters.py index b5a1a6d2..c25be2ce 100644 --- a/src/cutadapt/filters.py +++ b/src/cutadapt/filters.py @@ -22,318 +22,318 @@ class NoFilter: - """ - No filtering, just send each read to the given writer. - """ - def __init__(self, writer): - self.writer = writer - self.written = 0 # no of written reads TODO move to writer - self.written_bp = [0, 0] + """ + No filtering, just send each read to the given writer. + """ + def __init__(self, writer): + self.writer = writer + self.written = 0 # no of written reads TODO move to writer + self.written_bp = [0, 0] - @property - def filtered(self): - return 0 + @property + def filtered(self): + return 0 - def __call__(self, read, matches): - self.writer.write(read) - self.written += 1 - self.written_bp[0] += len(read) - return DISCARD + def __call__(self, read, matches): + self.writer.write(read) + self.written += 1 + self.written_bp[0] += len(read) + return DISCARD class PairedNoFilter: - """ - No filtering, just send each paired-end read to the given writer. - """ - def __init__(self, writer): - self.writer = writer - self.written = 0 # no of written reads or read pairs TODO move to writer - self.written_bp = [0, 0] - - @property - def filtered(self): - return 0 - - def __call__(self, read1, read2, matches1, matches2): - self.writer.write(read1, read2) - self.written += 1 - self.written_bp[0] += len(read1) - self.written_bp[1] += len(read2) - return DISCARD + """ + No filtering, just send each paired-end read to the given writer. + """ + def __init__(self, writer): + self.writer = writer + self.written = 0 # no of written reads or read pairs TODO move to writer + self.written_bp = [0, 0] + + @property + def filtered(self): + return 0 + + def __call__(self, read1, read2, matches1, matches2): + self.writer.write(read1, read2) + self.written += 1 + self.written_bp[0] += len(read1) + self.written_bp[1] += len(read2) + return DISCARD class Redirector: - """ - Redirect discarded reads to the given writer. This is for single-end reads. - """ - def __init__(self, writer, filter, filter2=None): - # TODO filter2 should really not be here - self.filtered = 0 - self.writer = writer - self.filter = filter - self.written = 0 # no of written reads TODO move to writer - self.written_bp = [0, 0] - - def __call__(self, read, matches): - if self.filter(read, matches): - self.filtered += 1 - if self.writer is not None: - self.writer.write(read) - self.written += 1 - self.written_bp[0] += len(read) - return DISCARD - return KEEP + """ + Redirect discarded reads to the given writer. This is for single-end reads. + """ + def __init__(self, writer, filter, filter2=None): + # TODO filter2 should really not be here + self.filtered = 0 + self.writer = writer + self.filter = filter + self.written = 0 # no of written reads TODO move to writer + self.written_bp = [0, 0] + + def __call__(self, read, matches): + if self.filter(read, matches): + self.filtered += 1 + if self.writer is not None: + self.writer.write(read) + self.written += 1 + self.written_bp[0] += len(read) + return DISCARD + return KEEP class PairedRedirector: - """ - Redirect paired-end reads matching a filtering criterion to a writer. - Different filtering styles are supported, differing by which of the - two reads in a pair have to fulfill the filtering criterion. - """ - def __init__(self, writer, filter, filter2, pair_filter_mode='any'): - """ - pair_filter_mode -- these values are allowed: - 'any': The pair is discarded if any read matches. - 'both': The pair is discarded if both reads match. - 'first': The pair is discarded if the first read matches - ('legacy' mode, backwards compatibility). With 'first', the - second read is not inspected. - """ - if pair_filter_mode not in ('any', 'both', 'first'): - raise ValueError("pair_filter_mode must be 'any', 'both' or 'first'") - self.filtered = 0 - self.writer = writer - self.filter = filter - self.filter2 = filter2 - self.written = 0 # no of written reads or read pairs TODO move to writer - self.written_bp = [0, 0] - if filter2 is None: - self._is_filtered = self._is_filtered_first - elif filter is None: - self._is_filtered = self._is_filtered_second - elif pair_filter_mode == 'any': - self._is_filtered = self._is_filtered_any - elif pair_filter_mode == 'both': - self._is_filtered = self._is_filtered_both - else: - self._is_filtered = self._is_filtered_first - - def _is_filtered_any(self, read1, read2, matches1, matches2): - return self.filter(read1, matches1) or self.filter2(read2, matches2) - - def _is_filtered_both(self, read1, read2, matches1, matches2): - return self.filter(read1, matches1) and self.filter2(read2, matches2) - - def _is_filtered_first(self, read1, read2, matches1, matches2): - return self.filter(read1, matches1) - - def _is_filtered_second(self, read1, read2, matches1, matches2): - return self.filter2(read2, matches2) - - def __call__(self, read1, read2, matches1, matches2): - if self._is_filtered(read1, read2, matches1, matches2): - self.filtered += 1 - if self.writer is not None: - self.writer.write(read1, read2) - self.written += 1 - self.written_bp[0] += len(read1) - self.written_bp[1] += len(read2) - return DISCARD - return KEEP + """ + Redirect paired-end reads matching a filtering criterion to a writer. + Different filtering styles are supported, differing by which of the + two reads in a pair have to fulfill the filtering criterion. + """ + def __init__(self, writer, filter, filter2, pair_filter_mode='any'): + """ + pair_filter_mode -- these values are allowed: + 'any': The pair is discarded if any read matches. + 'both': The pair is discarded if both reads match. + 'first': The pair is discarded if the first read matches + ('legacy' mode, backwards compatibility). With 'first', the + second read is not inspected. + """ + if pair_filter_mode not in ('any', 'both', 'first'): + raise ValueError("pair_filter_mode must be 'any', 'both' or 'first'") + self.filtered = 0 + self.writer = writer + self.filter = filter + self.filter2 = filter2 + self.written = 0 # no of written reads or read pairs TODO move to writer + self.written_bp = [0, 0] + if filter2 is None: + self._is_filtered = self._is_filtered_first + elif filter is None: + self._is_filtered = self._is_filtered_second + elif pair_filter_mode == 'any': + self._is_filtered = self._is_filtered_any + elif pair_filter_mode == 'both': + self._is_filtered = self._is_filtered_both + else: + self._is_filtered = self._is_filtered_first + + def _is_filtered_any(self, read1, read2, matches1, matches2): + return self.filter(read1, matches1) or self.filter2(read2, matches2) + + def _is_filtered_both(self, read1, read2, matches1, matches2): + return self.filter(read1, matches1) and self.filter2(read2, matches2) + + def _is_filtered_first(self, read1, read2, matches1, matches2): + return self.filter(read1, matches1) + + def _is_filtered_second(self, read1, read2, matches1, matches2): + return self.filter2(read2, matches2) + + def __call__(self, read1, read2, matches1, matches2): + if self._is_filtered(read1, read2, matches1, matches2): + self.filtered += 1 + if self.writer is not None: + self.writer.write(read1, read2) + self.written += 1 + self.written_bp[0] += len(read1) + self.written_bp[1] += len(read2) + return DISCARD + return KEEP class TooShortReadFilter: - def __init__(self, minimum_length): - self.minimum_length = minimum_length + def __init__(self, minimum_length): + self.minimum_length = minimum_length - def __call__(self, read, matches): - return len(read) < self.minimum_length + def __call__(self, read, matches): + return len(read) < self.minimum_length class TooLongReadFilter: - def __init__(self, maximum_length): - self.maximum_length = maximum_length + def __init__(self, maximum_length): + self.maximum_length = maximum_length - def __call__(self, read, matches): - return len(read) > self.maximum_length + def __call__(self, read, matches): + return len(read) > self.maximum_length class NContentFilter: - """ - Discards a reads that has a number of 'N's over a given threshold. It handles both raw counts - of Ns as well as proportions. Note, for raw counts, it is a 'greater than' comparison, - so a cutoff of '1' will keep reads with a single N in it. - """ - def __init__(self, count): - """ - Count -- if it is below 1.0, it will be considered a proportion, and above and equal to - 1 will be considered as discarding reads with a number of N's greater than this cutoff. - """ - assert count >= 0 - self.is_proportion = count < 1.0 - self.cutoff = count - - def __call__(self, read, matches): - """Return True when the read should be discarded""" - n_count = read.sequence.lower().count('n') - if self.is_proportion: - if len(read) == 0: - return False - return n_count / len(read) > self.cutoff - else: - return n_count > self.cutoff + """ + Discards a reads that has a number of 'N's over a given threshold. It handles both raw counts + of Ns as well as proportions. Note, for raw counts, it is a 'greater than' comparison, + so a cutoff of '1' will keep reads with a single N in it. + """ + def __init__(self, count): + """ + Count -- if it is below 1.0, it will be considered a proportion, and above and equal to + 1 will be considered as discarding reads with a number of N's greater than this cutoff. + """ + assert count >= 0 + self.is_proportion = count < 1.0 + self.cutoff = count + + def __call__(self, read, matches): + """Return True when the read should be discarded""" + n_count = read.sequence.lower().count('n') + if self.is_proportion: + if len(read) == 0: + return False + return n_count / len(read) > self.cutoff + else: + return n_count > self.cutoff class DiscardUntrimmedFilter: - """ - Return True if read is untrimmed. - """ - def __call__(self, read, matches): - return not matches + """ + Return True if read is untrimmed. + """ + def __call__(self, read, matches): + return not matches class DiscardTrimmedFilter: - """ - Return True if read is trimmed. - """ - def __call__(self, read, matches): - return bool(matches) + """ + Return True if read is trimmed. + """ + def __call__(self, read, matches): + return bool(matches) class CasavaFilter: - """ - Remove reads that fail the CASAVA filter. These have header lines that - look like ``xxxx x:Y:x:x`` (with a ``Y``). Reads that pass the filter - have an ``N`` instead of ``Y``. + """ + Remove reads that fail the CASAVA filter. These have header lines that + look like ``xxxx x:Y:x:x`` (with a ``Y``). Reads that pass the filter + have an ``N`` instead of ``Y``. - Reads with unrecognized headers are kept. - """ - def __call__(self, read, matches): - _, _, right = read.name.partition(' ') - return right[1:4] == ':Y:' # discard if :Y: found + Reads with unrecognized headers are kept. + """ + def __call__(self, read, matches): + _, _, right = read.name.partition(' ') + return right[1:4] == ':Y:' # discard if :Y: found class Demultiplexer: - """ - Demultiplex trimmed reads. Reads are written to different output files - depending on which adapter matches. Files are created when the first read - is written to them. - """ - def __init__(self, path_template, untrimmed_path, qualities): - """ - path_template must contain the string '{name}', which will be replaced - with the name of the adapter to form the final output path. - Reads without an adapter match are written to the file named by - untrimmed_path. - """ - assert '{name}' in path_template - self.template = path_template - self.untrimmed_path = untrimmed_path - self.untrimmed_writer = None - self.writers = dict() - self.written = 0 - self.written_bp = [0, 0] - self.qualities = qualities - - def __call__(self, read, matches): - """ - Write the read to the proper output file according to the most recent match - """ - if matches: - name = matches[-1].adapter.name - if name not in self.writers: - self.writers[name] = dnaio.open(self.template.replace('{name}', name), - mode='w', qualities=self.qualities) - self.written += 1 - self.written_bp[0] += len(read) - self.writers[name].write(read) - else: - if self.untrimmed_writer is None and self.untrimmed_path is not None: - self.untrimmed_writer = dnaio.open(self.untrimmed_path, - mode='w', qualities=self.qualities) - if self.untrimmed_writer is not None: - self.written += 1 - self.written_bp[0] += len(read) - self.untrimmed_writer.write(read) - return DISCARD - - def close(self): - for w in self.writers.values(): - w.close() - if self.untrimmed_writer is not None: - self.untrimmed_writer.close() + """ + Demultiplex trimmed reads. Reads are written to different output files + depending on which adapter matches. Files are created when the first read + is written to them. + """ + def __init__(self, path_template, untrimmed_path, qualities): + """ + path_template must contain the string '{name}', which will be replaced + with the name of the adapter to form the final output path. + Reads without an adapter match are written to the file named by + untrimmed_path. + """ + assert '{name}' in path_template + self.template = path_template + self.untrimmed_path = untrimmed_path + self.untrimmed_writer = None + self.writers = dict() + self.written = 0 + self.written_bp = [0, 0] + self.qualities = qualities + + def __call__(self, read, matches): + """ + Write the read to the proper output file according to the most recent match + """ + if matches: + name = matches[-1].adapter.name + if name not in self.writers: + self.writers[name] = dnaio.open(self.template.replace('{name}', name), + mode='w', qualities=self.qualities) + self.written += 1 + self.written_bp[0] += len(read) + self.writers[name].write(read) + else: + if self.untrimmed_writer is None and self.untrimmed_path is not None: + self.untrimmed_writer = dnaio.open(self.untrimmed_path, + mode='w', qualities=self.qualities) + if self.untrimmed_writer is not None: + self.written += 1 + self.written_bp[0] += len(read) + self.untrimmed_writer.write(read) + return DISCARD + + def close(self): + for w in self.writers.values(): + w.close() + if self.untrimmed_writer is not None: + self.untrimmed_writer.close() class PairedEndDemultiplexer: - """ - Demultiplex trimmed paired-end reads. Reads are written to different output files - depending on which adapter (in read 1) matches. - """ - def __init__(self, path_template, path_paired_template, untrimmed_path, untrimmed_paired_path, - qualities): - """ - The path templates must contain the string '{name}', which will be replaced - with the name of the adapter to form the final output path. - Read pairs without an adapter match are written to the files named by - untrimmed_path. - """ - self._demultiplexer1 = Demultiplexer(path_template, untrimmed_path, qualities) - self._demultiplexer2 = Demultiplexer(path_paired_template, untrimmed_paired_path, - qualities) - - @property - def written(self): - return self._demultiplexer1.written + self._demultiplexer2.written - - @property - def written_bp(self): - return [self._demultiplexer1.written_bp[0], self._demultiplexer2.written_bp[0]] - - def __call__(self, read1, read2, matches1, matches2): - assert read2 is not None - self._demultiplexer1(read1, matches1) - self._demultiplexer2(read2, matches1) - - def close(self): - self._demultiplexer1.close() - self._demultiplexer1.close() + """ + Demultiplex trimmed paired-end reads. Reads are written to different output files + depending on which adapter (in read 1) matches. + """ + def __init__(self, path_template, path_paired_template, untrimmed_path, untrimmed_paired_path, + qualities): + """ + The path templates must contain the string '{name}', which will be replaced + with the name of the adapter to form the final output path. + Read pairs without an adapter match are written to the files named by + untrimmed_path. + """ + self._demultiplexer1 = Demultiplexer(path_template, untrimmed_path, qualities) + self._demultiplexer2 = Demultiplexer(path_paired_template, untrimmed_paired_path, + qualities) + + @property + def written(self): + return self._demultiplexer1.written + self._demultiplexer2.written + + @property + def written_bp(self): + return [self._demultiplexer1.written_bp[0], self._demultiplexer2.written_bp[0]] + + def __call__(self, read1, read2, matches1, matches2): + assert read2 is not None + self._demultiplexer1(read1, matches1) + self._demultiplexer2(read2, matches1) + + def close(self): + self._demultiplexer1.close() + self._demultiplexer1.close() class RestFileWriter: - def __init__(self, file): - self.file = file + def __init__(self, file): + self.file = file - def __call__(self, read, matches): - if matches: - rest = matches[-1].rest() - if len(rest) > 0: - print(rest, read.name, file=self.file) - return KEEP + def __call__(self, read, matches): + if matches: + rest = matches[-1].rest() + if len(rest) > 0: + print(rest, read.name, file=self.file) + return KEEP class WildcardFileWriter: - def __init__(self, file): - self.file = file + def __init__(self, file): + self.file = file - def __call__(self, read, matches): - if matches: - print(matches[-1].wildcards(), read.name, file=self.file) - return KEEP + def __call__(self, read, matches): + if matches: + print(matches[-1].wildcards(), read.name, file=self.file) + return KEEP class InfoFileWriter: - def __init__(self, file): - self.file = file - - def __call__(self, read, matches): - if matches: - for match in matches: - info_record = match.get_info_record() - print(*info_record, sep='\t', file=self.file) - else: - seq = read.sequence - qualities = read.qualities if read.qualities is not None else '' - print(read.name, -1, seq, qualities, sep='\t', file=self.file) - - return KEEP + def __init__(self, file): + self.file = file + + def __call__(self, read, matches): + if matches: + for match in matches: + info_record = match.get_info_record() + print(*info_record, sep='\t', file=self.file) + else: + seq = read.sequence + qualities = read.qualities if read.qualities is not None else '' + print(read.name, -1, seq, qualities, sep='\t', file=self.file) + + return KEEP diff --git a/src/cutadapt/modifiers.py b/src/cutadapt/modifiers.py index 48c9ddc8..911333bb 100644 --- a/src/cutadapt/modifiers.py +++ b/src/cutadapt/modifiers.py @@ -10,226 +10,226 @@ class AdapterCutter: - """ - Repeatedly find one of multiple adapters in reads. - The number of times the search is repeated is specified by the - times parameter. - """ - - def __init__(self, adapters, times=1, action='trim'): - """ - adapters -- list of Adapter objects - - action -- What to do with a found adapter: None, 'trim', or 'mask' - """ - self.adapters = adapters - self.times = times - assert action in ('trim', 'mask', None) - self.action = action - self.with_adapters = 0 - self.adapter_statistics = OrderedDict((a, a.create_statistics()) for a in adapters) - - def _best_match(self, read): - """ - Find the best matching adapter in the given read. - - Return either a Match instance or None if there are no matches. - """ - # TODO - # try to sort adapters by length, longest first, break when current best - # match is longer than length of next adapter to try - best_match = None - for adapter in self.adapters: - match = adapter.match_to(read) - if match is None: - continue - - # the no. of matches determines which adapter fits best - if best_match is None or match.matches > best_match.matches: - best_match = match - return best_match - - def __call__(self, read, matches): - """ - Search for the best-matching adapter in a read, perform the requested action - ('trim', 'mask', or None as determined by self.action) it and return the - (possibly) modified read. - - *self.times* adapter removal rounds are done. During each round, - only the best-matching adapter is trimmed. If no adapter was found in a round, - no further rounds are attempted. - - The 'matches' parameter needs to be a list. Every time an adapter is found, - a Match object describing the match will be appended to it. - - The read is converted to uppercase before it is compared to the adapter - sequences. - """ - trimmed_read = read - for _ in range(self.times): - match = self._best_match(trimmed_read) - if match is None: - # if nothing found, attempt no further rounds - break - matches.append(match) - trimmed_read = match.trimmed() - match.update_statistics(self.adapter_statistics[match.adapter]) - - if not matches: - return trimmed_read - - if self.action == 'trim': - # read is already trimmed, nothing to do - pass - elif self.action == 'mask': - # add N from last modification - masked_sequence = trimmed_read.sequence - for match in sorted(matches, reverse=True, key=lambda m: m.astart): - ns = 'N' * (len(match.read.sequence) - - len(match.trimmed().sequence)) - # add N depending on match position - if match.remove_before: - masked_sequence = ns + masked_sequence - else: - masked_sequence += ns - # set masked sequence as sequence with original quality - trimmed_read.sequence = masked_sequence - trimmed_read.qualities = matches[0].read.qualities - assert len(trimmed_read.sequence) == len(read) - elif self.action is None: # --no-trim - trimmed_read = read[:] - - self.with_adapters += 1 - return trimmed_read + """ + Repeatedly find one of multiple adapters in reads. + The number of times the search is repeated is specified by the + times parameter. + """ + + def __init__(self, adapters, times=1, action='trim'): + """ + adapters -- list of Adapter objects + + action -- What to do with a found adapter: None, 'trim', or 'mask' + """ + self.adapters = adapters + self.times = times + assert action in ('trim', 'mask', None) + self.action = action + self.with_adapters = 0 + self.adapter_statistics = OrderedDict((a, a.create_statistics()) for a in adapters) + + def _best_match(self, read): + """ + Find the best matching adapter in the given read. + + Return either a Match instance or None if there are no matches. + """ + # TODO + # try to sort adapters by length, longest first, break when current best + # match is longer than length of next adapter to try + best_match = None + for adapter in self.adapters: + match = adapter.match_to(read) + if match is None: + continue + + # the no. of matches determines which adapter fits best + if best_match is None or match.matches > best_match.matches: + best_match = match + return best_match + + def __call__(self, read, matches): + """ + Search for the best-matching adapter in a read, perform the requested action + ('trim', 'mask', or None as determined by self.action) it and return the + (possibly) modified read. + + *self.times* adapter removal rounds are done. During each round, + only the best-matching adapter is trimmed. If no adapter was found in a round, + no further rounds are attempted. + + The 'matches' parameter needs to be a list. Every time an adapter is found, + a Match object describing the match will be appended to it. + + The read is converted to uppercase before it is compared to the adapter + sequences. + """ + trimmed_read = read + for _ in range(self.times): + match = self._best_match(trimmed_read) + if match is None: + # if nothing found, attempt no further rounds + break + matches.append(match) + trimmed_read = match.trimmed() + match.update_statistics(self.adapter_statistics[match.adapter]) + + if not matches: + return trimmed_read + + if self.action == 'trim': + # read is already trimmed, nothing to do + pass + elif self.action == 'mask': + # add N from last modification + masked_sequence = trimmed_read.sequence + for match in sorted(matches, reverse=True, key=lambda m: m.astart): + ns = 'N' * (len(match.read.sequence) - + len(match.trimmed().sequence)) + # add N depending on match position + if match.remove_before: + masked_sequence = ns + masked_sequence + else: + masked_sequence += ns + # set masked sequence as sequence with original quality + trimmed_read.sequence = masked_sequence + trimmed_read.qualities = matches[0].read.qualities + assert len(trimmed_read.sequence) == len(read) + elif self.action is None: # --no-trim + trimmed_read = read[:] + + self.with_adapters += 1 + return trimmed_read class UnconditionalCutter: - """ - A modifier that unconditionally removes the first n or the last n bases from a read. + """ + A modifier that unconditionally removes the first n or the last n bases from a read. - If the length is positive, the bases are removed from the beginning of the read. - If the length is negative, the bases are removed from the end of the read. - """ - def __init__(self, length): - self.length = length + If the length is positive, the bases are removed from the beginning of the read. + If the length is negative, the bases are removed from the end of the read. + """ + def __init__(self, length): + self.length = length - def __call__(self, read, matches): - if self.length > 0: - return read[self.length:] - elif self.length < 0: - return read[:self.length] + def __call__(self, read, matches): + if self.length > 0: + return read[self.length:] + elif self.length < 0: + return read[:self.length] class LengthTagModifier: - """ - Replace "length=..." strings in read names. - """ - def __init__(self, length_tag): - self.regex = re.compile(r"\b" + length_tag + r"[0-9]*\b") - self.length_tag = length_tag + """ + Replace "length=..." strings in read names. + """ + def __init__(self, length_tag): + self.regex = re.compile(r"\b" + length_tag + r"[0-9]*\b") + self.length_tag = length_tag - def __call__(self, read, matches): - read = read[:] - if read.name.find(self.length_tag) >= 0: - read.name = self.regex.sub(self.length_tag + str(len(read.sequence)), read.name) - return read + def __call__(self, read, matches): + read = read[:] + if read.name.find(self.length_tag) >= 0: + read.name = self.regex.sub(self.length_tag + str(len(read.sequence)), read.name) + return read class SuffixRemover: - """ - Remove a given suffix from read names. - """ - def __init__(self, suffix): - self.suffix = suffix + """ + Remove a given suffix from read names. + """ + def __init__(self, suffix): + self.suffix = suffix - def __call__(self, read, matches): - read = read[:] - if read.name.endswith(self.suffix): - read.name = read.name[:-len(self.suffix)] - return read + def __call__(self, read, matches): + read = read[:] + if read.name.endswith(self.suffix): + read.name = read.name[:-len(self.suffix)] + return read class PrefixSuffixAdder: - """ - Add a suffix and a prefix to read names - """ - def __init__(self, prefix, suffix): - self.prefix = prefix - self.suffix = suffix + """ + Add a suffix and a prefix to read names + """ + def __init__(self, prefix, suffix): + self.prefix = prefix + self.suffix = suffix - def __call__(self, read, matches): - read = read[:] - adapter_name = matches[-1].adapter.name if matches else 'no_adapter' - read.name = self.prefix.replace('{name}', adapter_name) + read.name + \ - self.suffix.replace('{name}', adapter_name) - return read + def __call__(self, read, matches): + read = read[:] + adapter_name = matches[-1].adapter.name if matches else 'no_adapter' + read.name = self.prefix.replace('{name}', adapter_name) + read.name + \ + self.suffix.replace('{name}', adapter_name) + return read class ZeroCapper: - """ - Change negative quality values of a read to zero - """ - def __init__(self, quality_base=33): - qb = quality_base - self.zero_cap_trans = str.maketrans(''.join(map(chr, range(qb))), chr(qb) * qb) + """ + Change negative quality values of a read to zero + """ + def __init__(self, quality_base=33): + qb = quality_base + self.zero_cap_trans = str.maketrans(''.join(map(chr, range(qb))), chr(qb) * qb) - def __call__(self, read, matches): - read = read[:] - read.qualities = read.qualities.translate(self.zero_cap_trans) - return read + def __call__(self, read, matches): + read = read[:] + read.qualities = read.qualities.translate(self.zero_cap_trans) + return read class NextseqQualityTrimmer: - def __init__(self, cutoff, base): - self.cutoff = cutoff - self.base = base - self.trimmed_bases = 0 + def __init__(self, cutoff, base): + self.cutoff = cutoff + self.base = base + self.trimmed_bases = 0 - def __call__(self, read, matches): - stop = nextseq_trim_index(read, self.cutoff, self.base) - self.trimmed_bases += len(read) - stop - return read[:stop] + def __call__(self, read, matches): + stop = nextseq_trim_index(read, self.cutoff, self.base) + self.trimmed_bases += len(read) - stop + return read[:stop] class QualityTrimmer: - def __init__(self, cutoff_front, cutoff_back, base): - self.cutoff_front = cutoff_front - self.cutoff_back = cutoff_back - self.base = base - self.trimmed_bases = 0 + def __init__(self, cutoff_front, cutoff_back, base): + self.cutoff_front = cutoff_front + self.cutoff_back = cutoff_back + self.base = base + self.trimmed_bases = 0 - def __call__(self, read, matches): - start, stop = quality_trim_index(read.qualities, self.cutoff_front, self.cutoff_back, self.base) - self.trimmed_bases += len(read) - (stop - start) - return read[start:stop] + def __call__(self, read, matches): + start, stop = quality_trim_index(read.qualities, self.cutoff_front, self.cutoff_back, self.base) + self.trimmed_bases += len(read) - (stop - start) + return read[start:stop] class Shortener: - """Unconditionally shorten a read to the given length + """Unconditionally shorten a read to the given length - If the length is positive, the bases are removed from the end of the read. - If the length is negative, the bases are removed from the beginning of the read. - """ - def __init__(self, length): - self.length = length + If the length is positive, the bases are removed from the end of the read. + If the length is negative, the bases are removed from the beginning of the read. + """ + def __init__(self, length): + self.length = length - def __call__(self, read, matches): - if self.length >= 0: - return read[:self.length] - else: - return read[self.length:] + def __call__(self, read, matches): + if self.length >= 0: + return read[:self.length] + else: + return read[self.length:] class NEndTrimmer: - """Trims Ns from the 3' and 5' end of reads""" - def __init__(self): - self.start_trim = re.compile(r'^N+') - self.end_trim = re.compile(r'N+$') - - def __call__(self, read, matches): - sequence = read.sequence - start_cut = self.start_trim.match(sequence) - end_cut = self.end_trim.search(sequence) - start_cut = start_cut.end() if start_cut else 0 - end_cut = end_cut.start() if end_cut else len(read) - return read[start_cut:end_cut] + """Trims Ns from the 3' and 5' end of reads""" + def __init__(self): + self.start_trim = re.compile(r'^N+') + self.end_trim = re.compile(r'N+$') + + def __call__(self, read, matches): + sequence = read.sequence + start_cut = self.start_trim.match(sequence) + end_cut = self.end_trim.search(sequence) + start_cut = start_cut.end() if start_cut else 0 + end_cut = end_cut.start() if end_cut else len(read) + return read[start_cut:end_cut] diff --git a/src/cutadapt/pipeline.py b/src/cutadapt/pipeline.py index f198e210..0b544532 100644 --- a/src/cutadapt/pipeline.py +++ b/src/cutadapt/pipeline.py @@ -14,661 +14,661 @@ from .modifiers import ZeroCapper from .report import Statistics from .filters import (Redirector, PairedRedirector, NoFilter, PairedNoFilter, InfoFileWriter, - RestFileWriter, WildcardFileWriter, TooShortReadFilter, TooLongReadFilter, NContentFilter, - CasavaFilter, DiscardTrimmedFilter, DiscardUntrimmedFilter, Demultiplexer, - PairedEndDemultiplexer) + RestFileWriter, WildcardFileWriter, TooShortReadFilter, TooLongReadFilter, NContentFilter, + CasavaFilter, DiscardTrimmedFilter, DiscardUntrimmedFilter, Demultiplexer, + PairedEndDemultiplexer) logger = logging.getLogger() class OutputFiles: - """ - The attributes are open file-like objects except when demultiplex is True. In that case, - untrimmed, untrimmed2 are file names, and out and out2 are file name templates - containing '{name}'. - If interleaved is True, then out is written interleaved. - Files may also be None. - """ - # TODO interleaving for the other file pairs (too_short, too_long, untrimmed)? - def __init__( - self, - out=None, - out2=None, - untrimmed=None, - untrimmed2=None, - too_short=None, - too_short2=None, - too_long=None, - too_long2=None, - info=None, - rest=None, - wildcard=None, - demultiplex=False, - interleaved=False, - ): - self.out = out - self.out2 = out2 - self.untrimmed = untrimmed - self.untrimmed2 = untrimmed2 - self.too_short = too_short - self.too_short2 = too_short2 - self.too_long = too_long - self.too_long2 = too_long2 - self.info = info - self.rest = rest - self.wildcard = wildcard - self.demultiplex = demultiplex - self.interleaved = interleaved - - def __iter__(self): - yield self.out - yield self.out2 - yield self.untrimmed - yield self.untrimmed2 - yield self.too_short - yield self.too_short2 - yield self.too_long - yield self.too_long2 - yield self.info - yield self.rest - yield self.wildcard + """ + The attributes are open file-like objects except when demultiplex is True. In that case, + untrimmed, untrimmed2 are file names, and out and out2 are file name templates + containing '{name}'. + If interleaved is True, then out is written interleaved. + Files may also be None. + """ + # TODO interleaving for the other file pairs (too_short, too_long, untrimmed)? + def __init__( + self, + out=None, + out2=None, + untrimmed=None, + untrimmed2=None, + too_short=None, + too_short2=None, + too_long=None, + too_long2=None, + info=None, + rest=None, + wildcard=None, + demultiplex=False, + interleaved=False, + ): + self.out = out + self.out2 = out2 + self.untrimmed = untrimmed + self.untrimmed2 = untrimmed2 + self.too_short = too_short + self.too_short2 = too_short2 + self.too_long = too_long + self.too_long2 = too_long2 + self.info = info + self.rest = rest + self.wildcard = wildcard + self.demultiplex = demultiplex + self.interleaved = interleaved + + def __iter__(self): + yield self.out + yield self.out2 + yield self.untrimmed + yield self.untrimmed2 + yield self.too_short + yield self.too_short2 + yield self.too_long + yield self.too_long2 + yield self.info + yield self.rest + yield self.wildcard class Pipeline: - """ - Processing pipeline that loops over reads and applies modifiers and filters - """ - should_warn_legacy = False - n_adapters = 0 - - def __init__(self, ): - self._close_files = [] - self._reader = None - self._filters = [] - self._modifiers = [] - self._outfiles = None - self._demultiplexer = None - - # Filter settings - self._minimum_length = None - self._maximum_length = None - self.max_n = None - self.discard_casava = False - self.discard_trimmed = False - self.discard_untrimmed = False - - def set_input(self, file1, file2=None, fileformat=None, interleaved=False): - self._reader = dnaio.open(file1, file2=file2, fileformat=fileformat, - interleaved=interleaved, mode='r') - # Special treatment: Disable zero-capping if no qualities are available - if not self._reader.delivers_qualities: - self._modifiers = [m for m in self._modifiers if not isinstance(m, ZeroCapper)] - - def _open_writer(self, file, file2, **kwargs): - # TODO backwards-incompatible change (?) would be to use outfiles.interleaved - # for all outputs - return dnaio.open(file, file2=file2, mode='w', qualities=self.uses_qualities, - **kwargs) - - def set_output(self, outfiles): - self._filters = [] - self._outfiles = outfiles - filter_wrapper = self._filter_wrapper() - - if outfiles.rest: - self._filters.append(RestFileWriter(outfiles.rest)) - if outfiles.info: - self._filters.append(InfoFileWriter(outfiles.info)) - if outfiles.wildcard: - self._filters.append(WildcardFileWriter(outfiles.wildcard)) - - # minimum length and maximum length - for lengths, file1, file2, filter_class in ( - (self._minimum_length, outfiles.too_short, outfiles.too_short2, TooShortReadFilter), - (self._maximum_length, outfiles.too_long, outfiles.too_long2, TooLongReadFilter) - ): - writer = None - if lengths is not None: - if file1: - writer = self._open_writer(file1, file2) - f1 = filter_class(lengths[0]) if lengths[0] is not None else None - if len(lengths) == 2 and lengths[1] is not None: - f2 = filter_class(lengths[1]) - else: - f2 = None - self._filters.append(filter_wrapper(writer, filter=f1, filter2=f2)) - - if self.max_n is not None: - f1 = f2 = NContentFilter(self.max_n) - self._filters.append(filter_wrapper(None, f1, f2)) - - if self.discard_casava: - f1 = f2 = CasavaFilter() - self._filters.append(filter_wrapper(None, f1, f2)) - - if int(self.discard_trimmed) + int(self.discard_untrimmed) + int(outfiles.untrimmed is not None) > 1: - raise ValueError('discard_trimmed, discard_untrimmed and outfiles.untrimmed must not ' - 'be set simultaneously') - - if outfiles.demultiplex: - self._demultiplexer = self._create_demultiplexer(outfiles) - self._filters.append(self._demultiplexer) - else: - # Set up the remaining filters to deal with --discard-trimmed, - # --discard-untrimmed and --untrimmed-output. These options - # are mutually exclusive in order to avoid brain damage. - if self.discard_trimmed: - self._filters.append(filter_wrapper(None, DiscardTrimmedFilter(), DiscardTrimmedFilter())) - elif self.discard_untrimmed: - self._filters.append(filter_wrapper(None, DiscardUntrimmedFilter(), DiscardUntrimmedFilter())) - elif outfiles.untrimmed: - untrimmed_writer = self._open_writer(outfiles.untrimmed, outfiles.untrimmed2) - self._filters.append(filter_wrapper(untrimmed_writer, DiscardUntrimmedFilter(), DiscardUntrimmedFilter())) - self._filters.append(self._final_filter(outfiles)) - - def close(self): - for f in self._outfiles: - # TODO do not use hasattr - if f is not None and f is not sys.stdin and f is not sys.stdout and hasattr(f, 'close'): - f.close() - if self._demultiplexer is not None: - self._demultiplexer.close() - - @property - def uses_qualities(self): - return self._reader.delivers_qualities - - def run(self): - (n, total1_bp, total2_bp) = self.process_reads() - # TODO - m = self._modifiers - m2 = getattr(self, '_modifiers2', []) - stats = Statistics() - stats.collect(n, total1_bp, total2_bp, m, m2, self._filters) - return stats - - def process_reads(self): - raise NotImplementedError() - - def _filter_wrapper(self): - raise NotImplementedError() - - def _final_filter(self, outfiles): - raise NotImplementedError() - - def _create_demultiplexer(self, outfiles): - raise NotImplementedError() + """ + Processing pipeline that loops over reads and applies modifiers and filters + """ + should_warn_legacy = False + n_adapters = 0 + + def __init__(self, ): + self._close_files = [] + self._reader = None + self._filters = [] + self._modifiers = [] + self._outfiles = None + self._demultiplexer = None + + # Filter settings + self._minimum_length = None + self._maximum_length = None + self.max_n = None + self.discard_casava = False + self.discard_trimmed = False + self.discard_untrimmed = False + + def set_input(self, file1, file2=None, fileformat=None, interleaved=False): + self._reader = dnaio.open(file1, file2=file2, fileformat=fileformat, + interleaved=interleaved, mode='r') + # Special treatment: Disable zero-capping if no qualities are available + if not self._reader.delivers_qualities: + self._modifiers = [m for m in self._modifiers if not isinstance(m, ZeroCapper)] + + def _open_writer(self, file, file2, **kwargs): + # TODO backwards-incompatible change (?) would be to use outfiles.interleaved + # for all outputs + return dnaio.open(file, file2=file2, mode='w', qualities=self.uses_qualities, + **kwargs) + + def set_output(self, outfiles): + self._filters = [] + self._outfiles = outfiles + filter_wrapper = self._filter_wrapper() + + if outfiles.rest: + self._filters.append(RestFileWriter(outfiles.rest)) + if outfiles.info: + self._filters.append(InfoFileWriter(outfiles.info)) + if outfiles.wildcard: + self._filters.append(WildcardFileWriter(outfiles.wildcard)) + + # minimum length and maximum length + for lengths, file1, file2, filter_class in ( + (self._minimum_length, outfiles.too_short, outfiles.too_short2, TooShortReadFilter), + (self._maximum_length, outfiles.too_long, outfiles.too_long2, TooLongReadFilter) + ): + writer = None + if lengths is not None: + if file1: + writer = self._open_writer(file1, file2) + f1 = filter_class(lengths[0]) if lengths[0] is not None else None + if len(lengths) == 2 and lengths[1] is not None: + f2 = filter_class(lengths[1]) + else: + f2 = None + self._filters.append(filter_wrapper(writer, filter=f1, filter2=f2)) + + if self.max_n is not None: + f1 = f2 = NContentFilter(self.max_n) + self._filters.append(filter_wrapper(None, f1, f2)) + + if self.discard_casava: + f1 = f2 = CasavaFilter() + self._filters.append(filter_wrapper(None, f1, f2)) + + if int(self.discard_trimmed) + int(self.discard_untrimmed) + int(outfiles.untrimmed is not None) > 1: + raise ValueError('discard_trimmed, discard_untrimmed and outfiles.untrimmed must not ' + 'be set simultaneously') + + if outfiles.demultiplex: + self._demultiplexer = self._create_demultiplexer(outfiles) + self._filters.append(self._demultiplexer) + else: + # Set up the remaining filters to deal with --discard-trimmed, + # --discard-untrimmed and --untrimmed-output. These options + # are mutually exclusive in order to avoid brain damage. + if self.discard_trimmed: + self._filters.append(filter_wrapper(None, DiscardTrimmedFilter(), DiscardTrimmedFilter())) + elif self.discard_untrimmed: + self._filters.append(filter_wrapper(None, DiscardUntrimmedFilter(), DiscardUntrimmedFilter())) + elif outfiles.untrimmed: + untrimmed_writer = self._open_writer(outfiles.untrimmed, outfiles.untrimmed2) + self._filters.append(filter_wrapper(untrimmed_writer, DiscardUntrimmedFilter(), DiscardUntrimmedFilter())) + self._filters.append(self._final_filter(outfiles)) + + def close(self): + for f in self._outfiles: + # TODO do not use hasattr + if f is not None and f is not sys.stdin and f is not sys.stdout and hasattr(f, 'close'): + f.close() + if self._demultiplexer is not None: + self._demultiplexer.close() + + @property + def uses_qualities(self): + return self._reader.delivers_qualities + + def run(self): + (n, total1_bp, total2_bp) = self.process_reads() + # TODO + m = self._modifiers + m2 = getattr(self, '_modifiers2', []) + stats = Statistics() + stats.collect(n, total1_bp, total2_bp, m, m2, self._filters) + return stats + + def process_reads(self): + raise NotImplementedError() + + def _filter_wrapper(self): + raise NotImplementedError() + + def _final_filter(self, outfiles): + raise NotImplementedError() + + def _create_demultiplexer(self, outfiles): + raise NotImplementedError() class SingleEndPipeline(Pipeline): - """ - Processing pipeline for single-end reads - """ - paired = False - - def __init__(self): - super().__init__() - self._modifiers = [] - - def add(self, modifier): - self._modifiers.append(modifier) - - def add1(self, modifier): - """An alias for the add() function. Makes the interface similar to PairedEndPipeline""" - self.add(modifier) - - def process_reads(self): - """Run the pipeline. Return statistics""" - n = 0 # no. of processed reads # TODO turn into attribute - total_bp = 0 - for read in self._reader: - n += 1 - total_bp += len(read.sequence) - matches = [] - for modifier in self._modifiers: - read = modifier(read, matches) - for filter_ in self._filters: - if filter_(read, matches): - break - return (n, total_bp, None) - - def _filter_wrapper(self): - return Redirector - - def _final_filter(self, outfiles): - writer = self._open_writer(outfiles.out, outfiles.out2) - return NoFilter(writer) - - def _create_demultiplexer(self, outfiles): - return Demultiplexer(outfiles.out, outfiles.untrimmed, qualities=self.uses_qualities) - - @property - def minimum_length(self): - return self._minimum_length - - @minimum_length.setter - def minimum_length(self, value): - assert value is None or len(value) == 1 - self._minimum_length = value - - @property - def maximum_length(self): - return self._maximum_length - - @maximum_length.setter - def maximum_length(self, value): - assert value is None or len(value) == 1 - self._maximum_length = value + """ + Processing pipeline for single-end reads + """ + paired = False + + def __init__(self): + super().__init__() + self._modifiers = [] + + def add(self, modifier): + self._modifiers.append(modifier) + + def add1(self, modifier): + """An alias for the add() function. Makes the interface similar to PairedEndPipeline""" + self.add(modifier) + + def process_reads(self): + """Run the pipeline. Return statistics""" + n = 0 # no. of processed reads # TODO turn into attribute + total_bp = 0 + for read in self._reader: + n += 1 + total_bp += len(read.sequence) + matches = [] + for modifier in self._modifiers: + read = modifier(read, matches) + for filter_ in self._filters: + if filter_(read, matches): + break + return (n, total_bp, None) + + def _filter_wrapper(self): + return Redirector + + def _final_filter(self, outfiles): + writer = self._open_writer(outfiles.out, outfiles.out2) + return NoFilter(writer) + + def _create_demultiplexer(self, outfiles): + return Demultiplexer(outfiles.out, outfiles.untrimmed, qualities=self.uses_qualities) + + @property + def minimum_length(self): + return self._minimum_length + + @minimum_length.setter + def minimum_length(self, value): + assert value is None or len(value) == 1 + self._minimum_length = value + + @property + def maximum_length(self): + return self._maximum_length + + @maximum_length.setter + def maximum_length(self, value): + assert value is None or len(value) == 1 + self._maximum_length = value class PairedEndPipeline(Pipeline): - """ - Processing pipeline for paired-end reads. - """ - - def __init__(self, pair_filter_mode, modify_first_read_only=False): - """Setting modify_first_read_only to True enables "legacy mode" - """ - super().__init__() - self._modifiers2 = [] - self._pair_filter_mode = pair_filter_mode - self._modify_first_read_only = modify_first_read_only - self._add_both_called = False - self._should_warn_legacy = False - self._reader = None - - def set_input(self, *args, **kwargs): - super().set_input(*args, **kwargs) - if not self._reader.delivers_qualities: - self._modifiers2 = [m for m in self._modifiers2 if not isinstance(m, ZeroCapper)] - - def add(self, modifier): - """ - Add a modifier for R1 and R2. If modify_first_read_only is True, - the modifier is not added for R2. - """ - self._modifiers.append(modifier) - if not self._modify_first_read_only: - modifier2 = copy.copy(modifier) - self._modifiers2.append(modifier2) - else: - self._should_warn_legacy = True - - def add1(self, modifier): - """Add a modifier for R1 only""" - self._modifiers.append(modifier) - - def add2(self, modifier): - """Add a modifier for R2 only""" - assert not self._modify_first_read_only - self._modifiers2.append(modifier) - - def process_reads(self): - n = 0 # no. of processed reads - total1_bp = 0 - total2_bp = 0 - for read1, read2 in self._reader: - n += 1 - total1_bp += len(read1.sequence) - total2_bp += len(read2.sequence) - matches1 = [] - matches2 = [] - for modifier in self._modifiers: - read1 = modifier(read1, matches1) - for modifier in self._modifiers2: - read2 = modifier(read2, matches2) - for filter in self._filters: - # Stop writing as soon as one of the filters was successful. - if filter(read1, read2, matches1, matches2): - break - return (n, total1_bp, total2_bp) - - @property - def should_warn_legacy(self): - return self._should_warn_legacy - - @should_warn_legacy.setter - def should_warn_legacy(self, value): - self._should_warn_legacy = bool(value) - - @property - def paired(self): - return 'first' if self._modify_first_read_only else 'both' - - def _filter_wrapper(self): - return functools.partial(PairedRedirector, pair_filter_mode=self._pair_filter_mode) - - def _final_filter(self, outfiles): - writer = self._open_writer(outfiles.out, outfiles.out2, interleaved=outfiles.interleaved) - return PairedNoFilter(writer) - - def _create_demultiplexer(self, outfiles): - return PairedEndDemultiplexer(outfiles.out, outfiles.out2, - outfiles.untrimmed, outfiles.untrimmed2, qualities=self.uses_qualities) - - @property - def minimum_length(self): - return self._minimum_length - - @minimum_length.setter - def minimum_length(self, value): - assert value is None or len(value) == 2 - self._minimum_length = value - - @property - def maximum_length(self): - return self._maximum_length - - @maximum_length.setter - def maximum_length(self, value): - assert value is None or len(value) == 2 - self._maximum_length = value + """ + Processing pipeline for paired-end reads. + """ + + def __init__(self, pair_filter_mode, modify_first_read_only=False): + """Setting modify_first_read_only to True enables "legacy mode" + """ + super().__init__() + self._modifiers2 = [] + self._pair_filter_mode = pair_filter_mode + self._modify_first_read_only = modify_first_read_only + self._add_both_called = False + self._should_warn_legacy = False + self._reader = None + + def set_input(self, *args, **kwargs): + super().set_input(*args, **kwargs) + if not self._reader.delivers_qualities: + self._modifiers2 = [m for m in self._modifiers2 if not isinstance(m, ZeroCapper)] + + def add(self, modifier): + """ + Add a modifier for R1 and R2. If modify_first_read_only is True, + the modifier is not added for R2. + """ + self._modifiers.append(modifier) + if not self._modify_first_read_only: + modifier2 = copy.copy(modifier) + self._modifiers2.append(modifier2) + else: + self._should_warn_legacy = True + + def add1(self, modifier): + """Add a modifier for R1 only""" + self._modifiers.append(modifier) + + def add2(self, modifier): + """Add a modifier for R2 only""" + assert not self._modify_first_read_only + self._modifiers2.append(modifier) + + def process_reads(self): + n = 0 # no. of processed reads + total1_bp = 0 + total2_bp = 0 + for read1, read2 in self._reader: + n += 1 + total1_bp += len(read1.sequence) + total2_bp += len(read2.sequence) + matches1 = [] + matches2 = [] + for modifier in self._modifiers: + read1 = modifier(read1, matches1) + for modifier in self._modifiers2: + read2 = modifier(read2, matches2) + for filter in self._filters: + # Stop writing as soon as one of the filters was successful. + if filter(read1, read2, matches1, matches2): + break + return (n, total1_bp, total2_bp) + + @property + def should_warn_legacy(self): + return self._should_warn_legacy + + @should_warn_legacy.setter + def should_warn_legacy(self, value): + self._should_warn_legacy = bool(value) + + @property + def paired(self): + return 'first' if self._modify_first_read_only else 'both' + + def _filter_wrapper(self): + return functools.partial(PairedRedirector, pair_filter_mode=self._pair_filter_mode) + + def _final_filter(self, outfiles): + writer = self._open_writer(outfiles.out, outfiles.out2, interleaved=outfiles.interleaved) + return PairedNoFilter(writer) + + def _create_demultiplexer(self, outfiles): + return PairedEndDemultiplexer(outfiles.out, outfiles.out2, + outfiles.untrimmed, outfiles.untrimmed2, qualities=self.uses_qualities) + + @property + def minimum_length(self): + return self._minimum_length + + @minimum_length.setter + def minimum_length(self, value): + assert value is None or len(value) == 2 + self._minimum_length = value + + @property + def maximum_length(self): + return self._maximum_length + + @maximum_length.setter + def maximum_length(self, value): + assert value is None or len(value) == 2 + self._maximum_length = value def reader_process(file, file2, connections, queue, buffer_size, stdin_fd): - """ - Read chunks of FASTA or FASTQ data from *file* and send to a worker. - - queue -- a Queue of worker indices. A worker writes its own index into this - queue to notify the reader that it is ready to receive more data. - connections -- a list of Connection objects, one for each worker. - - The function repeatedly - - - reads a chunk from the file - - reads a worker index from the Queue - - sends the chunk to connections[index] - - and finally sends "poison pills" (the value -1) to all connections. - """ - if stdin_fd != -1: - sys.stdin.close() - sys.stdin = os.fdopen(stdin_fd) - try: - with xopen(file, 'rb') as f: - if file2: - with xopen(file2, 'rb') as f2: - for chunk_index, (chunk1, chunk2) in enumerate(dnaio.read_paired_chunks(f, f2, buffer_size)): - # Determine the worker that should get this chunk - worker_index = queue.get() - pipe = connections[worker_index] - pipe.send(chunk_index) - pipe.send_bytes(chunk1) - pipe.send_bytes(chunk2) - else: - for chunk_index, chunk in enumerate(dnaio.read_chunks(f, buffer_size)): - # Determine the worker that should get this chunk - worker_index = queue.get() - pipe = connections[worker_index] - pipe.send(chunk_index) - pipe.send_bytes(chunk) - - # Send poison pills to all workers - for _ in range(len(connections)): - worker_index = queue.get() - connections[worker_index].send(-1) - except Exception as e: - # TODO better send this to a common "something went wrong" Queue - for worker_index in range(len(connections)): - connections[worker_index].send(-2) - connections[worker_index].send((e, traceback.format_exc())) + """ + Read chunks of FASTA or FASTQ data from *file* and send to a worker. + + queue -- a Queue of worker indices. A worker writes its own index into this + queue to notify the reader that it is ready to receive more data. + connections -- a list of Connection objects, one for each worker. + + The function repeatedly + + - reads a chunk from the file + - reads a worker index from the Queue + - sends the chunk to connections[index] + + and finally sends "poison pills" (the value -1) to all connections. + """ + if stdin_fd != -1: + sys.stdin.close() + sys.stdin = os.fdopen(stdin_fd) + try: + with xopen(file, 'rb') as f: + if file2: + with xopen(file2, 'rb') as f2: + for chunk_index, (chunk1, chunk2) in enumerate(dnaio.read_paired_chunks(f, f2, buffer_size)): + # Determine the worker that should get this chunk + worker_index = queue.get() + pipe = connections[worker_index] + pipe.send(chunk_index) + pipe.send_bytes(chunk1) + pipe.send_bytes(chunk2) + else: + for chunk_index, chunk in enumerate(dnaio.read_chunks(f, buffer_size)): + # Determine the worker that should get this chunk + worker_index = queue.get() + pipe = connections[worker_index] + pipe.send(chunk_index) + pipe.send_bytes(chunk) + + # Send poison pills to all workers + for _ in range(len(connections)): + worker_index = queue.get() + connections[worker_index].send(-1) + except Exception as e: + # TODO better send this to a common "something went wrong" Queue + for worker_index in range(len(connections)): + connections[worker_index].send(-2) + connections[worker_index].send((e, traceback.format_exc())) class WorkerProcess(Process): - """ - The worker repeatedly reads chunks of data from the read_pipe, runs the pipeline on it - and sends the processed chunks to the write_pipe. - - To notify the reader process that it wants data, it puts its own identifier into the - need_work_queue before attempting to read data from the read_pipe. - """ - def __init__(self, id_, pipeline, input_path1, input_path2, - interleaved_input, orig_outfiles, read_pipe, write_pipe, need_work_queue): - super().__init__() - self._id = id_ - self._pipeline = pipeline - self._input_path1 = input_path1 - self._input_path2 = input_path2 - self._interleaved_input = interleaved_input - self._orig_outfiles = orig_outfiles - self._read_pipe = read_pipe - self._write_pipe = write_pipe - self._need_work_queue = need_work_queue - - def run(self): - try: - stats = Statistics() - while True: - # Notify reader that we need data - self._need_work_queue.put(self._id) - chunk_index = self._read_pipe.recv() - if chunk_index == -1: - # reader is done - break - elif chunk_index == -2: - # An exception has occurred in the reader - e, tb_str = self._read_pipe.recv() - logger.error('%s', tb_str) - raise e - - # Setting the .buffer.name attributess below is necessary because - # file format detection uses the file name - data = self._read_pipe.recv_bytes() - input = io.BytesIO(data) - input.name = self._input_path1 - - if self._input_path2: - data = self._read_pipe.recv_bytes() - input2 = io.BytesIO(data) - input2.name = self._input_path2 - else: - input2 = None - output = io.BytesIO() - output.name = self._orig_outfiles.out.name - - if self._orig_outfiles.out2 is not None: - output2 = io.BytesIO() - output2.name = self._orig_outfiles.out2.name - else: - output2 = None - - outfiles = OutputFiles(out=output, out2=output2, interleaved=self._orig_outfiles.interleaved) - self._pipeline.set_input(input, input2, interleaved=self._interleaved_input) - self._pipeline.set_output(outfiles) - (n, bp1, bp2) = self._pipeline.process_reads() - cur_stats = Statistics() - cur_stats.collect(n, bp1, bp2, [], [], self._pipeline._filters) - stats += cur_stats - - output.flush() - processed_chunk = output.getvalue() - - self._write_pipe.send(chunk_index) - self._write_pipe.send_bytes(processed_chunk) - if self._orig_outfiles.out2 is not None: - output2.flush() - processed_chunk2 = output2.getvalue() - self._write_pipe.send_bytes(processed_chunk2) - - m = self._pipeline._modifiers - m2 = getattr(self._pipeline, '_modifiers2', []) - modifier_stats = Statistics() - modifier_stats.collect(0, 0, 0 if self._pipeline.paired else None, m, m2, []) - stats += modifier_stats - self._write_pipe.send(-1) - self._write_pipe.send(stats) - except Exception as e: - self._write_pipe.send(-2) - self._write_pipe.send((e, traceback.format_exc())) + """ + The worker repeatedly reads chunks of data from the read_pipe, runs the pipeline on it + and sends the processed chunks to the write_pipe. + + To notify the reader process that it wants data, it puts its own identifier into the + need_work_queue before attempting to read data from the read_pipe. + """ + def __init__(self, id_, pipeline, input_path1, input_path2, + interleaved_input, orig_outfiles, read_pipe, write_pipe, need_work_queue): + super().__init__() + self._id = id_ + self._pipeline = pipeline + self._input_path1 = input_path1 + self._input_path2 = input_path2 + self._interleaved_input = interleaved_input + self._orig_outfiles = orig_outfiles + self._read_pipe = read_pipe + self._write_pipe = write_pipe + self._need_work_queue = need_work_queue + + def run(self): + try: + stats = Statistics() + while True: + # Notify reader that we need data + self._need_work_queue.put(self._id) + chunk_index = self._read_pipe.recv() + if chunk_index == -1: + # reader is done + break + elif chunk_index == -2: + # An exception has occurred in the reader + e, tb_str = self._read_pipe.recv() + logger.error('%s', tb_str) + raise e + + # Setting the .buffer.name attributess below is necessary because + # file format detection uses the file name + data = self._read_pipe.recv_bytes() + input = io.BytesIO(data) + input.name = self._input_path1 + + if self._input_path2: + data = self._read_pipe.recv_bytes() + input2 = io.BytesIO(data) + input2.name = self._input_path2 + else: + input2 = None + output = io.BytesIO() + output.name = self._orig_outfiles.out.name + + if self._orig_outfiles.out2 is not None: + output2 = io.BytesIO() + output2.name = self._orig_outfiles.out2.name + else: + output2 = None + + outfiles = OutputFiles(out=output, out2=output2, interleaved=self._orig_outfiles.interleaved) + self._pipeline.set_input(input, input2, interleaved=self._interleaved_input) + self._pipeline.set_output(outfiles) + (n, bp1, bp2) = self._pipeline.process_reads() + cur_stats = Statistics() + cur_stats.collect(n, bp1, bp2, [], [], self._pipeline._filters) + stats += cur_stats + + output.flush() + processed_chunk = output.getvalue() + + self._write_pipe.send(chunk_index) + self._write_pipe.send_bytes(processed_chunk) + if self._orig_outfiles.out2 is not None: + output2.flush() + processed_chunk2 = output2.getvalue() + self._write_pipe.send_bytes(processed_chunk2) + + m = self._pipeline._modifiers + m2 = getattr(self._pipeline, '_modifiers2', []) + modifier_stats = Statistics() + modifier_stats.collect(0, 0, 0 if self._pipeline.paired else None, m, m2, []) + stats += modifier_stats + self._write_pipe.send(-1) + self._write_pipe.send(stats) + except Exception as e: + self._write_pipe.send(-2) + self._write_pipe.send((e, traceback.format_exc())) class OrderedChunkWriter: - """ - We may receive chunks of processed data from worker processes - in any order. This class writes them to an output file in - the correct order. - """ - def __init__(self, outfile): - self._chunks = dict() - self._current_index = 0 - self._outfile = outfile - - def write(self, data, chunk_index): - """ - """ - self._chunks[chunk_index] = data - while self._current_index in self._chunks: - self._outfile.write(self._chunks[self._current_index]) - del self._chunks[self._current_index] - self._current_index += 1 - - def wrote_everything(self): - return not self._chunks + """ + We may receive chunks of processed data from worker processes + in any order. This class writes them to an output file in + the correct order. + """ + def __init__(self, outfile): + self._chunks = dict() + self._current_index = 0 + self._outfile = outfile + + def write(self, data, chunk_index): + """ + """ + self._chunks[chunk_index] = data + while self._current_index in self._chunks: + self._outfile.write(self._chunks[self._current_index]) + del self._chunks[self._current_index] + self._current_index += 1 + + def wrote_everything(self): + return not self._chunks class ParallelPipelineRunner: - """ - Run a Pipeline in parallel - - - When set_input() is called, a reader process is spawned. - - When run() is called, as many worker processes as requested are spawned. - - In the main process, results are written to the output files in the correct - order, and statistics are aggregated. - - If a worker needs work, it puts its own index into a Queue() (_need_work_queue). - The reader process listens on this queue and sends the raw data to the - worker that has requested work. For sending the data from reader to worker, - a Connection() is used. There is one such connection for each worker (self._pipes). - - For sending the processed data from the worker to the main process, there - is a second set of connections, again one for each worker. - - When the reader is finished, it sends 'poison pills' to all workers. - When a worker receives this, it sends a poison pill to the main process, - followed by a Statistics object that contains statistics about all the reads - processed by that worker. - """ - - def __init__(self, pipeline, n_workers, buffer_size=4*1024**2): - self._pipeline = pipeline - self._pipes = [] # the workers read from these - self._reader_process = None - self._outfiles = None - self._input_path1 = None - self._input_path2 = None - self._interleaved_input = None - self._n_workers = n_workers - self._need_work_queue = Queue() - self._buffer_size = buffer_size - - def set_input(self, file1, file2=None, fileformat=None, interleaved=False): - if self._reader_process is not None: - raise RuntimeError('Do not call set_input more than once') - assert fileformat is None - self._input_path1 = file1 if type(file1) is str else file1.name - self._input_path2 = file2 if type(file2) is str or file2 is None else file2.name - self._interleaved_input = interleaved - connections = [Pipe(duplex=False) for _ in range(self._n_workers)] - self._pipes, connw = zip(*connections) - try: - fileno = sys.stdin.fileno() - except io.UnsupportedOperation: - # This happens during tests: pytest sets sys.stdin to an object - # that does not have a file descriptor. - fileno = -1 - self._reader_process = Process(target=reader_process, args=(file1, file2, connw, - self._need_work_queue, self._buffer_size, fileno)) - self._reader_process.daemon = True - self._reader_process.start() - - @staticmethod - def can_output_to(outfiles): - return ( - outfiles.out is not None - and outfiles.rest is None - and outfiles.info is None - and outfiles.wildcard is None - and outfiles.too_short is None - and outfiles.too_short2 is None - and outfiles.too_long is None - and outfiles.too_long2 is None - and outfiles.untrimmed is None - and outfiles.untrimmed2 is None - and not outfiles.demultiplex - ) - - def set_output(self, outfiles): - if not self.can_output_to(outfiles): - raise ValueError() - self._outfiles = outfiles - - def _start_workers(self): - workers = [] - connections = [] - for index in range(self._n_workers): - conn_r, conn_w = Pipe(duplex=False) - connections.append(conn_r) - worker = WorkerProcess( - index, self._pipeline, - self._input_path1, self._input_path2, - self._interleaved_input, self._outfiles, - self._pipes[index], conn_w, self._need_work_queue) - worker.daemon = True - worker.start() - workers.append(worker) - return workers, connections - - def run(self): - workers, connections = self._start_workers() - writers = [] - for outfile in [self._outfiles.out, self._outfiles.out2]: - if outfile is None: - continue - writers.append(OrderedChunkWriter(outfile)) - stats = None - while connections: - ready_connections = multiprocessing.connection.wait(connections) - for connection in ready_connections: - chunk_index = connection.recv() - if chunk_index == -1: - # the worker is done - cur_stats = connection.recv() - if stats == -2: - # An exception has occurred in the worker (see below, - # this happens only when there is an exception sending - # the statistics) - e, tb_str = connection.recv() - # TODO traceback should only be printed in development - logger.debug('%s', tb_str) - raise e - if stats is None: - stats = cur_stats - else: - stats += cur_stats - connections.remove(connection) - continue - elif chunk_index == -2: - # An exception has occurred in the worker - e, tb_str = connection.recv() - - # TODO traceback should only be printed in development - # We should use the worker's actual traceback object - # here, but traceback objects are not picklable. - logger.debug('%s', tb_str) - raise e - - for writer in writers: - data = connection.recv_bytes() - writer.write(data, chunk_index) - for writer in writers: - assert writer.wrote_everything() - for w in workers: - w.join() - self._reader_process.join() - return stats - - def close(self): - for f in self._outfiles: - # TODO do not use hasattr - if f is not None and f is not sys.stdin and f is not sys.stdout and hasattr(f, 'close'): - f.close() + """ + Run a Pipeline in parallel + + - When set_input() is called, a reader process is spawned. + - When run() is called, as many worker processes as requested are spawned. + - In the main process, results are written to the output files in the correct + order, and statistics are aggregated. + + If a worker needs work, it puts its own index into a Queue() (_need_work_queue). + The reader process listens on this queue and sends the raw data to the + worker that has requested work. For sending the data from reader to worker, + a Connection() is used. There is one such connection for each worker (self._pipes). + + For sending the processed data from the worker to the main process, there + is a second set of connections, again one for each worker. + + When the reader is finished, it sends 'poison pills' to all workers. + When a worker receives this, it sends a poison pill to the main process, + followed by a Statistics object that contains statistics about all the reads + processed by that worker. + """ + + def __init__(self, pipeline, n_workers, buffer_size=4*1024**2): + self._pipeline = pipeline + self._pipes = [] # the workers read from these + self._reader_process = None + self._outfiles = None + self._input_path1 = None + self._input_path2 = None + self._interleaved_input = None + self._n_workers = n_workers + self._need_work_queue = Queue() + self._buffer_size = buffer_size + + def set_input(self, file1, file2=None, fileformat=None, interleaved=False): + if self._reader_process is not None: + raise RuntimeError('Do not call set_input more than once') + assert fileformat is None + self._input_path1 = file1 if type(file1) is str else file1.name + self._input_path2 = file2 if type(file2) is str or file2 is None else file2.name + self._interleaved_input = interleaved + connections = [Pipe(duplex=False) for _ in range(self._n_workers)] + self._pipes, connw = zip(*connections) + try: + fileno = sys.stdin.fileno() + except io.UnsupportedOperation: + # This happens during tests: pytest sets sys.stdin to an object + # that does not have a file descriptor. + fileno = -1 + self._reader_process = Process(target=reader_process, args=(file1, file2, connw, + self._need_work_queue, self._buffer_size, fileno)) + self._reader_process.daemon = True + self._reader_process.start() + + @staticmethod + def can_output_to(outfiles): + return ( + outfiles.out is not None + and outfiles.rest is None + and outfiles.info is None + and outfiles.wildcard is None + and outfiles.too_short is None + and outfiles.too_short2 is None + and outfiles.too_long is None + and outfiles.too_long2 is None + and outfiles.untrimmed is None + and outfiles.untrimmed2 is None + and not outfiles.demultiplex + ) + + def set_output(self, outfiles): + if not self.can_output_to(outfiles): + raise ValueError() + self._outfiles = outfiles + + def _start_workers(self): + workers = [] + connections = [] + for index in range(self._n_workers): + conn_r, conn_w = Pipe(duplex=False) + connections.append(conn_r) + worker = WorkerProcess( + index, self._pipeline, + self._input_path1, self._input_path2, + self._interleaved_input, self._outfiles, + self._pipes[index], conn_w, self._need_work_queue) + worker.daemon = True + worker.start() + workers.append(worker) + return workers, connections + + def run(self): + workers, connections = self._start_workers() + writers = [] + for outfile in [self._outfiles.out, self._outfiles.out2]: + if outfile is None: + continue + writers.append(OrderedChunkWriter(outfile)) + stats = None + while connections: + ready_connections = multiprocessing.connection.wait(connections) + for connection in ready_connections: + chunk_index = connection.recv() + if chunk_index == -1: + # the worker is done + cur_stats = connection.recv() + if stats == -2: + # An exception has occurred in the worker (see below, + # this happens only when there is an exception sending + # the statistics) + e, tb_str = connection.recv() + # TODO traceback should only be printed in development + logger.debug('%s', tb_str) + raise e + if stats is None: + stats = cur_stats + else: + stats += cur_stats + connections.remove(connection) + continue + elif chunk_index == -2: + # An exception has occurred in the worker + e, tb_str = connection.recv() + + # TODO traceback should only be printed in development + # We should use the worker's actual traceback object + # here, but traceback objects are not picklable. + logger.debug('%s', tb_str) + raise e + + for writer in writers: + data = connection.recv_bytes() + writer.write(data, chunk_index) + for writer in writers: + assert writer.wrote_everything() + for w in workers: + w.join() + self._reader_process.join() + return stats + + def close(self): + for f in self._outfiles: + # TODO do not use hasattr + if f is not None and f is not sys.stdin and f is not sys.stdout and hasattr(f, 'close'): + f.close() diff --git a/src/cutadapt/qualtrim.pyx b/src/cutadapt/qualtrim.pyx index 098bd310..8112a916 100644 --- a/src/cutadapt/qualtrim.pyx +++ b/src/cutadapt/qualtrim.pyx @@ -4,81 +4,81 @@ Quality trimming. """ def quality_trim_index(str qualities, int cutoff_front, int cutoff_back, int base=33): - """ - Find the positions at which to trim low-quality ends from a nucleotide sequence. - Return tuple (start, stop) that indicates the good-quality segment. + """ + Find the positions at which to trim low-quality ends from a nucleotide sequence. + Return tuple (start, stop) that indicates the good-quality segment. - Qualities are assumed to be ASCII-encoded as chr(qual + base). + Qualities are assumed to be ASCII-encoded as chr(qual + base). - The algorithm is the same as the one used by BWA within the function - 'bwa_trim_read': - - Subtract the cutoff value from all qualities. - - Compute partial sums from all indices to the end of the sequence. - - Trim sequence at the index at which the sum is minimal. - """ - cdef: - int s - int max_qual - int stop = len(qualities) - int start = 0 - int i + The algorithm is the same as the one used by BWA within the function + 'bwa_trim_read': + - Subtract the cutoff value from all qualities. + - Compute partial sums from all indices to the end of the sequence. + - Trim sequence at the index at which the sum is minimal. + """ + cdef: + int s + int max_qual + int stop = len(qualities) + int start = 0 + int i - # find trim position for 5' end - s = 0 - max_qual = 0 - for i in range(len(qualities)): - s += cutoff_front - (ord(qualities[i]) - base) - if s < 0: - break - if s > max_qual: - max_qual = s - start = i + 1 + # find trim position for 5' end + s = 0 + max_qual = 0 + for i in range(len(qualities)): + s += cutoff_front - (ord(qualities[i]) - base) + if s < 0: + break + if s > max_qual: + max_qual = s + start = i + 1 - # same for 3' end - max_qual = 0 - s = 0 - for i in reversed(xrange(len(qualities))): - s += cutoff_back - (ord(qualities[i]) - base) - if s < 0: - break - if s > max_qual: - max_qual = s - stop = i - if start >= stop: - start, stop = 0, 0 - return (start, stop) + # same for 3' end + max_qual = 0 + s = 0 + for i in reversed(xrange(len(qualities))): + s += cutoff_back - (ord(qualities[i]) - base) + if s < 0: + break + if s > max_qual: + max_qual = s + stop = i + if start >= stop: + start, stop = 0, 0 + return (start, stop) def nextseq_trim_index(sequence, int cutoff, int base=33): - """ - Variant of the above quality trimming routine that works on NextSeq data. - With Illumina NextSeq, bases are encoded with two colors. 'No color' (a - dark cycle) usually means that a 'G' was sequenced, but that also occurs - when sequencing falls off the end of the fragment. The read then contains - a run of high-quality G bases in the end. + """ + Variant of the above quality trimming routine that works on NextSeq data. + With Illumina NextSeq, bases are encoded with two colors. 'No color' (a + dark cycle) usually means that a 'G' was sequenced, but that also occurs + when sequencing falls off the end of the fragment. The read then contains + a run of high-quality G bases in the end. - This routine works as the one above, but counts qualities belonging to 'G' - bases as being equal to cutoff - 1. - """ - bases = sequence.sequence - qualities = sequence.qualities - cdef: - int s = 0 - int max_qual = 0 - int max_i = len(qualities) - int i, q + This routine works as the one above, but counts qualities belonging to 'G' + bases as being equal to cutoff - 1. + """ + bases = sequence.sequence + qualities = sequence.qualities + cdef: + int s = 0 + int max_qual = 0 + int max_i = len(qualities) + int i, q - s = 0 - max_qual = 0 - max_i = len(qualities) - for i in reversed(xrange(max_i)): - q = ord(qualities[i]) - base - if bases[i] == 'G': - q = cutoff - 1 - s += cutoff - q - if s < 0: - break - if s > max_qual: - max_qual = s - max_i = i - return max_i + s = 0 + max_qual = 0 + max_i = len(qualities) + for i in reversed(xrange(max_i)): + q = ord(qualities[i]) - base + if bases[i] == 'G': + q = cutoff - 1 + s += cutoff - q + if s < 0: + break + if s > max_qual: + max_qual = s + max_i = i + return max_i diff --git a/src/cutadapt/report.py b/src/cutadapt/report.py index 6709199e..9e46aecc 100644 --- a/src/cutadapt/report.py +++ b/src/cutadapt/report.py @@ -7,421 +7,421 @@ from .adapters import BACK, BACK_NOT_INTERNAL, FRONT, FRONT_NOT_INTERNAL, PREFIX, SUFFIX, ANYWHERE, LINKED from .modifiers import QualityTrimmer, NextseqQualityTrimmer, AdapterCutter from .filters import (NoFilter, PairedNoFilter, TooShortReadFilter, TooLongReadFilter, - PairedEndDemultiplexer, Demultiplexer, NContentFilter, InfoFileWriter, WildcardFileWriter, - RestFileWriter) + PairedEndDemultiplexer, Demultiplexer, NContentFilter, InfoFileWriter, WildcardFileWriter, + RestFileWriter) def safe_divide(numerator, denominator): - if numerator is None or not denominator: - return 0.0 - else: - return numerator / denominator + if numerator is None or not denominator: + return 0.0 + else: + return numerator / denominator class Statistics: - def __init__(self): - """ - """ - self.paired = None - self.too_short = None - self.too_long = None - self.too_many_n = None - self.did_quality_trimming = None - self.n = 0 - self.written = 0 - self.total_bp = [0, 0] - self.written_bp = [0, 0] - self.with_adapters = [0, 0] - self.quality_trimmed_bp = [0, 0] - self.adapter_stats = [[], []] - - def __iadd__(self, other): - self.n += other.n - self.written += other.written - - if self.paired is None: - self.paired = other.paired - elif self.paired != other.paired: - raise ValueError('Incompatible Statistics: paired is not equal') - if self.did_quality_trimming is None: - self.did_quality_trimming = other.did_quality_trimming - elif self.did_quality_trimming != other.did_quality_trimming: - raise ValueError('Incompatible Statistics: did_quality_trimming is not equal') - - def add_if_not_none(a, b): - if a is None: - return b - if b is None: - return a - return a + b - self.too_short = add_if_not_none(self.too_short, other.too_short) - self.too_long = add_if_not_none(self.too_long, other.too_long) - self.too_many_n = add_if_not_none(self.too_many_n, other.too_many_n) - for i in (0, 1): - self.total_bp[i] += other.total_bp[i] - self.written_bp[i] += other.written_bp[i] - self.with_adapters[i] += other.with_adapters[i] - self.quality_trimmed_bp[i] += other.quality_trimmed_bp[i] - if self.adapter_stats[i] and other.adapter_stats[i]: - if len(self.adapter_stats[i]) != len(other.adapter_stats[i]): - raise ValueError('Incompatible Statistics objects (adapter_stats length)') - for j in range(len(self.adapter_stats[i])): - self.adapter_stats[i][j] += other.adapter_stats[i][j] - elif other.adapter_stats[i]: - assert self.adapter_stats[i] == [] - self.adapter_stats[i] = other.adapter_stats[i] - return self - - def collect(self, n, total_bp1, total_bp2, modifiers, modifiers2, writers): - """ - n -- total number of reads - total_bp1 -- number of bases in first reads - total_bp2 -- number of bases in second reads. None for single-end data. - """ - self.n = n - self.total_bp[0] = total_bp1 - if total_bp2 is None: - self.paired = False - else: - self.paired = True - self.total_bp[1] = total_bp2 - - # Collect statistics from writers/filters - for w in writers: - if isinstance(w, (InfoFileWriter, RestFileWriter, WildcardFileWriter)): - pass - elif isinstance(w, (NoFilter, PairedNoFilter, PairedEndDemultiplexer, Demultiplexer)): - self.written += w.written - self.written_bp[0] += w.written_bp[0] - self.written_bp[1] += w.written_bp[1] - elif isinstance(w.filter, TooShortReadFilter): - self.too_short = w.filtered - elif isinstance(w.filter, TooLongReadFilter): - self.too_long = w.filtered - elif isinstance(w.filter, NContentFilter): - self.too_many_n = w.filtered - assert self.written is not None - - # Collect statistics from modifiers - for i, modifiers_list in [(0, modifiers), (1, modifiers2)]: - for modifier in modifiers_list: - if isinstance(modifier, (QualityTrimmer, NextseqQualityTrimmer)): - self.quality_trimmed_bp[i] = modifier.trimmed_bases - self.did_quality_trimming = True - elif isinstance(modifier, AdapterCutter): - self.with_adapters[i] += modifier.with_adapters - self.adapter_stats[i] = list(modifier.adapter_statistics.values()) - - @property - def total(self): - return sum(self.total_bp) - - @property - def quality_trimmed(self): - return sum(self.quality_trimmed_bp) - - @property - def total_written_bp(self): - return sum(self.written_bp) - - @property - def written_fraction(self): - return safe_divide(self.written, self.n) - - @property - def with_adapters_fraction(self): - return [safe_divide(v, self.n) for v in self.with_adapters] - - @property - def quality_trimmed_fraction(self): - return safe_divide(self.quality_trimmed, self.total) - - @property - def total_written_bp_fraction(self): - return safe_divide(self.total_written_bp, self.total) - - @property - def too_short_fraction(self): - return safe_divide(self.too_short, self.n) - - @property - def too_long_fraction(self): - return safe_divide(self.too_long, self.n) - - @property - def too_many_n_fraction(self): - return safe_divide(self.too_many_n, self.n) + def __init__(self): + """ + """ + self.paired = None + self.too_short = None + self.too_long = None + self.too_many_n = None + self.did_quality_trimming = None + self.n = 0 + self.written = 0 + self.total_bp = [0, 0] + self.written_bp = [0, 0] + self.with_adapters = [0, 0] + self.quality_trimmed_bp = [0, 0] + self.adapter_stats = [[], []] + + def __iadd__(self, other): + self.n += other.n + self.written += other.written + + if self.paired is None: + self.paired = other.paired + elif self.paired != other.paired: + raise ValueError('Incompatible Statistics: paired is not equal') + if self.did_quality_trimming is None: + self.did_quality_trimming = other.did_quality_trimming + elif self.did_quality_trimming != other.did_quality_trimming: + raise ValueError('Incompatible Statistics: did_quality_trimming is not equal') + + def add_if_not_none(a, b): + if a is None: + return b + if b is None: + return a + return a + b + self.too_short = add_if_not_none(self.too_short, other.too_short) + self.too_long = add_if_not_none(self.too_long, other.too_long) + self.too_many_n = add_if_not_none(self.too_many_n, other.too_many_n) + for i in (0, 1): + self.total_bp[i] += other.total_bp[i] + self.written_bp[i] += other.written_bp[i] + self.with_adapters[i] += other.with_adapters[i] + self.quality_trimmed_bp[i] += other.quality_trimmed_bp[i] + if self.adapter_stats[i] and other.adapter_stats[i]: + if len(self.adapter_stats[i]) != len(other.adapter_stats[i]): + raise ValueError('Incompatible Statistics objects (adapter_stats length)') + for j in range(len(self.adapter_stats[i])): + self.adapter_stats[i][j] += other.adapter_stats[i][j] + elif other.adapter_stats[i]: + assert self.adapter_stats[i] == [] + self.adapter_stats[i] = other.adapter_stats[i] + return self + + def collect(self, n, total_bp1, total_bp2, modifiers, modifiers2, writers): + """ + n -- total number of reads + total_bp1 -- number of bases in first reads + total_bp2 -- number of bases in second reads. None for single-end data. + """ + self.n = n + self.total_bp[0] = total_bp1 + if total_bp2 is None: + self.paired = False + else: + self.paired = True + self.total_bp[1] = total_bp2 + + # Collect statistics from writers/filters + for w in writers: + if isinstance(w, (InfoFileWriter, RestFileWriter, WildcardFileWriter)): + pass + elif isinstance(w, (NoFilter, PairedNoFilter, PairedEndDemultiplexer, Demultiplexer)): + self.written += w.written + self.written_bp[0] += w.written_bp[0] + self.written_bp[1] += w.written_bp[1] + elif isinstance(w.filter, TooShortReadFilter): + self.too_short = w.filtered + elif isinstance(w.filter, TooLongReadFilter): + self.too_long = w.filtered + elif isinstance(w.filter, NContentFilter): + self.too_many_n = w.filtered + assert self.written is not None + + # Collect statistics from modifiers + for i, modifiers_list in [(0, modifiers), (1, modifiers2)]: + for modifier in modifiers_list: + if isinstance(modifier, (QualityTrimmer, NextseqQualityTrimmer)): + self.quality_trimmed_bp[i] = modifier.trimmed_bases + self.did_quality_trimming = True + elif isinstance(modifier, AdapterCutter): + self.with_adapters[i] += modifier.with_adapters + self.adapter_stats[i] = list(modifier.adapter_statistics.values()) + + @property + def total(self): + return sum(self.total_bp) + + @property + def quality_trimmed(self): + return sum(self.quality_trimmed_bp) + + @property + def total_written_bp(self): + return sum(self.written_bp) + + @property + def written_fraction(self): + return safe_divide(self.written, self.n) + + @property + def with_adapters_fraction(self): + return [safe_divide(v, self.n) for v in self.with_adapters] + + @property + def quality_trimmed_fraction(self): + return safe_divide(self.quality_trimmed, self.total) + + @property + def total_written_bp_fraction(self): + return safe_divide(self.total_written_bp, self.total) + + @property + def too_short_fraction(self): + return safe_divide(self.too_short, self.n) + + @property + def too_long_fraction(self): + return safe_divide(self.too_long, self.n) + + @property + def too_many_n_fraction(self): + return safe_divide(self.too_many_n, self.n) ADAPTER_TYPES = { - BACK: "regular 3'", - BACK_NOT_INTERNAL: "non-internal 3'", - FRONT: "regular 5'", - FRONT_NOT_INTERNAL: "non-internal 5'", - PREFIX: "anchored 5'", - SUFFIX: "anchored 3'", - ANYWHERE: "variable 5'/3'", - LINKED: "linked", + BACK: "regular 3'", + BACK_NOT_INTERNAL: "non-internal 3'", + FRONT: "regular 5'", + FRONT_NOT_INTERNAL: "non-internal 5'", + PREFIX: "anchored 5'", + SUFFIX: "anchored 3'", + ANYWHERE: "variable 5'/3'", + LINKED: "linked", } def print_error_ranges(adapter_length, error_rate): - print("No. of allowed errors:") - prev = 0 - for errors in range(1, int(error_rate * adapter_length) + 1): - r = int(errors / error_rate) - print("{}-{} bp: {};".format(prev, r - 1, errors - 1), end=' ') - prev = r - if prev == adapter_length: - print("{} bp: {}".format(adapter_length, int(error_rate * adapter_length))) - else: - print("{}-{} bp: {}".format(prev, adapter_length, int(error_rate * adapter_length))) - print() + print("No. of allowed errors:") + prev = 0 + for errors in range(1, int(error_rate * adapter_length) + 1): + r = int(errors / error_rate) + print("{}-{} bp: {};".format(prev, r - 1, errors - 1), end=' ') + prev = r + if prev == adapter_length: + print("{} bp: {}".format(adapter_length, int(error_rate * adapter_length))) + else: + print("{}-{} bp: {}".format(prev, adapter_length, int(error_rate * adapter_length))) + print() def print_histogram(end_statistics, n, gc_content): - """ - Print a histogram. Also, print the no. of reads expected to be - trimmed by chance (assuming a uniform distribution of nucleotides in the reads). - - adapter_statistics -- EndStatistics object - adapter_length -- adapter length - n -- total no. of reads. - """ - d = end_statistics.lengths - errors = end_statistics.errors - - match_probabilities = end_statistics.random_match_probabilities(gc_content=gc_content) - print("length", "count", "expect", "max.err", "error counts", sep="\t") - for length in sorted(d): - # when length surpasses adapter_length, the - # probability does not increase anymore - expect = n * match_probabilities[min(len(end_statistics.sequence), length)] - count = d[length] - max_errors = max(errors[length].keys()) - errs = ' '.join(str(errors[length][e]) for e in range(max_errors+1)) - print( - length, - count, - "{:.1F}".format(expect), - int(end_statistics.max_error_rate*min(length, len(end_statistics.sequence))), - errs, - sep="\t") - print() + """ + Print a histogram. Also, print the no. of reads expected to be + trimmed by chance (assuming a uniform distribution of nucleotides in the reads). + + adapter_statistics -- EndStatistics object + adapter_length -- adapter length + n -- total no. of reads. + """ + d = end_statistics.lengths + errors = end_statistics.errors + + match_probabilities = end_statistics.random_match_probabilities(gc_content=gc_content) + print("length", "count", "expect", "max.err", "error counts", sep="\t") + for length in sorted(d): + # when length surpasses adapter_length, the + # probability does not increase anymore + expect = n * match_probabilities[min(len(end_statistics.sequence), length)] + count = d[length] + max_errors = max(errors[length].keys()) + errs = ' '.join(str(errors[length][e]) for e in range(max_errors+1)) + print( + length, + count, + "{:.1F}".format(expect), + int(end_statistics.max_error_rate*min(length, len(end_statistics.sequence))), + errs, + sep="\t") + print() class AdjacentBaseStatistics: - def __init__(self, bases): - """ - """ - self.bases = bases - self._warnbase = None - total = sum(self.bases.values()) - if total == 0: - self._fractions = None - else: - self._fractions = [] - for base in ['A', 'C', 'G', 'T', '']: - text = base if base != '' else 'none/other' - fraction = 1.0 * self.bases[base] / total - self._fractions.append((text, 1.0 * self.bases[base] / total)) - if fraction > 0.8 and base != '': - self._warnbase = text - if total < 20: - self._warnbase = None - - @property - def should_warn(self): - return self._warnbase is not None - - def print(self): - if not self._fractions: - return False - print('Bases preceding removed adapters:') - for text, fraction in self._fractions: - print(' {}: {:.1%}'.format(text, fraction)) - if self.should_warn: - print('WARNING:') - print(' The adapter is preceded by "{}" extremely often.'.format(self._warnbase)) - print(" The provided adapter sequence could be incomplete at its 3' end.") - print() - return True - print() - return False + def __init__(self, bases): + """ + """ + self.bases = bases + self._warnbase = None + total = sum(self.bases.values()) + if total == 0: + self._fractions = None + else: + self._fractions = [] + for base in ['A', 'C', 'G', 'T', '']: + text = base if base != '' else 'none/other' + fraction = 1.0 * self.bases[base] / total + self._fractions.append((text, 1.0 * self.bases[base] / total)) + if fraction > 0.8 and base != '': + self._warnbase = text + if total < 20: + self._warnbase = None + + @property + def should_warn(self): + return self._warnbase is not None + + def print(self): + if not self._fractions: + return False + print('Bases preceding removed adapters:') + for text, fraction in self._fractions: + print(' {}: {:.1%}'.format(text, fraction)) + if self.should_warn: + print('WARNING:') + print(' The adapter is preceded by "{}" extremely often.'.format(self._warnbase)) + print(" The provided adapter sequence could be incomplete at its 3' end.") + print() + return True + print() + return False @contextmanager def redirect_standard_output(file): - if file is None: - yield - return - old_stdout = sys.stdout - sys.stdout = file - yield - sys.stdout = old_stdout + if file is None: + yield + return + old_stdout = sys.stdout + sys.stdout = file + yield + sys.stdout = old_stdout def print_report(stats, time, gc_content): - """Print report to standard output.""" - if stats.n == 0: - print("No reads processed! Either your input file is empty or you used the wrong -f/--format parameter.") - return - print("Finished in {:.2F} s ({:.0F} us/read; {:.2F} M reads/minute).".format( - time, 1E6 * time / stats.n, stats.n / time * 60 / 1E6)) - - report = "\n=== Summary ===\n\n" - if stats.paired: - report += textwrap.dedent("""\ - Total read pairs processed: {o.n:13,d} - Read 1 with adapter: {o.with_adapters[0]:13,d} ({o.with_adapters_fraction[0]:.1%}) - Read 2 with adapter: {o.with_adapters[1]:13,d} ({o.with_adapters_fraction[1]:.1%}) - """) - else: - report += textwrap.dedent("""\ - Total reads processed: {o.n:13,d} - Reads with adapters: {o.with_adapters[0]:13,d} ({o.with_adapters_fraction[0]:.1%}) - """) - if stats.too_short is not None: - report += "{pairs_or_reads} that were too short: {o.too_short:13,d} ({o.too_short_fraction:.1%})\n" - if stats.too_long is not None: - report += "{pairs_or_reads} that were too long: {o.too_long:13,d} ({o.too_long_fraction:.1%})\n" - if stats.too_many_n is not None: - report += "{pairs_or_reads} with too many N: {o.too_many_n:13,d} ({o.too_many_n_fraction:.1%})\n" - - report += textwrap.dedent("""\ - {pairs_or_reads} written (passing filters): {o.written:13,d} ({o.written_fraction:.1%}) - - Total basepairs processed: {o.total:13,d} bp - """) - if stats.paired: - report += " Read 1: {o.total_bp[0]:13,d} bp\n" - report += " Read 2: {o.total_bp[1]:13,d} bp\n" - - if stats.did_quality_trimming: - report += "Quality-trimmed: {o.quality_trimmed:13,d} bp ({o.quality_trimmed_fraction:.1%})\n" - if stats.paired: - report += " Read 1: {o.quality_trimmed_bp[0]:13,d} bp\n" - report += " Read 2: {o.quality_trimmed_bp[1]:13,d} bp\n" - - report += "Total written (filtered): {o.total_written_bp:13,d} bp ({o.total_written_bp_fraction:.1%})\n" - if stats.paired: - report += " Read 1: {o.written_bp[0]:13,d} bp\n" - report += " Read 2: {o.written_bp[1]:13,d} bp\n" - pairs_or_reads = "Pairs" if stats.paired else "Reads" - report = report.format(o=stats, pairs_or_reads=pairs_or_reads) - print(report) - - warning = False - for which_in_pair in (0, 1): - for adapter_statistics in stats.adapter_stats[which_in_pair]: - total_front = sum(adapter_statistics.front.lengths.values()) - total_back = sum(adapter_statistics.back.lengths.values()) - total = total_front + total_back - where = adapter_statistics.where - assert (where in (ANYWHERE, LINKED) - or (where in (BACK, BACK_NOT_INTERNAL, SUFFIX) and total_front == 0) - or (where in (FRONT, FRONT_NOT_INTERNAL, PREFIX) and total_back == 0)), (where, total_front) - - if stats.paired: - extra = 'First read: ' if which_in_pair == 0 else 'Second read: ' - else: - extra = '' - - print("=" * 3, extra + "Adapter", adapter_statistics.name, "=" * 3) - print() - - if where == LINKED: - print("Sequence: {}...{}; Type: linked; Length: {}+{}; " - "5' trimmed: {} times; 3' trimmed: {} times".format( - adapter_statistics.front.sequence, - adapter_statistics.back.sequence, - len(adapter_statistics.front.sequence), - len(adapter_statistics.back.sequence), - total_front, total_back)) - else: - print("Sequence: {}; Type: {}; Length: {}; Trimmed: {} times.". - format(adapter_statistics.front.sequence, ADAPTER_TYPES[adapter_statistics.where], - len(adapter_statistics.front.sequence), total)) - if total == 0: - print() - continue - if where == ANYWHERE: - print(total_front, "times, it overlapped the 5' end of a read") - print(total_back, "times, it overlapped the 3' end or was within the read") - print() - print_error_ranges(len(adapter_statistics.front.sequence), adapter_statistics.front.max_error_rate) - print("Overview of removed sequences (5')") - print_histogram(adapter_statistics.front, stats.n, gc_content) - print() - print("Overview of removed sequences (3' or within)") - print_histogram(adapter_statistics.back, stats.n, gc_content) - elif where == LINKED: - print() - print_error_ranges(len(adapter_statistics.front.sequence), adapter_statistics.front.max_error_rate) - print_error_ranges(len(adapter_statistics.back.sequence), adapter_statistics.back.max_error_rate) - print("Overview of removed sequences at 5' end") - print_histogram(adapter_statistics.front, stats.n, gc_content) - print() - print("Overview of removed sequences at 3' end") - print_histogram(adapter_statistics.back, stats.n, gc_content) - elif where in (FRONT, PREFIX, FRONT_NOT_INTERNAL): - print() - print_error_ranges(len(adapter_statistics.front.sequence), adapter_statistics.front.max_error_rate) - print("Overview of removed sequences") - print_histogram(adapter_statistics.front, stats.n, gc_content) - else: - assert where in (BACK, SUFFIX, BACK_NOT_INTERNAL) - print() - print_error_ranges(len(adapter_statistics.back.sequence), adapter_statistics.back.max_error_rate) - base_stats = AdjacentBaseStatistics(adapter_statistics.back.adjacent_bases) - warning = warning or base_stats.print() - print("Overview of removed sequences") - print_histogram(adapter_statistics.back, stats.n, gc_content) - - if warning: - print('WARNING:') - print(' One or more of your adapter sequences may be incomplete.') - print(' Please see the detailed output above.') + """Print report to standard output.""" + if stats.n == 0: + print("No reads processed! Either your input file is empty or you used the wrong -f/--format parameter.") + return + print("Finished in {:.2F} s ({:.0F} us/read; {:.2F} M reads/minute).".format( + time, 1E6 * time / stats.n, stats.n / time * 60 / 1E6)) + + report = "\n=== Summary ===\n\n" + if stats.paired: + report += textwrap.dedent("""\ + Total read pairs processed: {o.n:13,d} + Read 1 with adapter: {o.with_adapters[0]:13,d} ({o.with_adapters_fraction[0]:.1%}) + Read 2 with adapter: {o.with_adapters[1]:13,d} ({o.with_adapters_fraction[1]:.1%}) + """) + else: + report += textwrap.dedent("""\ + Total reads processed: {o.n:13,d} + Reads with adapters: {o.with_adapters[0]:13,d} ({o.with_adapters_fraction[0]:.1%}) + """) + if stats.too_short is not None: + report += "{pairs_or_reads} that were too short: {o.too_short:13,d} ({o.too_short_fraction:.1%})\n" + if stats.too_long is not None: + report += "{pairs_or_reads} that were too long: {o.too_long:13,d} ({o.too_long_fraction:.1%})\n" + if stats.too_many_n is not None: + report += "{pairs_or_reads} with too many N: {o.too_many_n:13,d} ({o.too_many_n_fraction:.1%})\n" + + report += textwrap.dedent("""\ + {pairs_or_reads} written (passing filters): {o.written:13,d} ({o.written_fraction:.1%}) + + Total basepairs processed: {o.total:13,d} bp + """) + if stats.paired: + report += " Read 1: {o.total_bp[0]:13,d} bp\n" + report += " Read 2: {o.total_bp[1]:13,d} bp\n" + + if stats.did_quality_trimming: + report += "Quality-trimmed: {o.quality_trimmed:13,d} bp ({o.quality_trimmed_fraction:.1%})\n" + if stats.paired: + report += " Read 1: {o.quality_trimmed_bp[0]:13,d} bp\n" + report += " Read 2: {o.quality_trimmed_bp[1]:13,d} bp\n" + + report += "Total written (filtered): {o.total_written_bp:13,d} bp ({o.total_written_bp_fraction:.1%})\n" + if stats.paired: + report += " Read 1: {o.written_bp[0]:13,d} bp\n" + report += " Read 2: {o.written_bp[1]:13,d} bp\n" + pairs_or_reads = "Pairs" if stats.paired else "Reads" + report = report.format(o=stats, pairs_or_reads=pairs_or_reads) + print(report) + + warning = False + for which_in_pair in (0, 1): + for adapter_statistics in stats.adapter_stats[which_in_pair]: + total_front = sum(adapter_statistics.front.lengths.values()) + total_back = sum(adapter_statistics.back.lengths.values()) + total = total_front + total_back + where = adapter_statistics.where + assert (where in (ANYWHERE, LINKED) + or (where in (BACK, BACK_NOT_INTERNAL, SUFFIX) and total_front == 0) + or (where in (FRONT, FRONT_NOT_INTERNAL, PREFIX) and total_back == 0)), (where, total_front) + + if stats.paired: + extra = 'First read: ' if which_in_pair == 0 else 'Second read: ' + else: + extra = '' + + print("=" * 3, extra + "Adapter", adapter_statistics.name, "=" * 3) + print() + + if where == LINKED: + print("Sequence: {}...{}; Type: linked; Length: {}+{}; " + "5' trimmed: {} times; 3' trimmed: {} times".format( + adapter_statistics.front.sequence, + adapter_statistics.back.sequence, + len(adapter_statistics.front.sequence), + len(adapter_statistics.back.sequence), + total_front, total_back)) + else: + print("Sequence: {}; Type: {}; Length: {}; Trimmed: {} times.". + format(adapter_statistics.front.sequence, ADAPTER_TYPES[adapter_statistics.where], + len(adapter_statistics.front.sequence), total)) + if total == 0: + print() + continue + if where == ANYWHERE: + print(total_front, "times, it overlapped the 5' end of a read") + print(total_back, "times, it overlapped the 3' end or was within the read") + print() + print_error_ranges(len(adapter_statistics.front.sequence), adapter_statistics.front.max_error_rate) + print("Overview of removed sequences (5')") + print_histogram(adapter_statistics.front, stats.n, gc_content) + print() + print("Overview of removed sequences (3' or within)") + print_histogram(adapter_statistics.back, stats.n, gc_content) + elif where == LINKED: + print() + print_error_ranges(len(adapter_statistics.front.sequence), adapter_statistics.front.max_error_rate) + print_error_ranges(len(adapter_statistics.back.sequence), adapter_statistics.back.max_error_rate) + print("Overview of removed sequences at 5' end") + print_histogram(adapter_statistics.front, stats.n, gc_content) + print() + print("Overview of removed sequences at 3' end") + print_histogram(adapter_statistics.back, stats.n, gc_content) + elif where in (FRONT, PREFIX, FRONT_NOT_INTERNAL): + print() + print_error_ranges(len(adapter_statistics.front.sequence), adapter_statistics.front.max_error_rate) + print("Overview of removed sequences") + print_histogram(adapter_statistics.front, stats.n, gc_content) + else: + assert where in (BACK, SUFFIX, BACK_NOT_INTERNAL) + print() + print_error_ranges(len(adapter_statistics.back.sequence), adapter_statistics.back.max_error_rate) + base_stats = AdjacentBaseStatistics(adapter_statistics.back.adjacent_bases) + warning = warning or base_stats.print() + print("Overview of removed sequences") + print_histogram(adapter_statistics.back, stats.n, gc_content) + + if warning: + print('WARNING:') + print(' One or more of your adapter sequences may be incomplete.') + print(' Please see the detailed output above.') def print_minimal_report(stats, time, gc_content): - """Print a minimal tabular report suitable for concatenation""" - - def none(value): - return 0 if value is None else value - - fields = [ - "OK", - stats.n, # reads/pairs in - stats.total, # bases in - none(stats.too_short), # reads/pairs - none(stats.too_long), # reads/pairs - none(stats.too_many_n), # reads/pairs - stats.written, # reads/pairs out - stats.with_adapters[0], # reads - stats.quality_trimmed_bp[0], # bases - stats.written_bp[0], # bases out - ] - if stats.paired: - fields += [ - stats.with_adapters[1], # reads/pairs - stats.quality_trimmed_bp[1], # bases - stats.written_bp[1], # bases - ] - - warning = False - for which_in_pair in (0, 1): - for adapter_statistics in stats.adapter_stats[which_in_pair]: - if adapter_statistics.where in (BACK, SUFFIX, BACK_NOT_INTERNAL): - if AdjacentBaseStatistics(adapter_statistics.back.adjacent_bases).should_warn: - warning = True - break - if warning: - fields[0] = "WARN" - header = [ - 'status', 'in_reads', 'in_bp', 'too_short', 'too_long', 'too_many_n', 'out_reads', - 'w/adapters', 'qualtrim_bp', 'out_bp'] - if stats.paired: - header += ['w/adapters2', 'qualtrim2_bp', 'out2_bp'] - print(*header, sep='\t') - print(*fields, sep='\t') + """Print a minimal tabular report suitable for concatenation""" + + def none(value): + return 0 if value is None else value + + fields = [ + "OK", + stats.n, # reads/pairs in + stats.total, # bases in + none(stats.too_short), # reads/pairs + none(stats.too_long), # reads/pairs + none(stats.too_many_n), # reads/pairs + stats.written, # reads/pairs out + stats.with_adapters[0], # reads + stats.quality_trimmed_bp[0], # bases + stats.written_bp[0], # bases out + ] + if stats.paired: + fields += [ + stats.with_adapters[1], # reads/pairs + stats.quality_trimmed_bp[1], # bases + stats.written_bp[1], # bases + ] + + warning = False + for which_in_pair in (0, 1): + for adapter_statistics in stats.adapter_stats[which_in_pair]: + if adapter_statistics.where in (BACK, SUFFIX, BACK_NOT_INTERNAL): + if AdjacentBaseStatistics(adapter_statistics.back.adjacent_bases).should_warn: + warning = True + break + if warning: + fields[0] = "WARN" + header = [ + 'status', 'in_reads', 'in_bp', 'too_short', 'too_long', 'too_many_n', 'out_reads', + 'w/adapters', 'qualtrim_bp', 'out_bp'] + if stats.paired: + header += ['w/adapters2', 'qualtrim2_bp', 'out2_bp'] + print(*header, sep='\t') + print(*fields, sep='\t') diff --git a/src/cutadapt/utils.py b/src/cutadapt/utils.py index 5e3a109f..ff245521 100644 --- a/src/cutadapt/utils.py +++ b/src/cutadapt/utils.py @@ -3,23 +3,23 @@ def available_cpu_count(): - """ - Return the number of available virtual or physical CPUs on this system. - The number of available CPUs can be smaller than the total number of CPUs - when the cpuset(7) mechanism is in use, as is the case on some cluster - systems. + """ + Return the number of available virtual or physical CPUs on this system. + The number of available CPUs can be smaller than the total number of CPUs + when the cpuset(7) mechanism is in use, as is the case on some cluster + systems. - Adapted from http://stackoverflow.com/a/1006301/715090 - """ - try: - with open('/proc/self/status') as f: - status = f.read() - m = re.search(r'(?m)^Cpus_allowed:\s*(.*)$', status) - if m: - res = bin(int(m.group(1).replace(',', ''), 16)).count('1') - if res > 0: - return min(res, multiprocessing.cpu_count()) - except IOError: - pass + Adapted from http://stackoverflow.com/a/1006301/715090 + """ + try: + with open('/proc/self/status') as f: + status = f.read() + m = re.search(r'(?m)^Cpus_allowed:\s*(.*)$', status) + if m: + res = bin(int(m.group(1).replace(',', ''), 16)).count('1') + if res > 0: + return min(res, multiprocessing.cpu_count()) + except IOError: + pass - return multiprocessing.cpu_count() + return multiprocessing.cpu_count() diff --git a/tests/conftest.py b/tests/conftest.py index 15656d72..b6605a07 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,4 +3,4 @@ @pytest.fixture(params=[1, 2]) def cores(request): - return request.param + return request.param diff --git a/tests/test_adapters.py b/tests/test_adapters.py index 3aa0ca02..f638abe4 100644 --- a/tests/test_adapters.py +++ b/tests/test_adapters.py @@ -3,277 +3,277 @@ from dnaio import Sequence from cutadapt.adapters import (Adapter, Match, FRONT, BACK, PREFIX, - expand_braces, LinkedAdapter, AdapterStatistics, AdapterParser, ANYWHERE) + expand_braces, LinkedAdapter, AdapterStatistics, AdapterParser, ANYWHERE) def test_issue_52(): - adapter = Adapter( - sequence='GAACTCCAGTCACNNNNN', - where=BACK, - remove='suffix', - max_error_rate=0.12, - min_overlap=5, - read_wildcards=False, - adapter_wildcards=True) - read = Sequence(name="abc", sequence='CCCCAGAACTACAGTCCCGGC') - am = Match(astart=0, astop=17, rstart=5, rstop=21, matches=15, errors=2, - remove_before=False, adapter=adapter, read=read) - assert am.wildcards() == 'GGC' - """ - The result above should actually be 'CGGC' since the correct - alignment is this one: - - adapter GAACTCCAGTCACNNNNN - mismatches X X - read CCCCAGAACTACAGTC-CCGGC - - Since we do not keep the alignment, guessing 'GGC' is the best we - can currently do. - """ + adapter = Adapter( + sequence='GAACTCCAGTCACNNNNN', + where=BACK, + remove='suffix', + max_error_rate=0.12, + min_overlap=5, + read_wildcards=False, + adapter_wildcards=True) + read = Sequence(name="abc", sequence='CCCCAGAACTACAGTCCCGGC') + am = Match(astart=0, astop=17, rstart=5, rstop=21, matches=15, errors=2, + remove_before=False, adapter=adapter, read=read) + assert am.wildcards() == 'GGC' + """ + The result above should actually be 'CGGC' since the correct + alignment is this one: + + adapter GAACTCCAGTCACNNNNN + mismatches X X + read CCCCAGAACTACAGTC-CCGGC + + Since we do not keep the alignment, guessing 'GGC' is the best we + can currently do. + """ def test_issue_80(): - # This issue turned out to not be an actual issue with the alignment - # algorithm. The following alignment is found because it has more matches - # than the 'obvious' one: - # - # TCGTATGCCGTCTTC - # =========X==XX= - # TCGTATGCCCTC--C - # - # This is correct, albeit a little surprising, since an alignment without - # indels would have only two errors. - - adapter = Adapter( - sequence="TCGTATGCCGTCTTC", - where=BACK, - remove='suffix', - max_error_rate=0.2, - min_overlap=3, - read_wildcards=False, - adapter_wildcards=False) - read = Sequence(name="seq2", sequence="TCGTATGCCCTCC") - result = adapter.match_to(read) - assert result.errors == 3, result - assert result.astart == 0, result - assert result.astop == 15, result + # This issue turned out to not be an actual issue with the alignment + # algorithm. The following alignment is found because it has more matches + # than the 'obvious' one: + # + # TCGTATGCCGTCTTC + # =========X==XX= + # TCGTATGCCCTC--C + # + # This is correct, albeit a little surprising, since an alignment without + # indels would have only two errors. + + adapter = Adapter( + sequence="TCGTATGCCGTCTTC", + where=BACK, + remove='suffix', + max_error_rate=0.2, + min_overlap=3, + read_wildcards=False, + adapter_wildcards=False) + read = Sequence(name="seq2", sequence="TCGTATGCCCTCC") + result = adapter.match_to(read) + assert result.errors == 3, result + assert result.astart == 0, result + assert result.astop == 15, result def test_str(): - a = Adapter('ACGT', where=BACK, remove='suffix', max_error_rate=0.1) - str(a) - str(a.match_to(Sequence(name='seq', sequence='TTACGT'))) + a = Adapter('ACGT', where=BACK, remove='suffix', max_error_rate=0.1) + str(a) + str(a.match_to(Sequence(name='seq', sequence='TTACGT'))) def test_expand_braces(): - assert expand_braces('') == '' - assert expand_braces('A') == 'A' - assert expand_braces('A{0}') == '' - assert expand_braces('A{1}') == 'A' - assert expand_braces('A{2}') == 'AA' - assert expand_braces('A{2}C') == 'AAC' - assert expand_braces('ACGTN{3}TGACCC') == 'ACGTNNNTGACCC' - assert expand_braces('ACGTN{10}TGACCC') == 'ACGTNNNNNNNNNNTGACCC' - assert expand_braces('ACGTN{3}TGA{4}CCC') == 'ACGTNNNTGAAAACCC' - assert expand_braces('ACGTN{0}TGA{4}CCC') == 'ACGTTGAAAACCC' + assert expand_braces('') == '' + assert expand_braces('A') == 'A' + assert expand_braces('A{0}') == '' + assert expand_braces('A{1}') == 'A' + assert expand_braces('A{2}') == 'AA' + assert expand_braces('A{2}C') == 'AAC' + assert expand_braces('ACGTN{3}TGACCC') == 'ACGTNNNTGACCC' + assert expand_braces('ACGTN{10}TGACCC') == 'ACGTNNNNNNNNNNTGACCC' + assert expand_braces('ACGTN{3}TGA{4}CCC') == 'ACGTNNNTGAAAACCC' + assert expand_braces('ACGTN{0}TGA{4}CCC') == 'ACGTTGAAAACCC' def test_expand_braces_fail(): - for expression in ['{', '}', '{}', '{5', '{1}', 'A{-7}', 'A{', 'A{1', 'N{7', 'AN{7', 'A{4{}', - 'A{4}{3}', 'A{b}', 'A{6X}', 'A{X6}']: - with pytest.raises(ValueError): - expand_braces(expression) + for expression in ['{', '}', '{}', '{5', '{1}', 'A{-7}', 'A{', 'A{1', 'N{7', 'AN{7', 'A{4{}', + 'A{4}{3}', 'A{b}', 'A{6X}', 'A{X6}']: + with pytest.raises(ValueError): + expand_braces(expression) def test_linked_adapter(): - front_adapter = Adapter('AAAA', where=PREFIX, min_overlap=4) - back_adapter = Adapter('TTTT', where=BACK, min_overlap=3) + front_adapter = Adapter('AAAA', where=PREFIX, min_overlap=4) + back_adapter = Adapter('TTTT', where=BACK, min_overlap=3) - linked_adapter = LinkedAdapter( - front_adapter, back_adapter, front_required=True, back_required=False, name='name') - assert linked_adapter.front_adapter.min_overlap == 4 - assert linked_adapter.back_adapter.min_overlap == 3 + linked_adapter = LinkedAdapter( + front_adapter, back_adapter, front_required=True, back_required=False, name='name') + assert linked_adapter.front_adapter.min_overlap == 4 + assert linked_adapter.back_adapter.min_overlap == 3 - sequence = Sequence(name='seq', sequence='AAAACCCCCTTTT') - trimmed = linked_adapter.match_to(sequence).trimmed() - assert trimmed.name == 'seq' - assert trimmed.sequence == 'CCCCC' + sequence = Sequence(name='seq', sequence='AAAACCCCCTTTT') + trimmed = linked_adapter.match_to(sequence).trimmed() + assert trimmed.name == 'seq' + assert trimmed.sequence == 'CCCCC' def test_info_record(): - adapter = Adapter( - sequence='GAACTCCAGTCACNNNNN', - where=BACK, - max_error_rate=0.12, - min_overlap=5, - read_wildcards=False, - adapter_wildcards=True, - name="Foo") - read = Sequence(name="abc", sequence='CCCCAGAACTACAGTCCCGGC') - am = Match(astart=0, astop=17, rstart=5, rstop=21, matches=15, errors=2, remove_before=False, - adapter=adapter, read=read) - assert am.get_info_record() == ( - "abc", - 2, - 5, - 21, - 'CCCCA', - 'GAACTACAGTCCCGGC', - '', - 'Foo', - '', - '', - '' - ) + adapter = Adapter( + sequence='GAACTCCAGTCACNNNNN', + where=BACK, + max_error_rate=0.12, + min_overlap=5, + read_wildcards=False, + adapter_wildcards=True, + name="Foo") + read = Sequence(name="abc", sequence='CCCCAGAACTACAGTCCCGGC') + am = Match(astart=0, astop=17, rstart=5, rstop=21, matches=15, errors=2, remove_before=False, + adapter=adapter, read=read) + assert am.get_info_record() == ( + "abc", + 2, + 5, + 21, + 'CCCCA', + 'GAACTACAGTCCCGGC', + '', + 'Foo', + '', + '', + '' + ) def test_random_match_probabilities(): - a = Adapter('A', where=BACK, max_error_rate=0.1).create_statistics() - assert a.back.random_match_probabilities(0.5) == [1, 0.25] - assert a.back.random_match_probabilities(0.2) == [1, 0.4] + a = Adapter('A', where=BACK, max_error_rate=0.1).create_statistics() + assert a.back.random_match_probabilities(0.5) == [1, 0.25] + assert a.back.random_match_probabilities(0.2) == [1, 0.4] - for s in ('ACTG', 'XMWH'): - a = Adapter(s, where=BACK, max_error_rate=0.1).create_statistics() - assert a.back.random_match_probabilities(0.5) == [1, 0.25, 0.25**2, 0.25**3, 0.25**4] - assert a.back.random_match_probabilities(0.2) == [1, 0.4, 0.4*0.1, 0.4*0.1*0.4, 0.4*0.1*0.4*0.1] + for s in ('ACTG', 'XMWH'): + a = Adapter(s, where=BACK, max_error_rate=0.1).create_statistics() + assert a.back.random_match_probabilities(0.5) == [1, 0.25, 0.25**2, 0.25**3, 0.25**4] + assert a.back.random_match_probabilities(0.2) == [1, 0.4, 0.4*0.1, 0.4*0.1*0.4, 0.4*0.1*0.4*0.1] - a = Adapter('GTCA', where=FRONT, max_error_rate=0.1).create_statistics() - assert a.front.random_match_probabilities(0.5) == [1, 0.25, 0.25**2, 0.25**3, 0.25**4] - assert a.front.random_match_probabilities(0.2) == [1, 0.4, 0.4*0.1, 0.4*0.1*0.4, 0.4*0.1*0.4*0.1] + a = Adapter('GTCA', where=FRONT, max_error_rate=0.1).create_statistics() + assert a.front.random_match_probabilities(0.5) == [1, 0.25, 0.25**2, 0.25**3, 0.25**4] + assert a.front.random_match_probabilities(0.2) == [1, 0.4, 0.4*0.1, 0.4*0.1*0.4, 0.4*0.1*0.4*0.1] def test_add_adapter_statistics(): - stats = Adapter('A', name='name', where=BACK, max_error_rate=0.1).create_statistics() - end_stats = stats.back - end_stats.adjacent_bases['A'] = 7 - end_stats.adjacent_bases['C'] = 19 - end_stats.adjacent_bases['G'] = 23 - end_stats.adjacent_bases['T'] = 42 - end_stats.adjacent_bases[''] = 45 - - end_stats.errors[10][0] = 100 - end_stats.errors[10][1] = 11 - end_stats.errors[10][2] = 3 - end_stats.errors[20][0] = 600 - end_stats.errors[20][1] = 66 - end_stats.errors[20][2] = 6 - - stats2 = Adapter('A', name='name', where=BACK, max_error_rate=0.1).create_statistics() - end_stats2 = stats2.back - end_stats2.adjacent_bases['A'] = 43 - end_stats2.adjacent_bases['C'] = 31 - end_stats2.adjacent_bases['G'] = 27 - end_stats2.adjacent_bases['T'] = 8 - end_stats2.adjacent_bases[''] = 5 - end_stats2.errors[10][0] = 234 - end_stats2.errors[10][1] = 14 - end_stats2.errors[10][3] = 5 - end_stats2.errors[15][0] = 90 - end_stats2.errors[15][1] = 17 - end_stats2.errors[15][2] = 2 - - stats += stats2 - r = stats.back - - assert r.adjacent_bases == {'A': 50, 'C': 50, 'G': 50, 'T': 50, '': 50} - assert r.errors == { - 10: {0: 334, 1: 25, 2: 3, 3: 5}, - 15: {0: 90, 1: 17, 2: 2}, - 20: {0: 600, 1: 66, 2: 6}, - } + stats = Adapter('A', name='name', where=BACK, max_error_rate=0.1).create_statistics() + end_stats = stats.back + end_stats.adjacent_bases['A'] = 7 + end_stats.adjacent_bases['C'] = 19 + end_stats.adjacent_bases['G'] = 23 + end_stats.adjacent_bases['T'] = 42 + end_stats.adjacent_bases[''] = 45 + + end_stats.errors[10][0] = 100 + end_stats.errors[10][1] = 11 + end_stats.errors[10][2] = 3 + end_stats.errors[20][0] = 600 + end_stats.errors[20][1] = 66 + end_stats.errors[20][2] = 6 + + stats2 = Adapter('A', name='name', where=BACK, max_error_rate=0.1).create_statistics() + end_stats2 = stats2.back + end_stats2.adjacent_bases['A'] = 43 + end_stats2.adjacent_bases['C'] = 31 + end_stats2.adjacent_bases['G'] = 27 + end_stats2.adjacent_bases['T'] = 8 + end_stats2.adjacent_bases[''] = 5 + end_stats2.errors[10][0] = 234 + end_stats2.errors[10][1] = 14 + end_stats2.errors[10][3] = 5 + end_stats2.errors[15][0] = 90 + end_stats2.errors[15][1] = 17 + end_stats2.errors[15][2] = 2 + + stats += stats2 + r = stats.back + + assert r.adjacent_bases == {'A': 50, 'C': 50, 'G': 50, 'T': 50, '': 50} + assert r.errors == { + 10: {0: 334, 1: 25, 2: 3, 3: 5}, + 15: {0: 90, 1: 17, 2: 2}, + 20: {0: 600, 1: 66, 2: 6}, + } def test_issue_265(): - """Crash when accessing the matches property of non-anchored linked adapters""" - s = Sequence('name', 'AAAATTTT') - front_adapter = Adapter('GGG', where=FRONT) - back_adapter = Adapter('TTT', where=BACK) - la = LinkedAdapter(front_adapter, back_adapter, front_required=False, back_required=False, name='name') - assert la.match_to(s).matches == 3 + """Crash when accessing the matches property of non-anchored linked adapters""" + s = Sequence('name', 'AAAATTTT') + front_adapter = Adapter('GGG', where=FRONT) + back_adapter = Adapter('TTT', where=BACK) + la = LinkedAdapter(front_adapter, back_adapter, front_required=False, back_required=False, name='name') + assert la.match_to(s).matches == 3 def test_parse_file_notation(tmpdir): - tmp_path = str(tmpdir.join('adapters.fasta')) - with open(tmp_path, 'w') as f: - f.write(dedent(""">first_name - ADAPTER1 - >second_name - ADAPTER2 - """)) - parser = AdapterParser( - max_error_rate=0.2, min_overlap=4, read_wildcards=False, - adapter_wildcards=False, indels=False) - - adapters = list(parser.parse('file:' + tmp_path, cmdline_type='back')) - assert len(adapters) == 2 - assert adapters[0].name == 'first_name' - assert adapters[0].sequence == 'ADAPTER1' - assert adapters[1].name == 'second_name' - assert adapters[1].sequence == 'ADAPTER2' - for a in adapters: - assert a.max_error_rate == 0.2 - assert a.min_overlap == 4 - assert not a.read_wildcards - assert not a.adapter_wildcards - assert not a.indels + tmp_path = str(tmpdir.join('adapters.fasta')) + with open(tmp_path, 'w') as f: + f.write(dedent(""">first_name + ADAPTER1 + >second_name + ADAPTER2 + """)) + parser = AdapterParser( + max_error_rate=0.2, min_overlap=4, read_wildcards=False, + adapter_wildcards=False, indels=False) + + adapters = list(parser.parse('file:' + tmp_path, cmdline_type='back')) + assert len(adapters) == 2 + assert adapters[0].name == 'first_name' + assert adapters[0].sequence == 'ADAPTER1' + assert adapters[1].name == 'second_name' + assert adapters[1].sequence == 'ADAPTER2' + for a in adapters: + assert a.max_error_rate == 0.2 + assert a.min_overlap == 4 + assert not a.read_wildcards + assert not a.adapter_wildcards + assert not a.indels def test_parse_not_linked(): - p = AdapterParser._parse_not_linked - assert p('A', 'front') == (None, None, 'A', {}) - assert p('A', 'back') == (None, None, 'A', {}) - assert p('A', 'anywhere') == (None, None, 'A', {}) - assert p('^A', 'front') == (None, 'anchored', 'A', {}) - assert p('XXXA', 'front') == (None, 'noninternal', 'A', {}) - assert p('A$', 'back') == (None, 'anchored', 'A', {}) - assert p('AXXXX', 'back') == (None, 'noninternal', 'A', {}) - assert p('a_name=ADAPT', 'front') == ('a_name', None, 'ADAPT', {}) + p = AdapterParser._parse_not_linked + assert p('A', 'front') == (None, None, 'A', {}) + assert p('A', 'back') == (None, None, 'A', {}) + assert p('A', 'anywhere') == (None, None, 'A', {}) + assert p('^A', 'front') == (None, 'anchored', 'A', {}) + assert p('XXXA', 'front') == (None, 'noninternal', 'A', {}) + assert p('A$', 'back') == (None, 'anchored', 'A', {}) + assert p('AXXXX', 'back') == (None, 'noninternal', 'A', {}) + assert p('a_name=ADAPT', 'front') == ('a_name', None, 'ADAPT', {}) def test_parse_parameters(): - p = AdapterParser._parse_parameters - assert p('e=0.1') == {'max_error_rate': 0.1} - assert p('error_rate=0.1') == {'max_error_rate': 0.1} - assert p('o=5') == {'min_overlap': 5} - assert p('min_overlap=5') == {'min_overlap': 5} - assert p('o=7; e=0.4') == {'min_overlap': 7, 'max_error_rate': 0.4} - assert p('anywhere') == {'anywhere': True} - - with pytest.raises(ValueError): - p('e=hallo') - with pytest.raises(KeyError): - p('bla=0.1') - with pytest.raises(ValueError): - p('e=') + p = AdapterParser._parse_parameters + assert p('e=0.1') == {'max_error_rate': 0.1} + assert p('error_rate=0.1') == {'max_error_rate': 0.1} + assert p('o=5') == {'min_overlap': 5} + assert p('min_overlap=5') == {'min_overlap': 5} + assert p('o=7; e=0.4') == {'min_overlap': 7, 'max_error_rate': 0.4} + assert p('anywhere') == {'anywhere': True} + + with pytest.raises(ValueError): + p('e=hallo') + with pytest.raises(KeyError): + p('bla=0.1') + with pytest.raises(ValueError): + p('e=') def test_parse_with_parameters(): - parser = AdapterParser( - max_error_rate=0.2, min_overlap=4, read_wildcards=False, - adapter_wildcards=False, indels=False) - a = parser._parse('ACGTACGT; e=0.15', 'front') - assert a.max_error_rate == 0.15 - assert a.min_overlap == 4 + parser = AdapterParser( + max_error_rate=0.2, min_overlap=4, read_wildcards=False, + adapter_wildcards=False, indels=False) + a = parser._parse('ACGTACGT; e=0.15', 'front') + assert a.max_error_rate == 0.15 + assert a.min_overlap == 4 - a = parser._parse('ACGTAAAA; o=5; e=0.11', 'back') - assert a.max_error_rate == 0.11 - assert a.min_overlap == 5 + a = parser._parse('ACGTAAAA; o=5; e=0.11', 'back') + assert a.max_error_rate == 0.11 + assert a.min_overlap == 5 - for spec in ('thename=ACG;e=0.15 ... TGT;e=0.17', 'thename=ACG;e=0.15...TGT;e=0.17'): - a = parser._parse(spec, 'back') - assert isinstance(a, LinkedAdapter) - assert a.front_adapter.max_error_rate == 0.15 - assert a.back_adapter.max_error_rate == 0.17 + for spec in ('thename=ACG;e=0.15 ... TGT;e=0.17', 'thename=ACG;e=0.15...TGT;e=0.17'): + a = parser._parse(spec, 'back') + assert isinstance(a, LinkedAdapter) + assert a.front_adapter.max_error_rate == 0.15 + assert a.back_adapter.max_error_rate == 0.17 def test_anywhere_parameter(): - parser = AdapterParser(max_error_rate=0.2, min_overlap=4, read_wildcards=False, - adapter_wildcards=False, indels=True) - adapter = list(parser.parse('CTGAAGTGAAGTACACGGTT;anywhere', 'back'))[0] - assert adapter.remove == 'suffix' - assert adapter.where == ANYWHERE - read = Sequence('foo1', 'TGAAGTACACGGTTAAAAAAAAAA') - from cutadapt.modifiers import AdapterCutter - cutter = AdapterCutter([adapter]) - trimmed_read = cutter(read, []) - assert trimmed_read.sequence == '' + parser = AdapterParser(max_error_rate=0.2, min_overlap=4, read_wildcards=False, + adapter_wildcards=False, indels=True) + adapter = list(parser.parse('CTGAAGTGAAGTACACGGTT;anywhere', 'back'))[0] + assert adapter.remove == 'suffix' + assert adapter.where == ANYWHERE + read = Sequence('foo1', 'TGAAGTACACGGTTAAAAAAAAAA') + from cutadapt.modifiers import AdapterCutter + cutter = AdapterCutter([adapter]) + trimmed_read = cutter(read, []) + assert trimmed_read.sequence == '' diff --git a/tests/test_align.py b/tests/test_align.py index 4805f59c..d9ff34fb 100644 --- a/tests/test_align.py +++ b/tests/test_align.py @@ -1,120 +1,120 @@ from cutadapt.align import (locate, compare_prefixes, compare_suffixes, - Aligner) + Aligner) from cutadapt.adapters import BACK class TestAligner(): - def test(self): - reference = 'CTCCAGCTTAGACATATC' - aligner = Aligner(reference, 0.1, flags=BACK) - aligner.locate('CC') + def test(self): + reference = 'CTCCAGCTTAGACATATC' + aligner = Aligner(reference, 0.1, flags=BACK) + aligner.locate('CC') - def test_100_percent_error_rate(self): - reference = 'GCTTAGACATATC' - aligner = Aligner(reference, 1.0, flags=BACK) - aligner.locate('CAA') + def test_100_percent_error_rate(self): + reference = 'GCTTAGACATATC' + aligner = Aligner(reference, 1.0, flags=BACK) + aligner.locate('CAA') def test_polya(): - s = 'AAAAAAAAAAAAAAAAA' - t = 'ACAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA' - result = locate(s, t, 0.0, BACK) - #start_s, stop_s, start_t, stop_t, matches, cost = result - assert result == (0, len(s), 4, 4 + len(s), len(s), 0) + s = 'AAAAAAAAAAAAAAAAA' + t = 'ACAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA' + result = locate(s, t, 0.0, BACK) + #start_s, stop_s, start_t, stop_t, matches, cost = result + assert result == (0, len(s), 4, 4 + len(s), len(s), 0) # Sequences with IUPAC wildcards # R=A|G, Y=C|T, S=G|C, W=A|T, K=G|T, M=A|C, B=C|G|T, D=A|G|T, H=A|C|T, V=A|C|G, # N=A|C|G|T, X={} WILDCARD_SEQUENCES = [ - 'CCCATTGATC', # original sequence without wildcards - 'CCCRTTRATC', # R=A|G - 'YCCATYGATC', # Y=C|T - 'CSSATTSATC', # S=G|C - 'CCCWWWGATC', # W=A|T - 'CCCATKKATC', # K=G|T - 'CCMATTGMTC', # M=A|C - 'BCCATTBABC', # B=C|G|T - 'BCCATTBABC', # B - 'CCCDTTDADC', # D=A|G|T - 'CHCATHGATC', # H=A|C|T - 'CVCVTTVATC', # V=A|C|G - 'CCNATNGATC', # N=A|C|G|T - 'CCCNTTNATC', # N + 'CCCATTGATC', # original sequence without wildcards + 'CCCRTTRATC', # R=A|G + 'YCCATYGATC', # Y=C|T + 'CSSATTSATC', # S=G|C + 'CCCWWWGATC', # W=A|T + 'CCCATKKATC', # K=G|T + 'CCMATTGMTC', # M=A|C + 'BCCATTBABC', # B=C|G|T + 'BCCATTBABC', # B + 'CCCDTTDADC', # D=A|G|T + 'CHCATHGATC', # H=A|C|T + 'CVCVTTVATC', # V=A|C|G + 'CCNATNGATC', # N=A|C|G|T + 'CCCNTTNATC', # N # 'CCCXTTXATC', # X ] def test_compare_prefixes(): - assert compare_prefixes('AAXAA', 'AAAAATTTTTTTTT') == (0, 5, 0, 5, 4, 1) - assert compare_prefixes('AANAA', 'AACAATTTTTTTTT', wildcard_ref=True) == (0, 5, 0, 5, 5, 0) - assert compare_prefixes('AANAA', 'AACAATTTTTTTTT', wildcard_ref=True) == (0, 5, 0, 5, 5, 0) - assert compare_prefixes('XAAAAA', 'AAAAATTTTTTTTT') == (0, 6, 0, 6, 4, 2) + assert compare_prefixes('AAXAA', 'AAAAATTTTTTTTT') == (0, 5, 0, 5, 4, 1) + assert compare_prefixes('AANAA', 'AACAATTTTTTTTT', wildcard_ref=True) == (0, 5, 0, 5, 5, 0) + assert compare_prefixes('AANAA', 'AACAATTTTTTTTT', wildcard_ref=True) == (0, 5, 0, 5, 5, 0) + assert compare_prefixes('XAAAAA', 'AAAAATTTTTTTTT') == (0, 6, 0, 6, 4, 2) - a = WILDCARD_SEQUENCES[0] - for s in WILDCARD_SEQUENCES: - r = s + 'GCCAGGGTTGATTCGGCTGATCTGGCCG' - result = compare_prefixes(a, r, wildcard_query=True) - assert result == (0, 10, 0, 10, 10, 0), result + a = WILDCARD_SEQUENCES[0] + for s in WILDCARD_SEQUENCES: + r = s + 'GCCAGGGTTGATTCGGCTGATCTGGCCG' + result = compare_prefixes(a, r, wildcard_query=True) + assert result == (0, 10, 0, 10, 10, 0), result - result = compare_prefixes(r, a, wildcard_ref=True) - assert result == (0, 10, 0, 10, 10, 0) + result = compare_prefixes(r, a, wildcard_ref=True) + assert result == (0, 10, 0, 10, 10, 0) - for s in WILDCARD_SEQUENCES: - for t in WILDCARD_SEQUENCES: - r = s + 'GCCAGGG' - result = compare_prefixes(s, r, ) - assert result == (0, 10, 0, 10, 10, 0) + for s in WILDCARD_SEQUENCES: + for t in WILDCARD_SEQUENCES: + r = s + 'GCCAGGG' + result = compare_prefixes(s, r, ) + assert result == (0, 10, 0, 10, 10, 0) - result = compare_prefixes(r, s, wildcard_ref=True, wildcard_query=True) - assert result == (0, 10, 0, 10, 10, 0) + result = compare_prefixes(r, s, wildcard_ref=True, wildcard_query=True) + assert result == (0, 10, 0, 10, 10, 0) - r = WILDCARD_SEQUENCES[0] + 'GCCAGG' - for wildc_ref in (False, True): - for wildc_query in (False, True): - result = compare_prefixes('CCCXTTXATC', r, wildcard_ref=wildc_ref, wildcard_query=wildc_query) - assert result == (0, 10, 0, 10, 8, 2) + r = WILDCARD_SEQUENCES[0] + 'GCCAGG' + for wildc_ref in (False, True): + for wildc_query in (False, True): + result = compare_prefixes('CCCXTTXATC', r, wildcard_ref=wildc_ref, wildcard_query=wildc_query) + assert result == (0, 10, 0, 10, 8, 2) def test_compare_suffixes(): - assert compare_suffixes('AAXAA', 'TTTTTTTAAAAA') == (0, 5, 7, 12, 4, 1) - assert compare_suffixes('AANAA', 'TTTTTTTAACAA', wildcard_ref=True) == (0, 5, 7, 12, 5, 0) - assert compare_suffixes('AANAA', 'TTTTTTTAACAA', wildcard_ref=True) == (0, 5, 7, 12, 5, 0) - assert compare_suffixes('AAAAAX', 'TTTTTTTAAAAA') == (0, 6, 6, 12, 4, 2) + assert compare_suffixes('AAXAA', 'TTTTTTTAAAAA') == (0, 5, 7, 12, 4, 1) + assert compare_suffixes('AANAA', 'TTTTTTTAACAA', wildcard_ref=True) == (0, 5, 7, 12, 5, 0) + assert compare_suffixes('AANAA', 'TTTTTTTAACAA', wildcard_ref=True) == (0, 5, 7, 12, 5, 0) + assert compare_suffixes('AAAAAX', 'TTTTTTTAAAAA') == (0, 6, 6, 12, 4, 2) def test_wildcards_in_adapter(): - r = 'CATCTGTCC' + WILDCARD_SEQUENCES[0] + 'GCCAGGGTTGATTCGGCTGATCTGGCCG' - for a in WILDCARD_SEQUENCES: - result = locate(a, r, 0.0, BACK, wildcard_ref=True) - assert result == (0, 10, 9, 19, 10, 0), result + r = 'CATCTGTCC' + WILDCARD_SEQUENCES[0] + 'GCCAGGGTTGATTCGGCTGATCTGGCCG' + for a in WILDCARD_SEQUENCES: + result = locate(a, r, 0.0, BACK, wildcard_ref=True) + assert result == (0, 10, 9, 19, 10, 0), result - a = 'CCCXTTXATC' - result = locate(a, r, 0.0, BACK, wildcard_ref=True) - assert result is None + a = 'CCCXTTXATC' + result = locate(a, r, 0.0, BACK, wildcard_ref=True) + assert result is None def test_wildcards_in_read(): - a = WILDCARD_SEQUENCES[0] - for s in WILDCARD_SEQUENCES: - r = 'CATCTGTCC' + s + 'GCCAGGGTTGATTCGGCTGATCTGGCCG' - result = locate(a, r, 0.0, BACK, wildcard_query=True) - if 'X' in s: - assert result is None - else: - assert result == (0, 10, 9, 19, 10, 0), result + a = WILDCARD_SEQUENCES[0] + for s in WILDCARD_SEQUENCES: + r = 'CATCTGTCC' + s + 'GCCAGGGTTGATTCGGCTGATCTGGCCG' + result = locate(a, r, 0.0, BACK, wildcard_query=True) + if 'X' in s: + assert result is None + else: + assert result == (0, 10, 9, 19, 10, 0), result def test_wildcards_in_both(): - for a in WILDCARD_SEQUENCES: - for s in WILDCARD_SEQUENCES: - if 'X' in s or 'X' in a: - continue - r = 'CATCTGTCC' + s + 'GCCAGGGTTGATTCGGCTGATCTGGCCG' - result = locate(a, r, 0.0, BACK, wildcard_ref=True, wildcard_query=True) - assert result == (0, 10, 9, 19, 10, 0), result + for a in WILDCARD_SEQUENCES: + for s in WILDCARD_SEQUENCES: + if 'X' in s or 'X' in a: + continue + r = 'CATCTGTCC' + s + 'GCCAGGGTTGATTCGGCTGATCTGGCCG' + result = locate(a, r, 0.0, BACK, wildcard_ref=True, wildcard_query=True) + assert result == (0, 10, 9, 19, 10, 0), result def test_no_match(): - a = locate('CTGATCTGGCCG', 'AAAAGGG', 0.1, BACK) - assert a is None, a + a = locate('CTGATCTGGCCG', 'AAAAGGG', 0.1, BACK) + assert a is None, a diff --git a/tests/test_commandline.py b/tests/test_commandline.py index c58fa5eb..06b08807 100644 --- a/tests/test_commandline.py +++ b/tests/test_commandline.py @@ -17,539 +17,539 @@ def test_example(): - run('-N -b ADAPTER', 'example.fa', 'example.fa') + run('-N -b ADAPTER', 'example.fa', 'example.fa') def test_small(): - run('-b TTAGACATATCTCCGTCG', 'small.fastq', 'small.fastq') + run('-b TTAGACATATCTCCGTCG', 'small.fastq', 'small.fastq') def test_empty(): - """empty input""" - run('-a TTAGACATATCTCCGTCG', 'empty.fastq', 'empty.fastq') + """empty input""" + run('-a TTAGACATATCTCCGTCG', 'empty.fastq', 'empty.fastq') def test_newlines(): - """DOS/Windows newlines""" - run('-e 0.12 -b TTAGACATATCTCCGTCG', 'dos.fastq', 'dos.fastq') + """DOS/Windows newlines""" + run('-e 0.12 -b TTAGACATATCTCCGTCG', 'dos.fastq', 'dos.fastq') def test_lowercase(): - """lowercase adapter""" - run('-b ttagacatatctccgtcg', 'lowercase.fastq', 'small.fastq') + """lowercase adapter""" + run('-b ttagacatatctccgtcg', 'lowercase.fastq', 'small.fastq') def test_rest(): - """-r/--rest-file""" - with temporary_path('rest.tmp') as rest_tmp: - run(['-b', 'ADAPTER', '-N', '-r', rest_tmp], "rest.fa", "rest.fa") - assert_files_equal(datapath('rest.txt'), rest_tmp) + """-r/--rest-file""" + with temporary_path('rest.tmp') as rest_tmp: + run(['-b', 'ADAPTER', '-N', '-r', rest_tmp], "rest.fa", "rest.fa") + assert_files_equal(datapath('rest.txt'), rest_tmp) def test_restfront(): - with temporary_path("rest.txt") as path: - run(['-g', 'ADAPTER', '-N', '-r', path], "restfront.fa", "rest.fa") - assert_files_equal(datapath('restfront.txt'), path) + with temporary_path("rest.txt") as path: + run(['-g', 'ADAPTER', '-N', '-r', path], "restfront.fa", "rest.fa") + assert_files_equal(datapath('restfront.txt'), path) def test_discard(): - """--discard""" - run("-b TTAGACATATCTCCGTCG --discard", "discard.fastq", "small.fastq") + """--discard""" + run("-b TTAGACATATCTCCGTCG --discard", "discard.fastq", "small.fastq") def test_discard_untrimmed(): - """--discard-untrimmed""" - run('-b CAAGAT --discard-untrimmed', 'discard-untrimmed.fastq', 'small.fastq') + """--discard-untrimmed""" + run('-b CAAGAT --discard-untrimmed', 'discard-untrimmed.fastq', 'small.fastq') @pytest.mark.skip(reason='Regression since switching to dnaio') def test_second_header_retained(cores): - """test if sequence name after the "+" is retained""" - run("--cores {} -e 0.12 -b TTAGACATATCTCCGTCG".format(cores), "plus.fastq", "plus.fastq") + """test if sequence name after the "+" is retained""" + run("--cores {} -e 0.12 -b TTAGACATATCTCCGTCG".format(cores), "plus.fastq", "plus.fastq") @pytest.mark.skip(reason='Regression since switching to dnaio') def test_length_tag_second_header(cores): - """Ensure --length-tag= also modifies the second header line""" - run("--cores {} -a GGCTTC --length-tag=length=".format(cores), - 'SRR2040271_1.fastq', 'SRR2040271_1.fastq') + """Ensure --length-tag= also modifies the second header line""" + run("--cores {} -a GGCTTC --length-tag=length=".format(cores), + 'SRR2040271_1.fastq', 'SRR2040271_1.fastq') def test_extensiontxtgz(): - """automatic recognition of "_sequence.txt.gz" extension""" - run("-b TTAGACATATCTCCGTCG", "s_1_sequence.txt", "s_1_sequence.txt.gz") + """automatic recognition of "_sequence.txt.gz" extension""" + run("-b TTAGACATATCTCCGTCG", "s_1_sequence.txt", "s_1_sequence.txt.gz") def test_format(): - """the -f/--format parameter""" - run("-f fastq -b TTAGACATATCTCCGTCG", "small.fastq", "small.myownextension") + """the -f/--format parameter""" + run("-f fastq -b TTAGACATATCTCCGTCG", "small.fastq", "small.myownextension") def test_minimum_length(): - """-m/--minimum-length""" - run("-m 5 -a TTAGACATATCTCCGTCG", "minlen.fa", "lengths.fa") + """-m/--minimum-length""" + run("-m 5 -a TTAGACATATCTCCGTCG", "minlen.fa", "lengths.fa") def test_too_short(tmpdir): - """--too-short-output""" - too_short_path = str(tmpdir.join('tooshort.fa')) - run("-m 5 -a TTAGACATATCTCCGTCG --too-short-output " + too_short_path, "minlen.fa", "lengths.fa") - assert_files_equal(datapath('tooshort.fa'), too_short_path) + """--too-short-output""" + too_short_path = str(tmpdir.join('tooshort.fa')) + run("-m 5 -a TTAGACATATCTCCGTCG --too-short-output " + too_short_path, "minlen.fa", "lengths.fa") + assert_files_equal(datapath('tooshort.fa'), too_short_path) def test_maximum_length(): - """-M/--maximum-length""" - run("-M 5 -a TTAGACATATCTCCGTCG", "maxlen.fa", "lengths.fa") + """-M/--maximum-length""" + run("-M 5 -a TTAGACATATCTCCGTCG", "maxlen.fa", "lengths.fa") def test_too_long(tmpdir): - """--too-long-output""" - too_long_path = str(tmpdir.join('toolong.fa')) - run("-M 5 -a TTAGACATATCTCCGTCG --too-long-output " + too_long_path, "maxlen.fa", "lengths.fa") - assert_files_equal(datapath('toolong.fa'), too_long_path) + """--too-long-output""" + too_long_path = str(tmpdir.join('toolong.fa')) + run("-M 5 -a TTAGACATATCTCCGTCG --too-long-output " + too_long_path, "maxlen.fa", "lengths.fa") + assert_files_equal(datapath('toolong.fa'), too_long_path) def test_length_tag(): - """454 data; -n and --length-tag""" - run("-n 3 -e 0.1 --length-tag length= " - "-b TGAGACACGCAACAGGGGAAAGGCAAGGCACACAGGGGATAGG " - "-b TCCATCTCATCCCTGCGTGTCCCATCTGTTCCCTCCCTGTCTCA", '454.fa', '454.fa') + """454 data; -n and --length-tag""" + run("-n 3 -e 0.1 --length-tag length= " + "-b TGAGACACGCAACAGGGGAAAGGCAAGGCACACAGGGGATAGG " + "-b TCCATCTCATCCCTGCGTGTCCCATCTGTTCCCTCCCTGTCTCA", '454.fa', '454.fa') @pytest.mark.parametrize("length", list(range(3, 11))) def test_overlap_a(tmpdir, length): - """-O/--overlap with -a""" - adapter = "catatctccg" - record = ">read\nGAGACCATTCCAATG" + adapter[:length] + '\n' - input = tmpdir.join("overlap.fasta") - input.write(record) - if length < 7: - expected = record - else: - expected = '>read\nGAGACCATTCCAATG\n' - output = tmpdir.join("overlap-trimmed.fasta") - main(["-O", "7", "-e", "0", "-a", adapter, "-o", str(output), str(input)]) - assert expected == output.read() + """-O/--overlap with -a""" + adapter = "catatctccg" + record = ">read\nGAGACCATTCCAATG" + adapter[:length] + '\n' + input = tmpdir.join("overlap.fasta") + input.write(record) + if length < 7: + expected = record + else: + expected = '>read\nGAGACCATTCCAATG\n' + output = tmpdir.join("overlap-trimmed.fasta") + main(["-O", "7", "-e", "0", "-a", adapter, "-o", str(output), str(input)]) + assert expected == output.read() def test_overlap_b(): - """-O/--overlap with -b""" - run("-O 10 -b TTAGACATATCTCCGTCG", "overlapb.fa", "overlapb.fa") + """-O/--overlap with -b""" + run("-O 10 -b TTAGACATATCTCCGTCG", "overlapb.fa", "overlapb.fa") def test_qualtrim(): - """-q with low qualities""" - run("-q 10 -a XXXXXX", "lowqual.fastq", "lowqual.fastq") + """-q with low qualities""" + run("-q 10 -a XXXXXX", "lowqual.fastq", "lowqual.fastq") def test_qualbase(): - """-q with low qualities, using ascii(quality+64) encoding""" - run("-q 10 --quality-base 64 -a XXXXXX", "illumina64.fastq", "illumina64.fastq") + """-q with low qualities, using ascii(quality+64) encoding""" + run("-q 10 --quality-base 64 -a XXXXXX", "illumina64.fastq", "illumina64.fastq") def test_quality_trim_only(): - """only trim qualities, do not remove adapters""" - run("-q 10 --quality-base 64", "illumina64.fastq", "illumina64.fastq") + """only trim qualities, do not remove adapters""" + run("-q 10 --quality-base 64", "illumina64.fastq", "illumina64.fastq") def test_twoadapters(): - """two adapters""" - run("-a AATTTCAGGAATT -a GTTCTCTAGTTCT", "twoadapters.fasta", "twoadapters.fasta") + """two adapters""" + run("-a AATTTCAGGAATT -a GTTCTCTAGTTCT", "twoadapters.fasta", "twoadapters.fasta") def test_polya(): - """poly-A tails""" - run("-m 24 -O 10 -a AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", "polya.fasta", "polya.fasta") + """poly-A tails""" + run("-m 24 -O 10 -a AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", "polya.fasta", "polya.fasta") def test_polya_brace_notation(): - """poly-A tails""" - run("-m 24 -O 10 -a A{35}", "polya.fasta", "polya.fasta") + """poly-A tails""" + run("-m 24 -O 10 -a A{35}", "polya.fasta", "polya.fasta") # the same as --action=none def test_no_trim(): - run("--no-trim --discard-untrimmed -a CCCTAGTTAAAC", 'no-trim.fastq', 'small.fastq') + run("--no-trim --discard-untrimmed -a CCCTAGTTAAAC", 'no-trim.fastq', 'small.fastq') def test_action_none(): - run("--action=none --discard-untrimmed -a CCCTAGTTAAAC", 'no-trim.fastq', 'small.fastq') + run("--action=none --discard-untrimmed -a CCCTAGTTAAAC", 'no-trim.fastq', 'small.fastq') # the same as --action=mask def test_mask_adapter(): - """mask adapter with N (reads maintain the same length)""" - run("-b CAAG -n 3 --mask-adapter", "anywhere_repeat.fastq", "anywhere_repeat.fastq") + """mask adapter with N (reads maintain the same length)""" + run("-b CAAG -n 3 --mask-adapter", "anywhere_repeat.fastq", "anywhere_repeat.fastq") def test_action_mask(): - """mask adapter with N (reads maintain the same length)""" - run("-b CAAG -n 3 --action=mask", "anywhere_repeat.fastq", "anywhere_repeat.fastq") + """mask adapter with N (reads maintain the same length)""" + run("-b CAAG -n 3 --action=mask", "anywhere_repeat.fastq", "anywhere_repeat.fastq") def test_gz_multiblock(): - """compressed gz file with multiple blocks (created by concatenating two .gz files)""" - run("-b TTAGACATATCTCCGTCG", "small.fastq", "multiblock.fastq.gz") + """compressed gz file with multiple blocks (created by concatenating two .gz files)""" + run("-b TTAGACATATCTCCGTCG", "small.fastq", "multiblock.fastq.gz") @pytest.mark.parametrize("opt", ["-y", "--suffix"]) def test_suffix(opt): - """-y/--suffix parameter""" - run([opt, ' {name}', '-e', '0', '-a', 'OnlyT=TTTTTTTT', '-a', 'OnlyG=GGGGGGGG'], "suffix.fastq", "suffix.fastq") + """-y/--suffix parameter""" + run([opt, ' {name}', '-e', '0', '-a', 'OnlyT=TTTTTTTT', '-a', 'OnlyG=GGGGGGGG'], "suffix.fastq", "suffix.fastq") def test_read_wildcard(): - """test wildcards in reads""" - run("--match-read-wildcards -b ACGTACGT", "wildcard.fa", "wildcard.fa") + """test wildcards in reads""" + run("--match-read-wildcards -b ACGTACGT", "wildcard.fa", "wildcard.fa") def test_adapter_wildcard(): - """wildcards in adapter""" - for adapter_type, expected in ( - ("-a", "wildcard_adapter.fa"), - ("-b", "wildcard_adapter_anywhere.fa")): - with temporary_path("wildcardtmp.txt") as wildcardtmp: - run("--wildcard-file {0} {1} ACGTNNNACGT".format(wildcardtmp, adapter_type), - expected, "wildcard_adapter.fa") - with open(wildcardtmp) as wct: - lines = wct.readlines() - lines = [ line.strip() for line in lines ] - assert lines == ['AAA 1', 'GGG 2', 'CCC 3b', 'TTT 4b'] + """wildcards in adapter""" + for adapter_type, expected in ( + ("-a", "wildcard_adapter.fa"), + ("-b", "wildcard_adapter_anywhere.fa")): + with temporary_path("wildcardtmp.txt") as wildcardtmp: + run("--wildcard-file {0} {1} ACGTNNNACGT".format(wildcardtmp, adapter_type), + expected, "wildcard_adapter.fa") + with open(wildcardtmp) as wct: + lines = wct.readlines() + lines = [ line.strip() for line in lines ] + assert lines == ['AAA 1', 'GGG 2', 'CCC 3b', 'TTT 4b'] def test_wildcard_N(): - """test 'N' wildcard matching with no allowed errors""" - run("-e 0 -a GGGGGGG --match-read-wildcards", "wildcardN.fa", "wildcardN.fa") + """test 'N' wildcard matching with no allowed errors""" + run("-e 0 -a GGGGGGG --match-read-wildcards", "wildcardN.fa", "wildcardN.fa") def test_illumina_adapter_wildcard(): - run("-a VCCGAMCYUCKHRKDCUBBCNUWNSGHCGU", "illumina.fastq", "illumina.fastq.gz") + run("-a VCCGAMCYUCKHRKDCUBBCNUWNSGHCGU", "illumina.fastq", "illumina.fastq.gz") def test_adapter_front(): - """test adapter in front""" - run("--front ADAPTER -N", "examplefront.fa", "example.fa") + """test adapter in front""" + run("--front ADAPTER -N", "examplefront.fa", "example.fa") def test_literal_N(): - """test matching literal 'N's""" - run("-N -e 0.2 -a NNNNNNNNNNNNNN", "trimN3.fasta", "trimN3.fasta") + """test matching literal 'N's""" + run("-N -e 0.2 -a NNNNNNNNNNNNNN", "trimN3.fasta", "trimN3.fasta") def test_literal_N2(): - run("-N -O 1 -g NNNNNNNNNNNNNN", "trimN5.fasta", "trimN5.fasta") + run("-N -O 1 -g NNNNNNNNNNNNNN", "trimN5.fasta", "trimN5.fasta") def test_literal_N_brace_notation(): - """test matching literal 'N's""" - run("-N -e 0.2 -a N{14}", "trimN3.fasta", "trimN3.fasta") + """test matching literal 'N's""" + run("-N -e 0.2 -a N{14}", "trimN3.fasta", "trimN3.fasta") def test_literal_N2_brace_notation(): - run("-N -O 1 -g N{14}", "trimN5.fasta", "trimN5.fasta") + run("-N -O 1 -g N{14}", "trimN5.fasta", "trimN5.fasta") def test_anchored_front(): - run("-g ^FRONTADAPT -N", "anchored.fasta", "anchored.fasta") + run("-g ^FRONTADAPT -N", "anchored.fasta", "anchored.fasta") def test_anchored_front_ellipsis_notation(): - run("-a FRONTADAPT... -N", "anchored.fasta", "anchored.fasta") + run("-a FRONTADAPT... -N", "anchored.fasta", "anchored.fasta") def test_anchored_back(): - run("-a BACKADAPTER$ -N", "anchored-back.fasta", "anchored-back.fasta") + run("-a BACKADAPTER$ -N", "anchored-back.fasta", "anchored-back.fasta") def test_anchored_back_ellipsis_notation(): - run("-a ...BACKADAPTER$ -N", "anchored-back.fasta", "anchored-back.fasta") + run("-a ...BACKADAPTER$ -N", "anchored-back.fasta", "anchored-back.fasta") def test_anchored_back_no_indels(): - run("-a BACKADAPTER$ -N --no-indels", "anchored-back.fasta", "anchored-back.fasta") + run("-a BACKADAPTER$ -N --no-indels", "anchored-back.fasta", "anchored-back.fasta") def test_no_indels(): - run('-a TTAGACATAT -g GAGATTGCCA --no-indels', 'no_indels.fasta', 'no_indels.fasta') + run('-a TTAGACATAT -g GAGATTGCCA --no-indels', 'no_indels.fasta', 'no_indels.fasta') def test_ellipsis_notation(): - run('-a ...TTAGACATAT -g GAGATTGCCA --no-indels', 'no_indels.fasta', 'no_indels.fasta') + run('-a ...TTAGACATAT -g GAGATTGCCA --no-indels', 'no_indels.fasta', 'no_indels.fasta') def test_issue_46(): - """issue 46 - IndexError with --wildcard-file""" - with temporary_path("wildcardtmp.txt") as wildcardtmp: - run("--anywhere=AACGTN --wildcard-file={0}".format(wildcardtmp), "issue46.fasta", "issue46.fasta") + """issue 46 - IndexError with --wildcard-file""" + with temporary_path("wildcardtmp.txt") as wildcardtmp: + run("--anywhere=AACGTN --wildcard-file={0}".format(wildcardtmp), "issue46.fasta", "issue46.fasta") def test_strip_suffix(): - run("--strip-suffix _sequence -a XXXXXXX", "stripped.fasta", "simple.fasta") + run("--strip-suffix _sequence -a XXXXXXX", "stripped.fasta", "simple.fasta") def test_info_file(): - # The true adapter sequence in the illumina.fastq.gz data set is - # GCCTAACTTCTTAGACTGCCTTAAGGACGT (fourth base is different) - # - with temporary_path("infotmp.txt") as infotmp: - run(["--info-file", infotmp, '-a', 'adapt=GCCGAACTTCTTAGACTGCCTTAAGGACGT'], - "illumina.fastq", "illumina.fastq.gz") - assert_files_equal(cutpath('illumina.info.txt'), infotmp) + # The true adapter sequence in the illumina.fastq.gz data set is + # GCCTAACTTCTTAGACTGCCTTAAGGACGT (fourth base is different) + # + with temporary_path("infotmp.txt") as infotmp: + run(["--info-file", infotmp, '-a', 'adapt=GCCGAACTTCTTAGACTGCCTTAAGGACGT'], + "illumina.fastq", "illumina.fastq.gz") + assert_files_equal(cutpath('illumina.info.txt'), infotmp) def test_info_file_times(): - with temporary_path("infotmp.txt") as infotmp: - run(["--info-file", infotmp, '--times', '2', '-a', 'adapt=GCCGAACTTCTTA', - '-a', 'adapt2=GACTGCCTTAAGGACGT'], "illumina5.fastq", "illumina5.fastq") - assert_files_equal(cutpath('illumina5.info.txt'), infotmp) + with temporary_path("infotmp.txt") as infotmp: + run(["--info-file", infotmp, '--times', '2', '-a', 'adapt=GCCGAACTTCTTA', + '-a', 'adapt2=GACTGCCTTAAGGACGT'], "illumina5.fastq", "illumina5.fastq") + assert_files_equal(cutpath('illumina5.info.txt'), infotmp) def test_info_file_fasta(): - with temporary_path("infotmp.txt") as infotmp: - # Just make sure that it runs - run(['--info-file', infotmp, '-a', 'TTAGACATAT', '-g', 'GAGATTGCCA', '--no-indels'], 'no_indels.fasta', 'no_indels.fasta') + with temporary_path("infotmp.txt") as infotmp: + # Just make sure that it runs + run(['--info-file', infotmp, '-a', 'TTAGACATAT', '-g', 'GAGATTGCCA', '--no-indels'], 'no_indels.fasta', 'no_indels.fasta') def test_named_adapter(): - run("-a MY_ADAPTER=GCCGAACTTCTTAGACTGCCTTAAGGACGT", "illumina.fastq", "illumina.fastq.gz") + run("-a MY_ADAPTER=GCCGAACTTCTTAGACTGCCTTAAGGACGT", "illumina.fastq", "illumina.fastq.gz") def test_adapter_with_u(): - run("-a GCCGAACUUCUUAGACUGCCUUAAGGACGU", "illumina.fastq", "illumina.fastq.gz") + run("-a GCCGAACUUCUUAGACUGCCUUAAGGACGU", "illumina.fastq", "illumina.fastq.gz") def test_bzip2(): - run('-b TTAGACATATCTCCGTCG', 'small.fastq', 'small.fastq.bz2') + run('-b TTAGACATATCTCCGTCG', 'small.fastq', 'small.fastq.bz2') if sys.version_info[:2] >= (3, 3): - def test_bzip2_multiblock(): - run('-b TTAGACATATCTCCGTCG', 'small.fastq', 'multiblock.fastq.bz2') + def test_bzip2_multiblock(): + run('-b TTAGACATATCTCCGTCG', 'small.fastq', 'multiblock.fastq.bz2') try: - import lzma + import lzma - def test_xz(): - run('-b TTAGACATATCTCCGTCG', 'small.fastq', 'small.fastq.xz') + def test_xz(): + run('-b TTAGACATATCTCCGTCG', 'small.fastq', 'small.fastq.xz') except ImportError: - pass + pass def test_no_args(): - with pytest.raises(SystemExit): - with redirect_stderr(): - main([]) + with pytest.raises(SystemExit): + with redirect_stderr(): + main([]) def test_two_fastqs(): - with pytest.raises(SystemExit): - with redirect_stderr(): - main([datapath('paired.1.fastq'), datapath('paired.2.fastq')]) + with pytest.raises(SystemExit): + with redirect_stderr(): + main([datapath('paired.1.fastq'), datapath('paired.2.fastq')]) def test_anchored_no_indels(): - """anchored 5' adapter, mismatches only (no indels)""" - run('-g ^TTAGACATAT --no-indels -e 0.1', 'anchored_no_indels.fasta', 'anchored_no_indels.fasta') + """anchored 5' adapter, mismatches only (no indels)""" + run('-g ^TTAGACATAT --no-indels -e 0.1', 'anchored_no_indels.fasta', 'anchored_no_indels.fasta') def test_anchored_no_indels_wildcard_read(): - """anchored 5' adapter, mismatches only (no indels), but wildcards in the read count as matches""" - run('-g ^TTAGACATAT --match-read-wildcards --no-indels -e 0.1', 'anchored_no_indels_wildcard.fasta', 'anchored_no_indels.fasta') + """anchored 5' adapter, mismatches only (no indels), but wildcards in the read count as matches""" + run('-g ^TTAGACATAT --match-read-wildcards --no-indels -e 0.1', 'anchored_no_indels_wildcard.fasta', 'anchored_no_indels.fasta') def test_anchored_no_indels_wildcard_adapt(): - """anchored 5' adapter, mismatches only (no indels), but wildcards in the adapter count as matches""" - run('-g ^TTAGACANAT --no-indels -e 0.1', 'anchored_no_indels.fasta', 'anchored_no_indels.fasta') + """anchored 5' adapter, mismatches only (no indels), but wildcards in the adapter count as matches""" + run('-g ^TTAGACANAT --no-indels -e 0.1', 'anchored_no_indels.fasta', 'anchored_no_indels.fasta') def test_non_iupac_characters(): - with pytest.raises(SystemExit): - with redirect_stderr(): - main(['-a', 'ZACGT', datapath('small.fastq')]) + with pytest.raises(SystemExit): + with redirect_stderr(): + main(['-a', 'ZACGT', datapath('small.fastq')]) def test_unconditional_cut_front(): - run('-u 5', 'unconditional-front.fastq', 'small.fastq') + run('-u 5', 'unconditional-front.fastq', 'small.fastq') def test_unconditional_cut_back(): - run('-u -5', 'unconditional-back.fastq', 'small.fastq') + run('-u -5', 'unconditional-back.fastq', 'small.fastq') def test_unconditional_cut_both(): - run('-u -5 -u 5', 'unconditional-both.fastq', 'small.fastq') + run('-u -5 -u 5', 'unconditional-both.fastq', 'small.fastq') def test_untrimmed_output(): - with temporary_path('untrimmed.tmp.fastq') as tmp: - run(['-a', 'TTAGACATATCTCCGTCG', '--untrimmed-output', tmp], 'small.trimmed.fastq', 'small.fastq') - assert_files_equal(cutpath('small.untrimmed.fastq'), tmp) + with temporary_path('untrimmed.tmp.fastq') as tmp: + run(['-a', 'TTAGACATATCTCCGTCG', '--untrimmed-output', tmp], 'small.trimmed.fastq', 'small.fastq') + assert_files_equal(cutpath('small.untrimmed.fastq'), tmp) def test_adapter_file(): - run('-a file:' + datapath('adapter.fasta'), 'illumina.fastq', 'illumina.fastq.gz') + run('-a file:' + datapath('adapter.fasta'), 'illumina.fastq', 'illumina.fastq.gz') def test_adapter_file_5p_anchored(): - run('-N -g file:' + datapath('prefix-adapter.fasta'), 'anchored.fasta', 'anchored.fasta') + run('-N -g file:' + datapath('prefix-adapter.fasta'), 'anchored.fasta', 'anchored.fasta') def test_adapter_file_3p_anchored(): - run('-N -a file:' + datapath('suffix-adapter.fasta'), 'anchored-back.fasta', 'anchored-back.fasta') + run('-N -a file:' + datapath('suffix-adapter.fasta'), 'anchored-back.fasta', 'anchored-back.fasta') def test_adapter_file_5p_anchored_no_indels(): - run('-N --no-indels -g file:' + datapath('prefix-adapter.fasta'), 'anchored.fasta', 'anchored.fasta') + run('-N --no-indels -g file:' + datapath('prefix-adapter.fasta'), 'anchored.fasta', 'anchored.fasta') def test_adapter_file_3p_anchored_no_indels(): - run('-N --no-indels -a file:' + datapath('suffix-adapter.fasta'), 'anchored-back.fasta', 'anchored-back.fasta') + run('-N --no-indels -a file:' + datapath('suffix-adapter.fasta'), 'anchored-back.fasta', 'anchored-back.fasta') def test_demultiplex(): - tempdir = tempfile.mkdtemp(prefix='cutadapt-tests.') - multiout = os.path.join(tempdir, 'tmp-demulti.{name}.fasta') - params = ['-a', 'first=AATTTCAGGAATT', '-a', 'second=GTTCTCTAGTTCT', '-o', multiout, datapath('twoadapters.fasta')] - assert main(params) is None - assert_files_equal(cutpath('twoadapters.first.fasta'), multiout.format(name='first')) - assert_files_equal(cutpath('twoadapters.second.fasta'), multiout.format(name='second')) - assert_files_equal(cutpath('twoadapters.unknown.fasta'), multiout.format(name='unknown')) - shutil.rmtree(tempdir) + tempdir = tempfile.mkdtemp(prefix='cutadapt-tests.') + multiout = os.path.join(tempdir, 'tmp-demulti.{name}.fasta') + params = ['-a', 'first=AATTTCAGGAATT', '-a', 'second=GTTCTCTAGTTCT', '-o', multiout, datapath('twoadapters.fasta')] + assert main(params) is None + assert_files_equal(cutpath('twoadapters.first.fasta'), multiout.format(name='first')) + assert_files_equal(cutpath('twoadapters.second.fasta'), multiout.format(name='second')) + assert_files_equal(cutpath('twoadapters.unknown.fasta'), multiout.format(name='unknown')) + shutil.rmtree(tempdir) def test_max_n(): - run('--max-n 0', 'maxn0.fasta', 'maxn.fasta') - run('--max-n 1', 'maxn1.fasta', 'maxn.fasta') - run('--max-n 2', 'maxn2.fasta', 'maxn.fasta') - run('--max-n 0.2', 'maxn0.2.fasta', 'maxn.fasta') - run('--max-n 0.4', 'maxn0.4.fasta', 'maxn.fasta') + run('--max-n 0', 'maxn0.fasta', 'maxn.fasta') + run('--max-n 1', 'maxn1.fasta', 'maxn.fasta') + run('--max-n 2', 'maxn2.fasta', 'maxn.fasta') + run('--max-n 0.2', 'maxn0.2.fasta', 'maxn.fasta') + run('--max-n 0.4', 'maxn0.4.fasta', 'maxn.fasta') def test_quiet_is_quiet(): - captured_standard_output = StringIO() - captured_standard_error = StringIO() - old_stdout = sys.stdout - old_stderr = sys.stderr - try: - sys.stdout = captured_standard_output - sys.stderr = captured_standard_error - main(['-o', '/dev/null', '--quiet', '-a', 'XXXX', datapath('illumina.fastq.gz')]) - finally: - sys.stdout = old_stdout - sys.stderr = old_stderr - assert captured_standard_output.getvalue() == '' - assert captured_standard_error.getvalue() == '' + captured_standard_output = StringIO() + captured_standard_error = StringIO() + old_stdout = sys.stdout + old_stderr = sys.stderr + try: + sys.stdout = captured_standard_output + sys.stderr = captured_standard_error + main(['-o', '/dev/null', '--quiet', '-a', 'XXXX', datapath('illumina.fastq.gz')]) + finally: + sys.stdout = old_stdout + sys.stderr = old_stderr + assert captured_standard_output.getvalue() == '' + assert captured_standard_error.getvalue() == '' def test_x_brace_notation(): - main(['-o', '/dev/null', '--quiet', '-a', 'X{5}', datapath('small.fastq')]) + main(['-o', '/dev/null', '--quiet', '-a', 'X{5}', datapath('small.fastq')]) def test_nextseq(): - run('--nextseq-trim 22', 'nextseq.fastq', 'nextseq.fastq') + run('--nextseq-trim 22', 'nextseq.fastq', 'nextseq.fastq') def test_linked(): - run('-a AAAAAAAAAA...TTTTTTTTTT', 'linked.fasta', 'linked.fasta') + run('-a AAAAAAAAAA...TTTTTTTTTT', 'linked.fasta', 'linked.fasta') def test_linked_explicitly_anchored(): - run('-a ^AAAAAAAAAA...TTTTTTTTTT', 'linked.fasta', 'linked.fasta') + run('-a ^AAAAAAAAAA...TTTTTTTTTT', 'linked.fasta', 'linked.fasta') def test_linked_multiple(): - run('-a AAAAAAAAAA...TTTTTTTTTT -a AAAAAAAAAA...GCGCGCGCGC', 'linked.fasta', 'linked.fasta') + run('-a AAAAAAAAAA...TTTTTTTTTT -a AAAAAAAAAA...GCGCGCGCGC', 'linked.fasta', 'linked.fasta') def test_linked_both_anchored(): - run('-a AAAAAAAAAA...TTTTT$', 'linked-anchored.fasta', 'linked.fasta') + run('-a AAAAAAAAAA...TTTTT$', 'linked-anchored.fasta', 'linked.fasta') def test_linked_5p_not_anchored(): - run('-g AAAAAAAAAA...TTTTTTTTTT', 'linked-not-anchored.fasta', 'linked.fasta') + run('-g AAAAAAAAAA...TTTTTTTTTT', 'linked-not-anchored.fasta', 'linked.fasta') def test_linked_discard_untrimmed(): - run('-a AAAAAAAAAA...TTTTTTTTTT --discard-untrimmed', 'linked-discard.fasta', 'linked.fasta') + run('-a AAAAAAAAAA...TTTTTTTTTT --discard-untrimmed', 'linked-discard.fasta', 'linked.fasta') def test_linked_discard_untrimmed_g(): - run('-g AAAAAAAAAA...TTTTTTTTTT --discard-untrimmed', 'linked-discard-g.fasta', 'linked.fasta') + run('-g AAAAAAAAAA...TTTTTTTTTT --discard-untrimmed', 'linked-discard-g.fasta', 'linked.fasta') def test_linked_anywhere(): - with pytest.raises(SystemExit): - with redirect_stderr(): - main(['-b', 'AAA...TTT', datapath('linked.fasta')]) + with pytest.raises(SystemExit): + with redirect_stderr(): + main(['-b', 'AAA...TTT', datapath('linked.fasta')]) def test_anywhere_anchored_5p(): - with pytest.raises(SystemExit): - with redirect_stderr(): - main(['-b', '^AAA', datapath('small.fastq')]) + with pytest.raises(SystemExit): + with redirect_stderr(): + main(['-b', '^AAA', datapath('small.fastq')]) def test_anywhere_anchored_3p(): - with pytest.raises(SystemExit): - with redirect_stderr(): - main(['-b', 'TTT$', datapath('small.fastq')]) + with pytest.raises(SystemExit): + with redirect_stderr(): + main(['-b', 'TTT$', datapath('small.fastq')]) def test_fasta(): - run('-a TTAGACATATCTCCGTCG', 'small.fasta', 'small.fastq') + run('-a TTAGACATATCTCCGTCG', 'small.fasta', 'small.fastq') def test_fasta_no_trim(): - run([], 'small-no-trim.fasta', 'small.fastq') + run([], 'small-no-trim.fasta', 'small.fastq') def test_length(): - run('--length 5', 'shortened.fastq', 'small.fastq') + run('--length 5', 'shortened.fastq', 'small.fastq') def test_negative_length(): - run('--length -5', 'shortened-negative.fastq', 'small.fastq') + run('--length -5', 'shortened-negative.fastq', 'small.fastq') def test_run_cutadapt_process(): - subprocess.check_call(['cutadapt', '--version']) + subprocess.check_call(['cutadapt', '--version']) @pytest.mark.timeout(0.5) def test_issue_296(tmpdir): - # Hang when using both --no-trim and --info-file together - info_path = str(tmpdir.join('info.txt')) - reads_path = str(tmpdir.join('reads.fasta')) - out_path = str(tmpdir.join('out.fasta')) - with open(reads_path, 'w') as f: - f.write('>read\nCACAAA\n') - main(['--info-file', info_path, '--no-trim', '-g', 'TTTCAC', '-o', out_path, reads_path]) - # Output should be unchanged because of --no-trim - assert_files_equal(reads_path, out_path) + # Hang when using both --no-trim and --info-file together + info_path = str(tmpdir.join('info.txt')) + reads_path = str(tmpdir.join('reads.fasta')) + out_path = str(tmpdir.join('out.fasta')) + with open(reads_path, 'w') as f: + f.write('>read\nCACAAA\n') + main(['--info-file', info_path, '--no-trim', '-g', 'TTTCAC', '-o', out_path, reads_path]) + # Output should be unchanged because of --no-trim + assert_files_equal(reads_path, out_path) def test_xadapter(): - run('-g XTCCGAATAGA', 'xadapter.fasta', 'xadapterx.fasta') + run('-g XTCCGAATAGA', 'xadapter.fasta', 'xadapterx.fasta') def test_adapterx(): - run('-a TCCGAATAGAX', 'adapterx.fasta', 'xadapterx.fasta') + run('-a TCCGAATAGAX', 'adapterx.fasta', 'xadapterx.fasta') def test_discard_casava(): - run('--discard-casava', 'casava.fastq', 'casava.fastq') + run('--discard-casava', 'casava.fastq', 'casava.fastq') def test_underscore(): - """File name ending in _fastq.gz (issue #275)""" - run('-b TTAGACATATCTCCGTCG', 'small.fastq', 'underscore_fastq.gz') + """File name ending in _fastq.gz (issue #275)""" + run('-b TTAGACATATCTCCGTCG', 'small.fastq', 'underscore_fastq.gz') def test_cores_autodetect(): - # Just make sure that it runs; functionality is not tested - run('--cores 0 -b TTAGACATATCTCCGTCG', 'small.fastq', 'underscore_fastq.gz') + # Just make sure that it runs; functionality is not tested + run('--cores 0 -b TTAGACATATCTCCGTCG', 'small.fastq', 'underscore_fastq.gz') def test_write_compressed_fastq(cores, tmpdir): - main(['--cores', str(cores), '-o', str(tmpdir.join('out.fastq.gz')), datapath('small.fastq')]) + main(['--cores', str(cores), '-o', str(tmpdir.join('out.fastq.gz')), datapath('small.fastq')]) diff --git a/tests/test_filters.py b/tests/test_filters.py index 8fe98257..d584cc70 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -8,33 +8,33 @@ @mark.parametrize('seq,count,expected', [ - ('AAA', 0, KEEP), - ('AAA', 1, KEEP), - ('AAACCTTGGN', 1, KEEP), - ('AAACNNNCTTGGN', 0.5, KEEP), - ('NNNNNN', 1, DISCARD), - ('ANAAAA', 1 / 6, KEEP), - ('ANAAAA', 0, DISCARD), + ('AAA', 0, KEEP), + ('AAA', 1, KEEP), + ('AAACCTTGGN', 1, KEEP), + ('AAACNNNCTTGGN', 0.5, KEEP), + ('NNNNNN', 1, DISCARD), + ('ANAAAA', 1 / 6, KEEP), + ('ANAAAA', 0, DISCARD), ]) def test_ncontentfilter(seq, count, expected): - # third parameter is True if read should be discarded - filter_ = NContentFilter(count=count) - _seq = Sequence('read1', seq, qualities='#'*len(seq)) - assert filter_(_seq, []) == expected + # third parameter is True if read should be discarded + filter_ = NContentFilter(count=count) + _seq = Sequence('read1', seq, qualities='#'*len(seq)) + assert filter_(_seq, []) == expected @mark.parametrize('seq1,seq2,count,expected', [ - ('AAA', 'AAA', 0, KEEP), - ('AAAN', 'AAA', 0, DISCARD), - ('AAA', 'AANA', 0, DISCARD), - ('ANAA', 'AANA', 1, KEEP), + ('AAA', 'AAA', 0, KEEP), + ('AAAN', 'AAA', 0, DISCARD), + ('AAA', 'AANA', 0, DISCARD), + ('ANAA', 'AANA', 1, KEEP), ]) def test_ncontentfilter_paired(seq1, seq2, count, expected): - filter_ = NContentFilter(count=count) - filter_legacy = PairedRedirector(None, filter_, filter_, pair_filter_mode='first') - filter_any = PairedRedirector(None, filter_, filter_, pair_filter_mode='any') - read1 = Sequence('read1', seq1, qualities='#'*len(seq1)) - read2 = Sequence('read1', seq2, qualities='#'*len(seq2)) - assert filter_legacy(read1, read2, [], []) == filter_(read1, []) - # discard entire pair if one of the reads fulfills criteria - assert filter_any(read1, read2, [], []) == expected + filter_ = NContentFilter(count=count) + filter_legacy = PairedRedirector(None, filter_, filter_, pair_filter_mode='first') + filter_any = PairedRedirector(None, filter_, filter_, pair_filter_mode='any') + read1 = Sequence('read1', seq1, qualities='#'*len(seq1)) + read2 = Sequence('read1', seq2, qualities='#'*len(seq2)) + assert filter_legacy(read1, read2, [], []) == filter_(read1, []) + # discard entire pair if one of the reads fulfills criteria + assert filter_any(read1, read2, [], []) == expected diff --git a/tests/test_modifiers.py b/tests/test_modifiers.py index 7bd9d2e9..15316cfe 100644 --- a/tests/test_modifiers.py +++ b/tests/test_modifiers.py @@ -1,51 +1,51 @@ from dnaio import Sequence from cutadapt.modifiers import (UnconditionalCutter, NEndTrimmer, QualityTrimmer, - Shortener) + Shortener) def test_unconditional_cutter(): - uc = UnconditionalCutter(length=5) - s = 'abcdefg' - assert UnconditionalCutter(length=2)(s, []) == 'cdefg' - assert UnconditionalCutter(length=-2)(s, []) == 'abcde' - assert UnconditionalCutter(length=100)(s, []) == '' - assert UnconditionalCutter(length=-100)(s, []) == '' + uc = UnconditionalCutter(length=5) + s = 'abcdefg' + assert UnconditionalCutter(length=2)(s, []) == 'cdefg' + assert UnconditionalCutter(length=-2)(s, []) == 'abcde' + assert UnconditionalCutter(length=100)(s, []) == '' + assert UnconditionalCutter(length=-100)(s, []) == '' def test_nend_trimmer(): - trimmer = NEndTrimmer() - seqs = ['NNNNAAACCTTGGNNN', 'NNNNAAACNNNCTTGGNNN', 'NNNNNN'] - trims = ['AAACCTTGG', 'AAACNNNCTTGG', ''] - for seq, trimmed in zip(seqs, trims): - _seq = Sequence('read1', seq, qualities='#'*len(seq)) - _trimmed = Sequence('read1', trimmed, qualities='#'*len(trimmed)) - assert trimmer(_seq, []) == _trimmed + trimmer = NEndTrimmer() + seqs = ['NNNNAAACCTTGGNNN', 'NNNNAAACNNNCTTGGNNN', 'NNNNNN'] + trims = ['AAACCTTGG', 'AAACNNNCTTGG', ''] + for seq, trimmed in zip(seqs, trims): + _seq = Sequence('read1', seq, qualities='#'*len(seq)) + _trimmed = Sequence('read1', trimmed, qualities='#'*len(trimmed)) + assert trimmer(_seq, []) == _trimmed def test_quality_trimmer(): - read = Sequence('read1', 'ACGTTTACGTA', '##456789###') + read = Sequence('read1', 'ACGTTTACGTA', '##456789###') - qt = QualityTrimmer(10, 10, 33) - assert qt(read, []) == Sequence('read1', 'GTTTAC', '456789') + qt = QualityTrimmer(10, 10, 33) + assert qt(read, []) == Sequence('read1', 'GTTTAC', '456789') - qt = QualityTrimmer(0, 10, 33) - assert qt(read, []) == Sequence('read1', 'ACGTTTAC', '##456789') + qt = QualityTrimmer(0, 10, 33) + assert qt(read, []) == Sequence('read1', 'ACGTTTAC', '##456789') - qt = QualityTrimmer(10, 0, 33) - assert qt(read, []) == Sequence('read1', 'GTTTACGTA', '456789###') + qt = QualityTrimmer(10, 0, 33) + assert qt(read, []) == Sequence('read1', 'GTTTACGTA', '456789###') def test_shortener(): - read = Sequence('read1', 'ACGTTTACGTA', '##456789###') + read = Sequence('read1', 'ACGTTTACGTA', '##456789###') - shortener = Shortener(0) - assert shortener(read, []) == Sequence('read1', '', '') + shortener = Shortener(0) + assert shortener(read, []) == Sequence('read1', '', '') - shortener = Shortener(1) - assert shortener(read, []) == Sequence('read1', 'A', '#') + shortener = Shortener(1) + assert shortener(read, []) == Sequence('read1', 'A', '#') - shortener = Shortener(5) - assert shortener(read, []) == Sequence('read1', 'ACGTT', '##456') + shortener = Shortener(5) + assert shortener(read, []) == Sequence('read1', 'ACGTT', '##456') - shortener = Shortener(100) - assert shortener(read, []) == read + shortener = Shortener(100) + assert shortener(read, []) == read diff --git a/tests/test_paired.py b/tests/test_paired.py index 28bb9cb8..6c6293e8 100644 --- a/tests/test_paired.py +++ b/tests/test_paired.py @@ -10,425 +10,425 @@ def run_paired(params, in1, in2, expected1, expected2, cores): - if type(params) is str: - params = params.split() - params += ['--cores', str(cores), '--buffer-size=512'] - with temporary_path('tmp1-' + expected1) as p1: - with temporary_path('tmp2-' + expected2) as p2: - params += ['-o', p1, '-p', p2] - params += [datapath(in1), datapath(in2)] - assert main(params) is None - assert_files_equal(cutpath(expected1), p1) - assert_files_equal(cutpath(expected2), p2) + if type(params) is str: + params = params.split() + params += ['--cores', str(cores), '--buffer-size=512'] + with temporary_path('tmp1-' + expected1) as p1: + with temporary_path('tmp2-' + expected2) as p2: + params += ['-o', p1, '-p', p2] + params += [datapath(in1), datapath(in2)] + assert main(params) is None + assert_files_equal(cutpath(expected1), p1) + assert_files_equal(cutpath(expected2), p2) def run_interleaved(params, inpath1, inpath2=None, expected1=None, expected2=None, cores=1): - """ - Interleaved input or output (or both) - """ - assert not (inpath1 and inpath2 and expected1 and expected2) - assert not (expected2 and not expected1) - assert not (inpath2 and not inpath1) - if type(params) is str: - params = params.split() - params += ['--interleaved', '--cores', str(cores), '--buffer-size=512'] - with temporary_path('tmp1-' + expected1) as tmp1: - params += ['-o', tmp1] - paths = [datapath(inpath1)] - if inpath2: - paths += [datapath(inpath2)] - if expected2: - with temporary_path('tmp2-' + expected2) as tmp2: - params += ['-p', tmp2] - assert main(params + paths) is None - assert_files_equal(cutpath(expected2), tmp2) - else: - assert main(params + paths) is None - assert_files_equal(cutpath(expected1), tmp1) + """ + Interleaved input or output (or both) + """ + assert not (inpath1 and inpath2 and expected1 and expected2) + assert not (expected2 and not expected1) + assert not (inpath2 and not inpath1) + if type(params) is str: + params = params.split() + params += ['--interleaved', '--cores', str(cores), '--buffer-size=512'] + with temporary_path('tmp1-' + expected1) as tmp1: + params += ['-o', tmp1] + paths = [datapath(inpath1)] + if inpath2: + paths += [datapath(inpath2)] + if expected2: + with temporary_path('tmp2-' + expected2) as tmp2: + params += ['-p', tmp2] + assert main(params + paths) is None + assert_files_equal(cutpath(expected2), tmp2) + else: + assert main(params + paths) is None + assert_files_equal(cutpath(expected1), tmp1) def test_paired_separate(): - """test separate trimming of paired-end reads""" - run('-a TTAGACATAT', 'paired-separate.1.fastq', 'paired.1.fastq') - run('-a CAGTGGAGTA', 'paired-separate.2.fastq', 'paired.2.fastq') + """test separate trimming of paired-end reads""" + run('-a TTAGACATAT', 'paired-separate.1.fastq', 'paired.1.fastq') + run('-a CAGTGGAGTA', 'paired-separate.2.fastq', 'paired.2.fastq') def test_paired_end_legacy(cores): - """--paired-output, not using -A/-B/-G""" - # the -m 14 filters out one read, which should then also be filtered out in the second output file - # -q 10 should not change anything: qualities in file 1 are high enough, - # qualities in file 2 should not be inspected. - run_paired( - '-a TTAGACATAT -m 14 -q 10', - in1='paired.1.fastq', in2='paired.2.fastq', - expected1='paired.m14.1.fastq', expected2='paired.m14.2.fastq', - cores=cores - ) + """--paired-output, not using -A/-B/-G""" + # the -m 14 filters out one read, which should then also be filtered out in the second output file + # -q 10 should not change anything: qualities in file 1 are high enough, + # qualities in file 2 should not be inspected. + run_paired( + '-a TTAGACATAT -m 14 -q 10', + in1='paired.1.fastq', in2='paired.2.fastq', + expected1='paired.m14.1.fastq', expected2='paired.m14.2.fastq', + cores=cores + ) def test_untrimmed_paired_output(): - with temporary_path("tmp-untrimmed.1.fastq") as untrimmed1: - with temporary_path("tmp-untrimmed.2.fastq") as untrimmed2: - run_paired( - ['-a', 'TTAGACATAT', - '--untrimmed-output', untrimmed1, - '--untrimmed-paired-output', untrimmed2], - in1='paired.1.fastq', in2='paired.2.fastq', - expected1='paired-trimmed.1.fastq', expected2='paired-trimmed.2.fastq', - cores=1 - ) - assert_files_equal(cutpath('paired-untrimmed.1.fastq'), untrimmed1) - assert_files_equal(cutpath('paired-untrimmed.2.fastq'), untrimmed2) + with temporary_path("tmp-untrimmed.1.fastq") as untrimmed1: + with temporary_path("tmp-untrimmed.2.fastq") as untrimmed2: + run_paired( + ['-a', 'TTAGACATAT', + '--untrimmed-output', untrimmed1, + '--untrimmed-paired-output', untrimmed2], + in1='paired.1.fastq', in2='paired.2.fastq', + expected1='paired-trimmed.1.fastq', expected2='paired-trimmed.2.fastq', + cores=1 + ) + assert_files_equal(cutpath('paired-untrimmed.1.fastq'), untrimmed1) + assert_files_equal(cutpath('paired-untrimmed.2.fastq'), untrimmed2) def test_explicit_format_with_paired(): - # Use --format=fastq with input files whose extension is .txt - with temporary_path("paired.1.txt") as txt1: - with temporary_path("paired.2.txt") as txt2: - shutil.copyfile(datapath("paired.1.fastq"), txt1) - shutil.copyfile(datapath("paired.2.fastq"), txt2) - run_paired( - '--format=fastq -a TTAGACATAT -m 14', - in1=txt1, in2=txt2, - expected1='paired.m14.1.fastq', - expected2='paired.m14.2.fastq', - cores=1 - ) + # Use --format=fastq with input files whose extension is .txt + with temporary_path("paired.1.txt") as txt1: + with temporary_path("paired.2.txt") as txt2: + shutil.copyfile(datapath("paired.1.fastq"), txt1) + shutil.copyfile(datapath("paired.2.fastq"), txt2) + run_paired( + '--format=fastq -a TTAGACATAT -m 14', + in1=txt1, in2=txt2, + expected1='paired.m14.1.fastq', + expected2='paired.m14.2.fastq', + cores=1 + ) def test_no_trimming_legacy(): - # make sure that this doesn't divide by zero - main([ - '-a', 'XXXXX', '-o', '/dev/null', '-p', '/dev/null', - datapath('paired.1.fastq'), datapath('paired.2.fastq')]) + # make sure that this doesn't divide by zero + main([ + '-a', 'XXXXX', '-o', '/dev/null', '-p', '/dev/null', + datapath('paired.1.fastq'), datapath('paired.2.fastq')]) def test_no_trimming(): - # make sure that this doesn't divide by zero - main([ - '-a', 'XXXXX', '-A', 'XXXXX', '-o', '/dev/null', '-p', '/dev/null', - datapath('paired.1.fastq'), datapath('paired.2.fastq')]) + # make sure that this doesn't divide by zero + main([ + '-a', 'XXXXX', '-A', 'XXXXX', '-o', '/dev/null', '-p', '/dev/null', + datapath('paired.1.fastq'), datapath('paired.2.fastq')]) def test_missing_file(tmpdir): - with redirect_stderr(): - with pytest.raises(SystemExit): - main(['--paired-output', str(tmpdir.join('out.fastq')), datapath('paired.1.fastq')]) + with redirect_stderr(): + with pytest.raises(SystemExit): + main(['--paired-output', str(tmpdir.join('out.fastq')), datapath('paired.1.fastq')]) def test_first_too_short(tmpdir, cores): - # Create a truncated file in which the last read is missing - trunc1 = tmpdir.join("truncated.1.fastq") - with open(datapath('paired.1.fastq')) as f: - lines = f.readlines() - lines = lines[:-4] - trunc1.write(''.join(lines)) - - with redirect_stderr(): - with pytest.raises(SystemExit): - main([ - '-o', '/dev/null', - '--paired-output', str(tmpdir.join('out.fastq')), - '--cores', str(cores), - str(trunc1), datapath('paired.2.fastq') - ]) + # Create a truncated file in which the last read is missing + trunc1 = tmpdir.join("truncated.1.fastq") + with open(datapath('paired.1.fastq')) as f: + lines = f.readlines() + lines = lines[:-4] + trunc1.write(''.join(lines)) + + with redirect_stderr(): + with pytest.raises(SystemExit): + main([ + '-o', '/dev/null', + '--paired-output', str(tmpdir.join('out.fastq')), + '--cores', str(cores), + str(trunc1), datapath('paired.2.fastq') + ]) def test_second_too_short(tmpdir, cores): - # Create a truncated file in which the last read is missing - trunc2 = tmpdir.join("truncated.2.fastq") - with open(datapath('paired.2.fastq')) as f: - lines = f.readlines() - lines = lines[:-4] - trunc2.write(''.join(lines)) - - with redirect_stderr(): - with pytest.raises(SystemExit): - main([ - '-o', '/dev/null', - '--paired-output', str(tmpdir.join('out.fastq')), - '--cores', str(cores), - datapath('paired.1.fastq'), str(trunc2) - ]) + # Create a truncated file in which the last read is missing + trunc2 = tmpdir.join("truncated.2.fastq") + with open(datapath('paired.2.fastq')) as f: + lines = f.readlines() + lines = lines[:-4] + trunc2.write(''.join(lines)) + + with redirect_stderr(): + with pytest.raises(SystemExit): + main([ + '-o', '/dev/null', + '--paired-output', str(tmpdir.join('out.fastq')), + '--cores', str(cores), + datapath('paired.1.fastq'), str(trunc2) + ]) def test_unmatched_read_names(tmpdir, cores): - # Create a file in which reads 2 and 1 are swapped - with open(datapath('paired.1.fastq')) as f: - lines = f.readlines() - lines = lines[0:4] + lines[8:12] + lines[4:8] + lines[12:] - swapped = tmpdir.join("swapped.1.fastq") + # Create a file in which reads 2 and 1 are swapped + with open(datapath('paired.1.fastq')) as f: + lines = f.readlines() + lines = lines[0:4] + lines[8:12] + lines[4:8] + lines[12:] + swapped = tmpdir.join("swapped.1.fastq") - swapped.write(''.join(lines)) + swapped.write(''.join(lines)) - with pytest.raises(SystemExit): - main([ - '-o', str(tmpdir.join('out1.fastq')), - '--paired-output', str(tmpdir.join('out2.fastq')), - '--cores', str(cores), - str(swapped), datapath('paired.2.fastq') - ]) + with pytest.raises(SystemExit): + main([ + '-o', str(tmpdir.join('out1.fastq')), + '--paired-output', str(tmpdir.join('out2.fastq')), + '--cores', str(cores), + str(swapped), datapath('paired.2.fastq') + ]) def test_p_without_o(cores): - """Option -p given but -o missing""" - with pytest.raises(SystemExit): - main('-a XX -p /dev/null'.split() - + ['--cores', str(cores)] - + [datapath('paired.1.fastq'), datapath('paired.2.fastq')]) + """Option -p given but -o missing""" + with pytest.raises(SystemExit): + main('-a XX -p /dev/null'.split() + + ['--cores', str(cores)] + + [datapath('paired.1.fastq'), datapath('paired.2.fastq')]) def test_paired_but_only_one_input_file(cores): - """Option -p given but only one input file""" - with pytest.raises(SystemExit): - main('-a XX -o /dev/null -p /dev/null'.split() - + ['--cores', str(cores)] - + [datapath('paired.1.fastq')]) + """Option -p given but only one input file""" + with pytest.raises(SystemExit): + main('-a XX -o /dev/null -p /dev/null'.split() + + ['--cores', str(cores)] + + [datapath('paired.1.fastq')]) def test_legacy_minlength(cores): - """Ensure -m is not applied to second read in a pair in legacy mode""" - run_paired( - '-a XXX -m 27', - in1='paired.1.fastq', in2='paired.2.fastq', - expected1='paired-m27.1.fastq', expected2='paired-m27.2.fastq', - cores=cores - ) + """Ensure -m is not applied to second read in a pair in legacy mode""" + run_paired( + '-a XXX -m 27', + in1='paired.1.fastq', in2='paired.2.fastq', + expected1='paired-m27.1.fastq', expected2='paired-m27.2.fastq', + cores=cores + ) def test_paired_end(cores): - """single-pass paired-end with -m""" - run_paired( - '-a TTAGACATAT -A CAGTGGAGTA -m 14', - in1='paired.1.fastq', in2='paired.2.fastq', - expected1='paired.1.fastq', expected2='paired.2.fastq', - cores=cores - ) + """single-pass paired-end with -m""" + run_paired( + '-a TTAGACATAT -A CAGTGGAGTA -m 14', + in1='paired.1.fastq', in2='paired.2.fastq', + expected1='paired.1.fastq', expected2='paired.2.fastq', + cores=cores + ) def test_paired_anchored_back_no_indels(): - run_paired( - '-a BACKADAPTER$ -A BACKADAPTER$ -N --no-indels', - in1='anchored-back.fasta', in2='anchored-back.fasta', - expected1='anchored-back.fasta', expected2="anchored-back.fasta", - cores=1 - ) + run_paired( + '-a BACKADAPTER$ -A BACKADAPTER$ -N --no-indels', + in1='anchored-back.fasta', in2='anchored-back.fasta', + expected1='anchored-back.fasta', expected2="anchored-back.fasta", + cores=1 + ) def test_paired_end_qualtrim(cores): - """single-pass paired-end with -q and -m""" - run_paired( - '-q 20 -a TTAGACATAT -A CAGTGGAGTA -m 14 -M 90', - in1='paired.1.fastq', in2='paired.2.fastq', - expected1='pairedq.1.fastq', expected2='pairedq.2.fastq', - cores=cores - ) + """single-pass paired-end with -q and -m""" + run_paired( + '-q 20 -a TTAGACATAT -A CAGTGGAGTA -m 14 -M 90', + in1='paired.1.fastq', in2='paired.2.fastq', + expected1='pairedq.1.fastq', expected2='pairedq.2.fastq', + cores=cores + ) def test_paired_end_qualtrim_swapped(cores): - """single-pass paired-end with -q and -m, but files swapped""" - run_paired( - '-q 20 -a CAGTGGAGTA -A TTAGACATAT -m 14', - in1='paired.2.fastq', in2='paired.1.fastq', - expected1='pairedq.2.fastq', expected2='pairedq.1.fastq', - cores=cores - ) + """single-pass paired-end with -q and -m, but files swapped""" + run_paired( + '-q 20 -a CAGTGGAGTA -A TTAGACATAT -m 14', + in1='paired.2.fastq', in2='paired.1.fastq', + expected1='pairedq.2.fastq', expected2='pairedq.1.fastq', + cores=cores + ) def test_paired_end_cut(cores): - run_paired( - '-u 3 -u -1 -U 4 -U -2', - in1='paired.1.fastq', in2='paired.2.fastq', - expected1='pairedu.1.fastq', expected2='pairedu.2.fastq', - cores=cores - ) + run_paired( + '-u 3 -u -1 -U 4 -U -2', + in1='paired.1.fastq', in2='paired.2.fastq', + expected1='pairedu.1.fastq', expected2='pairedu.2.fastq', + cores=cores + ) def test_paired_end_upper_a_only(cores): - run_paired( - '-A CAGTGGAGTA', - in1='paired.1.fastq', in2='paired.2.fastq', - expected1='paired-onlyA.1.fastq', expected2='paired-onlyA.2.fastq', - cores=cores - ) + run_paired( + '-A CAGTGGAGTA', + in1='paired.1.fastq', in2='paired.2.fastq', + expected1='paired-onlyA.1.fastq', expected2='paired-onlyA.2.fastq', + cores=cores + ) def test_discard_untrimmed(cores): - # issue #146 - # the first adapter is a sequence cut out from the first read - run_paired( - '-a CTCCAGCTTAGACATATC -A XXXXXXXX --discard-untrimmed', - in1='paired.1.fastq', in2='paired.2.fastq', - expected1='empty.fastq', expected2='empty.fastq', - cores=cores - ) + # issue #146 + # the first adapter is a sequence cut out from the first read + run_paired( + '-a CTCCAGCTTAGACATATC -A XXXXXXXX --discard-untrimmed', + in1='paired.1.fastq', in2='paired.2.fastq', + expected1='empty.fastq', expected2='empty.fastq', + cores=cores + ) def test_discard_trimmed(cores): - run_paired( - '-A C -O 1 --discard-trimmed', # applies everywhere - in1='paired.1.fastq', in2='paired.2.fastq', - expected1='empty.fastq', expected2='empty.fastq', - cores=cores - ) + run_paired( + '-A C -O 1 --discard-trimmed', # applies everywhere + in1='paired.1.fastq', in2='paired.2.fastq', + expected1='empty.fastq', expected2='empty.fastq', + cores=cores + ) def test_interleaved_in_and_out(cores): - """Single-pass interleaved paired-end with -q and -m""" - run_interleaved( - '-q 20 -a TTAGACATAT -A CAGTGGAGTA -m 14 -M 90', - inpath1='interleaved.fastq', expected1='interleaved.fastq', - cores=cores - ) + """Single-pass interleaved paired-end with -q and -m""" + run_interleaved( + '-q 20 -a TTAGACATAT -A CAGTGGAGTA -m 14 -M 90', + inpath1='interleaved.fastq', expected1='interleaved.fastq', + cores=cores + ) def test_interleaved_in(cores): - """Interleaved input, two files output""" - run_interleaved( - '-q 20 -a TTAGACATAT -A CAGTGGAGTA -m 14 -M 90', - inpath1='interleaved.fastq', - expected1='pairedq.1.fastq', expected2='pairedq.2.fastq', - cores=cores - ) + """Interleaved input, two files output""" + run_interleaved( + '-q 20 -a TTAGACATAT -A CAGTGGAGTA -m 14 -M 90', + inpath1='interleaved.fastq', + expected1='pairedq.1.fastq', expected2='pairedq.2.fastq', + cores=cores + ) def test_interleaved_out(cores): - """Two files input, interleaved output""" - run_interleaved( - '-q 20 -a TTAGACATAT -A CAGTGGAGTA -m 14 -M 90', - inpath1='paired.1.fastq', inpath2='paired.2.fastq', - expected1='interleaved.fastq', - cores=cores - ) + """Two files input, interleaved output""" + run_interleaved( + '-q 20 -a TTAGACATAT -A CAGTGGAGTA -m 14 -M 90', + inpath1='paired.1.fastq', inpath2='paired.2.fastq', + expected1='interleaved.fastq', + cores=cores + ) def test_interleaved_neither_nor(): - """Option --interleaved used, but pairs of files given for input and output""" - with temporary_path("temp-paired.1.fastq") as p1: - with temporary_path("temp-paired.2.fastq") as p2: - params = '-a XX --interleaved'.split() - with redirect_stderr(): - params += ['-o', p1, '-p1', p2, 'paired.1.fastq', 'paired.2.fastq'] - with pytest.raises(SystemExit): - main(params) + """Option --interleaved used, but pairs of files given for input and output""" + with temporary_path("temp-paired.1.fastq") as p1: + with temporary_path("temp-paired.2.fastq") as p2: + params = '-a XX --interleaved'.split() + with redirect_stderr(): + params += ['-o', p1, '-p1', p2, 'paired.1.fastq', 'paired.2.fastq'] + with pytest.raises(SystemExit): + main(params) def test_pair_filter_both(cores): - run_paired( - '--pair-filter=both -a TTAGACATAT -A GGAGTA -m 14', - in1='paired.1.fastq', in2='paired.2.fastq', - expected1='paired-filterboth.1.fastq', expected2='paired-filterboth.2.fastq', - cores=cores - ) + run_paired( + '--pair-filter=both -a TTAGACATAT -A GGAGTA -m 14', + in1='paired.1.fastq', in2='paired.2.fastq', + expected1='paired-filterboth.1.fastq', expected2='paired-filterboth.2.fastq', + cores=cores + ) def test_pair_filter_first(cores): - run_paired( - '--pair-filter=first -a TTAGACATAT -A GGAGTA -m 14', - in1='paired.1.fastq', in2='paired.2.fastq', - expected1='paired-filterfirst.1.fastq', expected2='paired-filterfirst.2.fastq', - cores=cores - ) + run_paired( + '--pair-filter=first -a TTAGACATAT -A GGAGTA -m 14', + in1='paired.1.fastq', in2='paired.2.fastq', + expected1='paired-filterfirst.1.fastq', expected2='paired-filterfirst.2.fastq', + cores=cores + ) def test_too_short_paired_output(): - with temporary_path("temp-too-short.1.fastq") as p1: - with temporary_path("temp-too-short.2.fastq") as p2: - run_paired( - '-a TTAGACATAT -A CAGTGGAGTA -m 14 --too-short-output ' - '{0} --too-short-paired-output {1}'.format(p1, p2), - in1='paired.1.fastq', in2='paired.2.fastq', - expected1='paired.1.fastq', expected2='paired.2.fastq', - cores=1 - ) - assert_files_equal(cutpath('paired-too-short.1.fastq'), p1) - assert_files_equal(cutpath('paired-too-short.2.fastq'), p2) + with temporary_path("temp-too-short.1.fastq") as p1: + with temporary_path("temp-too-short.2.fastq") as p2: + run_paired( + '-a TTAGACATAT -A CAGTGGAGTA -m 14 --too-short-output ' + '{0} --too-short-paired-output {1}'.format(p1, p2), + in1='paired.1.fastq', in2='paired.2.fastq', + expected1='paired.1.fastq', expected2='paired.2.fastq', + cores=1 + ) + assert_files_equal(cutpath('paired-too-short.1.fastq'), p1) + assert_files_equal(cutpath('paired-too-short.2.fastq'), p2) def test_too_long_output(): - with temporary_path('temp-too-long.1.fastq') as p1: - with temporary_path('temp-too-long.2.fastq') as p2: - run_paired( - '-a TTAGACATAT -A CAGTGGAGTA -M 14 --too-long-output ' - '{0} --too-long-paired-output {1}'.format(p1, p2), - in1='paired.1.fastq', in2='paired.2.fastq', - expected1='paired-too-short.1.fastq', expected2='paired-too-short.2.fastq', - cores=1 - ) - assert_files_equal(cutpath('paired.1.fastq'), p1) - assert_files_equal(cutpath('paired.2.fastq'), p2) + with temporary_path('temp-too-long.1.fastq') as p1: + with temporary_path('temp-too-long.2.fastq') as p2: + run_paired( + '-a TTAGACATAT -A CAGTGGAGTA -M 14 --too-long-output ' + '{0} --too-long-paired-output {1}'.format(p1, p2), + in1='paired.1.fastq', in2='paired.2.fastq', + expected1='paired-too-short.1.fastq', expected2='paired-too-short.2.fastq', + cores=1 + ) + assert_files_equal(cutpath('paired.1.fastq'), p1) + assert_files_equal(cutpath('paired.2.fastq'), p2) def test_too_short_output_paired_option_missing(): - with temporary_path('temp-too-short.1.fastq') as p1: - with pytest.raises(SystemExit): - run_paired( - '-a TTAGACATAT -A CAGTGGAGTA -m 14 --too-short-output ' - '{0}'.format(p1), - in1='paired.1.fastq', in2='paired.2.fastq', - expected1='paired.1.fastq', expected2='paired.2.fastq', - cores=1 - ) + with temporary_path('temp-too-short.1.fastq') as p1: + with pytest.raises(SystemExit): + run_paired( + '-a TTAGACATAT -A CAGTGGAGTA -m 14 --too-short-output ' + '{0}'.format(p1), + in1='paired.1.fastq', in2='paired.2.fastq', + expected1='paired.1.fastq', expected2='paired.2.fastq', + cores=1 + ) def test_nextseq_paired(cores): - run_paired('--nextseq-trim 22', in1='nextseq.fastq', in2='nextseq.fastq', - expected1='nextseq.fastq', expected2='nextseq.fastq', - cores=cores) + run_paired('--nextseq-trim 22', in1='nextseq.fastq', in2='nextseq.fastq', + expected1='nextseq.fastq', expected2='nextseq.fastq', + cores=cores) def test_paired_demultiplex(): - tempdir = tempfile.mkdtemp(prefix='cutadapt-tests.') - multiout1 = os.path.join(tempdir, 'demultiplexed.{name}.1.fastq') - multiout2 = os.path.join(tempdir, 'demultiplexed.{name}.2.fastq') - params = [ - '-a', 'first=AACATTAGACA', '-a', 'second=CATTAGACATATCGG', - '-A', 'ignored=CAGTGGAGTA', '-A', 'alsoignored=AATAACAGTGGAGTA', - '-o', multiout1, '-p', multiout2, - datapath('paired.1.fastq'), datapath('paired.2.fastq')] - assert main(params) is None - assert_files_equal(cutpath('demultiplexed.first.1.fastq'), multiout1.format(name='first')) - assert_files_equal(cutpath('demultiplexed.second.1.fastq'), multiout1.format(name='second')) - assert_files_equal(cutpath('demultiplexed.unknown.1.fastq'), multiout1.format(name='unknown')) - assert_files_equal(cutpath('demultiplexed.first.2.fastq'), multiout2.format(name='first')) - assert_files_equal(cutpath('demultiplexed.second.2.fastq'), multiout2.format(name='second')) - assert_files_equal(cutpath('demultiplexed.unknown.2.fastq'), multiout2.format(name='unknown')) - shutil.rmtree(tempdir) + tempdir = tempfile.mkdtemp(prefix='cutadapt-tests.') + multiout1 = os.path.join(tempdir, 'demultiplexed.{name}.1.fastq') + multiout2 = os.path.join(tempdir, 'demultiplexed.{name}.2.fastq') + params = [ + '-a', 'first=AACATTAGACA', '-a', 'second=CATTAGACATATCGG', + '-A', 'ignored=CAGTGGAGTA', '-A', 'alsoignored=AATAACAGTGGAGTA', + '-o', multiout1, '-p', multiout2, + datapath('paired.1.fastq'), datapath('paired.2.fastq')] + assert main(params) is None + assert_files_equal(cutpath('demultiplexed.first.1.fastq'), multiout1.format(name='first')) + assert_files_equal(cutpath('demultiplexed.second.1.fastq'), multiout1.format(name='second')) + assert_files_equal(cutpath('demultiplexed.unknown.1.fastq'), multiout1.format(name='unknown')) + assert_files_equal(cutpath('demultiplexed.first.2.fastq'), multiout2.format(name='first')) + assert_files_equal(cutpath('demultiplexed.second.2.fastq'), multiout2.format(name='second')) + assert_files_equal(cutpath('demultiplexed.unknown.2.fastq'), multiout2.format(name='unknown')) + shutil.rmtree(tempdir) @pytest.mark.parametrize('name_op,l1,l2,m', list(product( - (('m', lambda x, y: x >= y), ('M', lambda x, y: x <= y)), - range(1, 5), - range(1, 5), - [(2, 3), (2, None), (None, 3)] + (('m', lambda x, y: x >= y), ('M', lambda x, y: x <= y)), + range(1, 5), + range(1, 5), + [(2, 3), (2, None), (None, 3)] ))) def test_separate_minmaxlength(tmpdir, name_op, l1, l2, m): - """Separate minimum lengths for R1 and R2""" - m1, m2 = m - name, func = name_op - inpath = str(tmpdir.join('separate_minlength.fasta')) - expected = str(tmpdir.join('separate_minlength_expected.fasta')) - outpath = str(tmpdir.join('out.fasta')) - record = '>r{}:{}\n{}\n'.format(l1, l2, 'A' * l1) - record += '>r{}:{}\n{}'.format(l1, l2, 'A' * l2) - with open(inpath, 'w') as f: - print(record, file=f) - with open(expected, 'w') as f: - if (m1 is None or func(l1, m1)) and (m2 is None or func(l2, m2)): - print(record, file=f) - - assert os.path.exists(inpath) - assert os.path.exists(expected) - if m1 is None: - m1 = '' - if m2 is None: - m2 = '' - - main(['--interleaved', '-o', outpath, '-' + name, '{}:{}'.format(m1, m2), inpath]) - assert_files_equal(expected, outpath) + """Separate minimum lengths for R1 and R2""" + m1, m2 = m + name, func = name_op + inpath = str(tmpdir.join('separate_minlength.fasta')) + expected = str(tmpdir.join('separate_minlength_expected.fasta')) + outpath = str(tmpdir.join('out.fasta')) + record = '>r{}:{}\n{}\n'.format(l1, l2, 'A' * l1) + record += '>r{}:{}\n{}'.format(l1, l2, 'A' * l2) + with open(inpath, 'w') as f: + print(record, file=f) + with open(expected, 'w') as f: + if (m1 is None or func(l1, m1)) and (m2 is None or func(l2, m2)): + print(record, file=f) + + assert os.path.exists(inpath) + assert os.path.exists(expected) + if m1 is None: + m1 = '' + if m2 is None: + m2 = '' + + main(['--interleaved', '-o', outpath, '-' + name, '{}:{}'.format(m1, m2), inpath]) + assert_files_equal(expected, outpath) def test_separate_minlength_single(): - """Using separate minlengths for single-end data""" - with pytest.raises(SystemExit): - main(['-m', '5:7', datapath('small.fastq')]) + """Using separate minlengths for single-end data""" + with pytest.raises(SystemExit): + main(['-m', '5:7', datapath('small.fastq')]) diff --git a/tests/test_qualtrim.py b/tests/test_qualtrim.py index 631ac105..4bfcb1a4 100644 --- a/tests/test_qualtrim.py +++ b/tests/test_qualtrim.py @@ -3,10 +3,10 @@ def test_nextseq_trim(): - s = Sequence('n', '', '') - assert nextseq_trim_index(s, cutoff=22) == 0 - s = Sequence('n', - 'TCTCGTATGCCGTCTTATGCTTGAAAAAAAAAAGGGGGGGGGGGGGGGGGNNNNNNNNNNNGGNGG', - 'AA//EAEE//A6///E//A//EA/EEEEEEAEA//EEEEEEEEEEEEEEE###########EE#EA' - ) - assert nextseq_trim_index(s, cutoff=22) == 33 + s = Sequence('n', '', '') + assert nextseq_trim_index(s, cutoff=22) == 0 + s = Sequence('n', + 'TCTCGTATGCCGTCTTATGCTTGAAAAAAAAAAGGGGGGGGGGGGGGGGGNNNNNNNNNNNGGNGG', + 'AA//EAEE//A6///E//A//EA/EEEEEEAEA//EEEEEEEEEEEEEEE###########EE#EA' + ) + assert nextseq_trim_index(s, cutoff=22) == 33 diff --git a/tests/test_trim.py b/tests/test_trim.py index df9b4373..6a1712d0 100644 --- a/tests/test_trim.py +++ b/tests/test_trim.py @@ -4,58 +4,58 @@ def test_statistics(): - read = Sequence('name', 'AAAACCCCAAAA') - adapters = [Adapter('CCCC', BACK, max_error_rate=0.1)] - cutter = AdapterCutter(adapters, times=3) - trimmed_read = cutter(read, []) - # TODO make this a lot simpler - trimmed_bp = 0 - for adapter in adapters: - for d in (cutter.adapter_statistics[adapter].front.lengths, - cutter.adapter_statistics[adapter].back.lengths): - trimmed_bp += sum(seqlen * count for (seqlen, count) in d.items()) - assert trimmed_bp <= len(read), trimmed_bp + read = Sequence('name', 'AAAACCCCAAAA') + adapters = [Adapter('CCCC', BACK, max_error_rate=0.1)] + cutter = AdapterCutter(adapters, times=3) + trimmed_read = cutter(read, []) + # TODO make this a lot simpler + trimmed_bp = 0 + for adapter in adapters: + for d in (cutter.adapter_statistics[adapter].front.lengths, + cutter.adapter_statistics[adapter].back.lengths): + trimmed_bp += sum(seqlen * count for (seqlen, count) in d.items()) + assert trimmed_bp <= len(read), trimmed_bp def test_end_trim_with_mismatch(): - """ - Test the not-so-obvious case where an adapter of length 13 is trimmed from - the end of a sequence with overlap 9 and there is one deletion. - In this case the algorithm starts with 10 bases of the adapter to get - the hit and so the match is considered good. An insertion or substitution - at the same spot is not a match. - """ - adapter = Adapter('TCGATCGATCGAT', BACK, max_error_rate=0.1) + """ + Test the not-so-obvious case where an adapter of length 13 is trimmed from + the end of a sequence with overlap 9 and there is one deletion. + In this case the algorithm starts with 10 bases of the adapter to get + the hit and so the match is considered good. An insertion or substitution + at the same spot is not a match. + """ + adapter = Adapter('TCGATCGATCGAT', BACK, max_error_rate=0.1) - read = Sequence('foo1', 'AAAAAAAAAAATCGTCGATC') - cutter = AdapterCutter([adapter], times=1) - trimmed_read = cutter(read, []) + read = Sequence('foo1', 'AAAAAAAAAAATCGTCGATC') + cutter = AdapterCutter([adapter], times=1) + trimmed_read = cutter(read, []) - assert trimmed_read.sequence == 'AAAAAAAAAAA' - assert cutter.adapter_statistics[adapter].back.lengths == {9: 1} - # We see 1 error at length 9 even though the number of allowed mismatches at - # length 9 is 0. - assert cutter.adapter_statistics[adapter].back.errors[9][1] == 1 + assert trimmed_read.sequence == 'AAAAAAAAAAA' + assert cutter.adapter_statistics[adapter].back.lengths == {9: 1} + # We see 1 error at length 9 even though the number of allowed mismatches at + # length 9 is 0. + assert cutter.adapter_statistics[adapter].back.errors[9][1] == 1 - read = Sequence('foo2', 'AAAAAAAAAAATCGAACGA') - cutter = AdapterCutter([adapter], times=1) - trimmed_read = cutter(read, []) + read = Sequence('foo2', 'AAAAAAAAAAATCGAACGA') + cutter = AdapterCutter([adapter], times=1) + trimmed_read = cutter(read, []) - assert trimmed_read.sequence == read.sequence - assert cutter.adapter_statistics[adapter].back.lengths == {} + assert trimmed_read.sequence == read.sequence + assert cutter.adapter_statistics[adapter].back.lengths == {} def test_anywhere_with_errors(): - adapter = Adapter('CCGCATTTAG', ANYWHERE, max_error_rate=0.1) - for seq, expected_trimmed in ( - ('AACCGGTTccgcatttagGATC', 'AACCGGTT'), - ('AACCGGTTccgcgtttagGATC', 'AACCGGTT'), # one mismatch - ('AACCGGTTccgcatttag', 'AACCGGTT'), - ('ccgcatttagAACCGGTT', 'AACCGGTT'), - ('ccgtatttagAACCGGTT', 'AACCGGTT'), # one mismatch - ('ccgatttagAACCGGTT', 'AACCGGTT'), # one deletion - ): - read = Sequence('foo', seq) - cutter = AdapterCutter([adapter], times=1) - trimmed_read = cutter(read, []) - assert trimmed_read.sequence == expected_trimmed + adapter = Adapter('CCGCATTTAG', ANYWHERE, max_error_rate=0.1) + for seq, expected_trimmed in ( + ('AACCGGTTccgcatttagGATC', 'AACCGGTT'), + ('AACCGGTTccgcgtttagGATC', 'AACCGGTT'), # one mismatch + ('AACCGGTTccgcatttag', 'AACCGGTT'), + ('ccgcatttagAACCGGTT', 'AACCGGTT'), + ('ccgtatttagAACCGGTT', 'AACCGGTT'), # one mismatch + ('ccgatttagAACCGGTT', 'AACCGGTT'), # one deletion + ): + read = Sequence('foo', seq) + cutter = AdapterCutter([adapter], times=1) + trimmed_read = cutter(read, []) + assert trimmed_read.sequence == expected_trimmed diff --git a/tests/utils.py b/tests/utils.py index bb5a8982..0edaf54a 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -10,51 +10,51 @@ @contextmanager def redirect_stderr(): - """Send stderr to stdout. Nose doesn't capture stderr, yet.""" - old_stderr = sys.stderr - sys.stderr = sys.stdout - yield - sys.stderr = old_stderr + """Send stderr to stdout. Nose doesn't capture stderr, yet.""" + old_stderr = sys.stderr + sys.stderr = sys.stdout + yield + sys.stderr = old_stderr @contextmanager def temporary_path(name): - tempdir = mkdtemp(prefix='cutadapt-tests.') - path = os.path.join(tempdir, name) - try: - yield path - finally: - rmtree(tempdir) + tempdir = mkdtemp(prefix='cutadapt-tests.') + path = os.path.join(tempdir, name) + try: + yield path + finally: + rmtree(tempdir) def datapath(path): - return os.path.join(os.path.dirname(__file__), 'data', path) + return os.path.join(os.path.dirname(__file__), 'data', path) def cutpath(path): - return os.path.join(os.path.dirname(__file__), 'cut', path) + return os.path.join(os.path.dirname(__file__), 'cut', path) class FilesDifferent(Exception): - pass + pass def assert_files_equal(path1, path2): - try: - subprocess.check_output(['diff', '-u', path1, path2], stderr=subprocess.STDOUT) - except subprocess.CalledProcessError as e: - raise FilesDifferent('\n' + e.output.decode()) from None + try: + subprocess.check_output(['diff', '-u', path1, path2], stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as e: + raise FilesDifferent('\n' + e.output.decode()) from None def run(params, expected, inpath, inpath2=None): - if type(params) is str: - params = params.split() - with temporary_path(expected) as tmp_fastaq: - params += ['-o', tmp_fastaq] # TODO not parallelizable - params += [datapath(inpath)] - if inpath2: - params += [datapath(inpath2)] - assert main(params) is None - # TODO redirect standard output - assert_files_equal(cutpath(expected), tmp_fastaq) - # TODO diff log files + if type(params) is str: + params = params.split() + with temporary_path(expected) as tmp_fastaq: + params += ['-o', tmp_fastaq] # TODO not parallelizable + params += [datapath(inpath)] + if inpath2: + params += [datapath(inpath2)] + assert main(params) is None + # TODO redirect standard output + assert_files_equal(cutpath(expected), tmp_fastaq) + # TODO diff log files