diff --git a/.gitignore b/.gitignore index d5a6dd6..f7bf712 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +.vscode/ +htslib/ .cproject .project .*.swp diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..6530047 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,49 @@ +FROM phusion/baseimage:jammy-1.0.1 + +ARG HTSLIB_VERSION=1.17 + +ENV TERM=xterm-256color \ + HTSLIB_URL=https://github.com/samtools/htslib/releases/download/${HTSLIB_VERSION}/htslib-${HTSLIB_VERSION}.tar.bz2 + +COPY . minimap2_index_modifier + +# install deps and cleanup apt garbage +RUN set -eux; \ + apt-get update; \ + apt-get install -y \ + wget \ + make \ + autoconf \ + gcc \ + libcurl4-openssl-dev \ + zlib1g-dev \ + libbz2-dev \ + liblzma-dev \ + bzip2 \ + && rm -rf /var/lib/apt/lists/*; + +#install htslib +RUN set -eux; \ +mkdir temp; \ +cd temp; \ +\ +wget ${HTSLIB_URL}; \ +tar -xf htslib-${HTSLIB_VERSION}.tar.bz2; \ +cd htslib-${HTSLIB_VERSION}; \ +\ +autoreconf -i; \ +./configure; \ +make; \ +make install; \ +cd ../../; \ +rm -rf temp; + +#install minimap2_index_modifier +RUN set -eux; \ +cd minimap2_index_modifier; \ +make; \ +cp minimap2 /usr/local/bin; \ +cd ../; \ +rm -rf minimap2_index_modifier; + +ENV LD_LIBRARY_PATH=/usr/local/lib diff --git a/Makefile b/Makefile index 4118616..08a38a1 100644 --- a/Makefile +++ b/Makefile @@ -2,11 +2,11 @@ CFLAGS= -g -Wall -O2 -Wc++-compat #-Wextra CPPFLAGS= -DHAVE_KALLOC INCLUDES= OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o \ - lchain.o align.o hit.o seed.o map.o format.o pe.o esterr.o splitidx.o \ + lchain.o align.o hit.o seed.o linked_vcf_list.o map.o format.o pe.o esterr.o splitidx.o \ ksw2_ll_sse.o PROG= minimap2 PROG_EXTRA= sdust minimap2-lite -LIBS= -lm -lz -lpthread +LIBS= -lm -lz -lpthread -lhts ifeq ($(arm_neon),) # if arm_neon is not defined ifeq ($(sse2only),) # if sse2only is not defined @@ -53,7 +53,7 @@ minimap2-lite:example.o libminimap2.a libminimap2.a:$(OBJS) $(AR) -csru $@ $(OBJS) -sdust:sdust.c kalloc.o kalloc.h kdq.h kvec.h kseq.h ketopt.h sdust.h +sdust:sdust.c kalloc.o kalloc.h kdq.h kvec.h kseq.h ketopt.h sdust.h linked_vcf_list.h $(CC) -D_SDUST_MAIN $(CFLAGS) $< kalloc.o -o $@ -lz # SSE-specific targets on x86/x86_64 @@ -128,5 +128,6 @@ options.o: mmpriv.h minimap.h bseq.h kseq.h pe.o: mmpriv.h minimap.h bseq.h kseq.h kvec.h kalloc.h ksort.h sdust.o: kalloc.h kdq.h kvec.h sdust.h seed.o: mmpriv.h minimap.h bseq.h kseq.h kalloc.h ksort.h -sketch.o: kvec.h kalloc.h mmpriv.h minimap.h bseq.h kseq.h +linked_vcf_list.o: mmpriv.h minimap.h linked_vcf_list.h +sketch.o: kvec.h kalloc.h mmpriv.h minimap.h bseq.h kseq.h linked_vcf_list.h splitidx.o: mmpriv.h minimap.h bseq.h kseq.h diff --git a/README.md b/README.md index 13c4938..fc64f5d 100644 --- a/README.md +++ b/README.md @@ -1,403 +1,53 @@ -[![GitHub Downloads](https://img.shields.io/github/downloads/lh3/minimap2/total.svg?style=social&logo=github&label=Download)](https://github.com/lh3/minimap2/releases) -[![BioConda Install](https://img.shields.io/conda/dn/bioconda/minimap2.svg?style=flag&label=BioConda%20install)](https://anaconda.org/bioconda/minimap2) -[![PyPI](https://img.shields.io/pypi/v/mappy.svg?style=flat)](https://pypi.python.org/pypi/mappy) -[![Build Status](https://github.com/lh3/minimap2/actions/workflows/ci.yaml/badge.svg)](https://github.com/lh3/minimap2/actions) -## Getting Started -```sh -git clone https://github.com/lh3/minimap2 -cd minimap2 && make -# long sequences against a reference genome -./minimap2 -a test/MT-human.fa test/MT-orang.fa > test.sam -# create an index first and then map -./minimap2 -x map-ont -d MT-human-ont.mmi test/MT-human.fa -./minimap2 -a MT-human-ont.mmi test/MT-orang.fa > test.sam -# use presets (no test data) -./minimap2 -ax map-pb ref.fa pacbio.fq.gz > aln.sam # PacBio CLR genomic reads -./minimap2 -ax map-ont ref.fa ont.fq.gz > aln.sam # Oxford Nanopore genomic reads -./minimap2 -ax map-hifi ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio HiFi/CCS genomic reads (v2.19 or later) -./minimap2 -ax asm20 ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio HiFi/CCS genomic reads (v2.18 or earlier) -./minimap2 -ax sr ref.fa read1.fa read2.fa > aln.sam # short genomic paired-end reads -./minimap2 -ax splice ref.fa rna-reads.fa > aln.sam # spliced long reads (strand unknown) -./minimap2 -ax splice -uf -k14 ref.fa reads.fa > aln.sam # noisy Nanopore Direct RNA-seq -./minimap2 -ax splice:hq -uf ref.fa query.fa > aln.sam # Final PacBio Iso-seq or traditional cDNA -./minimap2 -ax splice --junc-bed anno.bed12 ref.fa query.fa > aln.sam # prioritize on annotated junctions -./minimap2 -cx asm5 asm1.fa asm2.fa > aln.paf # intra-species asm-to-asm alignment -./minimap2 -x ava-pb reads.fa reads.fa > overlaps.paf # PacBio read overlap -./minimap2 -x ava-ont reads.fa reads.fa > overlaps.paf # Nanopore read overlap -# man page for detailed command line options -man ./minimap2.1 -``` - -## Table of Contents - -- [Getting Started](#started) -- [Users' Guide](#uguide) - - [Installation](#install) - - [General usage](#general) - - [Use cases](#cases) - - [Map long noisy genomic reads](#map-long-genomic) - - [Map long mRNA/cDNA reads](#map-long-splice) - - [Find overlaps between long reads](#long-overlap) - - [Map short accurate genomic reads](#short-genomic) - - [Full genome/assembly alignment](#full-genome) - - [Advanced features](#advanced) - - [Working with >65535 CIGAR operations](#long-cigar) - - [The cs optional tag](#cs) - - [Working with the PAF format](#paftools) - - [Algorithm overview](#algo) - - [Getting help](#help) - - [Citing minimap2](#cite) -- [Developers' Guide](#dguide) -- [Limitations](#limit) - -## Users' Guide - -Minimap2 is a versatile sequence alignment program that aligns DNA or mRNA -sequences against a large reference database. Typical use cases include: (1) -mapping PacBio or Oxford Nanopore genomic reads to the human genome; (2) -finding overlaps between long reads with error rate up to ~15%; (3) -splice-aware alignment of PacBio Iso-Seq or Nanopore cDNA or Direct RNA reads -against a reference genome; (4) aligning Illumina single- or paired-end reads; -(5) assembly-to-assembly alignment; (6) full-genome alignment between two -closely related species with divergence below ~15%. +Minimap2_index_modifier +======================= +Minimap2_index_modifier is a fork of alignment tool [minimap2](https://github.com/lh3/minimap2). +Unlike the original tool, this can use the variants defined in the VCF file when generating the index, for more accurate alignment. -For ~10kb noisy reads sequences, minimap2 is tens of times faster than -mainstream long-read mappers such as BLASR, BWA-MEM, NGMLR and GMAP. It is more -accurate on simulated long reads and produces biologically meaningful alignment -ready for downstream analyses. For >100bp Illumina short reads, minimap2 is -three times as fast as BWA-MEM and Bowtie2, and as accurate on simulated data. -Detailed evaluations are available from the [minimap2 paper][doi] or the -[preprint][preprint]. -### Installation - -Minimap2 is optimized for x86-64 CPUs. You can acquire precompiled binaries from -the [release page][release] with: -```sh -curl -L https://github.com/lh3/minimap2/releases/download/v2.24/minimap2-2.24_x64-linux.tar.bz2 | tar -jxvf - -./minimap2-2.24_x64-linux/minimap2 +Minimap2_index_modifier can be used in the same way as the original minimap2. +To create a modified index use additional parameter `--vcf-file-with-variants `. +```bash +minimap2 -d index.mmi --vcf-file-with-variants input.vcf.gz reference.fasta ``` -If you want to compile from the source, you need to have a C compiler, GNU make -and zlib development files installed. Then type `make` in the source code -directory to compile. If you see compilation errors, try `make sse2only=1` -to disable SSE4 code, which will make minimap2 slightly slower. -Minimap2 also works with ARM CPUs supporting the NEON instruction sets. To -compile for 32 bit ARM architectures (such as ARMv7), use `make arm_neon=1`. To -compile for for 64 bit ARM architectures (such as ARMv8), use `make arm_neon=1 -aarch64=1`. +Use flag `--parse-haplotype` if your VCF contains phased haplotypes. -Minimap2 can use [SIMD Everywhere (SIMDe)][simde] library for porting -implementation to the different SIMD instruction sets. To compile using SIMDe, -use `make -f Makefile.simde`. To compile for ARM CPUs, use `Makefile.simde` -with the ARM related command lines given above. +## Contents +* [Installation](#installation) + * [Compiling from source](#compiling-from-source) + * [Docker](#docker) + * [Dockerhub](#dockerhub) +* [Pre-built indexes](#pre-built-indexes) +* [Tests](#tests) -### General usage +## Installation +### Compiling from source +To compile from source, use this version of tools: -Without any options, minimap2 takes a reference database and a query sequence -file as input and produce approximate mapping, without base-level alignment -(i.e. coordinates are only approximate and no CIGAR in output), in the [PAF format][paf]: -```sh -minimap2 ref.fa query.fq > approx-mapping.paf -``` -You can ask minimap2 to generate CIGAR at the `cg` tag of PAF with: -```sh -minimap2 -c ref.fa query.fq > alignment.paf -``` -or to output alignments in the [SAM format][sam]: -```sh -minimap2 -a ref.fa query.fq > alignment.sam -``` -Minimap2 seamlessly works with gzip'd FASTA and FASTQ formats as input. You -don't need to convert between FASTA and FASTQ or decompress gzip'd files first. +* GCC/G++ 11.4.0+ +* HTSlib v1.17 -For the human reference genome, minimap2 takes a few minutes to generate a -minimizer index for the reference before mapping. To reduce indexing time, you -can optionally save the index with option **-d** and replace the reference -sequence file with the index file on the minimap2 command line: -```sh -minimap2 -d ref.mmi ref.fa # indexing -minimap2 -a ref.mmi reads.fq > alignment.sam # alignment +Command to compile: +```bash +cd minimap2_index_modifier && make ``` -***Importantly***, it should be noted that once you build the index, indexing -parameters such as **-k**, **-w**, **-H** and **-I** can't be changed during -mapping. If you are running minimap2 for different data types, you will -probably need to keep multiple indexes generated with different parameters. -This makes minimap2 different from BWA which always uses the same index -regardless of query data types. -### Use cases - -Minimap2 uses the same base algorithm for all applications. However, due to the -different data types it supports (e.g. short vs long reads; DNA vs mRNA reads), -minimap2 needs to be tuned for optimal performance and accuracy. It is usually -recommended to choose a preset with option **-x**, which sets multiple -parameters at the same time. The default setting is the same as `map-ont`. - -#### Map long noisy genomic reads - -```sh -minimap2 -ax map-pb ref.fa pacbio-reads.fq > aln.sam # for PacBio CLR reads -minimap2 -ax map-ont ref.fa ont-reads.fq > aln.sam # for Oxford Nanopore reads +### Docker +Clone this repository and build a Docker image as follows. +```bash +docker build -t minimap2_index_modifier:2.24 . ``` -The difference between `map-pb` and `map-ont` is that `map-pb` uses -homopolymer-compressed (HPC) minimizers as seeds, while `map-ont` uses ordinary -minimizers as seeds. Emperical evaluation suggests HPC minimizers improve -performance and sensitivity when aligning PacBio CLR reads, but hurt when aligning -Nanopore reads. - -#### Map long mRNA/cDNA reads -```sh -minimap2 -ax splice:hq -uf ref.fa iso-seq.fq > aln.sam # PacBio Iso-seq/traditional cDNA -minimap2 -ax splice ref.fa nanopore-cdna.fa > aln.sam # Nanopore 2D cDNA-seq -minimap2 -ax splice -uf -k14 ref.fa direct-rna.fq > aln.sam # Nanopore Direct RNA-seq -minimap2 -ax splice --splice-flank=no SIRV.fa SIRV-seq.fa # mapping against SIRV control +### Dockerhub +Clone Docker image from Dockerhub. +```bash +docker pull egorguga/minimap2_index_modifier:2.24 ``` -There are different long-read RNA-seq technologies, including tranditional -full-length cDNA, EST, PacBio Iso-seq, Nanopore 2D cDNA-seq and Direct RNA-seq. -They produce data of varying quality and properties. By default, `-x splice` -assumes the read orientation relative to the transcript strand is unknown. It -tries two rounds of alignment to infer the orientation and write the strand to -the `ts` SAM/PAF tag if possible. For Iso-seq, Direct RNA-seq and tranditional -full-length cDNAs, it would be desired to apply `-u f` to force minimap2 to -consider the forward transcript strand only. This speeds up alignment with -slight improvement to accuracy. For noisy Nanopore Direct RNA-seq reads, it is -recommended to use a smaller k-mer size for increased sensitivity to the first -or the last exons. - -Minimap2 rates an alignment by the score of the max-scoring sub-segment, -*excluding* introns, and marks the best alignment as primary in SAM. When a -spliced gene also has unspliced pseudogenes, minimap2 does not intentionally -prefer spliced alignment, though in practice it more often marks the spliced -alignment as the primary. By default, minimap2 outputs up to five secondary -alignments (i.e. likely pseudogenes in the context of RNA-seq mapping). This -can be tuned with option **-N**. - -For long RNA-seq reads, minimap2 may produce chimeric alignments potentially -caused by gene fusions/structural variations or by an intron longer than the -max intron length **-G** (200k by default). For now, it is not recommended to -apply an excessively large **-G** as this slows down minimap2 and sometimes -leads to false alignments. - -It is worth noting that by default `-x splice` prefers GT[A/G]..[C/T]AG -over GT[C/T]..[A/G]AG, and then over other splicing signals. Considering -one additional base improves the junction accuracy for noisy reads, but -reduces the accuracy when aligning against the widely used SIRV control data. -This is because SIRV does not honor the evolutionarily conservative splicing -signal. If you are studying SIRV, you may apply `--splice-flank=no` to let -minimap2 only model GT..AG, ignoring the additional base. - -Since v2.17, minimap2 can optionally take annotated genes as input and -prioritize on annotated splice junctions. To use this feature, you can -```sh -paftools.js gff2bed anno.gff > anno.bed -minimap2 -ax splice --junc-bed anno.bed ref.fa query.fa > aln.sam -``` -Here, `anno.gff` is the gene annotation in the GTF or GFF3 format (`gff2bed` -automatically tests the format). The output of `gff2bed` is in the 12-column -BED format, or the BED12 format. With the `--junc-bed` option, minimap2 adds a -bonus score (tuned by `--junc-bonus`) if an aligned junction matches a junction -in the annotation. Option `--junc-bed` also takes 5-column BED, including the -strand field. In this case, each line indicates an oriented junction. - -#### Find overlaps between long reads - -```sh -minimap2 -x ava-pb reads.fq reads.fq > ovlp.paf # PacBio CLR read overlap -minimap2 -x ava-ont reads.fq reads.fq > ovlp.paf # Oxford Nanopore read overlap -``` -Similarly, `ava-pb` uses HPC minimizers while `ava-ont` uses ordinary -minimizers. It is usually not recommended to perform base-level alignment in -the overlapping mode because it is slow and may produce false positive -overlaps. However, if performance is not a concern, you may try to add `-a` or -`-c` anyway. - -#### Map short accurate genomic reads - -```sh -minimap2 -ax sr ref.fa reads-se.fq > aln.sam # single-end alignment -minimap2 -ax sr ref.fa read1.fq read2.fq > aln.sam # paired-end alignment -minimap2 -ax sr ref.fa reads-interleaved.fq > aln.sam # paired-end alignment -``` -When two read files are specified, minimap2 reads from each file in turn and -merge them into an interleaved stream internally. Two reads are considered to -be paired if they are adjacent in the input stream and have the same name (with -the `/[0-9]` suffix trimmed if present). Single- and paired-end reads can be -mixed. - -Minimap2 does not work well with short spliced reads. There are many capable -RNA-seq mappers for short reads. - -#### Full genome/assembly alignment - -```sh -minimap2 -ax asm5 ref.fa asm.fa > aln.sam # assembly to assembly/ref alignment -``` -For cross-species full-genome alignment, the scoring system needs to be tuned -according to the sequence divergence. - -### Advanced features - -#### Working with >65535 CIGAR operations - -Due to a design flaw, BAM does not work with CIGAR strings with >65535 -operations (SAM and CRAM work). However, for ultra-long nanopore reads minimap2 -may align ~1% of read bases with long CIGARs beyond the capability of BAM. If -you convert such SAM/CRAM to BAM, Picard and recent samtools will throw an -error and abort. Older samtools and other tools may create corrupted BAM. - -To avoid this issue, you can add option `-L` at the minimap2 command line. -This option moves a long CIGAR to the `CG` tag and leaves a fully clipped CIGAR -at the SAM CIGAR column. Current tools that don't read CIGAR (e.g. merging and -sorting) still work with such BAM records; tools that read CIGAR will -effectively ignore these records. It has been decided that future tools -will seamlessly recognize long-cigar records generated by option `-L`. - -**TL;DR**: if you work with ultra-long reads and use tools that only process -BAM files, please add option `-L`. - -#### The cs optional tag - -The `cs` SAM/PAF tag encodes bases at mismatches and INDELs. It matches regular -expression `/(:[0-9]+|\*[a-z][a-z]|[=\+\-][A-Za-z]+)+/`. Like CIGAR, `cs` -consists of series of operations. Each leading character specifies the -operation; the following sequence is the one involved in the operation. - -The `cs` tag is enabled by command line option `--cs`. The following alignment, -for example: -```txt -CGATCGATAAATAGAGTAG---GAATAGCA -|||||| |||||||||| |||| ||| -CGATCG---AATAGAGTAGGTCGAATtGCA -``` -is represented as `:6-ata:10+gtc:4*at:3`, where `:[0-9]+` represents an -identical block, `-ata` represents a deletion, `+gtc` an insertion and `*at` -indicates reference base `a` is substituted with a query base `t`. It is -similar to the `MD` SAM tag but is standalone and easier to parse. - -If `--cs=long` is used, the `cs` string also contains identical sequences in -the alignment. The above example will become -`=CGATCG-ata=AATAGAGTAG+gtc=GAAT*at=GCA`. The long form of `cs` encodes both -reference and query sequences in one string. The `cs` tag also encodes intron -positions and splicing signals (see the [minimap2 manpage][manpage-cs] for -details). - -#### Working with the PAF format - -Minimap2 also comes with a (java)script [paftools.js](misc/paftools.js) that -processes alignments in the PAF format. It calls variants from -assembly-to-reference alignment, lifts over BED files based on alignment, -converts between formats and provides utilities for various evaluations. For -details, please see [misc/README.md](misc/README.md). - -### Algorithm overview - -In the following, minimap2 command line options have a dash ahead and are -highlighted in bold. The description may help to tune minimap2 parameters. - -1. Read **-I** [=*4G*] reference bases, extract (**-k**,**-w**)-minimizers and - index them in a hash table. - -2. Read **-K** [=*200M*] query bases. For each query sequence, do step 3 - through 7: - -3. For each (**-k**,**-w**)-minimizer on the query, check against the reference - index. If a reference minimizer is not among the top **-f** [=*2e-4*] most - frequent, collect its the occurrences in the reference, which are called - *seeds*. - -4. Sort seeds by position in the reference. Chain them with dynamic - programming. Each chain represents a potential mapping. For read - overlapping, report all chains and then go to step 8. For reference mapping, - do step 5 through 7: - -5. Let *P* be the set of primary mappings, which is an empty set initially. For - each chain from the best to the worst according to their chaining scores: if - on the query, the chain overlaps with a chain in *P* by **--mask-level** - [=*0.5*] or higher fraction of the shorter chain, mark the chain as - *secondary* to the chain in *P*; otherwise, add the chain to *P*. - -6. Retain all primary mappings. Also retain up to **-N** [=*5*] top secondary - mappings if their chaining scores are higher than **-p** [=*0.8*] of their - corresponding primary mappings. - -7. If alignment is requested, filter out an internal seed if it potentially - leads to both a long insertion and a long deletion. Extend from the - left-most seed. Perform global alignments between internal seeds. Split the - chain if the accumulative score along the global alignment drops by **-z** - [=*400*], disregarding long gaps. Extend from the right-most seed. Output - chains and their alignments. - -8. If there are more query sequences in the input, go to step 2 until no more - queries are left. - -9. If there are more reference sequences, reopen the query file from the start - and go to step 1; otherwise stop. - -### Getting help - -Manpage [minimap2.1][manpage] provides detailed description of minimap2 -command line options and optional tags. The [FAQ](FAQ.md) page answers several -frequently asked questions. If you encounter bugs or have further questions or -requests, you can raise an issue at the [issue page][issue]. There is not a -specific mailing list for the time being. - -### Citing minimap2 - -If you use minimap2 in your work, please cite: - -> Li, H. (2018). Minimap2: pairwise alignment for nucleotide sequences. -> *Bioinformatics*, **34**:3094-3100. [doi:10.1093/bioinformatics/bty191][doi] - -## Developers' Guide - -Minimap2 is not only a command line tool, but also a programming library. -It provides C APIs to build/load index and to align sequences against the -index. File [example.c](example.c) demonstrates typical uses of C APIs. Header -file [minimap.h](minimap.h) gives more detailed API documentation. Minimap2 -aims to keep APIs in this header stable. File [mmpriv.h](mmpriv.h) contains -additional private APIs which may be subjected to changes frequently. - -This repository also provides Python bindings to a subset of C APIs. File -[python/README.rst](python/README.rst) gives the full documentation; -[python/minimap2.py](python/minimap2.py) shows an example. This Python -extension, mappy, is also [available from PyPI][mappypypi] via `pip install -mappy` or [from BioConda][mappyconda] via `conda install -c bioconda mappy`. - -## Limitations - -* Minimap2 may produce suboptimal alignments through long low-complexity - regions where seed positions may be suboptimal. This should not be a big - concern because even the optimal alignment may be wrong in such regions. - -* Minimap2 requires SSE2 instructions on x86 CPUs or NEON on ARM CPUs. It is - possible to add non-SIMD support, but it would make minimap2 slower by - several times. - -* Minimap2 does not work with a single query or database sequence ~2 - billion bases or longer (2,147,483,647 to be exact). The total length of all - sequences can well exceed this threshold. - -* Minimap2 often misses small exons. - +## Pre-built indexes +This [link](https://nextcloud.ispras.ru/index.php/s/wcb9PpZyr8Gb5CC) contains pre-built modified indexes for next references: +* GRCh38 [(GCA_000001405.15)](https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/references/GRCh38/) +* GRCh37 [(hs37d5)](https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/references/GRCh37/) -[paf]: https://github.com/lh3/miniasm/blob/master/PAF.md -[sam]: https://samtools.github.io/hts-specs/SAMv1.pdf -[minimap]: https://github.com/lh3/minimap -[smartdenovo]: https://github.com/ruanjue/smartdenovo -[longislnd]: https://www.ncbi.nlm.nih.gov/pubmed/27667791 -[gaba]: https://github.com/ocxtal/libgaba -[ksw2]: https://github.com/lh3/ksw2 -[preprint]: https://arxiv.org/abs/1708.01492 -[release]: https://github.com/lh3/minimap2/releases -[mappypypi]: https://pypi.python.org/pypi/mappy -[mappyconda]: https://anaconda.org/bioconda/mappy -[issue]: https://github.com/lh3/minimap2/issues -[k8]: https://github.com/attractivechaos/k8 -[manpage]: https://lh3.github.io/minimap2/minimap2.html -[manpage-cs]: https://lh3.github.io/minimap2/minimap2.html#10 -[doi]: https://doi.org/10.1093/bioinformatics/bty191 -[smide]: https://github.com/nemequ/simde -[unimap]: https://github.com/lh3/unimap +## Tests +See [test/tests.md](test/tests.md) for more details. diff --git a/index.c b/index.c index 0d8a2ae..d7133e4 100644 --- a/index.c +++ b/index.c @@ -15,6 +15,12 @@ #include "kvec.h" #include "khash.h" +#include +#include +#include +#include +#include + #define idx_hash(a) ((a)>>1) #define idx_eq(a, b) ((a)>>1 == (b)>>1) KHASH_INIT(idx, uint64_t, uint64_t, 1, idx_hash, idx_eq) @@ -263,7 +269,7 @@ static void worker_post(void *g, long i, int tid) kfree(0, b->a.a); b->a.n = b->a.m = 0, b->a.a = 0; } - + static void mm_idx_post(mm_idx_t *mi, int n_threads) { kt_for(n_threads, worker_post, mi, 1<b); @@ -282,6 +288,7 @@ typedef struct { uint64_t batch_size, sum_len; mm_bseq_file_t *fp; mm_idx_t *mi; + char * vcf_with_variants; } pipeline_t; typedef struct { @@ -355,15 +362,68 @@ static void *worker_pipeline(void *shared, int step, void *in) } else free(s); } else if (step == 1) { // step 1: compute sketch step_t *s = (step_t*)in; + for (i = 0; i < s->n_seq; ++i) { + //printf("SEQ %d %s\n", s->n_seq, s->seq[i].name); mm_bseq1_t *t = &s->seq[i]; if (t->l_seq > 0) mm_sketch(0, t->seq, t->l_seq, p->mi->w, p->mi->k, t->rid, p->mi->flag&MM_I_HPC, &s->a); else if (mm_verbose >= 2) fprintf(stderr, "[WARNING] the length database sequence '%s' is 0\n", t->name); + + if(p->vcf_with_variants && strcmp(p->vcf_with_variants, "")) { + mm_idx_manipulate_phased(p->mi, p->vcf_with_variants, &s->a, s->seq[i].name); + } + free(t->seq); free(t->name); } + free(s->seq); s->seq = 0; + + // sort by minimizer + radix_sort_128x(s->a.a, s->a.a + s->a.n); + + // stay only unique + //#### why 16, sizeof(mm128_v)? + mm128_t *tmp = (mm128_t*)calloc(s->a.n, 16); + + + size_t ptr_p, ptr_tmp, beg_tmp, end_tmp; + tmp[0].x = s->a.a[0].x; + tmp[0].y = s->a.a[0].y; + beg_tmp = 0; + end_tmp = 1; + + for (ptr_p = 1; ptr_p < s->a.n; ptr_p++) { + if(s->a.a[ptr_p].x == tmp[beg_tmp].x) { + int exist = 0; + for (ptr_tmp = beg_tmp; ptr_tmp < end_tmp; ptr_tmp++) { + if (s->a.a[ptr_p].y == tmp[ptr_tmp].y && s->a.a[ptr_p].x == tmp[ptr_tmp].x) { + exist = 1; + break; + } + } + if (exist == 0) { + tmp[end_tmp].x = s->a.a[ptr_p].x; + tmp[end_tmp].y = s->a.a[ptr_p].y; + end_tmp++; + } + } + else { + tmp[end_tmp].x = s->a.a[ptr_p].x; + tmp[end_tmp].y = s->a.a[ptr_p].y; + beg_tmp = end_tmp; + end_tmp++; + } + } + + tmp = (mm128_t *) realloc(tmp, sizeof(mm128_t) * end_tmp); + + //#### do we need realloc s->a.a ? + memcpy(s->a.a, tmp, end_tmp * 16); + free(tmp); + s->a.n = end_tmp; + return s; } else if (step == 2) { // dispatch sketch to buckets step_t *s = (step_t*)in; @@ -373,7 +433,7 @@ static void *worker_pipeline(void *shared, int step, void *in) return 0; } -mm_idx_t *mm_idx_gen(mm_bseq_file_t *fp, int w, int k, int b, int flag, int mini_batch_size, int n_threads, uint64_t batch_size) +mm_idx_t *mm_idx_gen(mm_bseq_file_t *fp, int w, int k, int b, int flag, int mini_batch_size, int n_threads, uint64_t batch_size, char * vcf_with_variants) { pipeline_t pl; if (fp == 0 || mm_bseq_eof(fp)) return 0; @@ -382,8 +442,10 @@ mm_idx_t *mm_idx_gen(mm_bseq_file_t *fp, int w, int k, int b, int flag, int mini pl.batch_size = batch_size; pl.fp = fp; pl.mi = mm_idx_init(w, k, b, flag); + pl.vcf_with_variants = vcf_with_variants; kt_pipeline(n_threads < 3? n_threads : 3, worker_pipeline, &pl, 3); + if (mm_verbose >= 3) fprintf(stderr, "[M::%s::%.3f*%.2f] collected minimizers\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0)); @@ -394,13 +456,13 @@ mm_idx_t *mm_idx_gen(mm_bseq_file_t *fp, int w, int k, int b, int flag, int mini return pl.mi; } -mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads) // a simpler interface; deprecated +mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads, char * vcf_with_variants) // a simpler interface; deprecated { mm_bseq_file_t *fp; mm_idx_t *mi; fp = mm_bseq_open(fn); if (fp == 0) return 0; - mi = mm_idx_gen(fp, w, k, 14, flag, 1<<18, n_threads, UINT64_MAX); + mi = mm_idx_gen(fp, w, k, 14, flag, 1<<18, n_threads, UINT64_MAX, vcf_with_variants); mm_bseq_close(fp); return mi; } @@ -459,6 +521,51 @@ mm_idx_t *mm_idx_str(int w, int k, int is_hpc, int bucket_bits, int n, const cha * index I/O * *************/ +void mm_idx_to_txt(FILE *fp, const mm_idx_t *mi) +{ + uint64_t sum_len = 0; + uint32_t x[5], i; + x[0] = mi->w, x[1] = mi->k, x[2] = mi->b, x[3] = mi->n_seq, x[4] = mi->flag; + fprintf(fp, "%s\n", MM_IDX_MAGIC); + fprintf(fp, "%u %u %u %u %u\n", x[0], x[1], x[2], x[3], x[4]); + + for (i = 0; i < mi->n_seq; ++i) { + if (mi->seq[i].name) { + uint8_t l = strlen(mi->seq[i].name); + fprintf(fp, "%u\n", l); + fprintf(fp, "%s\n", mi->seq[i].name); + } else { + fprintf(fp, "0\n"); + } + fprintf(fp, "%u\n", mi->seq[i].len); + sum_len += mi->seq[i].len; + } + for (i = 0; i < 1<b; ++i) { + mm_idx_bucket_t *b = &mi->B[i]; + khint_t k; + idxhash_t *h = (idxhash_t*)b->h; + uint32_t size = h? h->size : 0; + fprintf(fp, "%i\n", b->n); + for (int j = 0; j < b->n; j++) + fprintf(fp, "\t%lu\n", b->p[j]); + fprintf(fp, "%u\n", size); + if (size == 0) + continue; + for (k = 0; k < kh_end(h); ++k) { + uint64_t x[3]; + if (!kh_exist(h, k)) + continue; + x[0] = kh_key(h, k), x[1] = kh_val(h, k); + fprintf(fp, "%lu\t%lu\n", x[0], x[1]); + } + } + if (!(mi->flag & MM_I_NO_SEQ)) { + for (int i = 0; i < (sum_len + 7) / 8; i++) + fprintf(fp, "%u\n", mi->S[i]); + } + fflush(fp); +} + void mm_idx_dump(FILE *fp, const mm_idx_t *mi) { uint64_t sum_len = 0; @@ -500,6 +607,67 @@ void mm_idx_dump(FILE *fp, const mm_idx_t *mi) fflush(fp); } +mm_idx_t *mm_idx_load_from_txt(FILE *fp) +{ + char magic[4]; + uint32_t x[5], i; + uint64_t sum_len = 0; + mm_idx_t *mi; + + fscanf(fp, "%s\n", &magic); + + if (strncmp(magic, MM_IDX_MAGIC, 4) != 0) return 0; + fscanf(fp, "%u %u %u %u %u\n", &x[0], &x[1], &x[2], &x[3], &x[4]); + + mi = mm_idx_init(x[0], x[1], x[2], x[4]); + mi->n_seq = x[3]; + mi->seq = (mm_idx_seq_t*)kcalloc(mi->km, mi->n_seq, sizeof(mm_idx_seq_t)); + for (i = 0; i < mi->n_seq; ++i) { + uint8_t l; + mm_idx_seq_t *s = &mi->seq[i]; + fscanf(fp, "%u\n", &l); + if (l) { + s->name = (char*)kmalloc(mi->km, l + 1); + fscanf(fp, "%s\n", s->name); + s->name[l] = 0; + } + fscanf(fp, "%u\n", &s->len); + s->offset = sum_len; + s->is_alt = 0; + sum_len += s->len; + } + for (i = 0; i < 1<b; ++i) { + mm_idx_bucket_t *b = &mi->B[i]; + uint32_t j, size; + khint_t k; + idxhash_t *h; + fscanf(fp, "%i\n", &b->n); + b->p = (uint64_t*)malloc(b->n * 8); + for (int j = 0; j < b->n; j++) { + fscanf(fp, "\t%lu\n", &b->p[j]); + } + fscanf(fp, "%u\n", &size); + if (size == 0) continue; + b->h = h = kh_init(idx); + kh_resize(idx, h, size); + for (j = 0; j < size; ++j) { + uint64_t x[2]; + int absent; + fscanf(fp, "%lu\t%lu\n", &x[0], &x[1]); + k = kh_put(idx, h, x[0], &absent); + assert(absent); + kh_val(h, k) = x[1]; + } + } + if (!(mi->flag & MM_I_NO_SEQ)) { + mi->S = (uint32_t*)malloc((sum_len + 7) / 8 * 4); + for (int i = 0; i < (sum_len + 7) / 8; i++) + fscanf(fp, "%u\n", &mi->S[i]); + } + return mi; +} + + mm_idx_t *mm_idx_load(FILE *fp) { char magic[4]; @@ -605,7 +773,7 @@ void mm_idx_reader_close(mm_idx_reader_t *r) free(r); } -mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads) +mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads, char * vcf_with_variants) { mm_idx_t *mi; if (r->is_idx) { @@ -613,9 +781,13 @@ mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads) if (mi && mm_verbose >= 2 && (mi->k != r->opt.k || mi->w != r->opt.w || (mi->flag&MM_I_HPC) != (r->opt.flag&MM_I_HPC))) fprintf(stderr, "[WARNING]\033[1;31m Indexing parameters (-k, -w or -H) overridden by parameters used in the prebuilt index.\033[0m\n"); } else - mi = mm_idx_gen(r->fp.seq, r->opt.w, r->opt.k, r->opt.bucket_bits, r->opt.flag, r->opt.mini_batch_size, n_threads, r->opt.batch_size); + { + mi = mm_idx_gen(r->fp.seq, r->opt.w, r->opt.k, r->opt.bucket_bits, r->opt.flag, r->opt.mini_batch_size, n_threads, r->opt.batch_size, vcf_with_variants); + } if (mi) { - if (r->fp_out) mm_idx_dump(r->fp_out, mi); + if (r->fp_out) { + mm_idx_dump(r->fp_out, mi); + } mi->index = r->n_parts++; } return mi; diff --git a/linked_vcf_list.c b/linked_vcf_list.c new file mode 100644 index 0000000..2cfc550 --- /dev/null +++ b/linked_vcf_list.c @@ -0,0 +1,323 @@ +// Copyright 2023 karpulevich +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include +#include "mmpriv.h" +#include "minimap.h" +#include "linked_vcf_list.h" + +struct node *head = NULL; +struct node *current = NULL; + +// display the list +void printList(){ + struct node *p = head; + printf("\n["); + + //start from the beginning + while(p != NULL) { + printf(" %lu ",p->pos); + p = p->next; + } + printf("]"); +} + +// display the GT list +void printGTList(bcf_hdr_t *hdr){ + struct node *p = head; + + + //start from the beginning + while(p != NULL) { + int32_t *gt_arr = NULL, ngt_arr = 0; + int ngt; + ngt = bcf_get_genotypes(hdr, p->rec, >_arr, &ngt_arr); + if ( ngt > 0 ) { + int i, nsmpl = bcf_hdr_nsamples(hdr); + printf("%s %lu REF:%s ALT:%s\n", bcf_hdr_id2name(hdr, p->CHR_ID), p->pos, p->REF, p->ALT); + + int max_ploidy = ngt/nsmpl; + for (i=0; inext; + } +} + +int ifexists(char* z, const int u, const int s, char* v) +{ + for (int i = 0; i < u; i++) + if (!strncmp(&z[i * s], v, s)) { + return 1; + } + return 0; +} + +void calculate_haplotypes(mm_idx_t * mi, bcf_hdr_t *hdr, struct node *window_start_pointer, struct node *current_pointer, unsigned long curr_pos, mm128_v *p) { + //Even though vcf uses 1-based indexing (i.e. first base is base 1), htslib internally uses 0-based indexing (i.e. bcf1_t::pos is 0 based). + //http://wresch.github.io/2014/11/18/process-vcf-file-with-htslib.html + //so we need use (pos + 1) + struct node *c_pointer = current_pointer; + struct node *w_start_pointer = window_start_pointer; + + int MAX_SNP = 10; + + char *gt_array = NULL; + int arr_i, arr_j, arr_len; + + if (!(mi->flag & MM_PARSE_HT)) { + int snp_num = 0; + while (w_start_pointer != c_pointer) { + w_start_pointer = w_start_pointer->next; + snp_num += 1; + } + if (snp_num == MAX_SNP) { + return; + } + arr_i = (int) pow(2, snp_num); + arr_j = snp_num + 1; + gt_array = (char *) calloc(arr_i * arr_j, sizeof(char)); + for (unsigned i = 0; i < (unsigned) pow(2, snp_num); i++) { + int k = 0; + for (int j = 1 << (snp_num - 1); j > 0; j = j / 2) { + gt_array[i * arr_j + k] = (i & j) ? '1' : '0'; + k++; + } + } + arr_len = arr_i; + } + else { + MAX_SNP = 100; + int sample_num = bcf_hdr_nsamples(hdr); + int snp_num = 0; + arr_i = sample_num * 2; + arr_j = MAX_SNP + 1; + gt_array = (char *) calloc(arr_i * arr_j, sizeof(char)); + while (w_start_pointer != c_pointer) { + int32_t *gt_arr = NULL, ngt_arr = 0; + int ngt = bcf_get_genotypes(hdr, w_start_pointer->rec, >_arr, &ngt_arr); + + if (ngt > 0) { + int max_ploidy = ngt / sample_num; + + for (int i = 0; i < sample_num; i++) { + int32_t *ptr = gt_arr + i * max_ploidy; + gt_array[i * max_ploidy * arr_j + snp_num] = bcf_gt_allele(ptr[0]) + '0'; + gt_array[(i * max_ploidy + 1) * arr_j + snp_num] = bcf_gt_allele(ptr[1]) + '0'; + } + free(gt_arr); + } + + w_start_pointer = w_start_pointer->next; + snp_num += 1; + if (snp_num > MAX_SNP) { + free(gt_array); + return; + } + } + + //remove non unique strings + //https://subscription.packtpub.com/book/programming/9781838641108/1/ch01lvl1sec06/finding-the-unique-elements-in-an-array + int k = 1; + for (int i = 1; i < sample_num * 2; i++) { + if (!ifexists(gt_array, k, arr_j, >_array[i * arr_j])) { + if (i != k) memcpy(>_array[k * arr_j], >_array[i * arr_j], arr_j * sizeof(char)); + k++; + } + } + arr_len = k; + } + + //prepare SNP combinations + //Array format: + //REF_arr char array - REF (for control) + //ALT_arr char array - ALT + //POS_all ulong array - positions + //CHR - chromosome + //N_SNP - length + const char * CHR = bcf_hdr_id2name(hdr, window_start_pointer->CHR_ID); + + for(int i = 0; iREF; + ALT_arr[N_SNP] = local_w_start_pointer->ALT; + POS_all[N_SNP] = (unsigned long)(local_w_start_pointer->pos + 1); + N_SNP += 1; + } + + local_snp_num += 1; + local_w_start_pointer = local_w_start_pointer->next; + } + if (N_SNP > 0) { + add_variants(mi, CHR, REF_arr, ALT_arr, POS_all, N_SNP, curr_pos + 1, p); + } + } + free(gt_array); +} + +void handleGTList(mm_idx_t * mi, bcf_hdr_t *hdr, mm128_v *p){ + struct node *window_start_pointer = head; + struct node *current_pointer = head; + struct node *window_end_pointer = head; + + int gap = mi->k - 1; + + if (head == NULL) + return; + + while(window_end_pointer->next != NULL) + { + while(window_end_pointer != NULL && window_end_pointer->pos > (current_pointer->pos - gap)){ + window_end_pointer = window_end_pointer->next; + } + + while(window_start_pointer->pos > (current_pointer->pos + gap)){ + window_start_pointer = window_start_pointer->next; + } + + //calculate start and end pointers + calculate_haplotypes(mi, hdr, window_start_pointer, window_end_pointer, current_pointer->pos, p); + + current_pointer = current_pointer->next; + window_end_pointer = current_pointer; + } + + //last SNP batch + while(window_start_pointer->pos > current_pointer->pos + gap){ + window_start_pointer = window_start_pointer->next; + } + + calculate_haplotypes(mi, hdr, window_start_pointer, window_end_pointer->next, current_pointer->pos, p); +} + +//insertion at the beginning +void insertatbegin(unsigned long pos, bcf1_t *rec, int CHR_ID, char * REF, char * ALT){ + //create a link + struct node *lk = (struct node*) malloc(sizeof(struct node)); + lk->pos = pos; + lk->rec = rec; + lk->CHR_ID = CHR_ID; + lk->REF = REF; + lk->ALT = ALT; + + // point it to old first node + lk->next = head; + + //point first to new first node + head = lk; +} + +void insertatend(unsigned long pos, bcf1_t *rec, int CHR_ID, char * REF, char * ALT){ + //create a link + struct node *lk = (struct node*) malloc(sizeof(struct node)); + lk->pos = pos; + lk->rec = rec; + lk->CHR_ID = CHR_ID; + lk->REF = REF; + lk->ALT = ALT; + + struct node *linkedlist = head; + + // point it to old first node + while(linkedlist->next != NULL) + linkedlist = linkedlist->next; + + //point first to new first node + linkedlist->next = lk; +} +void insertafternode(struct node *list, unsigned long pos, bcf1_t *rec, int CHR_ID, char * REF, char * ALT){ + struct node *lk = (struct node*) malloc(sizeof(struct node)); + lk->pos = pos; + lk->rec = rec; + lk->CHR_ID = CHR_ID; + lk->REF = REF; + lk->ALT = ALT; + + lk->next = list->next; + list->next = lk; +} + +void deleteatbegin(){ + struct node *temp = head; + head = temp->next; + free(temp->ALT); + free(temp->REF); + bcf_destroy(temp->rec); + free(temp); +} + +void deleteList(){ + while (head != NULL) + { + deleteatbegin(); + } +} + +bool isListEmpty(){ + return head == NULL; +} + +void deletenode(int key){ + struct node *temp = head, *prev; + if (temp != NULL && temp->pos == key) { + head = temp->next; + free(temp); + return; + } + + // Find the key to be deleted + while (temp != NULL && temp->pos != key) { + prev = temp; + temp = temp->next; + } + + // If the key is not present + if (temp == NULL) return; + + // Remove the node + prev->next = temp->next; + free(temp); +} + +int searchlist(int key){ + struct node *temp = head; + while(temp != NULL) { + if (temp->pos == key) { + return 1; + } + temp=temp->next; + } + return 0; +} \ No newline at end of file diff --git a/linked_vcf_list.h b/linked_vcf_list.h new file mode 100644 index 0000000..0504865 --- /dev/null +++ b/linked_vcf_list.h @@ -0,0 +1,29 @@ +// Copyright 2023 karpulevich +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +struct node { + unsigned long pos; + bcf1_t *rec; + int CHR_ID; + char * REF; + char * ALT; + struct node *next; +}; + +bool isListEmpty(); +void handleGTList(mm_idx_t * mi, bcf_hdr_t *hdr, mm128_v *p); +void insertatbegin(unsigned long pos, bcf1_t *rec, int CHR_ID, char * REF, char * ALT); +void deleteList(); diff --git a/main.c b/main.c index 0be9933..93dbf04 100644 --- a/main.c +++ b/main.c @@ -78,6 +78,8 @@ static ko_longopt_t long_options[] = { { "chain-skip-scale",ko_required_argument,351 }, { "print-chains", ko_no_argument, 352 }, { "no-hash-name", ko_no_argument, 353 }, + { "vcf-file-with-variants",ko_required_argument,354 }, + { "parse-haplotype",ko_no_argument, 355 }, { "help", ko_no_argument, 'h' }, { "max-intron-len", ko_required_argument, 'G' }, { "version", ko_no_argument, 'V' }, @@ -127,6 +129,7 @@ int main(int argc, char *argv[]) mm_idxopt_t ipt; int i, c, n_threads = 3, n_parts, old_best_n = -1; char *fnw = 0, *rg = 0, *junc_bed = 0, *s, *alt_list = 0; + char *vcf_with_variants = 0; FILE *fp_help = stderr; mm_idx_reader_t *idx_rdr; mm_idx_t *mi; @@ -237,6 +240,8 @@ int main(int argc, char *argv[]) else if (c == 350) opt.q_occ_frac = atof(o.arg); // --q-occ-frac else if (c == 352) mm_dbg_flag |= MM_DBG_PRINT_CHAIN; // --print-chains else if (c == 353) opt.flag |= MM_F_NO_HASH_NAME; // --no-hash-name + else if (c == 354) vcf_with_variants = o.arg; // --vcf-file-with-variants + else if (c == 355) ipt.flag |= MM_PARSE_HT; // --parse-haplotype else if (c == 330) { fprintf(stderr, "[WARNING] \033[1;31m --lj-min-ratio has been deprecated.\033[0m\n"); } else if (c == 314) { // --frag @@ -324,6 +329,8 @@ int main(int argc, char *argv[]) fprintf(fp_help, " -w INT minimizer window size [%d]\n", ipt.w); fprintf(fp_help, " -I NUM split index for every ~NUM input bases [4G]\n"); fprintf(fp_help, " -d FILE dump index to FILE []\n"); + fprintf(fp_help, " --vcf-file-with-variants FILE pass VCF FILE to modify index []\n"); + fprintf(fp_help, " --parse-haplotype parse haplotypes from VCF to generate real SNP combinations, otherwise use all.\n"); fprintf(fp_help, " Mapping:\n"); fprintf(fp_help, " -f FLOAT filter out top FLOAT fraction of repetitive minimizers [%g]\n", opt.mid_occ_frac); fprintf(fp_help, " -g NUM stop chain enlongation if there are no minimizers in INT-bp [%d]\n", opt.max_gap); @@ -386,7 +393,7 @@ int main(int argc, char *argv[]) } if (opt.best_n == 0 && (opt.flag&MM_F_CIGAR) && mm_verbose >= 2) fprintf(stderr, "[WARNING]\033[1;31m `-N 0' reduces alignment accuracy. Please use --secondary=no to suppress secondary alignments.\033[0m\n"); - while ((mi = mm_idx_reader_read(idx_rdr, n_threads)) != 0) { + while ((mi = mm_idx_reader_read(idx_rdr, n_threads, vcf_with_variants)) != 0) { int ret; if ((opt.flag & MM_F_CIGAR) && (mi->flag & MM_I_NO_SEQ)) { fprintf(stderr, "[ERROR] the prebuilt index doesn't contain sequences.\n"); diff --git a/minimap.h b/minimap.h index 13e12e0..828de04 100644 --- a/minimap.h +++ b/minimap.h @@ -44,6 +44,7 @@ #define MM_I_HPC 0x1 #define MM_I_NO_SEQ 0x2 #define MM_I_NO_NAME 0x4 +#define MM_PARSE_HT 0x8 #define MM_IDX_MAGIC "MMI\2" @@ -247,7 +248,7 @@ mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt, cons * * @return an index on success; NULL if reaching the end of the input file */ -mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads); +mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads, char * vcf_with_variants); /** * Destroy/deallocate an index reader @@ -401,7 +402,7 @@ int mm_idx_bed_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int32_t en, uin // deprecated APIs for backward compatibility void mm_mapopt_init(mm_mapopt_t *opt); -mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads); +mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads, char * vcf_with_variants); #ifdef __cplusplus } diff --git a/mmpriv.h b/mmpriv.h index 2f5034b..254e694 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -6,6 +6,12 @@ #include "bseq.h" #include "kseq.h" +#include +#include +#include +#include +#include + #define MM_PARENT_UNSET (-1) #define MM_PARENT_TMP_PRI (-2) @@ -60,6 +66,10 @@ void radix_sort_64(uint64_t *beg, uint64_t *end); uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk); void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p); +void mm_idx_manipulate(mm_idx_t * mi, char *vcf_with_variants); +void mm_idx_manipulate_phased(mm_idx_t * mi, char * fname, mm128_v *p, char * contig_name); +void add_variants(mm_idx_t * mi, const char * CHR, char ** REF_arr, char ** ALT_arr, unsigned long * POS_all, int N_SNP, unsigned long curr_pos, mm128_v *p); +void add_indel(mm_idx_t * mi, const char * CHR, char * REF, char * ALT, unsigned long curr_pos, unsigned long indel_pos, mm128_v *p, const char * original_ref_seq); mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos); void mm_seed_mz_flt(void *km, mm128_v *mv, int32_t q_occ_max, float q_occ_frac); diff --git a/sketch.c b/sketch.c index f830693..2d898a8 100644 --- a/sketch.c +++ b/sketch.c @@ -6,6 +6,14 @@ #include "kvec.h" #include "mmpriv.h" +#include "linked_vcf_list.h" + +#include +#include +#include +#include +#include + unsigned char seq_nt4_table[256] = { 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, @@ -81,7 +89,7 @@ void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, i mm128_t buf[256], min = { UINT64_MAX, UINT64_MAX }; tiny_queue_t tq; - assert(len > 0 && (w > 0 && w < 256) && (k > 0 && k <= 28)); // 56 bits for k-mer; could use long k-mers, but 28 enough in practice + assert(len > 0 && (w > 0 && w < 256) && (k > 0 && k <= 128)); // 56 bits for k-mer; could use long k-mers, but 28 enough in practice memset(buf, 0xff, w * 16); memset(&tq, 0, sizeof(tiny_queue_t)); kv_resize(mm128_t, km, *p, p->n + len/w); @@ -141,3 +149,364 @@ void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, i if (min.x != UINT64_MAX) kv_push(mm128_t, km, *p, min); } + +void read_vcf(mm_idx_t * mi, char * fname, mm128_v *p, char * contig_name) +{ + int ret; + + kstring_t str = {0,0,0}; + + //open vcf file + htsFile *fp = hts_open(fname,"rb"); + + //read header + bcf_hdr_t *hdr = bcf_hdr_read(fp); + bcf1_t *rec = bcf_init(); + + tbx_t *idx = tbx_index_load(fname); + + if(!idx) { + //printf("Null index\n"); + return; + } + + //printf("%s\n", contig_name); + hts_itr_t *itr = tbx_itr_querys(idx, contig_name); + + if(!itr) { + //printf("Null iterator for contig_name %s\n", contig_name); + return; + } + + while ((ret = tbx_itr_next(fp, idx, itr, &str)) > 0) { + vcf_parse(&str, hdr, rec); + bcf_unpack(rec, BCF_UN_STR); + bcf_unpack(rec, BCF_UN_INFO); + + bcf1_t *rec_tmp = bcf_dup(rec); + char * REF = (char *)calloc(mi->k + 1, sizeof(char)); + strncpy(REF, rec->d.allele[0], mi->k + 1); + REF[mi->k + 1] = '\0'; + char * ALT = (char *)calloc(mi->k + 1, sizeof(char)); + strncpy(ALT, rec->d.allele[1], mi->k + 1); + ALT[mi->k + 1] = '\0'; + + insertatbegin((unsigned long)rec_tmp->pos, rec_tmp, rec_tmp->rid, REF, ALT); + bcf_empty(rec); + + } + + if(!isListEmpty()){ + //printf("CHR = %s;\n", contig_name); + handleGTList(mi, hdr, p); + + deleteList(); + } + + bcf_itr_destroy(itr); + tbx_destroy(idx); + bcf_hdr_destroy(hdr); + + if ( (ret=hts_close(fp)) ) + { + fprintf(stderr,"hts_close(%s): non-zero status %d\n",fname,ret); + exit(ret); + } +} + +void mm_idx_manipulate_phased(mm_idx_t * mi, char * fname, mm128_v *p, char * contig_name) { + read_vcf(mi, fname, p, contig_name); +} + +//REF - REF (for control) +//ALT - ALT +//curr_pos ulong - position +//CHR - chromosome +void add_indel(mm_idx_t * mi, const char * CHR, char * REF, char * ALT, unsigned long curr_pos, unsigned long indel_pos, mm128_v *p, const char * original_ref_seq) +{ + const char *contig_name = CHR; + const unsigned long center_position = curr_pos; + //const unsigned long position = indel_pos; + const unsigned long position = curr_pos; + + //Find seq + uint64_t contig_offset = 0; + int seq_num = -1; + for (int i = 0; i < mi->n_seq; i++) { + if (strcmp(contig_name, mi->seq[i].name) == 0) { + contig_offset = mi->seq[i].offset; + seq_num = i; + } + } + // Error if no contigs in fasta + if(seq_num == -1) { + printf("ERROR Contig %s id not found in reference\n", contig_name); + return; + } + + int SIDE_SIZE = (mi->k - 1) + mi->w; + // Calculate number of chunks: + // side chunks: take k-mer size, subtract 1 and add window size + // divided by chunk size and multiplied by 2 as it has 2 sides, and one for center + int SEQ_CHUNK_NUMBER = SIDE_SIZE / 8 * 2 + 1; + // add extra two side chunks if (mi->k - 1 + 10) is not a multiple of 8 + int EXTRA_GAP = (8 - SIDE_SIZE % 8) % 8; + SEQ_CHUNK_NUMBER = (EXTRA_GAP) ? SEQ_CHUNK_NUMBER + 2 : SEQ_CHUNK_NUMBER; + + int ref_len = strlen(REF); + int alt_len = strlen(ALT); + + if(ref_len == 1 && alt_len > 1 && alt_len < mi->k) { + char * new_ref_seq; + new_ref_seq = (char*)malloc(sizeof(char) * (SEQ_CHUNK_NUMBER * 8 + 1 + (alt_len - 1))); + memcpy(new_ref_seq, original_ref_seq, SEQ_CHUNK_NUMBER * 8 + 1); + + memcpy(new_ref_seq, original_ref_seq, EXTRA_GAP + SIDE_SIZE + (contig_offset + position - 1) % 8); + + new_ref_seq[EXTRA_GAP + SIDE_SIZE + (contig_offset + position - 1) % 8] = '\0'; + new_ref_seq = strcat(new_ref_seq, ALT); + new_ref_seq = strcat(new_ref_seq, &original_ref_seq[EXTRA_GAP + SIDE_SIZE + (contig_offset + position - 1) % 8 + 1]); + + //Finds minimizer in window + mm128_v minimizer_array = {0, 0, 0}; + mm_sketch(0, &new_ref_seq[EXTRA_GAP + (contig_offset + center_position - 1) % 8], SIDE_SIZE * 2 + 1 + (alt_len - 1), mi->w, mi->k, + 0, mi->flag & MM_I_HPC, &minimizer_array); + + for (int i = 0; i < minimizer_array.n; i++) { + if (minimizer_array.a[i].y < SIDE_SIZE * 2) continue; + if (minimizer_array.a[i].y > (SIDE_SIZE + mi->k) * 2 - 1 + (alt_len - 1) * 2) continue; + minimizer_array.a[i].y = ((uint64_t)seq_num << 32) + (center_position - SIDE_SIZE - 1 + minimizer_array.a[i].y / 2) * 2 + + (minimizer_array.a[i].y % 2); + + kv_push(mm128_t, 0, *p, minimizer_array.a[i]); + } + } else if (ref_len > 1 && alt_len == 1 && ref_len < mi->k) { + int EXT_CHUNK_COUNT = (ref_len - 2) / 8 + 1; + char * original_ref_seq_ext = (char*)malloc(sizeof(char) * (8 * EXT_CHUNK_COUNT + 1)); + original_ref_seq_ext[8 * EXT_CHUNK_COUNT] = '\0'; + for (int i = 0; i < EXT_CHUNK_COUNT; i++) { + uint32_t tmp_seq; + for (int i = 0; i < EXT_CHUNK_COUNT; i++) { + if ( + // Out of bounds + (contig_offset == 0 && (position - 1) / 8 + (SEQ_CHUNK_NUMBER / 2) + i + 1 < 0) || + ((contig_offset + position - 1) / 8 + (SEQ_CHUNK_NUMBER / 2) + i + 1) * 8 >= contig_offset + mi->seq[seq_num].len || + ((contig_offset + position - 1) / 8 + (SEQ_CHUNK_NUMBER / 2) + i + 2) * 8 <= contig_offset + ) + tmp_seq = 1145324612; // ALL N + else { + tmp_seq = mi->S[(contig_offset + position - 1) / 8 + (SEQ_CHUNK_NUMBER / 2) + i + 1]; + // At left bound + if (((contig_offset + position - 1) / 8 + (SEQ_CHUNK_NUMBER / 2) + i + 1) * 8 < contig_offset && + ((contig_offset + position - 1) / 8 + (SEQ_CHUNK_NUMBER / 2) + i + 2) * 8 > contig_offset) { + if (contig_offset % 8 == 0) + tmp_seq = 1145324612; // ALL N + else { + tmp_seq = tmp_seq >> (4 * (contig_offset % 8)); + for (int j = 0; j < contig_offset % 8; j++) + tmp_seq = (tmp_seq << 4) + 4; + } + } + // At right bound + if (((contig_offset + position - 1) / 8 + (SEQ_CHUNK_NUMBER / 2) + i + 1) * 8 < contig_offset + mi->seq[seq_num].len && + ((contig_offset + position - 1) / 8 + (SEQ_CHUNK_NUMBER / 2) + i + 2) * 8 > contig_offset + mi->seq[seq_num].len) { + tmp_seq = tmp_seq << (4 * (8 - (contig_offset + mi->seq[seq_num].len) % 8)); + for (int j = 0; j < 8 - (contig_offset + mi->seq[seq_num].len) % 8; j++) + tmp_seq = (tmp_seq >> 4) | 1073741824; // FIRST N + } + } + } + + for (int j = 0; j < 8; j++) { + uint32_t tmp = tmp_seq % 16; + switch (tmp) { + case 0: + original_ref_seq_ext[i * 8 + j] = 'A'; + break; + case 1: + original_ref_seq_ext[i * 8 + j] = 'C'; + break; + case 2: + original_ref_seq_ext[i * 8 + j] = 'G'; + break; + case 3: + original_ref_seq_ext[i * 8 + j] = 'T'; + break; + case 4: + original_ref_seq_ext[i * 8 + j] = 'N'; + } + tmp_seq = tmp_seq / 16; + } + } + + //Create new window + char * new_ref_seq = (char*)malloc(sizeof(char) * (SEQ_CHUNK_NUMBER * 8 + 1 - ref_len + 1 + EXT_CHUNK_COUNT * 8)); + memcpy(new_ref_seq, original_ref_seq, EXTRA_GAP + SIDE_SIZE + (contig_offset + position - 1) % 8); + new_ref_seq[EXTRA_GAP + SIDE_SIZE + (contig_offset + position - 1) % 8] = ALT[0]; + new_ref_seq[EXTRA_GAP + SIDE_SIZE + (contig_offset + position - 1) % 8 + 1] = '\0'; + new_ref_seq = strcat(new_ref_seq, &original_ref_seq[EXTRA_GAP + SIDE_SIZE + (contig_offset + position - 1) % 8 + ref_len]); + new_ref_seq = strcat(new_ref_seq, original_ref_seq_ext); + + //Finds minimizer in window + mm128_v minimizer_array = {0, 0, 0}; + mm_sketch(0, &new_ref_seq[(contig_offset + center_position - 1) % 8], SIDE_SIZE * 2 + 1, mi->w, mi->k, + 0, mi->flag & MM_I_HPC, &minimizer_array); + + + free(original_ref_seq_ext); + free(new_ref_seq); + + for (int i = 0; i < minimizer_array.n; i++) { + if (minimizer_array.a[i].y < SIDE_SIZE * 2 + 2) continue; // TODO change 50 to 48 for similarity with SNPS (it will take extra calculation but no changes) + if (minimizer_array.a[i].y > (SIDE_SIZE + mi->k) * 2 - 1) continue; + minimizer_array.a[i].y = ((uint64_t)seq_num << 32) + (center_position - SIDE_SIZE - 1 + minimizer_array.a[i].y / 2) * 2 + + (minimizer_array.a[i].y % 2) + (ref_len - 1) * 2; + kv_push(mm128_t, 0, *p, minimizer_array.a[i]); + } + } +} + +//Array format: +//REF_arr char array - REF (for control) +//ALT_arr char array - ALT +//POS_all ulong array - positions +//CHR - chromosome +//N_SNP - length +void add_variants(mm_idx_t * mi, const char * CHR, char ** REF_arr, char ** ALT_arr, unsigned long * POS_all, int N_SNP, unsigned long curr_pos, mm128_v *p) +{ + if (N_SNP == 0) + return; + + const char *snp_contig_name = CHR; + const unsigned long snp_position = curr_pos; + + //Find seq + uint64_t contig_offset; + int seq_num = -1; + for (int i = 0; i < mi->n_seq; i++) { + if (strcmp(snp_contig_name, mi->seq[i].name) == 0) { + contig_offset = mi->seq[i].offset; + seq_num = i; + } + } + //Error if no contigs in fasta + if(seq_num == -1) { + printf("ERROR Contig %s id not found in reference\n", snp_contig_name); + return; + } + + int SIDE_SIZE = (mi->k - 1) + mi->w; + // Calculate number of chunks: + // side chunks: take k-mer size, subtract 1 and add window size + // divided by chunk size and multiplied by 2 as it has 2 sides, and one for center + int SEQ_CHUNK_NUMBER = SIDE_SIZE / 8 * 2 + 1; + // add extra two side chunks if (mi->k - 1 + 10) is not a multiple of 8 + int EXTRA_GAP = (8 - SIDE_SIZE % 8) % 8; + SEQ_CHUNK_NUMBER = (EXTRA_GAP) ? SEQ_CHUNK_NUMBER + 2 : SEQ_CHUNK_NUMBER; + uint32_t seq[SEQ_CHUNK_NUMBER]; + + for (int i = 0; i < SEQ_CHUNK_NUMBER; i++) { + if ( + // Out of bounds + (contig_offset == 0 && (snp_position - 1) / 8 - (SEQ_CHUNK_NUMBER / 2) + i < 0) || + ((contig_offset + snp_position - 1) / 8 - (SEQ_CHUNK_NUMBER / 2) + i) * 8 >= contig_offset + mi->seq[seq_num].len || + ((contig_offset + snp_position - 1) / 8 - (SEQ_CHUNK_NUMBER / 2) + i + 1) * 8 <= contig_offset + ) + seq[i] = 1145324612; // ALL N + else { + seq[i] = mi->S[(contig_offset + snp_position - 1) / 8 - (SEQ_CHUNK_NUMBER / 2) + i]; + // At left bound + if (((contig_offset + snp_position - 1) / 8 - (SEQ_CHUNK_NUMBER / 2) + i) * 8 < contig_offset && + ((contig_offset + snp_position - 1) / 8 - (SEQ_CHUNK_NUMBER / 2) + i + 1) * 8 > contig_offset) { + if (contig_offset % 8 == 0) + seq[i] = 1145324612; // ALL N + else { + seq[i] = seq[i] >> (4 * (contig_offset % 8)); + for (int j = 0; j < contig_offset % 8; j++) + seq[i] = (seq[i] << 4) + 4; + } + } + // At right bound + if (((contig_offset + snp_position - 1) / 8 - (SEQ_CHUNK_NUMBER / 2) + i) * 8 < contig_offset + mi->seq[seq_num].len && + ((contig_offset + snp_position - 1) / 8 - (SEQ_CHUNK_NUMBER / 2) + i + 1) * 8 > contig_offset + mi->seq[seq_num].len) { + seq[i] = seq[i] << (4 * (8 - (contig_offset + mi->seq[seq_num].len) % 8)); + for (int j = 0; j < 8 - (contig_offset + mi->seq[seq_num].len) % 8; j++) + seq[i] = (seq[i] >> 4) | 1073741824; // FIRST N + } + } + } + + char original_ref_seq[SEQ_CHUNK_NUMBER * 8 + 1]; + original_ref_seq[SEQ_CHUNK_NUMBER * 8] = '\0'; + for (int i = 0; i < SEQ_CHUNK_NUMBER; i++) { + uint32_t tmp_seq = seq[i]; + for (int j = 0; j < 8; j++) { + uint32_t tmp = tmp_seq % 16; + switch (tmp) { + case 0: + original_ref_seq[i * 8 + j] = 'A'; + break; + case 1: + original_ref_seq[i * 8 + j] = 'C'; + break; + case 2: + original_ref_seq[i * 8 + j] = 'G'; + break; + case 3: + original_ref_seq[i * 8 + j] = 'T'; + break; + case 4: + original_ref_seq[i * 8 + j] = 'N'; + } + tmp_seq = tmp_seq / 16; + } + } + + + char new_ref_seq[SEQ_CHUNK_NUMBER * 8 + 1]; + memcpy(new_ref_seq, original_ref_seq, SEQ_CHUNK_NUMBER * 8 + 1); + + int has_indel = -1; + int indel_count = 0; + for (int i = N_SNP - 1; i >= 0; i--) { + //add single SNP + if ((strlen(REF_arr[i]) == 1) && (strlen(ALT_arr[i]) == 1)) { + new_ref_seq[(EXTRA_GAP + SIDE_SIZE + (contig_offset + snp_position - 1) % 8) + + (POS_all[i] - snp_position)] = ALT_arr[i][0];// - 'A' + 'a'; + } else { + has_indel = i; + indel_count++; + } + } + if ((indel_count == 1) && (POS_all[has_indel] == curr_pos)) { + if ((strlen(REF_arr[has_indel]) > mi->k) || (strlen(ALT_arr[has_indel]) > mi->k)) { + return; + } + return; + if ((strlen(REF_arr[has_indel]) > 1) && (strlen(ALT_arr[has_indel]) == 1)) { + add_indel(mi, CHR, REF_arr[has_indel], ALT_arr[has_indel], curr_pos, POS_all[has_indel], p, new_ref_seq); + return; + } + if ((strlen(REF_arr[has_indel]) == 1) && (strlen(ALT_arr[has_indel]) > 1)) { + add_indel(mi, CHR, REF_arr[has_indel], ALT_arr[has_indel], curr_pos, POS_all[has_indel], p, new_ref_seq); + return; + } + } + if (indel_count == N_SNP) { + return; + } + + //Finds minimizer in window + mm128_v minimizer_array = {0, 0, 0}; + mm_sketch(0, &new_ref_seq[EXTRA_GAP + (contig_offset + snp_position - 1) % 8], SIDE_SIZE * 2 + 1, mi->w, mi->k, + 0, mi->flag & MM_I_HPC, &minimizer_array); + + for (int i = 0; i < minimizer_array.n; i++) { + if (minimizer_array.a[i].y < SIDE_SIZE * 2) continue; + if (minimizer_array.a[i].y > (SIDE_SIZE + mi->k) * 2 - 1) continue; + minimizer_array.a[i].y = ((uint64_t)seq_num << 32) + (snp_position - SIDE_SIZE - 1 + minimizer_array.a[i].y / 2) * 2 + + (minimizer_array.a[i].y % 2); + + kv_push(mm128_t, 0, *p, minimizer_array.a[i]); + } +} diff --git a/test/README.md b/test/README.md new file mode 100644 index 0000000..01ee755 --- /dev/null +++ b/test/README.md @@ -0,0 +1,32 @@ +## Tests for minimap2_index_modiifer +There are three tests to check base functionality. + +### Common test +This test create two index files: regular and modified. +```bash +minimap2 -d test/test.mni test/test.fasta +minimap2 -d test/test.modified.mni --vcf-file-with-variants test/test_long_chr1_not_the_same.vcf.gz test/test.fasta +diff test/test.mni test/test.modified.mni +``` + +Modified index file should contains extra string and some strings should be mismatched. + +### Empty VCF test +If input VCF file contains no variants from reference modified index would be same as regular one. +```bash +minimap2 -d test/test.mni test/test.fasta +minimap2 -d test/test.modified.mni --vcf-file-with-variants test/empty.vcf.gz test/test.fasta +diff test/test.mni test/test.modified.mni +``` + +Index files should be the same (empty output after third command). + +### Pseudo-variants test +In this test `test/test_long_chr1_the_same.vcf.gz` contains pseudo-variants (like A -> A, C -> C, etc). This variants would be processed with no effect. +```bash +minimap2 -d test/test.mni test/test.fasta +minimap2 -d test/test.modified.mni --vcf-file-with-variants test/test_long_chr1_the_same.vcf.gz test/test.fasta +diff test/test.mni test/test.modified.mni +``` + +Index files should be the same. diff --git a/test/empty.vcf.gz b/test/empty.vcf.gz new file mode 100644 index 0000000..2c6e5b9 Binary files /dev/null and b/test/empty.vcf.gz differ diff --git a/test/empty.vcf.gz.tbi b/test/empty.vcf.gz.tbi new file mode 100644 index 0000000..b9e0990 Binary files /dev/null and b/test/empty.vcf.gz.tbi differ diff --git a/test/test.1.fq b/test/test.1.fq new file mode 100644 index 0000000..09c5d87 --- /dev/null +++ b/test/test.1.fq @@ -0,0 +1,36 @@ +@111\1 +TTTTTTTTTTTTTTATTTTATTTTTTTCACACAAACACACACACACACAA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@112\1 +TTTTTTTATTTTATTTTTTTTTCACACAAACACACACACACACAAAAAAA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@113\1 +TTTTTTTTTTATTTTATTTTTTTTCACACAAACACACACACACACAAAAA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@114\1 +AAAAAAAAAAAAAAAAAAAAAAAAATAAAAACGTACGACGTACGTTTTTT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@115\1 +AAAAAAAAAAAAAAAAAAAAAAATAAAAACGTACGACGTACGTTTTTTTT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@116\1 +AAAAAAAAAAATAAAAACGTACGACGTACGTTTTTTTTTTTTTTTATTTT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@117\1 +ACACACAAAAAAATATATATATATATATATAAATATATTATATATATATA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@118\1 +ATATATATATAAATATATATATATATATATATATATATATATATATATAT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@119\1 +TATATATATATATATATATATATATATATATATAAATATATATATATAAA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/test/test.2.fq b/test/test.2.fq new file mode 100644 index 0000000..6874907 --- /dev/null +++ b/test/test.2.fq @@ -0,0 +1,36 @@ +@111\2 +AAAATAAAAACGTACGACGTACGTTTTTTTTTTTTTTTATTTTATTTTTT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@112\2 +AAAAAAATAAAAACGTACGACGTACGTTTTTTTTTTTTTTTATTTTATTT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@113\2 +TAAAAAAAAAAAAAATAAAAACGTACGACGTACGTTTTTTTTTTTTTTTA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@114\2 +TTTTATTTTATTTTTTTGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@115\2 +TTTTTTGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTATATATGGGGG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@116\2 +GGGGGGGGGTTTTTTTTTTTTTTATATATGGGGGGGGTTTTTTTAAAAAA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@117\2 +ATATATATATATATAAATATATATATATAAAAAAAAAAAAAAAATAAAAA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@118\2 +AATATATATATATATATATATATATATATATATATATATATAAATATATA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@119\2 +GGGGGGGGGGGGGGGTTTTTTTTTTTTTTATATATGGGGGGGGTTTTTTT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ \ No newline at end of file diff --git a/test/test.fasta b/test/test.fasta new file mode 100644 index 0000000..b9d552f --- /dev/null +++ b/test/test.fasta @@ -0,0 +1,12 @@ +>chr1 +NNNNNNNNNNNTTTTTTTTTTTTTTTTTTTATTTTATTTTTTTTCACACAAACACACACACACACAAAAAAATATATATATATATATATATATATATATATATATATATATATATATATATATATATATATAAATATATAATATATAAAAAAAAAAAAAATAAAAACGTACGACGTACGTTTTTTTTTTTTTTTTTTTTATTTTTTTGGGGGGGGAGGGGGGGGGGTTTTTTTTTTTTTTATATATGGGGGGGGTTTTTTTAAAAAAAAATTTTTTTGGGGGGGGAGGGGGGGGGGTTTTTTTTTTTTTTNNNNNNNNNN +>chr2 +NNNNNNNNNNNTTTTTTTTTTTTTTTTTTTATTTTATTTTTTTTCACACAAACACACACACACACAAAAA +AATATATATATATATATATATATATATATATATATATATATATATATATATATATATATATAAATATATA +ATATATAAAAAAAAAAAAAATAAAAACGTACGACGTACGTTTTTTTTTTTTTTTTTTTTATTTTTTTGGG +GGGGGGGGGGGGGGGGTTTTTTTTTTTTTTATATATGGGGGGGGTTTTTTTAAAAAAAAANNNNNNNNNN +>chr3 +NNNNAGTAGTCTTTTTTTTTTTTTTTTTTTATTTTATTTTTTTTCACACAAACACACACACACACAAAAA +AATATATATATATATATATATATATATATATATATATATATATATATATATATATATATATAAATATATA +ATATATAAAAAAAAAAAAAATAAAAACGTACGACGTACGTTTTTTTTTTTTTTTTTTTTATTTTTTTGGG +GGGGGGGGGGGGGGGGTTTTTTTTTTTTTTATATATGGGGGGGGTTTTTTTAAAAAAAAAGGGGGGGGGG diff --git a/test/test.vcf b/test/test.vcf new file mode 100644 index 0000000..c009525 --- /dev/null +++ b/test/test.vcf @@ -0,0 +1,5 @@ +chr3 277 . G A X +chr3 6 . G C X +chr2 195 . T A X +chr2 146 . T A X +chr2 91 . T A X diff --git a/test/test_long_chr1.vcf.gz b/test/test_long_chr1.vcf.gz new file mode 100644 index 0000000..59fd971 Binary files /dev/null and b/test/test_long_chr1.vcf.gz differ diff --git a/test/test_long_chr1.vcf.gz.tbi b/test/test_long_chr1.vcf.gz.tbi new file mode 100644 index 0000000..25b601d Binary files /dev/null and b/test/test_long_chr1.vcf.gz.tbi differ diff --git a/test/test_long_chr1_not_the_same.vcf.gz b/test/test_long_chr1_not_the_same.vcf.gz new file mode 100644 index 0000000..20647e5 Binary files /dev/null and b/test/test_long_chr1_not_the_same.vcf.gz differ diff --git a/test/test_long_chr1_not_the_same.vcf.gz.tbi b/test/test_long_chr1_not_the_same.vcf.gz.tbi new file mode 100644 index 0000000..1cdb036 Binary files /dev/null and b/test/test_long_chr1_not_the_same.vcf.gz.tbi differ diff --git a/test/test_long_chr1_the_same.vcf.gz b/test/test_long_chr1_the_same.vcf.gz new file mode 100644 index 0000000..d92ccfe Binary files /dev/null and b/test/test_long_chr1_the_same.vcf.gz differ diff --git a/test/test_long_chr1_the_same.vcf.gz.tbi b/test/test_long_chr1_the_same.vcf.gz.tbi new file mode 100644 index 0000000..3fe3431 Binary files /dev/null and b/test/test_long_chr1_the_same.vcf.gz.tbi differ