diff --git a/.gitignore b/.gitignore
index d5a6dd6..f7bf712 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
+.vscode/
+htslib/
.cproject
.project
.*.swp
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..6530047
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,49 @@
+FROM phusion/baseimage:jammy-1.0.1
+
+ARG HTSLIB_VERSION=1.17
+
+ENV TERM=xterm-256color \
+ HTSLIB_URL=https://github.com/samtools/htslib/releases/download/${HTSLIB_VERSION}/htslib-${HTSLIB_VERSION}.tar.bz2
+
+COPY . minimap2_index_modifier
+
+# install deps and cleanup apt garbage
+RUN set -eux; \
+ apt-get update; \
+ apt-get install -y \
+ wget \
+ make \
+ autoconf \
+ gcc \
+ libcurl4-openssl-dev \
+ zlib1g-dev \
+ libbz2-dev \
+ liblzma-dev \
+ bzip2 \
+ && rm -rf /var/lib/apt/lists/*;
+
+#install htslib
+RUN set -eux; \
+mkdir temp; \
+cd temp; \
+\
+wget ${HTSLIB_URL}; \
+tar -xf htslib-${HTSLIB_VERSION}.tar.bz2; \
+cd htslib-${HTSLIB_VERSION}; \
+\
+autoreconf -i; \
+./configure; \
+make; \
+make install; \
+cd ../../; \
+rm -rf temp;
+
+#install minimap2_index_modifier
+RUN set -eux; \
+cd minimap2_index_modifier; \
+make; \
+cp minimap2 /usr/local/bin; \
+cd ../; \
+rm -rf minimap2_index_modifier;
+
+ENV LD_LIBRARY_PATH=/usr/local/lib
diff --git a/Makefile b/Makefile
index 4118616..08a38a1 100644
--- a/Makefile
+++ b/Makefile
@@ -2,11 +2,11 @@ CFLAGS= -g -Wall -O2 -Wc++-compat #-Wextra
CPPFLAGS= -DHAVE_KALLOC
INCLUDES=
OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o \
- lchain.o align.o hit.o seed.o map.o format.o pe.o esterr.o splitidx.o \
+ lchain.o align.o hit.o seed.o linked_vcf_list.o map.o format.o pe.o esterr.o splitidx.o \
ksw2_ll_sse.o
PROG= minimap2
PROG_EXTRA= sdust minimap2-lite
-LIBS= -lm -lz -lpthread
+LIBS= -lm -lz -lpthread -lhts
ifeq ($(arm_neon),) # if arm_neon is not defined
ifeq ($(sse2only),) # if sse2only is not defined
@@ -53,7 +53,7 @@ minimap2-lite:example.o libminimap2.a
libminimap2.a:$(OBJS)
$(AR) -csru $@ $(OBJS)
-sdust:sdust.c kalloc.o kalloc.h kdq.h kvec.h kseq.h ketopt.h sdust.h
+sdust:sdust.c kalloc.o kalloc.h kdq.h kvec.h kseq.h ketopt.h sdust.h linked_vcf_list.h
$(CC) -D_SDUST_MAIN $(CFLAGS) $< kalloc.o -o $@ -lz
# SSE-specific targets on x86/x86_64
@@ -128,5 +128,6 @@ options.o: mmpriv.h minimap.h bseq.h kseq.h
pe.o: mmpriv.h minimap.h bseq.h kseq.h kvec.h kalloc.h ksort.h
sdust.o: kalloc.h kdq.h kvec.h sdust.h
seed.o: mmpriv.h minimap.h bseq.h kseq.h kalloc.h ksort.h
-sketch.o: kvec.h kalloc.h mmpriv.h minimap.h bseq.h kseq.h
+linked_vcf_list.o: mmpriv.h minimap.h linked_vcf_list.h
+sketch.o: kvec.h kalloc.h mmpriv.h minimap.h bseq.h kseq.h linked_vcf_list.h
splitidx.o: mmpriv.h minimap.h bseq.h kseq.h
diff --git a/README.md b/README.md
index 13c4938..fc64f5d 100644
--- a/README.md
+++ b/README.md
@@ -1,403 +1,53 @@
-[](https://github.com/lh3/minimap2/releases)
-[](https://anaconda.org/bioconda/minimap2)
-[](https://pypi.python.org/pypi/mappy)
-[](https://github.com/lh3/minimap2/actions)
-## Getting Started
-```sh
-git clone https://github.com/lh3/minimap2
-cd minimap2 && make
-# long sequences against a reference genome
-./minimap2 -a test/MT-human.fa test/MT-orang.fa > test.sam
-# create an index first and then map
-./minimap2 -x map-ont -d MT-human-ont.mmi test/MT-human.fa
-./minimap2 -a MT-human-ont.mmi test/MT-orang.fa > test.sam
-# use presets (no test data)
-./minimap2 -ax map-pb ref.fa pacbio.fq.gz > aln.sam # PacBio CLR genomic reads
-./minimap2 -ax map-ont ref.fa ont.fq.gz > aln.sam # Oxford Nanopore genomic reads
-./minimap2 -ax map-hifi ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio HiFi/CCS genomic reads (v2.19 or later)
-./minimap2 -ax asm20 ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio HiFi/CCS genomic reads (v2.18 or earlier)
-./minimap2 -ax sr ref.fa read1.fa read2.fa > aln.sam # short genomic paired-end reads
-./minimap2 -ax splice ref.fa rna-reads.fa > aln.sam # spliced long reads (strand unknown)
-./minimap2 -ax splice -uf -k14 ref.fa reads.fa > aln.sam # noisy Nanopore Direct RNA-seq
-./minimap2 -ax splice:hq -uf ref.fa query.fa > aln.sam # Final PacBio Iso-seq or traditional cDNA
-./minimap2 -ax splice --junc-bed anno.bed12 ref.fa query.fa > aln.sam # prioritize on annotated junctions
-./minimap2 -cx asm5 asm1.fa asm2.fa > aln.paf # intra-species asm-to-asm alignment
-./minimap2 -x ava-pb reads.fa reads.fa > overlaps.paf # PacBio read overlap
-./minimap2 -x ava-ont reads.fa reads.fa > overlaps.paf # Nanopore read overlap
-# man page for detailed command line options
-man ./minimap2.1
-```
-
-## Table of Contents
-
-- [Getting Started](#started)
-- [Users' Guide](#uguide)
- - [Installation](#install)
- - [General usage](#general)
- - [Use cases](#cases)
- - [Map long noisy genomic reads](#map-long-genomic)
- - [Map long mRNA/cDNA reads](#map-long-splice)
- - [Find overlaps between long reads](#long-overlap)
- - [Map short accurate genomic reads](#short-genomic)
- - [Full genome/assembly alignment](#full-genome)
- - [Advanced features](#advanced)
- - [Working with >65535 CIGAR operations](#long-cigar)
- - [The cs optional tag](#cs)
- - [Working with the PAF format](#paftools)
- - [Algorithm overview](#algo)
- - [Getting help](#help)
- - [Citing minimap2](#cite)
-- [Developers' Guide](#dguide)
-- [Limitations](#limit)
-
-## Users' Guide
-
-Minimap2 is a versatile sequence alignment program that aligns DNA or mRNA
-sequences against a large reference database. Typical use cases include: (1)
-mapping PacBio or Oxford Nanopore genomic reads to the human genome; (2)
-finding overlaps between long reads with error rate up to ~15%; (3)
-splice-aware alignment of PacBio Iso-Seq or Nanopore cDNA or Direct RNA reads
-against a reference genome; (4) aligning Illumina single- or paired-end reads;
-(5) assembly-to-assembly alignment; (6) full-genome alignment between two
-closely related species with divergence below ~15%.
+Minimap2_index_modifier
+=======================
+Minimap2_index_modifier is a fork of alignment tool [minimap2](https://github.com/lh3/minimap2).
+Unlike the original tool, this can use the variants defined in the VCF file when generating the index, for more accurate alignment.
-For ~10kb noisy reads sequences, minimap2 is tens of times faster than
-mainstream long-read mappers such as BLASR, BWA-MEM, NGMLR and GMAP. It is more
-accurate on simulated long reads and produces biologically meaningful alignment
-ready for downstream analyses. For >100bp Illumina short reads, minimap2 is
-three times as fast as BWA-MEM and Bowtie2, and as accurate on simulated data.
-Detailed evaluations are available from the [minimap2 paper][doi] or the
-[preprint][preprint].
-### Installation
-
-Minimap2 is optimized for x86-64 CPUs. You can acquire precompiled binaries from
-the [release page][release] with:
-```sh
-curl -L https://github.com/lh3/minimap2/releases/download/v2.24/minimap2-2.24_x64-linux.tar.bz2 | tar -jxvf -
-./minimap2-2.24_x64-linux/minimap2
+Minimap2_index_modifier can be used in the same way as the original minimap2.
+To create a modified index use additional parameter `--vcf-file-with-variants `.
+```bash
+minimap2 -d index.mmi --vcf-file-with-variants input.vcf.gz reference.fasta
```
-If you want to compile from the source, you need to have a C compiler, GNU make
-and zlib development files installed. Then type `make` in the source code
-directory to compile. If you see compilation errors, try `make sse2only=1`
-to disable SSE4 code, which will make minimap2 slightly slower.
-Minimap2 also works with ARM CPUs supporting the NEON instruction sets. To
-compile for 32 bit ARM architectures (such as ARMv7), use `make arm_neon=1`. To
-compile for for 64 bit ARM architectures (such as ARMv8), use `make arm_neon=1
-aarch64=1`.
+Use flag `--parse-haplotype` if your VCF contains phased haplotypes.
-Minimap2 can use [SIMD Everywhere (SIMDe)][simde] library for porting
-implementation to the different SIMD instruction sets. To compile using SIMDe,
-use `make -f Makefile.simde`. To compile for ARM CPUs, use `Makefile.simde`
-with the ARM related command lines given above.
+## Contents
+* [Installation](#installation)
+ * [Compiling from source](#compiling-from-source)
+ * [Docker](#docker)
+ * [Dockerhub](#dockerhub)
+* [Pre-built indexes](#pre-built-indexes)
+* [Tests](#tests)
-### General usage
+## Installation
+### Compiling from source
+To compile from source, use this version of tools:
-Without any options, minimap2 takes a reference database and a query sequence
-file as input and produce approximate mapping, without base-level alignment
-(i.e. coordinates are only approximate and no CIGAR in output), in the [PAF format][paf]:
-```sh
-minimap2 ref.fa query.fq > approx-mapping.paf
-```
-You can ask minimap2 to generate CIGAR at the `cg` tag of PAF with:
-```sh
-minimap2 -c ref.fa query.fq > alignment.paf
-```
-or to output alignments in the [SAM format][sam]:
-```sh
-minimap2 -a ref.fa query.fq > alignment.sam
-```
-Minimap2 seamlessly works with gzip'd FASTA and FASTQ formats as input. You
-don't need to convert between FASTA and FASTQ or decompress gzip'd files first.
+* GCC/G++ 11.4.0+
+* HTSlib v1.17
-For the human reference genome, minimap2 takes a few minutes to generate a
-minimizer index for the reference before mapping. To reduce indexing time, you
-can optionally save the index with option **-d** and replace the reference
-sequence file with the index file on the minimap2 command line:
-```sh
-minimap2 -d ref.mmi ref.fa # indexing
-minimap2 -a ref.mmi reads.fq > alignment.sam # alignment
+Command to compile:
+```bash
+cd minimap2_index_modifier && make
```
-***Importantly***, it should be noted that once you build the index, indexing
-parameters such as **-k**, **-w**, **-H** and **-I** can't be changed during
-mapping. If you are running minimap2 for different data types, you will
-probably need to keep multiple indexes generated with different parameters.
-This makes minimap2 different from BWA which always uses the same index
-regardless of query data types.
-### Use cases
-
-Minimap2 uses the same base algorithm for all applications. However, due to the
-different data types it supports (e.g. short vs long reads; DNA vs mRNA reads),
-minimap2 needs to be tuned for optimal performance and accuracy. It is usually
-recommended to choose a preset with option **-x**, which sets multiple
-parameters at the same time. The default setting is the same as `map-ont`.
-
-#### Map long noisy genomic reads
-
-```sh
-minimap2 -ax map-pb ref.fa pacbio-reads.fq > aln.sam # for PacBio CLR reads
-minimap2 -ax map-ont ref.fa ont-reads.fq > aln.sam # for Oxford Nanopore reads
+### Docker
+Clone this repository and build a Docker image as follows.
+```bash
+docker build -t minimap2_index_modifier:2.24 .
```
-The difference between `map-pb` and `map-ont` is that `map-pb` uses
-homopolymer-compressed (HPC) minimizers as seeds, while `map-ont` uses ordinary
-minimizers as seeds. Emperical evaluation suggests HPC minimizers improve
-performance and sensitivity when aligning PacBio CLR reads, but hurt when aligning
-Nanopore reads.
-
-#### Map long mRNA/cDNA reads
-```sh
-minimap2 -ax splice:hq -uf ref.fa iso-seq.fq > aln.sam # PacBio Iso-seq/traditional cDNA
-minimap2 -ax splice ref.fa nanopore-cdna.fa > aln.sam # Nanopore 2D cDNA-seq
-minimap2 -ax splice -uf -k14 ref.fa direct-rna.fq > aln.sam # Nanopore Direct RNA-seq
-minimap2 -ax splice --splice-flank=no SIRV.fa SIRV-seq.fa # mapping against SIRV control
+### Dockerhub
+Clone Docker image from Dockerhub.
+```bash
+docker pull egorguga/minimap2_index_modifier:2.24
```
-There are different long-read RNA-seq technologies, including tranditional
-full-length cDNA, EST, PacBio Iso-seq, Nanopore 2D cDNA-seq and Direct RNA-seq.
-They produce data of varying quality and properties. By default, `-x splice`
-assumes the read orientation relative to the transcript strand is unknown. It
-tries two rounds of alignment to infer the orientation and write the strand to
-the `ts` SAM/PAF tag if possible. For Iso-seq, Direct RNA-seq and tranditional
-full-length cDNAs, it would be desired to apply `-u f` to force minimap2 to
-consider the forward transcript strand only. This speeds up alignment with
-slight improvement to accuracy. For noisy Nanopore Direct RNA-seq reads, it is
-recommended to use a smaller k-mer size for increased sensitivity to the first
-or the last exons.
-
-Minimap2 rates an alignment by the score of the max-scoring sub-segment,
-*excluding* introns, and marks the best alignment as primary in SAM. When a
-spliced gene also has unspliced pseudogenes, minimap2 does not intentionally
-prefer spliced alignment, though in practice it more often marks the spliced
-alignment as the primary. By default, minimap2 outputs up to five secondary
-alignments (i.e. likely pseudogenes in the context of RNA-seq mapping). This
-can be tuned with option **-N**.
-
-For long RNA-seq reads, minimap2 may produce chimeric alignments potentially
-caused by gene fusions/structural variations or by an intron longer than the
-max intron length **-G** (200k by default). For now, it is not recommended to
-apply an excessively large **-G** as this slows down minimap2 and sometimes
-leads to false alignments.
-
-It is worth noting that by default `-x splice` prefers GT[A/G]..[C/T]AG
-over GT[C/T]..[A/G]AG, and then over other splicing signals. Considering
-one additional base improves the junction accuracy for noisy reads, but
-reduces the accuracy when aligning against the widely used SIRV control data.
-This is because SIRV does not honor the evolutionarily conservative splicing
-signal. If you are studying SIRV, you may apply `--splice-flank=no` to let
-minimap2 only model GT..AG, ignoring the additional base.
-
-Since v2.17, minimap2 can optionally take annotated genes as input and
-prioritize on annotated splice junctions. To use this feature, you can
-```sh
-paftools.js gff2bed anno.gff > anno.bed
-minimap2 -ax splice --junc-bed anno.bed ref.fa query.fa > aln.sam
-```
-Here, `anno.gff` is the gene annotation in the GTF or GFF3 format (`gff2bed`
-automatically tests the format). The output of `gff2bed` is in the 12-column
-BED format, or the BED12 format. With the `--junc-bed` option, minimap2 adds a
-bonus score (tuned by `--junc-bonus`) if an aligned junction matches a junction
-in the annotation. Option `--junc-bed` also takes 5-column BED, including the
-strand field. In this case, each line indicates an oriented junction.
-
-#### Find overlaps between long reads
-
-```sh
-minimap2 -x ava-pb reads.fq reads.fq > ovlp.paf # PacBio CLR read overlap
-minimap2 -x ava-ont reads.fq reads.fq > ovlp.paf # Oxford Nanopore read overlap
-```
-Similarly, `ava-pb` uses HPC minimizers while `ava-ont` uses ordinary
-minimizers. It is usually not recommended to perform base-level alignment in
-the overlapping mode because it is slow and may produce false positive
-overlaps. However, if performance is not a concern, you may try to add `-a` or
-`-c` anyway.
-
-#### Map short accurate genomic reads
-
-```sh
-minimap2 -ax sr ref.fa reads-se.fq > aln.sam # single-end alignment
-minimap2 -ax sr ref.fa read1.fq read2.fq > aln.sam # paired-end alignment
-minimap2 -ax sr ref.fa reads-interleaved.fq > aln.sam # paired-end alignment
-```
-When two read files are specified, minimap2 reads from each file in turn and
-merge them into an interleaved stream internally. Two reads are considered to
-be paired if they are adjacent in the input stream and have the same name (with
-the `/[0-9]` suffix trimmed if present). Single- and paired-end reads can be
-mixed.
-
-Minimap2 does not work well with short spliced reads. There are many capable
-RNA-seq mappers for short reads.
-
-#### Full genome/assembly alignment
-
-```sh
-minimap2 -ax asm5 ref.fa asm.fa > aln.sam # assembly to assembly/ref alignment
-```
-For cross-species full-genome alignment, the scoring system needs to be tuned
-according to the sequence divergence.
-
-### Advanced features
-
-#### Working with >65535 CIGAR operations
-
-Due to a design flaw, BAM does not work with CIGAR strings with >65535
-operations (SAM and CRAM work). However, for ultra-long nanopore reads minimap2
-may align ~1% of read bases with long CIGARs beyond the capability of BAM. If
-you convert such SAM/CRAM to BAM, Picard and recent samtools will throw an
-error and abort. Older samtools and other tools may create corrupted BAM.
-
-To avoid this issue, you can add option `-L` at the minimap2 command line.
-This option moves a long CIGAR to the `CG` tag and leaves a fully clipped CIGAR
-at the SAM CIGAR column. Current tools that don't read CIGAR (e.g. merging and
-sorting) still work with such BAM records; tools that read CIGAR will
-effectively ignore these records. It has been decided that future tools
-will seamlessly recognize long-cigar records generated by option `-L`.
-
-**TL;DR**: if you work with ultra-long reads and use tools that only process
-BAM files, please add option `-L`.
-
-#### The cs optional tag
-
-The `cs` SAM/PAF tag encodes bases at mismatches and INDELs. It matches regular
-expression `/(:[0-9]+|\*[a-z][a-z]|[=\+\-][A-Za-z]+)+/`. Like CIGAR, `cs`
-consists of series of operations. Each leading character specifies the
-operation; the following sequence is the one involved in the operation.
-
-The `cs` tag is enabled by command line option `--cs`. The following alignment,
-for example:
-```txt
-CGATCGATAAATAGAGTAG---GAATAGCA
-|||||| |||||||||| |||| |||
-CGATCG---AATAGAGTAGGTCGAATtGCA
-```
-is represented as `:6-ata:10+gtc:4*at:3`, where `:[0-9]+` represents an
-identical block, `-ata` represents a deletion, `+gtc` an insertion and `*at`
-indicates reference base `a` is substituted with a query base `t`. It is
-similar to the `MD` SAM tag but is standalone and easier to parse.
-
-If `--cs=long` is used, the `cs` string also contains identical sequences in
-the alignment. The above example will become
-`=CGATCG-ata=AATAGAGTAG+gtc=GAAT*at=GCA`. The long form of `cs` encodes both
-reference and query sequences in one string. The `cs` tag also encodes intron
-positions and splicing signals (see the [minimap2 manpage][manpage-cs] for
-details).
-
-#### Working with the PAF format
-
-Minimap2 also comes with a (java)script [paftools.js](misc/paftools.js) that
-processes alignments in the PAF format. It calls variants from
-assembly-to-reference alignment, lifts over BED files based on alignment,
-converts between formats and provides utilities for various evaluations. For
-details, please see [misc/README.md](misc/README.md).
-
-### Algorithm overview
-
-In the following, minimap2 command line options have a dash ahead and are
-highlighted in bold. The description may help to tune minimap2 parameters.
-
-1. Read **-I** [=*4G*] reference bases, extract (**-k**,**-w**)-minimizers and
- index them in a hash table.
-
-2. Read **-K** [=*200M*] query bases. For each query sequence, do step 3
- through 7:
-
-3. For each (**-k**,**-w**)-minimizer on the query, check against the reference
- index. If a reference minimizer is not among the top **-f** [=*2e-4*] most
- frequent, collect its the occurrences in the reference, which are called
- *seeds*.
-
-4. Sort seeds by position in the reference. Chain them with dynamic
- programming. Each chain represents a potential mapping. For read
- overlapping, report all chains and then go to step 8. For reference mapping,
- do step 5 through 7:
-
-5. Let *P* be the set of primary mappings, which is an empty set initially. For
- each chain from the best to the worst according to their chaining scores: if
- on the query, the chain overlaps with a chain in *P* by **--mask-level**
- [=*0.5*] or higher fraction of the shorter chain, mark the chain as
- *secondary* to the chain in *P*; otherwise, add the chain to *P*.
-
-6. Retain all primary mappings. Also retain up to **-N** [=*5*] top secondary
- mappings if their chaining scores are higher than **-p** [=*0.8*] of their
- corresponding primary mappings.
-
-7. If alignment is requested, filter out an internal seed if it potentially
- leads to both a long insertion and a long deletion. Extend from the
- left-most seed. Perform global alignments between internal seeds. Split the
- chain if the accumulative score along the global alignment drops by **-z**
- [=*400*], disregarding long gaps. Extend from the right-most seed. Output
- chains and their alignments.
-
-8. If there are more query sequences in the input, go to step 2 until no more
- queries are left.
-
-9. If there are more reference sequences, reopen the query file from the start
- and go to step 1; otherwise stop.
-
-### Getting help
-
-Manpage [minimap2.1][manpage] provides detailed description of minimap2
-command line options and optional tags. The [FAQ](FAQ.md) page answers several
-frequently asked questions. If you encounter bugs or have further questions or
-requests, you can raise an issue at the [issue page][issue]. There is not a
-specific mailing list for the time being.
-
-### Citing minimap2
-
-If you use minimap2 in your work, please cite:
-
-> Li, H. (2018). Minimap2: pairwise alignment for nucleotide sequences.
-> *Bioinformatics*, **34**:3094-3100. [doi:10.1093/bioinformatics/bty191][doi]
-
-## Developers' Guide
-
-Minimap2 is not only a command line tool, but also a programming library.
-It provides C APIs to build/load index and to align sequences against the
-index. File [example.c](example.c) demonstrates typical uses of C APIs. Header
-file [minimap.h](minimap.h) gives more detailed API documentation. Minimap2
-aims to keep APIs in this header stable. File [mmpriv.h](mmpriv.h) contains
-additional private APIs which may be subjected to changes frequently.
-
-This repository also provides Python bindings to a subset of C APIs. File
-[python/README.rst](python/README.rst) gives the full documentation;
-[python/minimap2.py](python/minimap2.py) shows an example. This Python
-extension, mappy, is also [available from PyPI][mappypypi] via `pip install
-mappy` or [from BioConda][mappyconda] via `conda install -c bioconda mappy`.
-
-## Limitations
-
-* Minimap2 may produce suboptimal alignments through long low-complexity
- regions where seed positions may be suboptimal. This should not be a big
- concern because even the optimal alignment may be wrong in such regions.
-
-* Minimap2 requires SSE2 instructions on x86 CPUs or NEON on ARM CPUs. It is
- possible to add non-SIMD support, but it would make minimap2 slower by
- several times.
-
-* Minimap2 does not work with a single query or database sequence ~2
- billion bases or longer (2,147,483,647 to be exact). The total length of all
- sequences can well exceed this threshold.
-
-* Minimap2 often misses small exons.
-
+## Pre-built indexes
+This [link](https://nextcloud.ispras.ru/index.php/s/wcb9PpZyr8Gb5CC) contains pre-built modified indexes for next references:
+* GRCh38 [(GCA_000001405.15)](https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/references/GRCh38/)
+* GRCh37 [(hs37d5)](https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/references/GRCh37/)
-[paf]: https://github.com/lh3/miniasm/blob/master/PAF.md
-[sam]: https://samtools.github.io/hts-specs/SAMv1.pdf
-[minimap]: https://github.com/lh3/minimap
-[smartdenovo]: https://github.com/ruanjue/smartdenovo
-[longislnd]: https://www.ncbi.nlm.nih.gov/pubmed/27667791
-[gaba]: https://github.com/ocxtal/libgaba
-[ksw2]: https://github.com/lh3/ksw2
-[preprint]: https://arxiv.org/abs/1708.01492
-[release]: https://github.com/lh3/minimap2/releases
-[mappypypi]: https://pypi.python.org/pypi/mappy
-[mappyconda]: https://anaconda.org/bioconda/mappy
-[issue]: https://github.com/lh3/minimap2/issues
-[k8]: https://github.com/attractivechaos/k8
-[manpage]: https://lh3.github.io/minimap2/minimap2.html
-[manpage-cs]: https://lh3.github.io/minimap2/minimap2.html#10
-[doi]: https://doi.org/10.1093/bioinformatics/bty191
-[smide]: https://github.com/nemequ/simde
-[unimap]: https://github.com/lh3/unimap
+## Tests
+See [test/tests.md](test/tests.md) for more details.
diff --git a/index.c b/index.c
index 0d8a2ae..d7133e4 100644
--- a/index.c
+++ b/index.c
@@ -15,6 +15,12 @@
#include "kvec.h"
#include "khash.h"
+#include
+#include
+#include
+#include
+#include
+
#define idx_hash(a) ((a)>>1)
#define idx_eq(a, b) ((a)>>1 == (b)>>1)
KHASH_INIT(idx, uint64_t, uint64_t, 1, idx_hash, idx_eq)
@@ -263,7 +269,7 @@ static void worker_post(void *g, long i, int tid)
kfree(0, b->a.a);
b->a.n = b->a.m = 0, b->a.a = 0;
}
-
+
static void mm_idx_post(mm_idx_t *mi, int n_threads)
{
kt_for(n_threads, worker_post, mi, 1<b);
@@ -282,6 +288,7 @@ typedef struct {
uint64_t batch_size, sum_len;
mm_bseq_file_t *fp;
mm_idx_t *mi;
+ char * vcf_with_variants;
} pipeline_t;
typedef struct {
@@ -355,15 +362,68 @@ static void *worker_pipeline(void *shared, int step, void *in)
} else free(s);
} else if (step == 1) { // step 1: compute sketch
step_t *s = (step_t*)in;
+
for (i = 0; i < s->n_seq; ++i) {
+ //printf("SEQ %d %s\n", s->n_seq, s->seq[i].name);
mm_bseq1_t *t = &s->seq[i];
if (t->l_seq > 0)
mm_sketch(0, t->seq, t->l_seq, p->mi->w, p->mi->k, t->rid, p->mi->flag&MM_I_HPC, &s->a);
else if (mm_verbose >= 2)
fprintf(stderr, "[WARNING] the length database sequence '%s' is 0\n", t->name);
+
+ if(p->vcf_with_variants && strcmp(p->vcf_with_variants, "")) {
+ mm_idx_manipulate_phased(p->mi, p->vcf_with_variants, &s->a, s->seq[i].name);
+ }
+
free(t->seq); free(t->name);
}
+
free(s->seq); s->seq = 0;
+
+ // sort by minimizer
+ radix_sort_128x(s->a.a, s->a.a + s->a.n);
+
+ // stay only unique
+ //#### why 16, sizeof(mm128_v)?
+ mm128_t *tmp = (mm128_t*)calloc(s->a.n, 16);
+
+
+ size_t ptr_p, ptr_tmp, beg_tmp, end_tmp;
+ tmp[0].x = s->a.a[0].x;
+ tmp[0].y = s->a.a[0].y;
+ beg_tmp = 0;
+ end_tmp = 1;
+
+ for (ptr_p = 1; ptr_p < s->a.n; ptr_p++) {
+ if(s->a.a[ptr_p].x == tmp[beg_tmp].x) {
+ int exist = 0;
+ for (ptr_tmp = beg_tmp; ptr_tmp < end_tmp; ptr_tmp++) {
+ if (s->a.a[ptr_p].y == tmp[ptr_tmp].y && s->a.a[ptr_p].x == tmp[ptr_tmp].x) {
+ exist = 1;
+ break;
+ }
+ }
+ if (exist == 0) {
+ tmp[end_tmp].x = s->a.a[ptr_p].x;
+ tmp[end_tmp].y = s->a.a[ptr_p].y;
+ end_tmp++;
+ }
+ }
+ else {
+ tmp[end_tmp].x = s->a.a[ptr_p].x;
+ tmp[end_tmp].y = s->a.a[ptr_p].y;
+ beg_tmp = end_tmp;
+ end_tmp++;
+ }
+ }
+
+ tmp = (mm128_t *) realloc(tmp, sizeof(mm128_t) * end_tmp);
+
+ //#### do we need realloc s->a.a ?
+ memcpy(s->a.a, tmp, end_tmp * 16);
+ free(tmp);
+ s->a.n = end_tmp;
+
return s;
} else if (step == 2) { // dispatch sketch to buckets
step_t *s = (step_t*)in;
@@ -373,7 +433,7 @@ static void *worker_pipeline(void *shared, int step, void *in)
return 0;
}
-mm_idx_t *mm_idx_gen(mm_bseq_file_t *fp, int w, int k, int b, int flag, int mini_batch_size, int n_threads, uint64_t batch_size)
+mm_idx_t *mm_idx_gen(mm_bseq_file_t *fp, int w, int k, int b, int flag, int mini_batch_size, int n_threads, uint64_t batch_size, char * vcf_with_variants)
{
pipeline_t pl;
if (fp == 0 || mm_bseq_eof(fp)) return 0;
@@ -382,8 +442,10 @@ mm_idx_t *mm_idx_gen(mm_bseq_file_t *fp, int w, int k, int b, int flag, int mini
pl.batch_size = batch_size;
pl.fp = fp;
pl.mi = mm_idx_init(w, k, b, flag);
+ pl.vcf_with_variants = vcf_with_variants;
kt_pipeline(n_threads < 3? n_threads : 3, worker_pipeline, &pl, 3);
+
if (mm_verbose >= 3)
fprintf(stderr, "[M::%s::%.3f*%.2f] collected minimizers\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0));
@@ -394,13 +456,13 @@ mm_idx_t *mm_idx_gen(mm_bseq_file_t *fp, int w, int k, int b, int flag, int mini
return pl.mi;
}
-mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads) // a simpler interface; deprecated
+mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads, char * vcf_with_variants) // a simpler interface; deprecated
{
mm_bseq_file_t *fp;
mm_idx_t *mi;
fp = mm_bseq_open(fn);
if (fp == 0) return 0;
- mi = mm_idx_gen(fp, w, k, 14, flag, 1<<18, n_threads, UINT64_MAX);
+ mi = mm_idx_gen(fp, w, k, 14, flag, 1<<18, n_threads, UINT64_MAX, vcf_with_variants);
mm_bseq_close(fp);
return mi;
}
@@ -459,6 +521,51 @@ mm_idx_t *mm_idx_str(int w, int k, int is_hpc, int bucket_bits, int n, const cha
* index I/O *
*************/
+void mm_idx_to_txt(FILE *fp, const mm_idx_t *mi)
+{
+ uint64_t sum_len = 0;
+ uint32_t x[5], i;
+ x[0] = mi->w, x[1] = mi->k, x[2] = mi->b, x[3] = mi->n_seq, x[4] = mi->flag;
+ fprintf(fp, "%s\n", MM_IDX_MAGIC);
+ fprintf(fp, "%u %u %u %u %u\n", x[0], x[1], x[2], x[3], x[4]);
+
+ for (i = 0; i < mi->n_seq; ++i) {
+ if (mi->seq[i].name) {
+ uint8_t l = strlen(mi->seq[i].name);
+ fprintf(fp, "%u\n", l);
+ fprintf(fp, "%s\n", mi->seq[i].name);
+ } else {
+ fprintf(fp, "0\n");
+ }
+ fprintf(fp, "%u\n", mi->seq[i].len);
+ sum_len += mi->seq[i].len;
+ }
+ for (i = 0; i < 1<b; ++i) {
+ mm_idx_bucket_t *b = &mi->B[i];
+ khint_t k;
+ idxhash_t *h = (idxhash_t*)b->h;
+ uint32_t size = h? h->size : 0;
+ fprintf(fp, "%i\n", b->n);
+ for (int j = 0; j < b->n; j++)
+ fprintf(fp, "\t%lu\n", b->p[j]);
+ fprintf(fp, "%u\n", size);
+ if (size == 0)
+ continue;
+ for (k = 0; k < kh_end(h); ++k) {
+ uint64_t x[3];
+ if (!kh_exist(h, k))
+ continue;
+ x[0] = kh_key(h, k), x[1] = kh_val(h, k);
+ fprintf(fp, "%lu\t%lu\n", x[0], x[1]);
+ }
+ }
+ if (!(mi->flag & MM_I_NO_SEQ)) {
+ for (int i = 0; i < (sum_len + 7) / 8; i++)
+ fprintf(fp, "%u\n", mi->S[i]);
+ }
+ fflush(fp);
+}
+
void mm_idx_dump(FILE *fp, const mm_idx_t *mi)
{
uint64_t sum_len = 0;
@@ -500,6 +607,67 @@ void mm_idx_dump(FILE *fp, const mm_idx_t *mi)
fflush(fp);
}
+mm_idx_t *mm_idx_load_from_txt(FILE *fp)
+{
+ char magic[4];
+ uint32_t x[5], i;
+ uint64_t sum_len = 0;
+ mm_idx_t *mi;
+
+ fscanf(fp, "%s\n", &magic);
+
+ if (strncmp(magic, MM_IDX_MAGIC, 4) != 0) return 0;
+ fscanf(fp, "%u %u %u %u %u\n", &x[0], &x[1], &x[2], &x[3], &x[4]);
+
+ mi = mm_idx_init(x[0], x[1], x[2], x[4]);
+ mi->n_seq = x[3];
+ mi->seq = (mm_idx_seq_t*)kcalloc(mi->km, mi->n_seq, sizeof(mm_idx_seq_t));
+ for (i = 0; i < mi->n_seq; ++i) {
+ uint8_t l;
+ mm_idx_seq_t *s = &mi->seq[i];
+ fscanf(fp, "%u\n", &l);
+ if (l) {
+ s->name = (char*)kmalloc(mi->km, l + 1);
+ fscanf(fp, "%s\n", s->name);
+ s->name[l] = 0;
+ }
+ fscanf(fp, "%u\n", &s->len);
+ s->offset = sum_len;
+ s->is_alt = 0;
+ sum_len += s->len;
+ }
+ for (i = 0; i < 1<b; ++i) {
+ mm_idx_bucket_t *b = &mi->B[i];
+ uint32_t j, size;
+ khint_t k;
+ idxhash_t *h;
+ fscanf(fp, "%i\n", &b->n);
+ b->p = (uint64_t*)malloc(b->n * 8);
+ for (int j = 0; j < b->n; j++) {
+ fscanf(fp, "\t%lu\n", &b->p[j]);
+ }
+ fscanf(fp, "%u\n", &size);
+ if (size == 0) continue;
+ b->h = h = kh_init(idx);
+ kh_resize(idx, h, size);
+ for (j = 0; j < size; ++j) {
+ uint64_t x[2];
+ int absent;
+ fscanf(fp, "%lu\t%lu\n", &x[0], &x[1]);
+ k = kh_put(idx, h, x[0], &absent);
+ assert(absent);
+ kh_val(h, k) = x[1];
+ }
+ }
+ if (!(mi->flag & MM_I_NO_SEQ)) {
+ mi->S = (uint32_t*)malloc((sum_len + 7) / 8 * 4);
+ for (int i = 0; i < (sum_len + 7) / 8; i++)
+ fscanf(fp, "%u\n", &mi->S[i]);
+ }
+ return mi;
+}
+
+
mm_idx_t *mm_idx_load(FILE *fp)
{
char magic[4];
@@ -605,7 +773,7 @@ void mm_idx_reader_close(mm_idx_reader_t *r)
free(r);
}
-mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads)
+mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads, char * vcf_with_variants)
{
mm_idx_t *mi;
if (r->is_idx) {
@@ -613,9 +781,13 @@ mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads)
if (mi && mm_verbose >= 2 && (mi->k != r->opt.k || mi->w != r->opt.w || (mi->flag&MM_I_HPC) != (r->opt.flag&MM_I_HPC)))
fprintf(stderr, "[WARNING]\033[1;31m Indexing parameters (-k, -w or -H) overridden by parameters used in the prebuilt index.\033[0m\n");
} else
- mi = mm_idx_gen(r->fp.seq, r->opt.w, r->opt.k, r->opt.bucket_bits, r->opt.flag, r->opt.mini_batch_size, n_threads, r->opt.batch_size);
+ {
+ mi = mm_idx_gen(r->fp.seq, r->opt.w, r->opt.k, r->opt.bucket_bits, r->opt.flag, r->opt.mini_batch_size, n_threads, r->opt.batch_size, vcf_with_variants);
+ }
if (mi) {
- if (r->fp_out) mm_idx_dump(r->fp_out, mi);
+ if (r->fp_out) {
+ mm_idx_dump(r->fp_out, mi);
+ }
mi->index = r->n_parts++;
}
return mi;
diff --git a/linked_vcf_list.c b/linked_vcf_list.c
new file mode 100644
index 0000000..2cfc550
--- /dev/null
+++ b/linked_vcf_list.c
@@ -0,0 +1,323 @@
+// Copyright 2023 karpulevich
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include
+#include
+#include
+#include
+
+#include
+#include "mmpriv.h"
+#include "minimap.h"
+#include "linked_vcf_list.h"
+
+struct node *head = NULL;
+struct node *current = NULL;
+
+// display the list
+void printList(){
+ struct node *p = head;
+ printf("\n[");
+
+ //start from the beginning
+ while(p != NULL) {
+ printf(" %lu ",p->pos);
+ p = p->next;
+ }
+ printf("]");
+}
+
+// display the GT list
+void printGTList(bcf_hdr_t *hdr){
+ struct node *p = head;
+
+
+ //start from the beginning
+ while(p != NULL) {
+ int32_t *gt_arr = NULL, ngt_arr = 0;
+ int ngt;
+ ngt = bcf_get_genotypes(hdr, p->rec, >_arr, &ngt_arr);
+ if ( ngt > 0 ) {
+ int i, nsmpl = bcf_hdr_nsamples(hdr);
+ printf("%s %lu REF:%s ALT:%s\n", bcf_hdr_id2name(hdr, p->CHR_ID), p->pos, p->REF, p->ALT);
+
+ int max_ploidy = ngt/nsmpl;
+ for (i=0; inext;
+ }
+}
+
+int ifexists(char* z, const int u, const int s, char* v)
+{
+ for (int i = 0; i < u; i++)
+ if (!strncmp(&z[i * s], v, s)) {
+ return 1;
+ }
+ return 0;
+}
+
+void calculate_haplotypes(mm_idx_t * mi, bcf_hdr_t *hdr, struct node *window_start_pointer, struct node *current_pointer, unsigned long curr_pos, mm128_v *p) {
+ //Even though vcf uses 1-based indexing (i.e. first base is base 1), htslib internally uses 0-based indexing (i.e. bcf1_t::pos is 0 based).
+ //http://wresch.github.io/2014/11/18/process-vcf-file-with-htslib.html
+ //so we need use (pos + 1)
+ struct node *c_pointer = current_pointer;
+ struct node *w_start_pointer = window_start_pointer;
+
+ int MAX_SNP = 10;
+
+ char *gt_array = NULL;
+ int arr_i, arr_j, arr_len;
+
+ if (!(mi->flag & MM_PARSE_HT)) {
+ int snp_num = 0;
+ while (w_start_pointer != c_pointer) {
+ w_start_pointer = w_start_pointer->next;
+ snp_num += 1;
+ }
+ if (snp_num == MAX_SNP) {
+ return;
+ }
+ arr_i = (int) pow(2, snp_num);
+ arr_j = snp_num + 1;
+ gt_array = (char *) calloc(arr_i * arr_j, sizeof(char));
+ for (unsigned i = 0; i < (unsigned) pow(2, snp_num); i++) {
+ int k = 0;
+ for (int j = 1 << (snp_num - 1); j > 0; j = j / 2) {
+ gt_array[i * arr_j + k] = (i & j) ? '1' : '0';
+ k++;
+ }
+ }
+ arr_len = arr_i;
+ }
+ else {
+ MAX_SNP = 100;
+ int sample_num = bcf_hdr_nsamples(hdr);
+ int snp_num = 0;
+ arr_i = sample_num * 2;
+ arr_j = MAX_SNP + 1;
+ gt_array = (char *) calloc(arr_i * arr_j, sizeof(char));
+ while (w_start_pointer != c_pointer) {
+ int32_t *gt_arr = NULL, ngt_arr = 0;
+ int ngt = bcf_get_genotypes(hdr, w_start_pointer->rec, >_arr, &ngt_arr);
+
+ if (ngt > 0) {
+ int max_ploidy = ngt / sample_num;
+
+ for (int i = 0; i < sample_num; i++) {
+ int32_t *ptr = gt_arr + i * max_ploidy;
+ gt_array[i * max_ploidy * arr_j + snp_num] = bcf_gt_allele(ptr[0]) + '0';
+ gt_array[(i * max_ploidy + 1) * arr_j + snp_num] = bcf_gt_allele(ptr[1]) + '0';
+ }
+ free(gt_arr);
+ }
+
+ w_start_pointer = w_start_pointer->next;
+ snp_num += 1;
+ if (snp_num > MAX_SNP) {
+ free(gt_array);
+ return;
+ }
+ }
+
+ //remove non unique strings
+ //https://subscription.packtpub.com/book/programming/9781838641108/1/ch01lvl1sec06/finding-the-unique-elements-in-an-array
+ int k = 1;
+ for (int i = 1; i < sample_num * 2; i++) {
+ if (!ifexists(gt_array, k, arr_j, >_array[i * arr_j])) {
+ if (i != k) memcpy(>_array[k * arr_j], >_array[i * arr_j], arr_j * sizeof(char));
+ k++;
+ }
+ }
+ arr_len = k;
+ }
+
+ //prepare SNP combinations
+ //Array format:
+ //REF_arr char array - REF (for control)
+ //ALT_arr char array - ALT
+ //POS_all ulong array - positions
+ //CHR - chromosome
+ //N_SNP - length
+ const char * CHR = bcf_hdr_id2name(hdr, window_start_pointer->CHR_ID);
+
+ for(int i = 0; iREF;
+ ALT_arr[N_SNP] = local_w_start_pointer->ALT;
+ POS_all[N_SNP] = (unsigned long)(local_w_start_pointer->pos + 1);
+ N_SNP += 1;
+ }
+
+ local_snp_num += 1;
+ local_w_start_pointer = local_w_start_pointer->next;
+ }
+ if (N_SNP > 0) {
+ add_variants(mi, CHR, REF_arr, ALT_arr, POS_all, N_SNP, curr_pos + 1, p);
+ }
+ }
+ free(gt_array);
+}
+
+void handleGTList(mm_idx_t * mi, bcf_hdr_t *hdr, mm128_v *p){
+ struct node *window_start_pointer = head;
+ struct node *current_pointer = head;
+ struct node *window_end_pointer = head;
+
+ int gap = mi->k - 1;
+
+ if (head == NULL)
+ return;
+
+ while(window_end_pointer->next != NULL)
+ {
+ while(window_end_pointer != NULL && window_end_pointer->pos > (current_pointer->pos - gap)){
+ window_end_pointer = window_end_pointer->next;
+ }
+
+ while(window_start_pointer->pos > (current_pointer->pos + gap)){
+ window_start_pointer = window_start_pointer->next;
+ }
+
+ //calculate start and end pointers
+ calculate_haplotypes(mi, hdr, window_start_pointer, window_end_pointer, current_pointer->pos, p);
+
+ current_pointer = current_pointer->next;
+ window_end_pointer = current_pointer;
+ }
+
+ //last SNP batch
+ while(window_start_pointer->pos > current_pointer->pos + gap){
+ window_start_pointer = window_start_pointer->next;
+ }
+
+ calculate_haplotypes(mi, hdr, window_start_pointer, window_end_pointer->next, current_pointer->pos, p);
+}
+
+//insertion at the beginning
+void insertatbegin(unsigned long pos, bcf1_t *rec, int CHR_ID, char * REF, char * ALT){
+ //create a link
+ struct node *lk = (struct node*) malloc(sizeof(struct node));
+ lk->pos = pos;
+ lk->rec = rec;
+ lk->CHR_ID = CHR_ID;
+ lk->REF = REF;
+ lk->ALT = ALT;
+
+ // point it to old first node
+ lk->next = head;
+
+ //point first to new first node
+ head = lk;
+}
+
+void insertatend(unsigned long pos, bcf1_t *rec, int CHR_ID, char * REF, char * ALT){
+ //create a link
+ struct node *lk = (struct node*) malloc(sizeof(struct node));
+ lk->pos = pos;
+ lk->rec = rec;
+ lk->CHR_ID = CHR_ID;
+ lk->REF = REF;
+ lk->ALT = ALT;
+
+ struct node *linkedlist = head;
+
+ // point it to old first node
+ while(linkedlist->next != NULL)
+ linkedlist = linkedlist->next;
+
+ //point first to new first node
+ linkedlist->next = lk;
+}
+void insertafternode(struct node *list, unsigned long pos, bcf1_t *rec, int CHR_ID, char * REF, char * ALT){
+ struct node *lk = (struct node*) malloc(sizeof(struct node));
+ lk->pos = pos;
+ lk->rec = rec;
+ lk->CHR_ID = CHR_ID;
+ lk->REF = REF;
+ lk->ALT = ALT;
+
+ lk->next = list->next;
+ list->next = lk;
+}
+
+void deleteatbegin(){
+ struct node *temp = head;
+ head = temp->next;
+ free(temp->ALT);
+ free(temp->REF);
+ bcf_destroy(temp->rec);
+ free(temp);
+}
+
+void deleteList(){
+ while (head != NULL)
+ {
+ deleteatbegin();
+ }
+}
+
+bool isListEmpty(){
+ return head == NULL;
+}
+
+void deletenode(int key){
+ struct node *temp = head, *prev;
+ if (temp != NULL && temp->pos == key) {
+ head = temp->next;
+ free(temp);
+ return;
+ }
+
+ // Find the key to be deleted
+ while (temp != NULL && temp->pos != key) {
+ prev = temp;
+ temp = temp->next;
+ }
+
+ // If the key is not present
+ if (temp == NULL) return;
+
+ // Remove the node
+ prev->next = temp->next;
+ free(temp);
+}
+
+int searchlist(int key){
+ struct node *temp = head;
+ while(temp != NULL) {
+ if (temp->pos == key) {
+ return 1;
+ }
+ temp=temp->next;
+ }
+ return 0;
+}
\ No newline at end of file
diff --git a/linked_vcf_list.h b/linked_vcf_list.h
new file mode 100644
index 0000000..0504865
--- /dev/null
+++ b/linked_vcf_list.h
@@ -0,0 +1,29 @@
+// Copyright 2023 karpulevich
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include
+
+struct node {
+ unsigned long pos;
+ bcf1_t *rec;
+ int CHR_ID;
+ char * REF;
+ char * ALT;
+ struct node *next;
+};
+
+bool isListEmpty();
+void handleGTList(mm_idx_t * mi, bcf_hdr_t *hdr, mm128_v *p);
+void insertatbegin(unsigned long pos, bcf1_t *rec, int CHR_ID, char * REF, char * ALT);
+void deleteList();
diff --git a/main.c b/main.c
index 0be9933..93dbf04 100644
--- a/main.c
+++ b/main.c
@@ -78,6 +78,8 @@ static ko_longopt_t long_options[] = {
{ "chain-skip-scale",ko_required_argument,351 },
{ "print-chains", ko_no_argument, 352 },
{ "no-hash-name", ko_no_argument, 353 },
+ { "vcf-file-with-variants",ko_required_argument,354 },
+ { "parse-haplotype",ko_no_argument, 355 },
{ "help", ko_no_argument, 'h' },
{ "max-intron-len", ko_required_argument, 'G' },
{ "version", ko_no_argument, 'V' },
@@ -127,6 +129,7 @@ int main(int argc, char *argv[])
mm_idxopt_t ipt;
int i, c, n_threads = 3, n_parts, old_best_n = -1;
char *fnw = 0, *rg = 0, *junc_bed = 0, *s, *alt_list = 0;
+ char *vcf_with_variants = 0;
FILE *fp_help = stderr;
mm_idx_reader_t *idx_rdr;
mm_idx_t *mi;
@@ -237,6 +240,8 @@ int main(int argc, char *argv[])
else if (c == 350) opt.q_occ_frac = atof(o.arg); // --q-occ-frac
else if (c == 352) mm_dbg_flag |= MM_DBG_PRINT_CHAIN; // --print-chains
else if (c == 353) opt.flag |= MM_F_NO_HASH_NAME; // --no-hash-name
+ else if (c == 354) vcf_with_variants = o.arg; // --vcf-file-with-variants
+ else if (c == 355) ipt.flag |= MM_PARSE_HT; // --parse-haplotype
else if (c == 330) {
fprintf(stderr, "[WARNING] \033[1;31m --lj-min-ratio has been deprecated.\033[0m\n");
} else if (c == 314) { // --frag
@@ -324,6 +329,8 @@ int main(int argc, char *argv[])
fprintf(fp_help, " -w INT minimizer window size [%d]\n", ipt.w);
fprintf(fp_help, " -I NUM split index for every ~NUM input bases [4G]\n");
fprintf(fp_help, " -d FILE dump index to FILE []\n");
+ fprintf(fp_help, " --vcf-file-with-variants FILE pass VCF FILE to modify index []\n");
+ fprintf(fp_help, " --parse-haplotype parse haplotypes from VCF to generate real SNP combinations, otherwise use all.\n");
fprintf(fp_help, " Mapping:\n");
fprintf(fp_help, " -f FLOAT filter out top FLOAT fraction of repetitive minimizers [%g]\n", opt.mid_occ_frac);
fprintf(fp_help, " -g NUM stop chain enlongation if there are no minimizers in INT-bp [%d]\n", opt.max_gap);
@@ -386,7 +393,7 @@ int main(int argc, char *argv[])
}
if (opt.best_n == 0 && (opt.flag&MM_F_CIGAR) && mm_verbose >= 2)
fprintf(stderr, "[WARNING]\033[1;31m `-N 0' reduces alignment accuracy. Please use --secondary=no to suppress secondary alignments.\033[0m\n");
- while ((mi = mm_idx_reader_read(idx_rdr, n_threads)) != 0) {
+ while ((mi = mm_idx_reader_read(idx_rdr, n_threads, vcf_with_variants)) != 0) {
int ret;
if ((opt.flag & MM_F_CIGAR) && (mi->flag & MM_I_NO_SEQ)) {
fprintf(stderr, "[ERROR] the prebuilt index doesn't contain sequences.\n");
diff --git a/minimap.h b/minimap.h
index 13e12e0..828de04 100644
--- a/minimap.h
+++ b/minimap.h
@@ -44,6 +44,7 @@
#define MM_I_HPC 0x1
#define MM_I_NO_SEQ 0x2
#define MM_I_NO_NAME 0x4
+#define MM_PARSE_HT 0x8
#define MM_IDX_MAGIC "MMI\2"
@@ -247,7 +248,7 @@ mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt, cons
*
* @return an index on success; NULL if reaching the end of the input file
*/
-mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads);
+mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads, char * vcf_with_variants);
/**
* Destroy/deallocate an index reader
@@ -401,7 +402,7 @@ int mm_idx_bed_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int32_t en, uin
// deprecated APIs for backward compatibility
void mm_mapopt_init(mm_mapopt_t *opt);
-mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads);
+mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads, char * vcf_with_variants);
#ifdef __cplusplus
}
diff --git a/mmpriv.h b/mmpriv.h
index 2f5034b..254e694 100644
--- a/mmpriv.h
+++ b/mmpriv.h
@@ -6,6 +6,12 @@
#include "bseq.h"
#include "kseq.h"
+#include
+#include
+#include
+#include
+#include
+
#define MM_PARENT_UNSET (-1)
#define MM_PARENT_TMP_PRI (-2)
@@ -60,6 +66,10 @@ void radix_sort_64(uint64_t *beg, uint64_t *end);
uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk);
void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p);
+void mm_idx_manipulate(mm_idx_t * mi, char *vcf_with_variants);
+void mm_idx_manipulate_phased(mm_idx_t * mi, char * fname, mm128_v *p, char * contig_name);
+void add_variants(mm_idx_t * mi, const char * CHR, char ** REF_arr, char ** ALT_arr, unsigned long * POS_all, int N_SNP, unsigned long curr_pos, mm128_v *p);
+void add_indel(mm_idx_t * mi, const char * CHR, char * REF, char * ALT, unsigned long curr_pos, unsigned long indel_pos, mm128_v *p, const char * original_ref_seq);
mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos);
void mm_seed_mz_flt(void *km, mm128_v *mv, int32_t q_occ_max, float q_occ_frac);
diff --git a/sketch.c b/sketch.c
index f830693..2d898a8 100644
--- a/sketch.c
+++ b/sketch.c
@@ -6,6 +6,14 @@
#include "kvec.h"
#include "mmpriv.h"
+#include "linked_vcf_list.h"
+
+#include
+#include
+#include
+#include
+#include
+
unsigned char seq_nt4_table[256] = {
0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
@@ -81,7 +89,7 @@ void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, i
mm128_t buf[256], min = { UINT64_MAX, UINT64_MAX };
tiny_queue_t tq;
- assert(len > 0 && (w > 0 && w < 256) && (k > 0 && k <= 28)); // 56 bits for k-mer; could use long k-mers, but 28 enough in practice
+ assert(len > 0 && (w > 0 && w < 256) && (k > 0 && k <= 128)); // 56 bits for k-mer; could use long k-mers, but 28 enough in practice
memset(buf, 0xff, w * 16);
memset(&tq, 0, sizeof(tiny_queue_t));
kv_resize(mm128_t, km, *p, p->n + len/w);
@@ -141,3 +149,364 @@ void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, i
if (min.x != UINT64_MAX)
kv_push(mm128_t, km, *p, min);
}
+
+void read_vcf(mm_idx_t * mi, char * fname, mm128_v *p, char * contig_name)
+{
+ int ret;
+
+ kstring_t str = {0,0,0};
+
+ //open vcf file
+ htsFile *fp = hts_open(fname,"rb");
+
+ //read header
+ bcf_hdr_t *hdr = bcf_hdr_read(fp);
+ bcf1_t *rec = bcf_init();
+
+ tbx_t *idx = tbx_index_load(fname);
+
+ if(!idx) {
+ //printf("Null index\n");
+ return;
+ }
+
+ //printf("%s\n", contig_name);
+ hts_itr_t *itr = tbx_itr_querys(idx, contig_name);
+
+ if(!itr) {
+ //printf("Null iterator for contig_name %s\n", contig_name);
+ return;
+ }
+
+ while ((ret = tbx_itr_next(fp, idx, itr, &str)) > 0) {
+ vcf_parse(&str, hdr, rec);
+ bcf_unpack(rec, BCF_UN_STR);
+ bcf_unpack(rec, BCF_UN_INFO);
+
+ bcf1_t *rec_tmp = bcf_dup(rec);
+ char * REF = (char *)calloc(mi->k + 1, sizeof(char));
+ strncpy(REF, rec->d.allele[0], mi->k + 1);
+ REF[mi->k + 1] = '\0';
+ char * ALT = (char *)calloc(mi->k + 1, sizeof(char));
+ strncpy(ALT, rec->d.allele[1], mi->k + 1);
+ ALT[mi->k + 1] = '\0';
+
+ insertatbegin((unsigned long)rec_tmp->pos, rec_tmp, rec_tmp->rid, REF, ALT);
+ bcf_empty(rec);
+
+ }
+
+ if(!isListEmpty()){
+ //printf("CHR = %s;\n", contig_name);
+ handleGTList(mi, hdr, p);
+
+ deleteList();
+ }
+
+ bcf_itr_destroy(itr);
+ tbx_destroy(idx);
+ bcf_hdr_destroy(hdr);
+
+ if ( (ret=hts_close(fp)) )
+ {
+ fprintf(stderr,"hts_close(%s): non-zero status %d\n",fname,ret);
+ exit(ret);
+ }
+}
+
+void mm_idx_manipulate_phased(mm_idx_t * mi, char * fname, mm128_v *p, char * contig_name) {
+ read_vcf(mi, fname, p, contig_name);
+}
+
+//REF - REF (for control)
+//ALT - ALT
+//curr_pos ulong - position
+//CHR - chromosome
+void add_indel(mm_idx_t * mi, const char * CHR, char * REF, char * ALT, unsigned long curr_pos, unsigned long indel_pos, mm128_v *p, const char * original_ref_seq)
+{
+ const char *contig_name = CHR;
+ const unsigned long center_position = curr_pos;
+ //const unsigned long position = indel_pos;
+ const unsigned long position = curr_pos;
+
+ //Find seq
+ uint64_t contig_offset = 0;
+ int seq_num = -1;
+ for (int i = 0; i < mi->n_seq; i++) {
+ if (strcmp(contig_name, mi->seq[i].name) == 0) {
+ contig_offset = mi->seq[i].offset;
+ seq_num = i;
+ }
+ }
+ // Error if no contigs in fasta
+ if(seq_num == -1) {
+ printf("ERROR Contig %s id not found in reference\n", contig_name);
+ return;
+ }
+
+ int SIDE_SIZE = (mi->k - 1) + mi->w;
+ // Calculate number of chunks:
+ // side chunks: take k-mer size, subtract 1 and add window size
+ // divided by chunk size and multiplied by 2 as it has 2 sides, and one for center
+ int SEQ_CHUNK_NUMBER = SIDE_SIZE / 8 * 2 + 1;
+ // add extra two side chunks if (mi->k - 1 + 10) is not a multiple of 8
+ int EXTRA_GAP = (8 - SIDE_SIZE % 8) % 8;
+ SEQ_CHUNK_NUMBER = (EXTRA_GAP) ? SEQ_CHUNK_NUMBER + 2 : SEQ_CHUNK_NUMBER;
+
+ int ref_len = strlen(REF);
+ int alt_len = strlen(ALT);
+
+ if(ref_len == 1 && alt_len > 1 && alt_len < mi->k) {
+ char * new_ref_seq;
+ new_ref_seq = (char*)malloc(sizeof(char) * (SEQ_CHUNK_NUMBER * 8 + 1 + (alt_len - 1)));
+ memcpy(new_ref_seq, original_ref_seq, SEQ_CHUNK_NUMBER * 8 + 1);
+
+ memcpy(new_ref_seq, original_ref_seq, EXTRA_GAP + SIDE_SIZE + (contig_offset + position - 1) % 8);
+
+ new_ref_seq[EXTRA_GAP + SIDE_SIZE + (contig_offset + position - 1) % 8] = '\0';
+ new_ref_seq = strcat(new_ref_seq, ALT);
+ new_ref_seq = strcat(new_ref_seq, &original_ref_seq[EXTRA_GAP + SIDE_SIZE + (contig_offset + position - 1) % 8 + 1]);
+
+ //Finds minimizer in window
+ mm128_v minimizer_array = {0, 0, 0};
+ mm_sketch(0, &new_ref_seq[EXTRA_GAP + (contig_offset + center_position - 1) % 8], SIDE_SIZE * 2 + 1 + (alt_len - 1), mi->w, mi->k,
+ 0, mi->flag & MM_I_HPC, &minimizer_array);
+
+ for (int i = 0; i < minimizer_array.n; i++) {
+ if (minimizer_array.a[i].y < SIDE_SIZE * 2) continue;
+ if (minimizer_array.a[i].y > (SIDE_SIZE + mi->k) * 2 - 1 + (alt_len - 1) * 2) continue;
+ minimizer_array.a[i].y = ((uint64_t)seq_num << 32) + (center_position - SIDE_SIZE - 1 + minimizer_array.a[i].y / 2) * 2 +
+ (minimizer_array.a[i].y % 2);
+
+ kv_push(mm128_t, 0, *p, minimizer_array.a[i]);
+ }
+ } else if (ref_len > 1 && alt_len == 1 && ref_len < mi->k) {
+ int EXT_CHUNK_COUNT = (ref_len - 2) / 8 + 1;
+ char * original_ref_seq_ext = (char*)malloc(sizeof(char) * (8 * EXT_CHUNK_COUNT + 1));
+ original_ref_seq_ext[8 * EXT_CHUNK_COUNT] = '\0';
+ for (int i = 0; i < EXT_CHUNK_COUNT; i++) {
+ uint32_t tmp_seq;
+ for (int i = 0; i < EXT_CHUNK_COUNT; i++) {
+ if (
+ // Out of bounds
+ (contig_offset == 0 && (position - 1) / 8 + (SEQ_CHUNK_NUMBER / 2) + i + 1 < 0) ||
+ ((contig_offset + position - 1) / 8 + (SEQ_CHUNK_NUMBER / 2) + i + 1) * 8 >= contig_offset + mi->seq[seq_num].len ||
+ ((contig_offset + position - 1) / 8 + (SEQ_CHUNK_NUMBER / 2) + i + 2) * 8 <= contig_offset
+ )
+ tmp_seq = 1145324612; // ALL N
+ else {
+ tmp_seq = mi->S[(contig_offset + position - 1) / 8 + (SEQ_CHUNK_NUMBER / 2) + i + 1];
+ // At left bound
+ if (((contig_offset + position - 1) / 8 + (SEQ_CHUNK_NUMBER / 2) + i + 1) * 8 < contig_offset &&
+ ((contig_offset + position - 1) / 8 + (SEQ_CHUNK_NUMBER / 2) + i + 2) * 8 > contig_offset) {
+ if (contig_offset % 8 == 0)
+ tmp_seq = 1145324612; // ALL N
+ else {
+ tmp_seq = tmp_seq >> (4 * (contig_offset % 8));
+ for (int j = 0; j < contig_offset % 8; j++)
+ tmp_seq = (tmp_seq << 4) + 4;
+ }
+ }
+ // At right bound
+ if (((contig_offset + position - 1) / 8 + (SEQ_CHUNK_NUMBER / 2) + i + 1) * 8 < contig_offset + mi->seq[seq_num].len &&
+ ((contig_offset + position - 1) / 8 + (SEQ_CHUNK_NUMBER / 2) + i + 2) * 8 > contig_offset + mi->seq[seq_num].len) {
+ tmp_seq = tmp_seq << (4 * (8 - (contig_offset + mi->seq[seq_num].len) % 8));
+ for (int j = 0; j < 8 - (contig_offset + mi->seq[seq_num].len) % 8; j++)
+ tmp_seq = (tmp_seq >> 4) | 1073741824; // FIRST N
+ }
+ }
+ }
+
+ for (int j = 0; j < 8; j++) {
+ uint32_t tmp = tmp_seq % 16;
+ switch (tmp) {
+ case 0:
+ original_ref_seq_ext[i * 8 + j] = 'A';
+ break;
+ case 1:
+ original_ref_seq_ext[i * 8 + j] = 'C';
+ break;
+ case 2:
+ original_ref_seq_ext[i * 8 + j] = 'G';
+ break;
+ case 3:
+ original_ref_seq_ext[i * 8 + j] = 'T';
+ break;
+ case 4:
+ original_ref_seq_ext[i * 8 + j] = 'N';
+ }
+ tmp_seq = tmp_seq / 16;
+ }
+ }
+
+ //Create new window
+ char * new_ref_seq = (char*)malloc(sizeof(char) * (SEQ_CHUNK_NUMBER * 8 + 1 - ref_len + 1 + EXT_CHUNK_COUNT * 8));
+ memcpy(new_ref_seq, original_ref_seq, EXTRA_GAP + SIDE_SIZE + (contig_offset + position - 1) % 8);
+ new_ref_seq[EXTRA_GAP + SIDE_SIZE + (contig_offset + position - 1) % 8] = ALT[0];
+ new_ref_seq[EXTRA_GAP + SIDE_SIZE + (contig_offset + position - 1) % 8 + 1] = '\0';
+ new_ref_seq = strcat(new_ref_seq, &original_ref_seq[EXTRA_GAP + SIDE_SIZE + (contig_offset + position - 1) % 8 + ref_len]);
+ new_ref_seq = strcat(new_ref_seq, original_ref_seq_ext);
+
+ //Finds minimizer in window
+ mm128_v minimizer_array = {0, 0, 0};
+ mm_sketch(0, &new_ref_seq[(contig_offset + center_position - 1) % 8], SIDE_SIZE * 2 + 1, mi->w, mi->k,
+ 0, mi->flag & MM_I_HPC, &minimizer_array);
+
+
+ free(original_ref_seq_ext);
+ free(new_ref_seq);
+
+ for (int i = 0; i < minimizer_array.n; i++) {
+ if (minimizer_array.a[i].y < SIDE_SIZE * 2 + 2) continue; // TODO change 50 to 48 for similarity with SNPS (it will take extra calculation but no changes)
+ if (minimizer_array.a[i].y > (SIDE_SIZE + mi->k) * 2 - 1) continue;
+ minimizer_array.a[i].y = ((uint64_t)seq_num << 32) + (center_position - SIDE_SIZE - 1 + minimizer_array.a[i].y / 2) * 2 +
+ (minimizer_array.a[i].y % 2) + (ref_len - 1) * 2;
+ kv_push(mm128_t, 0, *p, minimizer_array.a[i]);
+ }
+ }
+}
+
+//Array format:
+//REF_arr char array - REF (for control)
+//ALT_arr char array - ALT
+//POS_all ulong array - positions
+//CHR - chromosome
+//N_SNP - length
+void add_variants(mm_idx_t * mi, const char * CHR, char ** REF_arr, char ** ALT_arr, unsigned long * POS_all, int N_SNP, unsigned long curr_pos, mm128_v *p)
+{
+ if (N_SNP == 0)
+ return;
+
+ const char *snp_contig_name = CHR;
+ const unsigned long snp_position = curr_pos;
+
+ //Find seq
+ uint64_t contig_offset;
+ int seq_num = -1;
+ for (int i = 0; i < mi->n_seq; i++) {
+ if (strcmp(snp_contig_name, mi->seq[i].name) == 0) {
+ contig_offset = mi->seq[i].offset;
+ seq_num = i;
+ }
+ }
+ //Error if no contigs in fasta
+ if(seq_num == -1) {
+ printf("ERROR Contig %s id not found in reference\n", snp_contig_name);
+ return;
+ }
+
+ int SIDE_SIZE = (mi->k - 1) + mi->w;
+ // Calculate number of chunks:
+ // side chunks: take k-mer size, subtract 1 and add window size
+ // divided by chunk size and multiplied by 2 as it has 2 sides, and one for center
+ int SEQ_CHUNK_NUMBER = SIDE_SIZE / 8 * 2 + 1;
+ // add extra two side chunks if (mi->k - 1 + 10) is not a multiple of 8
+ int EXTRA_GAP = (8 - SIDE_SIZE % 8) % 8;
+ SEQ_CHUNK_NUMBER = (EXTRA_GAP) ? SEQ_CHUNK_NUMBER + 2 : SEQ_CHUNK_NUMBER;
+ uint32_t seq[SEQ_CHUNK_NUMBER];
+
+ for (int i = 0; i < SEQ_CHUNK_NUMBER; i++) {
+ if (
+ // Out of bounds
+ (contig_offset == 0 && (snp_position - 1) / 8 - (SEQ_CHUNK_NUMBER / 2) + i < 0) ||
+ ((contig_offset + snp_position - 1) / 8 - (SEQ_CHUNK_NUMBER / 2) + i) * 8 >= contig_offset + mi->seq[seq_num].len ||
+ ((contig_offset + snp_position - 1) / 8 - (SEQ_CHUNK_NUMBER / 2) + i + 1) * 8 <= contig_offset
+ )
+ seq[i] = 1145324612; // ALL N
+ else {
+ seq[i] = mi->S[(contig_offset + snp_position - 1) / 8 - (SEQ_CHUNK_NUMBER / 2) + i];
+ // At left bound
+ if (((contig_offset + snp_position - 1) / 8 - (SEQ_CHUNK_NUMBER / 2) + i) * 8 < contig_offset &&
+ ((contig_offset + snp_position - 1) / 8 - (SEQ_CHUNK_NUMBER / 2) + i + 1) * 8 > contig_offset) {
+ if (contig_offset % 8 == 0)
+ seq[i] = 1145324612; // ALL N
+ else {
+ seq[i] = seq[i] >> (4 * (contig_offset % 8));
+ for (int j = 0; j < contig_offset % 8; j++)
+ seq[i] = (seq[i] << 4) + 4;
+ }
+ }
+ // At right bound
+ if (((contig_offset + snp_position - 1) / 8 - (SEQ_CHUNK_NUMBER / 2) + i) * 8 < contig_offset + mi->seq[seq_num].len &&
+ ((contig_offset + snp_position - 1) / 8 - (SEQ_CHUNK_NUMBER / 2) + i + 1) * 8 > contig_offset + mi->seq[seq_num].len) {
+ seq[i] = seq[i] << (4 * (8 - (contig_offset + mi->seq[seq_num].len) % 8));
+ for (int j = 0; j < 8 - (contig_offset + mi->seq[seq_num].len) % 8; j++)
+ seq[i] = (seq[i] >> 4) | 1073741824; // FIRST N
+ }
+ }
+ }
+
+ char original_ref_seq[SEQ_CHUNK_NUMBER * 8 + 1];
+ original_ref_seq[SEQ_CHUNK_NUMBER * 8] = '\0';
+ for (int i = 0; i < SEQ_CHUNK_NUMBER; i++) {
+ uint32_t tmp_seq = seq[i];
+ for (int j = 0; j < 8; j++) {
+ uint32_t tmp = tmp_seq % 16;
+ switch (tmp) {
+ case 0:
+ original_ref_seq[i * 8 + j] = 'A';
+ break;
+ case 1:
+ original_ref_seq[i * 8 + j] = 'C';
+ break;
+ case 2:
+ original_ref_seq[i * 8 + j] = 'G';
+ break;
+ case 3:
+ original_ref_seq[i * 8 + j] = 'T';
+ break;
+ case 4:
+ original_ref_seq[i * 8 + j] = 'N';
+ }
+ tmp_seq = tmp_seq / 16;
+ }
+ }
+
+
+ char new_ref_seq[SEQ_CHUNK_NUMBER * 8 + 1];
+ memcpy(new_ref_seq, original_ref_seq, SEQ_CHUNK_NUMBER * 8 + 1);
+
+ int has_indel = -1;
+ int indel_count = 0;
+ for (int i = N_SNP - 1; i >= 0; i--) {
+ //add single SNP
+ if ((strlen(REF_arr[i]) == 1) && (strlen(ALT_arr[i]) == 1)) {
+ new_ref_seq[(EXTRA_GAP + SIDE_SIZE + (contig_offset + snp_position - 1) % 8) +
+ (POS_all[i] - snp_position)] = ALT_arr[i][0];// - 'A' + 'a';
+ } else {
+ has_indel = i;
+ indel_count++;
+ }
+ }
+ if ((indel_count == 1) && (POS_all[has_indel] == curr_pos)) {
+ if ((strlen(REF_arr[has_indel]) > mi->k) || (strlen(ALT_arr[has_indel]) > mi->k)) {
+ return;
+ }
+ return;
+ if ((strlen(REF_arr[has_indel]) > 1) && (strlen(ALT_arr[has_indel]) == 1)) {
+ add_indel(mi, CHR, REF_arr[has_indel], ALT_arr[has_indel], curr_pos, POS_all[has_indel], p, new_ref_seq);
+ return;
+ }
+ if ((strlen(REF_arr[has_indel]) == 1) && (strlen(ALT_arr[has_indel]) > 1)) {
+ add_indel(mi, CHR, REF_arr[has_indel], ALT_arr[has_indel], curr_pos, POS_all[has_indel], p, new_ref_seq);
+ return;
+ }
+ }
+ if (indel_count == N_SNP) {
+ return;
+ }
+
+ //Finds minimizer in window
+ mm128_v minimizer_array = {0, 0, 0};
+ mm_sketch(0, &new_ref_seq[EXTRA_GAP + (contig_offset + snp_position - 1) % 8], SIDE_SIZE * 2 + 1, mi->w, mi->k,
+ 0, mi->flag & MM_I_HPC, &minimizer_array);
+
+ for (int i = 0; i < minimizer_array.n; i++) {
+ if (minimizer_array.a[i].y < SIDE_SIZE * 2) continue;
+ if (minimizer_array.a[i].y > (SIDE_SIZE + mi->k) * 2 - 1) continue;
+ minimizer_array.a[i].y = ((uint64_t)seq_num << 32) + (snp_position - SIDE_SIZE - 1 + minimizer_array.a[i].y / 2) * 2 +
+ (minimizer_array.a[i].y % 2);
+
+ kv_push(mm128_t, 0, *p, minimizer_array.a[i]);
+ }
+}
diff --git a/test/README.md b/test/README.md
new file mode 100644
index 0000000..01ee755
--- /dev/null
+++ b/test/README.md
@@ -0,0 +1,32 @@
+## Tests for minimap2_index_modiifer
+There are three tests to check base functionality.
+
+### Common test
+This test create two index files: regular and modified.
+```bash
+minimap2 -d test/test.mni test/test.fasta
+minimap2 -d test/test.modified.mni --vcf-file-with-variants test/test_long_chr1_not_the_same.vcf.gz test/test.fasta
+diff test/test.mni test/test.modified.mni
+```
+
+Modified index file should contains extra string and some strings should be mismatched.
+
+### Empty VCF test
+If input VCF file contains no variants from reference modified index would be same as regular one.
+```bash
+minimap2 -d test/test.mni test/test.fasta
+minimap2 -d test/test.modified.mni --vcf-file-with-variants test/empty.vcf.gz test/test.fasta
+diff test/test.mni test/test.modified.mni
+```
+
+Index files should be the same (empty output after third command).
+
+### Pseudo-variants test
+In this test `test/test_long_chr1_the_same.vcf.gz` contains pseudo-variants (like A -> A, C -> C, etc). This variants would be processed with no effect.
+```bash
+minimap2 -d test/test.mni test/test.fasta
+minimap2 -d test/test.modified.mni --vcf-file-with-variants test/test_long_chr1_the_same.vcf.gz test/test.fasta
+diff test/test.mni test/test.modified.mni
+```
+
+Index files should be the same.
diff --git a/test/empty.vcf.gz b/test/empty.vcf.gz
new file mode 100644
index 0000000..2c6e5b9
Binary files /dev/null and b/test/empty.vcf.gz differ
diff --git a/test/empty.vcf.gz.tbi b/test/empty.vcf.gz.tbi
new file mode 100644
index 0000000..b9e0990
Binary files /dev/null and b/test/empty.vcf.gz.tbi differ
diff --git a/test/test.1.fq b/test/test.1.fq
new file mode 100644
index 0000000..09c5d87
--- /dev/null
+++ b/test/test.1.fq
@@ -0,0 +1,36 @@
+@111\1
+TTTTTTTTTTTTTTATTTTATTTTTTTCACACAAACACACACACACACAA
++
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+@112\1
+TTTTTTTATTTTATTTTTTTTTCACACAAACACACACACACACAAAAAAA
++
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+@113\1
+TTTTTTTTTTATTTTATTTTTTTTCACACAAACACACACACACACAAAAA
++
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+@114\1
+AAAAAAAAAAAAAAAAAAAAAAAAATAAAAACGTACGACGTACGTTTTTT
++
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+@115\1
+AAAAAAAAAAAAAAAAAAAAAAATAAAAACGTACGACGTACGTTTTTTTT
++
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+@116\1
+AAAAAAAAAAATAAAAACGTACGACGTACGTTTTTTTTTTTTTTTATTTT
++
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+@117\1
+ACACACAAAAAAATATATATATATATATATAAATATATTATATATATATA
++
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+@118\1
+ATATATATATAAATATATATATATATATATATATATATATATATATATAT
++
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+@119\1
+TATATATATATATATATATATATATATATATATAAATATATATATATAAA
++
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/test/test.2.fq b/test/test.2.fq
new file mode 100644
index 0000000..6874907
--- /dev/null
+++ b/test/test.2.fq
@@ -0,0 +1,36 @@
+@111\2
+AAAATAAAAACGTACGACGTACGTTTTTTTTTTTTTTTATTTTATTTTTT
++
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+@112\2
+AAAAAAATAAAAACGTACGACGTACGTTTTTTTTTTTTTTTATTTTATTT
++
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+@113\2
+TAAAAAAAAAAAAAATAAAAACGTACGACGTACGTTTTTTTTTTTTTTTA
++
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+@114\2
+TTTTATTTTATTTTTTTGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTT
++
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+@115\2
+TTTTTTGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTATATATGGGGG
++
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+@116\2
+GGGGGGGGGTTTTTTTTTTTTTTATATATGGGGGGGGTTTTTTTAAAAAA
++
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+@117\2
+ATATATATATATATAAATATATATATATAAAAAAAAAAAAAAAATAAAAA
++
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+@118\2
+AATATATATATATATATATATATATATATATATATATATATAAATATATA
++
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+@119\2
+GGGGGGGGGGGGGGGTTTTTTTTTTTTTTATATATGGGGGGGGTTTTTTT
++
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
\ No newline at end of file
diff --git a/test/test.fasta b/test/test.fasta
new file mode 100644
index 0000000..b9d552f
--- /dev/null
+++ b/test/test.fasta
@@ -0,0 +1,12 @@
+>chr1
+NNNNNNNNNNNTTTTTTTTTTTTTTTTTTTATTTTATTTTTTTTCACACAAACACACACACACACAAAAAAATATATATATATATATATATATATATATATATATATATATATATATATATATATATATATAAATATATAATATATAAAAAAAAAAAAAATAAAAACGTACGACGTACGTTTTTTTTTTTTTTTTTTTTATTTTTTTGGGGGGGGAGGGGGGGGGGTTTTTTTTTTTTTTATATATGGGGGGGGTTTTTTTAAAAAAAAATTTTTTTGGGGGGGGAGGGGGGGGGGTTTTTTTTTTTTTTNNNNNNNNNN
+>chr2
+NNNNNNNNNNNTTTTTTTTTTTTTTTTTTTATTTTATTTTTTTTCACACAAACACACACACACACAAAAA
+AATATATATATATATATATATATATATATATATATATATATATATATATATATATATATATAAATATATA
+ATATATAAAAAAAAAAAAAATAAAAACGTACGACGTACGTTTTTTTTTTTTTTTTTTTTATTTTTTTGGG
+GGGGGGGGGGGGGGGGTTTTTTTTTTTTTTATATATGGGGGGGGTTTTTTTAAAAAAAAANNNNNNNNNN
+>chr3
+NNNNAGTAGTCTTTTTTTTTTTTTTTTTTTATTTTATTTTTTTTCACACAAACACACACACACACAAAAA
+AATATATATATATATATATATATATATATATATATATATATATATATATATATATATATATAAATATATA
+ATATATAAAAAAAAAAAAAATAAAAACGTACGACGTACGTTTTTTTTTTTTTTTTTTTTATTTTTTTGGG
+GGGGGGGGGGGGGGGGTTTTTTTTTTTTTTATATATGGGGGGGGTTTTTTTAAAAAAAAAGGGGGGGGGG
diff --git a/test/test.vcf b/test/test.vcf
new file mode 100644
index 0000000..c009525
--- /dev/null
+++ b/test/test.vcf
@@ -0,0 +1,5 @@
+chr3 277 . G A X
+chr3 6 . G C X
+chr2 195 . T A X
+chr2 146 . T A X
+chr2 91 . T A X
diff --git a/test/test_long_chr1.vcf.gz b/test/test_long_chr1.vcf.gz
new file mode 100644
index 0000000..59fd971
Binary files /dev/null and b/test/test_long_chr1.vcf.gz differ
diff --git a/test/test_long_chr1.vcf.gz.tbi b/test/test_long_chr1.vcf.gz.tbi
new file mode 100644
index 0000000..25b601d
Binary files /dev/null and b/test/test_long_chr1.vcf.gz.tbi differ
diff --git a/test/test_long_chr1_not_the_same.vcf.gz b/test/test_long_chr1_not_the_same.vcf.gz
new file mode 100644
index 0000000..20647e5
Binary files /dev/null and b/test/test_long_chr1_not_the_same.vcf.gz differ
diff --git a/test/test_long_chr1_not_the_same.vcf.gz.tbi b/test/test_long_chr1_not_the_same.vcf.gz.tbi
new file mode 100644
index 0000000..1cdb036
Binary files /dev/null and b/test/test_long_chr1_not_the_same.vcf.gz.tbi differ
diff --git a/test/test_long_chr1_the_same.vcf.gz b/test/test_long_chr1_the_same.vcf.gz
new file mode 100644
index 0000000..d92ccfe
Binary files /dev/null and b/test/test_long_chr1_the_same.vcf.gz differ
diff --git a/test/test_long_chr1_the_same.vcf.gz.tbi b/test/test_long_chr1_the_same.vcf.gz.tbi
new file mode 100644
index 0000000..3fe3431
Binary files /dev/null and b/test/test_long_chr1_the_same.vcf.gz.tbi differ