Skip to content

Difference from original tool #2

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 32 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
4b9ae8d
Initial
EKarpulevich Mar 20, 2023
c49e5ef
Add some constants
EKarpulevich Mar 20, 2023
89f4adc
add function headers in mmpriv.h and delete unused mm_sketch_syncmer
Mar 22, 2023
5993d46
Set rid as seq_num
Mar 22, 2023
7c56163
Added support for diferent kmers, fix #1
Mar 22, 2023
1102e28
added test fastq and fasta files waiting for vcf file
Mar 22, 2023
188ac91
Test vcf added
Mar 24, 2023
b8f3f3b
Added flags --modified-index-file and --vcf-file-with-variants
EKarpulevich May 15, 2023
35b1d08
Added unique SNP combination calculation
EKarpulevich May 18, 2023
31c61c2
Add read vcf procedures
EKarpulevich May 23, 2023
e2fa97c
Minor changes to avoid segfault on incorrect data
EKarpulevich May 24, 2023
a42acfe
Modify linked list walkthrough
EKarpulevich May 24, 2023
ce21d66
Fix minimazer creating params
EKarpulevich May 26, 2023
ad961f2
New minimizers also added to index
EKarpulevich May 30, 2023
20c1281
Tests added
EKarpulevich May 30, 2023
3dfe241
Added repeated minimizers removal
May 30, 2023
616b6bd
Accelerated calculation of unique occurrences of the minimizer
EKarpulevich May 31, 2023
900a4b7
set uint64_t for seq_num in add_variants
Jun 13, 2023
f1cc4a5
Remove indels from SNPs
EKarpulevich Jun 22, 2023
b0c489c
Add insertions
EKarpulevich Jun 22, 2023
473be10
Fix warnings
EKarpulevich Jun 27, 2023
f53fc36
Limit insertion length
EKarpulevich Jun 28, 2023
c86b111
Deletions added
Jun 30, 2023
ae28ae6
Added cobinations for SNPs and deletions.
Jul 4, 2023
e4e4e8d
Add big indels in combinations
Jul 4, 2023
af9a2e9
Added ability to not parse haplogroups but use all combinations
Aug 25, 2023
d95eea8
Pre-release changes
Nov 22, 2023
03f09b1
README updated
Jan 10, 2024
8f05dee
Dev (#1)
EgorGuga Apr 1, 2024
e3b76ef
Added dockerhub link in README.md
EgorGuga Apr 4, 2024
a8b8b6c
Update README.md
EgorGuga May 13, 2024
17d0b65
Update README.md
EgorGuga May 13, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
.vscode/
htslib/
.cproject
.project
.*.swp
Expand Down
49 changes: 49 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
FROM phusion/baseimage:jammy-1.0.1

ARG HTSLIB_VERSION=1.17

ENV TERM=xterm-256color \
HTSLIB_URL=https://github.com/samtools/htslib/releases/download/${HTSLIB_VERSION}/htslib-${HTSLIB_VERSION}.tar.bz2

COPY . minimap2_index_modifier

# install deps and cleanup apt garbage
RUN set -eux; \
apt-get update; \
apt-get install -y \
wget \
make \
autoconf \
gcc \
libcurl4-openssl-dev \
zlib1g-dev \
libbz2-dev \
liblzma-dev \
bzip2 \
&& rm -rf /var/lib/apt/lists/*;

#install htslib
RUN set -eux; \
mkdir temp; \
cd temp; \
\
wget ${HTSLIB_URL}; \
tar -xf htslib-${HTSLIB_VERSION}.tar.bz2; \
cd htslib-${HTSLIB_VERSION}; \
\
autoreconf -i; \
./configure; \
make; \
make install; \
cd ../../; \
rm -rf temp;

#install minimap2_index_modifier
RUN set -eux; \
cd minimap2_index_modifier; \
make; \
cp minimap2 /usr/local/bin; \
cd ../; \
rm -rf minimap2_index_modifier;

ENV LD_LIBRARY_PATH=/usr/local/lib
9 changes: 5 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@ CFLAGS= -g -Wall -O2 -Wc++-compat #-Wextra
CPPFLAGS= -DHAVE_KALLOC
INCLUDES=
OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o \
lchain.o align.o hit.o seed.o map.o format.o pe.o esterr.o splitidx.o \
lchain.o align.o hit.o seed.o linked_vcf_list.o map.o format.o pe.o esterr.o splitidx.o \
ksw2_ll_sse.o
PROG= minimap2
PROG_EXTRA= sdust minimap2-lite
LIBS= -lm -lz -lpthread
LIBS= -lm -lz -lpthread -lhts

ifeq ($(arm_neon),) # if arm_neon is not defined
ifeq ($(sse2only),) # if sse2only is not defined
Expand Down Expand Up @@ -53,7 +53,7 @@ minimap2-lite:example.o libminimap2.a
libminimap2.a:$(OBJS)
$(AR) -csru $@ $(OBJS)

sdust:sdust.c kalloc.o kalloc.h kdq.h kvec.h kseq.h ketopt.h sdust.h
sdust:sdust.c kalloc.o kalloc.h kdq.h kvec.h kseq.h ketopt.h sdust.h linked_vcf_list.h
$(CC) -D_SDUST_MAIN $(CFLAGS) $< kalloc.o -o $@ -lz

# SSE-specific targets on x86/x86_64
Expand Down Expand Up @@ -128,5 +128,6 @@ options.o: mmpriv.h minimap.h bseq.h kseq.h
pe.o: mmpriv.h minimap.h bseq.h kseq.h kvec.h kalloc.h ksort.h
sdust.o: kalloc.h kdq.h kvec.h sdust.h
seed.o: mmpriv.h minimap.h bseq.h kseq.h kalloc.h ksort.h
sketch.o: kvec.h kalloc.h mmpriv.h minimap.h bseq.h kseq.h
linked_vcf_list.o: mmpriv.h minimap.h linked_vcf_list.h
sketch.o: kvec.h kalloc.h mmpriv.h minimap.h bseq.h kseq.h linked_vcf_list.h
splitidx.o: mmpriv.h minimap.h bseq.h kseq.h
426 changes: 38 additions & 388 deletions README.md

Large diffs are not rendered by default.

186 changes: 179 additions & 7 deletions index.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@
#include "kvec.h"
#include "khash.h"

#include <htslib/hts.h>
#include <htslib/vcf.h>
#include <htslib/tbx.h>
#include <htslib/kstring.h>
#include <htslib/kseq.h>

#define idx_hash(a) ((a)>>1)
#define idx_eq(a, b) ((a)>>1 == (b)>>1)
KHASH_INIT(idx, uint64_t, uint64_t, 1, idx_hash, idx_eq)
Expand Down Expand Up @@ -263,7 +269,7 @@ static void worker_post(void *g, long i, int tid)
kfree(0, b->a.a);
b->a.n = b->a.m = 0, b->a.a = 0;
}

static void mm_idx_post(mm_idx_t *mi, int n_threads)
{
kt_for(n_threads, worker_post, mi, 1<<mi->b);
Expand All @@ -282,6 +288,7 @@ typedef struct {
uint64_t batch_size, sum_len;
mm_bseq_file_t *fp;
mm_idx_t *mi;
char * vcf_with_variants;
} pipeline_t;

typedef struct {
Expand Down Expand Up @@ -355,15 +362,68 @@ static void *worker_pipeline(void *shared, int step, void *in)
} else free(s);
} else if (step == 1) { // step 1: compute sketch
step_t *s = (step_t*)in;

for (i = 0; i < s->n_seq; ++i) {
//printf("SEQ %d %s\n", s->n_seq, s->seq[i].name);
mm_bseq1_t *t = &s->seq[i];
if (t->l_seq > 0)
mm_sketch(0, t->seq, t->l_seq, p->mi->w, p->mi->k, t->rid, p->mi->flag&MM_I_HPC, &s->a);
else if (mm_verbose >= 2)
fprintf(stderr, "[WARNING] the length database sequence '%s' is 0\n", t->name);

if(p->vcf_with_variants && strcmp(p->vcf_with_variants, "")) {
mm_idx_manipulate_phased(p->mi, p->vcf_with_variants, &s->a, s->seq[i].name);
}

free(t->seq); free(t->name);
}

free(s->seq); s->seq = 0;

// sort by minimizer
radix_sort_128x(s->a.a, s->a.a + s->a.n);

// stay only unique
//#### why 16, sizeof(mm128_v)?
mm128_t *tmp = (mm128_t*)calloc(s->a.n, 16);


size_t ptr_p, ptr_tmp, beg_tmp, end_tmp;
tmp[0].x = s->a.a[0].x;
tmp[0].y = s->a.a[0].y;
beg_tmp = 0;
end_tmp = 1;

for (ptr_p = 1; ptr_p < s->a.n; ptr_p++) {
if(s->a.a[ptr_p].x == tmp[beg_tmp].x) {
int exist = 0;
for (ptr_tmp = beg_tmp; ptr_tmp < end_tmp; ptr_tmp++) {
if (s->a.a[ptr_p].y == tmp[ptr_tmp].y && s->a.a[ptr_p].x == tmp[ptr_tmp].x) {
exist = 1;
break;
}
}
if (exist == 0) {
tmp[end_tmp].x = s->a.a[ptr_p].x;
tmp[end_tmp].y = s->a.a[ptr_p].y;
end_tmp++;
}
}
else {
tmp[end_tmp].x = s->a.a[ptr_p].x;
tmp[end_tmp].y = s->a.a[ptr_p].y;
beg_tmp = end_tmp;
end_tmp++;
}
}

tmp = (mm128_t *) realloc(tmp, sizeof(mm128_t) * end_tmp);

//#### do we need realloc s->a.a ?
memcpy(s->a.a, tmp, end_tmp * 16);
free(tmp);
s->a.n = end_tmp;

return s;
} else if (step == 2) { // dispatch sketch to buckets
step_t *s = (step_t*)in;
Expand All @@ -373,7 +433,7 @@ static void *worker_pipeline(void *shared, int step, void *in)
return 0;
}

mm_idx_t *mm_idx_gen(mm_bseq_file_t *fp, int w, int k, int b, int flag, int mini_batch_size, int n_threads, uint64_t batch_size)
mm_idx_t *mm_idx_gen(mm_bseq_file_t *fp, int w, int k, int b, int flag, int mini_batch_size, int n_threads, uint64_t batch_size, char * vcf_with_variants)
{
pipeline_t pl;
if (fp == 0 || mm_bseq_eof(fp)) return 0;
Expand All @@ -382,8 +442,10 @@ mm_idx_t *mm_idx_gen(mm_bseq_file_t *fp, int w, int k, int b, int flag, int mini
pl.batch_size = batch_size;
pl.fp = fp;
pl.mi = mm_idx_init(w, k, b, flag);
pl.vcf_with_variants = vcf_with_variants;

kt_pipeline(n_threads < 3? n_threads : 3, worker_pipeline, &pl, 3);

if (mm_verbose >= 3)
fprintf(stderr, "[M::%s::%.3f*%.2f] collected minimizers\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0));

Expand All @@ -394,13 +456,13 @@ mm_idx_t *mm_idx_gen(mm_bseq_file_t *fp, int w, int k, int b, int flag, int mini
return pl.mi;
}

mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads) // a simpler interface; deprecated
mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads, char * vcf_with_variants) // a simpler interface; deprecated
{
mm_bseq_file_t *fp;
mm_idx_t *mi;
fp = mm_bseq_open(fn);
if (fp == 0) return 0;
mi = mm_idx_gen(fp, w, k, 14, flag, 1<<18, n_threads, UINT64_MAX);
mi = mm_idx_gen(fp, w, k, 14, flag, 1<<18, n_threads, UINT64_MAX, vcf_with_variants);
mm_bseq_close(fp);
return mi;
}
Expand Down Expand Up @@ -459,6 +521,51 @@ mm_idx_t *mm_idx_str(int w, int k, int is_hpc, int bucket_bits, int n, const cha
* index I/O *
*************/

void mm_idx_to_txt(FILE *fp, const mm_idx_t *mi)
{
uint64_t sum_len = 0;
uint32_t x[5], i;
x[0] = mi->w, x[1] = mi->k, x[2] = mi->b, x[3] = mi->n_seq, x[4] = mi->flag;
fprintf(fp, "%s\n", MM_IDX_MAGIC);
fprintf(fp, "%u %u %u %u %u\n", x[0], x[1], x[2], x[3], x[4]);

for (i = 0; i < mi->n_seq; ++i) {
if (mi->seq[i].name) {
uint8_t l = strlen(mi->seq[i].name);
fprintf(fp, "%u\n", l);
fprintf(fp, "%s\n", mi->seq[i].name);
} else {
fprintf(fp, "0\n");
}
fprintf(fp, "%u\n", mi->seq[i].len);
sum_len += mi->seq[i].len;
}
for (i = 0; i < 1<<mi->b; ++i) {
mm_idx_bucket_t *b = &mi->B[i];
khint_t k;
idxhash_t *h = (idxhash_t*)b->h;
uint32_t size = h? h->size : 0;
fprintf(fp, "%i\n", b->n);
for (int j = 0; j < b->n; j++)
fprintf(fp, "\t%lu\n", b->p[j]);
fprintf(fp, "%u\n", size);
if (size == 0)
continue;
for (k = 0; k < kh_end(h); ++k) {
uint64_t x[3];
if (!kh_exist(h, k))
continue;
x[0] = kh_key(h, k), x[1] = kh_val(h, k);
fprintf(fp, "%lu\t%lu\n", x[0], x[1]);
}
}
if (!(mi->flag & MM_I_NO_SEQ)) {
for (int i = 0; i < (sum_len + 7) / 8; i++)
fprintf(fp, "%u\n", mi->S[i]);
}
fflush(fp);
}

void mm_idx_dump(FILE *fp, const mm_idx_t *mi)
{
uint64_t sum_len = 0;
Expand Down Expand Up @@ -500,6 +607,67 @@ void mm_idx_dump(FILE *fp, const mm_idx_t *mi)
fflush(fp);
}

mm_idx_t *mm_idx_load_from_txt(FILE *fp)
{
char magic[4];
uint32_t x[5], i;
uint64_t sum_len = 0;
mm_idx_t *mi;

fscanf(fp, "%s\n", &magic);

if (strncmp(magic, MM_IDX_MAGIC, 4) != 0) return 0;
fscanf(fp, "%u %u %u %u %u\n", &x[0], &x[1], &x[2], &x[3], &x[4]);

mi = mm_idx_init(x[0], x[1], x[2], x[4]);
mi->n_seq = x[3];
mi->seq = (mm_idx_seq_t*)kcalloc(mi->km, mi->n_seq, sizeof(mm_idx_seq_t));
for (i = 0; i < mi->n_seq; ++i) {
uint8_t l;
mm_idx_seq_t *s = &mi->seq[i];
fscanf(fp, "%u\n", &l);
if (l) {
s->name = (char*)kmalloc(mi->km, l + 1);
fscanf(fp, "%s\n", s->name);
s->name[l] = 0;
}
fscanf(fp, "%u\n", &s->len);
s->offset = sum_len;
s->is_alt = 0;
sum_len += s->len;
}
for (i = 0; i < 1<<mi->b; ++i) {
mm_idx_bucket_t *b = &mi->B[i];
uint32_t j, size;
khint_t k;
idxhash_t *h;
fscanf(fp, "%i\n", &b->n);
b->p = (uint64_t*)malloc(b->n * 8);
for (int j = 0; j < b->n; j++) {
fscanf(fp, "\t%lu\n", &b->p[j]);
}
fscanf(fp, "%u\n", &size);
if (size == 0) continue;
b->h = h = kh_init(idx);
kh_resize(idx, h, size);
for (j = 0; j < size; ++j) {
uint64_t x[2];
int absent;
fscanf(fp, "%lu\t%lu\n", &x[0], &x[1]);
k = kh_put(idx, h, x[0], &absent);
assert(absent);
kh_val(h, k) = x[1];
}
}
if (!(mi->flag & MM_I_NO_SEQ)) {
mi->S = (uint32_t*)malloc((sum_len + 7) / 8 * 4);
for (int i = 0; i < (sum_len + 7) / 8; i++)
fscanf(fp, "%u\n", &mi->S[i]);
}
return mi;
}


mm_idx_t *mm_idx_load(FILE *fp)
{
char magic[4];
Expand Down Expand Up @@ -605,17 +773,21 @@ void mm_idx_reader_close(mm_idx_reader_t *r)
free(r);
}

mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads)
mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads, char * vcf_with_variants)
{
mm_idx_t *mi;
if (r->is_idx) {
mi = mm_idx_load(r->fp.idx);
if (mi && mm_verbose >= 2 && (mi->k != r->opt.k || mi->w != r->opt.w || (mi->flag&MM_I_HPC) != (r->opt.flag&MM_I_HPC)))
fprintf(stderr, "[WARNING]\033[1;31m Indexing parameters (-k, -w or -H) overridden by parameters used in the prebuilt index.\033[0m\n");
} else
mi = mm_idx_gen(r->fp.seq, r->opt.w, r->opt.k, r->opt.bucket_bits, r->opt.flag, r->opt.mini_batch_size, n_threads, r->opt.batch_size);
{
mi = mm_idx_gen(r->fp.seq, r->opt.w, r->opt.k, r->opt.bucket_bits, r->opt.flag, r->opt.mini_batch_size, n_threads, r->opt.batch_size, vcf_with_variants);
}
if (mi) {
if (r->fp_out) mm_idx_dump(r->fp_out, mi);
if (r->fp_out) {
mm_idx_dump(r->fp_out, mi);
}
mi->index = r->n_parts++;
}
return mi;
Expand Down
Loading
Loading