From a46b6db6844c2d213965d7450a7eb0d2588d88e3 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 11 Apr 2025 17:46:25 +0200 Subject: [PATCH 1/6] mtmd : add more api around mtmd_image_tokens --- examples/llava/mtmd.cpp | 39 ++++++++++++++++++++++++++++++++++----- examples/llava/mtmd.h | 23 ++++++++++++++++++++--- 2 files changed, 54 insertions(+), 8 deletions(-) diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp index 114c274bc1250..98d660a643809 100644 --- a/examples/llava/mtmd.cpp +++ b/examples/llava/mtmd.cpp @@ -166,15 +166,36 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx, return output; } -void mtmd_input_chunks_free(mtmd_input_chunks * chunks) { - for (auto & chunk : *chunks) { - if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE && chunk.tokens_image) { - delete chunk.tokens_image; +void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) { + if (image_tokens) { + delete image_tokens; + } +} + +void mtmd_input_chunks_free(mtmd_input_chunks * chunks, bool free_images) { + if (free_images) { + for (auto & chunk : *chunks) { + if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE && chunk.tokens_image) { + mtmd_image_tokens_free(chunk.tokens_image); + chunk.tokens_image = nullptr; + } } } delete chunks; } +size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens) { + return image_tokens->n_tokens(); +} + +size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens) { + return image_tokens->nx; +} + +size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) { + return image_tokens->ny; +} + int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) { int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip); ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd); @@ -289,7 +310,7 @@ int32_t mtmd_helper_eval(mtmd_context * ctx, LOG_INF("image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0); } - int32_t n_tokens = chunk.tokens_image->n_tokens(); + int32_t n_tokens = mtmd_image_tokens_get_n_tokens(chunk.tokens_image); float * embd = mtmd_get_output_embd(ctx); decode_embd_batch batch_img(embd, n_tokens, n_past, 0); int64_t t1 = ggml_time_ms(); @@ -339,3 +360,11 @@ int32_t mtmd_helper_bitmap_init_from_file(const char * fname, mtmd_bitmap & outp std::memcpy(output.data.data(), data, output.nx * output.ny * 3); return 0; } + +bool mtmd_decode_use_non_causal(mtmd_context * ctx) { + projector_type proj_type = clip_get_projector_type(ctx->ctx_clip); + if (proj_type == PROJECTOR_TYPE_GEMMA3) { + return true; + } + return false; +} diff --git a/examples/llava/mtmd.h b/examples/llava/mtmd.h index 598f6947bb092..ca3fb6fdc7960 100644 --- a/examples/llava/mtmd.h +++ b/examples/llava/mtmd.h @@ -81,13 +81,20 @@ MTMD_API void mtmd_free(mtmd_context * ctx); // 2. (image tokens) // 3. "\ndescribe it in detail." // number of bitmaps must be equal to the number of image markers in the prompt +// the returned value must be freed using mtmd_input_chunks_free() // this function is thread-safe (shared ctx) MTMD_API mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx, const mtmd_input_text & text, const std::vector & bitmaps); -// free image chunk data -MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks); +// if free_images = true, free the image tokens ; otherwise, you must free them using mtmd_image_free() +MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks, bool free_images); + +// access mtmd_image_tokens +MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens); +MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens); +MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens); +MTMD_API void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens); // returns 0 on success MTMD_API int32_t mtmd_encode(mtmd_context * ctx, @@ -96,6 +103,11 @@ MTMD_API int32_t mtmd_encode(mtmd_context * ctx, // get output embeddings from the last encode pass MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx); +// whether we need to set non-causal mask before llama_decode +MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx); + + + // // helper functions (can be implemented based on other functions) // @@ -133,10 +145,15 @@ struct mtmd_context_deleter { using mtmd_context_ptr = std::unique_ptr; struct mtmd_input_chunks_deleter { - void operator()(mtmd_input_chunks * val) { mtmd_input_chunks_free(val); } + void operator()(mtmd_input_chunks * val) { mtmd_input_chunks_free(val, true); } }; using mtmd_input_chunks_ptr = std::unique_ptr; +struct mtmd_image_tokens_deleter { + void operator()(mtmd_image_tokens * val) { mtmd_image_tokens_free(val); } +}; +using mtmd_image_tokens_ptr = std::unique_ptr; + #else static_assert(false && "C header is not yet supported by this library"); From 7ac0b7b7b0433eacd8c9cabf3734f092637e6212 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 11 Apr 2025 22:17:47 +0200 Subject: [PATCH 2/6] mtmd : ability to calc image hash --- examples/llava/gemma3-cli.cpp | 1 + examples/llava/mtmd.cpp | 29 ++++++++++++++++++++++++++++- examples/llava/mtmd.h | 12 ++++++++---- 3 files changed, 37 insertions(+), 5 deletions(-) diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp index 91a07e2a8f40d..b200d8f111918 100644 --- a/examples/llava/gemma3-cli.cpp +++ b/examples/llava/gemma3-cli.cpp @@ -89,6 +89,7 @@ struct gemma3_context { ctx_vision.reset(mtmd_init_from_file(clip_path, model, mtmd_context_params{ /* use_gpu */ true, /* timings */ true, + /* hash */ false, /* n_threads */ params.cpuparams.n_threads, /* verbosity */ GGML_LOG_LEVEL_INFO, })); diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp index 98d660a643809..1691a71bf27fc 100644 --- a/examples/llava/mtmd.cpp +++ b/examples/llava/mtmd.cpp @@ -16,15 +16,22 @@ struct mtmd_context { struct clip_ctx * ctx_clip; const struct llama_model * text_model; std::vector image_embd_v; // image embedding vector + bool print_timings; int n_threads; std::string image_marker; + bool calc_image_hash; // TODO @ngxson : add timings mtmd_context(const char * mmproj_fname, const llama_model * text_model, - const mtmd_context_params & ctx_params) : print_timings(ctx_params.print_timings), n_threads(ctx_params.n_threads), image_marker(ctx_params.image_marker) { + const mtmd_context_params & ctx_params) : + print_timings (ctx_params.print_timings), + n_threads (ctx_params.n_threads), + image_marker (ctx_params.image_marker), + calc_image_hash(ctx_params.calc_image_hash) + { clip_context_params ctx_clip_params; ctx_clip_params.use_gpu = ctx_params.use_gpu; ctx_clip_params.verbosity = ctx_params.verbosity; @@ -49,6 +56,7 @@ struct mtmd_image_tokens { uint32_t ny; // number of tokens in y direction uint32_t n_tokens() const { return nx * ny; } clip_image_f32_batch batch_f32; // preprocessed image patches + size_t image_hash = 0; // hash of the image, useful for KV cache tracking }; mtmd_context * mtmd_init_from_file(const char * mmproj_fname, @@ -88,6 +96,16 @@ static std::vector mtmd_tokenize_text_internal( return result; } +static uint64_t hash_vector_float(const std::vector & vec) { + uint64_t seed = vec.size(); + std::hash hasher; + for (float val : vec) { + // inspired by boost::hash_combine + seed ^= hasher(val) + 0x9e3779b9 + (seed << 6) + (seed >> 2); + } + return seed; +} + mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx, const mtmd_input_text & text, const std::vector & bitmaps) { @@ -153,6 +171,11 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx, image_tokens->ny = 1; // TODO image_tokens->batch_f32 = std::move(batch_f32); + // optionally calculate the hash + if (ctx->calc_image_hash) { + image_tokens->image_hash = hash_vector_float(image_tokens->batch_f32.entries[0]->buf); + } + mtmd_input_chunk chunk{ MTMD_INPUT_CHUNK_TYPE_IMAGE, {}, @@ -196,6 +219,10 @@ size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) { return image_tokens->ny; } +uint64_t mtmd_image_tokens_get_hash(const mtmd_image_tokens * image_tokens) { + return image_tokens->image_hash; +} + int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) { int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip); ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd); diff --git a/examples/llava/mtmd.h b/examples/llava/mtmd.h index ca3fb6fdc7960..cadcfa16fdceb 100644 --- a/examples/llava/mtmd.h +++ b/examples/llava/mtmd.h @@ -52,6 +52,9 @@ using mtmd_input_chunks = std::vector; struct mtmd_context_params { bool use_gpu = true; bool print_timings = true; + // calc_image_hash is useful for tracking KV cache + // if not set, mtmd_image_tokens_get_hash will return 0 + bool calc_image_hash = false; int n_threads = 4; enum ggml_log_level verbosity = GGML_LOG_LEVEL_INFO; const char * image_marker = "<__image__>"; @@ -91,10 +94,11 @@ MTMD_API mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx, MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks, bool free_images); // access mtmd_image_tokens -MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens); -MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens); -MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens); -MTMD_API void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens); +MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens); +MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens); +MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens); +MTMD_API uint64_t mtmd_image_tokens_get_hash(const mtmd_image_tokens * image_tokens); +MTMD_API void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens); // returns 0 on success MTMD_API int32_t mtmd_encode(mtmd_context * ctx, From 58c47674aac9704cfbc2f8e44ebbbe318edc432e Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 12 Apr 2025 10:34:12 +0200 Subject: [PATCH 3/6] shared_ptr for mtmd_image_tokens --- examples/llava/gemma3-cli.cpp | 11 +++---- examples/llava/mtmd.cpp | 56 +++++++++++++++-------------------- examples/llava/mtmd.h | 32 +++++++++----------- 3 files changed, 44 insertions(+), 55 deletions(-) diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp index b200d8f111918..34296c87132b0 100644 --- a/examples/llava/gemma3-cli.cpp +++ b/examples/llava/gemma3-cli.cpp @@ -185,18 +185,19 @@ static int eval_message(gemma3_context & ctx, common_chat_msg & msg, std::vector text.text = formatted_chat.prompt; text.add_special = add_bos; text.parse_special = true; - mtmd_input_chunks_ptr chunks(mtmd_tokenize(ctx.ctx_vision.get(), text, bitmaps)); - if (chunks == nullptr) { - LOG_ERR("Unable to tokenize prompt\n"); + mtmd_input_chunks chunks; + int32_t res = mtmd_tokenize(ctx.ctx_vision.get(), chunks, text, bitmaps); + if (res != 0) { + LOG_ERR("Unable to tokenize prompt, res = %d\n", res); return 1; } - if (mtmd_helper_eval(ctx.ctx_vision.get(), ctx.lctx, chunks.get(), ctx.n_past, 0, ctx.n_batch)) { + if (mtmd_helper_eval(ctx.ctx_vision.get(), ctx.lctx, chunks, ctx.n_past, 0, ctx.n_batch)) { LOG_ERR("Unable to eval prompt\n"); return 1; } - ctx.n_past += mtmd_helper_get_n_tokens(chunks.get()); + ctx.n_past += mtmd_helper_get_n_tokens(chunks); return 0; } diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp index 1691a71bf27fc..44e48c7270368 100644 --- a/examples/llava/mtmd.cpp +++ b/examples/llava/mtmd.cpp @@ -106,10 +106,10 @@ static uint64_t hash_vector_float(const std::vector & vec) { return seed; } -mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx, - const mtmd_input_text & text, - const std::vector & bitmaps) { - mtmd_input_chunks * output = new mtmd_input_chunks; +int32_t mtmd_tokenize(mtmd_context * ctx, + std::vector & output, + const mtmd_input_text & text, + const std::vector & bitmaps) { auto vocab = llama_model_get_vocab(ctx->text_model); std::string prompt_modified(text.text); @@ -124,8 +124,8 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx, } std::vector parts = string_split_str(text.text, ctx->image_marker); - output->clear(); - output->reserve(parts.size()); + output.clear(); + output.reserve(parts.size()); size_t i_img = 0; @@ -141,14 +141,14 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx, std::move(tokens), {}, }; - output->emplace_back(std::move(chunk)); + output.emplace_back(std::move(chunk)); if (&parts.back() != &part) { // add image token to middle of 2 parts if (i_img >= bitmaps.size()) { LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size()); - return nullptr; + return 1; } // shim layer @@ -163,10 +163,10 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx, bool ok = clip_image_preprocess(ctx->ctx_clip, img_u8.get(), &batch_f32); if (!ok) { LOG_ERR("Unable to preprocess image\n"); - return nullptr; + return 2; } - mtmd_image_tokens * image_tokens = new mtmd_image_tokens; + mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens); image_tokens->nx = clip_n_patches(ctx->ctx_clip); // TODO @ngxson : use clip_n_patches_by_image image_tokens->ny = 1; // TODO image_tokens->batch_f32 = std::move(batch_f32); @@ -179,14 +179,14 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx, mtmd_input_chunk chunk{ MTMD_INPUT_CHUNK_TYPE_IMAGE, {}, - image_tokens, + std::move(image_tokens), }; - output->emplace_back(std::move(chunk)); + output.emplace_back(std::move(chunk)); i_img++; } } - return output; + return 0; } void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) { @@ -195,18 +195,6 @@ void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) { } } -void mtmd_input_chunks_free(mtmd_input_chunks * chunks, bool free_images) { - if (free_images) { - for (auto & chunk : *chunks) { - if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE && chunk.tokens_image) { - mtmd_image_tokens_free(chunk.tokens_image); - chunk.tokens_image = nullptr; - } - } - } - delete chunks; -} - size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens) { return image_tokens->n_tokens(); } @@ -238,9 +226,9 @@ float * mtmd_get_output_embd(mtmd_context * ctx) { return ctx->image_embd_v.data(); } -size_t mtmd_helper_get_n_tokens(mtmd_input_chunks * chunks) { +size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks) { size_t n_tokens = 0; - for (auto & chunk : *chunks) { + for (auto & chunk : chunks) { if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) { n_tokens += chunk.tokens_text.size(); } else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) { @@ -289,7 +277,7 @@ struct decode_embd_batch { int32_t mtmd_helper_eval(mtmd_context * ctx, llama_context * lctx, - mtmd_input_chunks * chunks, + mtmd_input_chunks & chunks, llama_pos pos0, llama_seq_id seq_id, int32_t n_batch) { @@ -297,8 +285,8 @@ int32_t mtmd_helper_eval(mtmd_context * ctx, llama_pos n_past = pos0; llama_batch text_batch = llama_batch_init(n_batch, 0, 1); - for (auto & chunk : *chunks) { - bool is_last = &chunk == &chunks->back(); + for (auto & chunk : chunks) { + bool is_last = &chunk == &chunks.back(); if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) { // TODO @ngxson : may need to split into smaller batches text_batch.n_tokens = chunk.tokens_text.size(); @@ -327,7 +315,7 @@ int32_t mtmd_helper_eval(mtmd_context * ctx, if (ctx->print_timings) { LOG_INF("encoding image...\n"); } - ret = mtmd_encode(ctx, chunk.tokens_image); + ret = mtmd_encode(ctx, chunk.tokens_image.get()); if (ret != 0) { LOG_ERR("failed to encode image\n"); llama_batch_free(text_batch); @@ -337,7 +325,7 @@ int32_t mtmd_helper_eval(mtmd_context * ctx, LOG_INF("image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0); } - int32_t n_tokens = mtmd_image_tokens_get_n_tokens(chunk.tokens_image); + int32_t n_tokens = mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get()); float * embd = mtmd_get_output_embd(ctx); decode_embd_batch batch_img(embd, n_tokens, n_past, 0); int64_t t1 = ggml_time_ms(); @@ -395,3 +383,7 @@ bool mtmd_decode_use_non_causal(mtmd_context * ctx) { } return false; } + +void mtmd_image_tokens_deleter::operator()(mtmd_image_tokens * val) { + mtmd_image_tokens_free(val); +} diff --git a/examples/llava/mtmd.h b/examples/llava/mtmd.h index cadcfa16fdceb..f07814a56208c 100644 --- a/examples/llava/mtmd.h +++ b/examples/llava/mtmd.h @@ -41,10 +41,15 @@ struct mtmd_bitmap { std::vector data; }; +struct mtmd_image_tokens_deleter { + void operator()(mtmd_image_tokens * val); // forward declaration +}; +using mtmd_image_tokens_ptr = std::unique_ptr; + struct mtmd_input_chunk { mtmd_input_chunk_type type; std::vector tokens_text; - mtmd_image_tokens * tokens_image = nullptr; + mtmd_image_tokens_ptr tokens_image; }; using mtmd_input_chunks = std::vector; @@ -84,15 +89,16 @@ MTMD_API void mtmd_free(mtmd_context * ctx); // 2. (image tokens) // 3. "\ndescribe it in detail." // number of bitmaps must be equal to the number of image markers in the prompt -// the returned value must be freed using mtmd_input_chunks_free() // this function is thread-safe (shared ctx) -MTMD_API mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx, +// return values: +// 0 on success +// 1 on number of images not matching the number of markers +// 2 on image preprocessing error +MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx, + std::vector & output, const mtmd_input_text & text, const std::vector & bitmaps); -// if free_images = true, free the image tokens ; otherwise, you must free them using mtmd_image_free() -MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks, bool free_images); - // access mtmd_image_tokens MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens); MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens); @@ -117,7 +123,7 @@ MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx); // // helper to count the total number of tokens from a list of chunks, useful to keep track of n_past -MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks * chunks); +MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks); // helper function that automatically: // 1. run llama_decode() on text chunks @@ -126,7 +132,7 @@ MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks * chunks); // otherwise, returns 0 on success MTMD_API int32_t mtmd_helper_eval(mtmd_context * ctx, llama_context * lctx, - mtmd_input_chunks * chunks, + mtmd_input_chunks & chunks, llama_pos pos0, llama_seq_id seq_id, int32_t n_batch); @@ -148,16 +154,6 @@ struct mtmd_context_deleter { }; using mtmd_context_ptr = std::unique_ptr; -struct mtmd_input_chunks_deleter { - void operator()(mtmd_input_chunks * val) { mtmd_input_chunks_free(val, true); } -}; -using mtmd_input_chunks_ptr = std::unique_ptr; - -struct mtmd_image_tokens_deleter { - void operator()(mtmd_image_tokens * val) { mtmd_image_tokens_free(val); } -}; -using mtmd_image_tokens_ptr = std::unique_ptr; - #else static_assert(false && "C header is not yet supported by this library"); From d3c3e20c424b02fedbef8d2fdddd0061c6255348 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 12 Apr 2025 11:03:38 +0200 Subject: [PATCH 4/6] move hash to user-define ID (fixed) --- examples/llava/gemma3-cli.cpp | 1 - examples/llava/mtmd.cpp | 25 +++++-------------------- examples/llava/mtmd.h | 14 ++++++-------- 3 files changed, 11 insertions(+), 29 deletions(-) diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp index 34296c87132b0..de206c85ae80c 100644 --- a/examples/llava/gemma3-cli.cpp +++ b/examples/llava/gemma3-cli.cpp @@ -89,7 +89,6 @@ struct gemma3_context { ctx_vision.reset(mtmd_init_from_file(clip_path, model, mtmd_context_params{ /* use_gpu */ true, /* timings */ true, - /* hash */ false, /* n_threads */ params.cpuparams.n_threads, /* verbosity */ GGML_LOG_LEVEL_INFO, })); diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp index 44e48c7270368..0898439d11d48 100644 --- a/examples/llava/mtmd.cpp +++ b/examples/llava/mtmd.cpp @@ -29,8 +29,7 @@ struct mtmd_context { const mtmd_context_params & ctx_params) : print_timings (ctx_params.print_timings), n_threads (ctx_params.n_threads), - image_marker (ctx_params.image_marker), - calc_image_hash(ctx_params.calc_image_hash) + image_marker (ctx_params.image_marker) { clip_context_params ctx_clip_params; ctx_clip_params.use_gpu = ctx_params.use_gpu; @@ -56,7 +55,7 @@ struct mtmd_image_tokens { uint32_t ny; // number of tokens in y direction uint32_t n_tokens() const { return nx * ny; } clip_image_f32_batch batch_f32; // preprocessed image patches - size_t image_hash = 0; // hash of the image, useful for KV cache tracking + std::string id; // optional user-defined ID, useful for KV cache tracking }; mtmd_context * mtmd_init_from_file(const char * mmproj_fname, @@ -96,16 +95,6 @@ static std::vector mtmd_tokenize_text_internal( return result; } -static uint64_t hash_vector_float(const std::vector & vec) { - uint64_t seed = vec.size(); - std::hash hasher; - for (float val : vec) { - // inspired by boost::hash_combine - seed ^= hasher(val) + 0x9e3779b9 + (seed << 6) + (seed >> 2); - } - return seed; -} - int32_t mtmd_tokenize(mtmd_context * ctx, std::vector & output, const mtmd_input_text & text, @@ -170,11 +159,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx, image_tokens->nx = clip_n_patches(ctx->ctx_clip); // TODO @ngxson : use clip_n_patches_by_image image_tokens->ny = 1; // TODO image_tokens->batch_f32 = std::move(batch_f32); - - // optionally calculate the hash - if (ctx->calc_image_hash) { - image_tokens->image_hash = hash_vector_float(image_tokens->batch_f32.entries[0]->buf); - } + image_tokens->id = bitmaps[i_img].id; // optional mtmd_input_chunk chunk{ MTMD_INPUT_CHUNK_TYPE_IMAGE, @@ -207,8 +192,8 @@ size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) { return image_tokens->ny; } -uint64_t mtmd_image_tokens_get_hash(const mtmd_image_tokens * image_tokens) { - return image_tokens->image_hash; +std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) { + return image_tokens->id; } int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) { diff --git a/examples/llava/mtmd.h b/examples/llava/mtmd.h index f07814a56208c..78be192dd6eb6 100644 --- a/examples/llava/mtmd.h +++ b/examples/llava/mtmd.h @@ -39,6 +39,7 @@ struct mtmd_bitmap { uint32_t nx; uint32_t ny; std::vector data; + std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking }; struct mtmd_image_tokens_deleter { @@ -57,9 +58,6 @@ using mtmd_input_chunks = std::vector; struct mtmd_context_params { bool use_gpu = true; bool print_timings = true; - // calc_image_hash is useful for tracking KV cache - // if not set, mtmd_image_tokens_get_hash will return 0 - bool calc_image_hash = false; int n_threads = 4; enum ggml_log_level verbosity = GGML_LOG_LEVEL_INFO; const char * image_marker = "<__image__>"; @@ -100,11 +98,11 @@ MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx, const std::vector & bitmaps); // access mtmd_image_tokens -MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens); -MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens); -MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens); -MTMD_API uint64_t mtmd_image_tokens_get_hash(const mtmd_image_tokens * image_tokens); -MTMD_API void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens); +MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens); +MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens); +MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens); +MTMD_API std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens); +MTMD_API void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens); // returns 0 on success MTMD_API int32_t mtmd_encode(mtmd_context * ctx, From cd5dc6b860ea76db9e9ed93b60461dff7fea4ff1 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 13 Apr 2025 23:39:03 +0200 Subject: [PATCH 5/6] fix prompt_modified --- examples/llava/mtmd.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp index 0898439d11d48..fe6d769095011 100644 --- a/examples/llava/mtmd.cpp +++ b/examples/llava/mtmd.cpp @@ -112,7 +112,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx, string_replace_all(prompt_modified, ctx->image_marker, marker_modified); } - std::vector parts = string_split_str(text.text, ctx->image_marker); + std::vector parts = string_split_str(prompt_modified, ctx->image_marker); output.clear(); output.reserve(parts.size()); From dbb257c14a3fb37c3500f2ae9ce12a59539a3683 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 14 Apr 2025 19:57:03 +0200 Subject: [PATCH 6/6] rm redundant data member --- examples/llava/mtmd.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp index fe6d769095011..3fd5bebc6a7d5 100644 --- a/examples/llava/mtmd.cpp +++ b/examples/llava/mtmd.cpp @@ -20,16 +20,15 @@ struct mtmd_context { bool print_timings; int n_threads; std::string image_marker; - bool calc_image_hash; // TODO @ngxson : add timings mtmd_context(const char * mmproj_fname, const llama_model * text_model, const mtmd_context_params & ctx_params) : - print_timings (ctx_params.print_timings), - n_threads (ctx_params.n_threads), - image_marker (ctx_params.image_marker) + print_timings(ctx_params.print_timings), + n_threads (ctx_params.n_threads), + image_marker (ctx_params.image_marker) { clip_context_params ctx_clip_params; ctx_clip_params.use_gpu = ctx_params.use_gpu;