From a46b6db6844c2d213965d7450a7eb0d2588d88e3 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 11 Apr 2025 17:46:25 +0200
Subject: [PATCH 1/6] mtmd : add more api around mtmd_image_tokens

---
 examples/llava/mtmd.cpp | 39 ++++++++++++++++++++++++++++++++++-----
 examples/llava/mtmd.h   | 23 ++++++++++++++++++++---
 2 files changed, 54 insertions(+), 8 deletions(-)

diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index 114c274bc1250..98d660a643809 100644
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -166,15 +166,36 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
     return output;
 }
 
-void mtmd_input_chunks_free(mtmd_input_chunks * chunks) {
-    for (auto & chunk : *chunks) {
-        if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE && chunk.tokens_image) {
-            delete chunk.tokens_image;
+void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) {
+    if (image_tokens) {
+        delete image_tokens;
+    }
+}
+
+void mtmd_input_chunks_free(mtmd_input_chunks * chunks, bool free_images) {
+    if (free_images) {
+        for (auto & chunk : *chunks) {
+            if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE && chunk.tokens_image) {
+                mtmd_image_tokens_free(chunk.tokens_image);
+                chunk.tokens_image = nullptr;
+            }
         }
     }
     delete chunks;
 }
 
+size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens) {
+    return image_tokens->n_tokens();
+}
+
+size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens) {
+    return image_tokens->nx;
+}
+
+size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
+    return image_tokens->ny;
+}
+
 int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
     int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
     ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
@@ -289,7 +310,7 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
                 LOG_INF("image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
             }
 
-            int32_t n_tokens = chunk.tokens_image->n_tokens();
+            int32_t n_tokens = mtmd_image_tokens_get_n_tokens(chunk.tokens_image);
             float * embd = mtmd_get_output_embd(ctx);
             decode_embd_batch batch_img(embd, n_tokens, n_past, 0);
             int64_t t1 = ggml_time_ms();
@@ -339,3 +360,11 @@ int32_t mtmd_helper_bitmap_init_from_file(const char * fname, mtmd_bitmap & outp
     std::memcpy(output.data.data(), data, output.nx * output.ny * 3);
     return 0;
 }
+
+bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
+    projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
+    if (proj_type == PROJECTOR_TYPE_GEMMA3) {
+        return true;
+    }
+    return false;
+}
diff --git a/examples/llava/mtmd.h b/examples/llava/mtmd.h
index 598f6947bb092..ca3fb6fdc7960 100644
--- a/examples/llava/mtmd.h
+++ b/examples/llava/mtmd.h
@@ -81,13 +81,20 @@ MTMD_API void mtmd_free(mtmd_context * ctx);
 //   2. (image tokens)
 //   3. "<end_of_image>\ndescribe it in detail."
 // number of bitmaps must be equal to the number of image markers in the prompt
+// the returned value must be freed using mtmd_input_chunks_free()
 // this function is thread-safe (shared ctx)
 MTMD_API mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
                                 const mtmd_input_text & text,
                                 const std::vector<mtmd_bitmap> & bitmaps);
 
-// free image chunk data
-MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks);
+// if free_images = true, free the image tokens ; otherwise, you must free them using mtmd_image_free()
+MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks, bool free_images);
+
+// access mtmd_image_tokens
+MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens);
+MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens);
+MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens);
+MTMD_API void   mtmd_image_tokens_free(mtmd_image_tokens * image_tokens);
 
 // returns 0 on success
 MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
@@ -96,6 +103,11 @@ MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
 // get output embeddings from the last encode pass
 MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
 
+// whether we need to set non-causal mask before llama_decode
+MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
+
+
+
 //
 // helper functions (can be implemented based on other functions)
 //
@@ -133,10 +145,15 @@ struct mtmd_context_deleter {
 using mtmd_context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>;
 
 struct mtmd_input_chunks_deleter {
-    void operator()(mtmd_input_chunks * val) { mtmd_input_chunks_free(val); }
+    void operator()(mtmd_input_chunks * val) { mtmd_input_chunks_free(val, true); }
 };
 using mtmd_input_chunks_ptr = std::unique_ptr<mtmd_input_chunks, mtmd_input_chunks_deleter>;
 
+struct mtmd_image_tokens_deleter {
+    void operator()(mtmd_image_tokens * val) { mtmd_image_tokens_free(val); }
+};
+using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens, mtmd_image_tokens_deleter>;
+
 #else
 
 static_assert(false && "C header is not yet supported by this library");

From 7ac0b7b7b0433eacd8c9cabf3734f092637e6212 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 11 Apr 2025 22:17:47 +0200
Subject: [PATCH 2/6] mtmd : ability to calc image hash

---
 examples/llava/gemma3-cli.cpp |  1 +
 examples/llava/mtmd.cpp       | 29 ++++++++++++++++++++++++++++-
 examples/llava/mtmd.h         | 12 ++++++++----
 3 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp
index 91a07e2a8f40d..b200d8f111918 100644
--- a/examples/llava/gemma3-cli.cpp
+++ b/examples/llava/gemma3-cli.cpp
@@ -89,6 +89,7 @@ struct gemma3_context {
         ctx_vision.reset(mtmd_init_from_file(clip_path, model, mtmd_context_params{
             /* use_gpu */   true,
             /* timings */   true,
+            /* hash */      false,
             /* n_threads */ params.cpuparams.n_threads,
             /* verbosity */ GGML_LOG_LEVEL_INFO,
         }));
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index 98d660a643809..1691a71bf27fc 100644
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -16,15 +16,22 @@ struct mtmd_context {
     struct clip_ctx * ctx_clip;
     const struct llama_model * text_model;
     std::vector<float> image_embd_v; // image embedding vector
+
     bool print_timings;
     int n_threads;
     std::string image_marker;
+    bool calc_image_hash;
 
     // TODO @ngxson : add timings
 
     mtmd_context(const char * mmproj_fname,
                    const llama_model * text_model,
-                   const mtmd_context_params & ctx_params) : print_timings(ctx_params.print_timings), n_threads(ctx_params.n_threads), image_marker(ctx_params.image_marker) {
+                   const mtmd_context_params & ctx_params) :
+        print_timings  (ctx_params.print_timings),
+        n_threads      (ctx_params.n_threads),
+        image_marker   (ctx_params.image_marker),
+        calc_image_hash(ctx_params.calc_image_hash)
+    {
         clip_context_params ctx_clip_params;
         ctx_clip_params.use_gpu   = ctx_params.use_gpu;
         ctx_clip_params.verbosity = ctx_params.verbosity;
@@ -49,6 +56,7 @@ struct mtmd_image_tokens {
     uint32_t ny; // number of tokens in y direction
     uint32_t n_tokens() const { return nx * ny; }
     clip_image_f32_batch batch_f32; // preprocessed image patches
+    size_t image_hash = 0; // hash of the image, useful for KV cache tracking
 };
 
 mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
@@ -88,6 +96,16 @@ static std::vector<llama_token> mtmd_tokenize_text_internal(
     return result;
 }
 
+static uint64_t hash_vector_float(const std::vector<float> & vec) {
+    uint64_t seed = vec.size();
+    std::hash<float> hasher;
+    for (float val : vec) {
+        // inspired by boost::hash_combine
+        seed ^= hasher(val) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+    }
+    return seed;
+}
+
 mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
                                 const mtmd_input_text & text,
                                 const std::vector<mtmd_bitmap> & bitmaps) {
@@ -153,6 +171,11 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
             image_tokens->ny = 1; // TODO
             image_tokens->batch_f32 = std::move(batch_f32);
 
+            // optionally calculate the hash
+            if (ctx->calc_image_hash) {
+                image_tokens->image_hash = hash_vector_float(image_tokens->batch_f32.entries[0]->buf);
+            }
+
             mtmd_input_chunk chunk{
                 MTMD_INPUT_CHUNK_TYPE_IMAGE,
                 {},
@@ -196,6 +219,10 @@ size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
     return image_tokens->ny;
 }
 
+uint64_t mtmd_image_tokens_get_hash(const mtmd_image_tokens * image_tokens) {
+    return image_tokens->image_hash;
+}
+
 int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
     int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
     ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
diff --git a/examples/llava/mtmd.h b/examples/llava/mtmd.h
index ca3fb6fdc7960..cadcfa16fdceb 100644
--- a/examples/llava/mtmd.h
+++ b/examples/llava/mtmd.h
@@ -52,6 +52,9 @@ using mtmd_input_chunks = std::vector<mtmd_input_chunk>;
 struct mtmd_context_params {
     bool use_gpu = true;
     bool print_timings = true;
+    // calc_image_hash is useful for tracking KV cache
+    // if not set, mtmd_image_tokens_get_hash will return 0
+    bool calc_image_hash = false;
     int n_threads = 4;
     enum ggml_log_level verbosity = GGML_LOG_LEVEL_INFO;
     const char * image_marker = "<__image__>";
@@ -91,10 +94,11 @@ MTMD_API mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
 MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks, bool free_images);
 
 // access mtmd_image_tokens
-MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens);
-MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens);
-MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens);
-MTMD_API void   mtmd_image_tokens_free(mtmd_image_tokens * image_tokens);
+MTMD_API size_t   mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens);
+MTMD_API size_t   mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens);
+MTMD_API size_t   mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens);
+MTMD_API uint64_t mtmd_image_tokens_get_hash(const mtmd_image_tokens * image_tokens);
+MTMD_API void     mtmd_image_tokens_free(mtmd_image_tokens * image_tokens);
 
 // returns 0 on success
 MTMD_API int32_t mtmd_encode(mtmd_context * ctx,

From 58c47674aac9704cfbc2f8e44ebbbe318edc432e Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sat, 12 Apr 2025 10:34:12 +0200
Subject: [PATCH 3/6] shared_ptr for mtmd_image_tokens

---
 examples/llava/gemma3-cli.cpp | 11 +++----
 examples/llava/mtmd.cpp       | 56 +++++++++++++++--------------------
 examples/llava/mtmd.h         | 32 +++++++++-----------
 3 files changed, 44 insertions(+), 55 deletions(-)

diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp
index b200d8f111918..34296c87132b0 100644
--- a/examples/llava/gemma3-cli.cpp
+++ b/examples/llava/gemma3-cli.cpp
@@ -185,18 +185,19 @@ static int eval_message(gemma3_context & ctx, common_chat_msg & msg, std::vector
     text.text          = formatted_chat.prompt;
     text.add_special   = add_bos;
     text.parse_special = true;
-    mtmd_input_chunks_ptr chunks(mtmd_tokenize(ctx.ctx_vision.get(), text, bitmaps));
-    if (chunks == nullptr) {
-        LOG_ERR("Unable to tokenize prompt\n");
+    mtmd_input_chunks chunks;
+    int32_t res = mtmd_tokenize(ctx.ctx_vision.get(), chunks, text, bitmaps);
+    if (res != 0) {
+        LOG_ERR("Unable to tokenize prompt, res = %d\n", res);
         return 1;
     }
 
-    if (mtmd_helper_eval(ctx.ctx_vision.get(), ctx.lctx, chunks.get(), ctx.n_past, 0, ctx.n_batch)) {
+    if (mtmd_helper_eval(ctx.ctx_vision.get(), ctx.lctx, chunks, ctx.n_past, 0, ctx.n_batch)) {
         LOG_ERR("Unable to eval prompt\n");
         return 1;
     }
 
-    ctx.n_past += mtmd_helper_get_n_tokens(chunks.get());
+    ctx.n_past += mtmd_helper_get_n_tokens(chunks);
 
     return 0;
 }
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index 1691a71bf27fc..44e48c7270368 100644
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -106,10 +106,10 @@ static uint64_t hash_vector_float(const std::vector<float> & vec) {
     return seed;
 }
 
-mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
-                                const mtmd_input_text & text,
-                                const std::vector<mtmd_bitmap> & bitmaps) {
-    mtmd_input_chunks * output = new mtmd_input_chunks;
+int32_t mtmd_tokenize(mtmd_context * ctx,
+                        std::vector<mtmd_input_chunk> & output,
+                        const mtmd_input_text & text,
+                        const std::vector<mtmd_bitmap> & bitmaps) {
     auto vocab = llama_model_get_vocab(ctx->text_model);
 
     std::string prompt_modified(text.text);
@@ -124,8 +124,8 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
     }
 
     std::vector<std::string> parts = string_split_str(text.text, ctx->image_marker);
-    output->clear();
-    output->reserve(parts.size());
+    output.clear();
+    output.reserve(parts.size());
 
     size_t i_img = 0;
 
@@ -141,14 +141,14 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
             std::move(tokens),
             {},
         };
-        output->emplace_back(std::move(chunk));
+        output.emplace_back(std::move(chunk));
 
         if (&parts.back() != &part) {
             // add image token to middle of 2 parts
 
             if (i_img >= bitmaps.size()) {
                 LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
-                return nullptr;
+                return 1;
             }
 
             // shim layer
@@ -163,10 +163,10 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
             bool ok = clip_image_preprocess(ctx->ctx_clip, img_u8.get(), &batch_f32);
             if (!ok) {
                 LOG_ERR("Unable to preprocess image\n");
-                return nullptr;
+                return 2;
             }
 
-            mtmd_image_tokens * image_tokens = new mtmd_image_tokens;
+            mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
             image_tokens->nx = clip_n_patches(ctx->ctx_clip); // TODO @ngxson : use clip_n_patches_by_image
             image_tokens->ny = 1; // TODO
             image_tokens->batch_f32 = std::move(batch_f32);
@@ -179,14 +179,14 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
             mtmd_input_chunk chunk{
                 MTMD_INPUT_CHUNK_TYPE_IMAGE,
                 {},
-                image_tokens,
+                std::move(image_tokens),
             };
-            output->emplace_back(std::move(chunk));
+            output.emplace_back(std::move(chunk));
             i_img++;
         }
     }
 
-    return output;
+    return 0;
 }
 
 void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) {
@@ -195,18 +195,6 @@ void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) {
     }
 }
 
-void mtmd_input_chunks_free(mtmd_input_chunks * chunks, bool free_images) {
-    if (free_images) {
-        for (auto & chunk : *chunks) {
-            if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE && chunk.tokens_image) {
-                mtmd_image_tokens_free(chunk.tokens_image);
-                chunk.tokens_image = nullptr;
-            }
-        }
-    }
-    delete chunks;
-}
-
 size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens) {
     return image_tokens->n_tokens();
 }
@@ -238,9 +226,9 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
     return ctx->image_embd_v.data();
 }
 
-size_t mtmd_helper_get_n_tokens(mtmd_input_chunks * chunks) {
+size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks) {
     size_t n_tokens = 0;
-    for (auto & chunk : *chunks) {
+    for (auto & chunk : chunks) {
         if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
             n_tokens += chunk.tokens_text.size();
         } else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
@@ -289,7 +277,7 @@ struct decode_embd_batch {
 
 int32_t mtmd_helper_eval(mtmd_context * ctx,
         llama_context * lctx,
-        mtmd_input_chunks * chunks,
+        mtmd_input_chunks & chunks,
         llama_pos pos0,
         llama_seq_id seq_id,
         int32_t n_batch) {
@@ -297,8 +285,8 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
     llama_pos n_past = pos0;
     llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
 
-    for (auto & chunk : *chunks) {
-        bool is_last = &chunk == &chunks->back();
+    for (auto & chunk : chunks) {
+        bool is_last = &chunk == &chunks.back();
         if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
             // TODO @ngxson : may need to split into smaller batches
             text_batch.n_tokens = chunk.tokens_text.size();
@@ -327,7 +315,7 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
             if (ctx->print_timings) {
                 LOG_INF("encoding image...\n");
             }
-            ret = mtmd_encode(ctx, chunk.tokens_image);
+            ret = mtmd_encode(ctx, chunk.tokens_image.get());
             if (ret != 0) {
                 LOG_ERR("failed to encode image\n");
                 llama_batch_free(text_batch);
@@ -337,7 +325,7 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
                 LOG_INF("image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
             }
 
-            int32_t n_tokens = mtmd_image_tokens_get_n_tokens(chunk.tokens_image);
+            int32_t n_tokens = mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get());
             float * embd = mtmd_get_output_embd(ctx);
             decode_embd_batch batch_img(embd, n_tokens, n_past, 0);
             int64_t t1 = ggml_time_ms();
@@ -395,3 +383,7 @@ bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
     }
     return false;
 }
+
+void mtmd_image_tokens_deleter::operator()(mtmd_image_tokens * val) {
+    mtmd_image_tokens_free(val);
+}
diff --git a/examples/llava/mtmd.h b/examples/llava/mtmd.h
index cadcfa16fdceb..f07814a56208c 100644
--- a/examples/llava/mtmd.h
+++ b/examples/llava/mtmd.h
@@ -41,10 +41,15 @@ struct mtmd_bitmap {
     std::vector<unsigned char> data;
 };
 
+struct mtmd_image_tokens_deleter {
+    void operator()(mtmd_image_tokens * val); // forward declaration
+};
+using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens, mtmd_image_tokens_deleter>;
+
 struct mtmd_input_chunk {
     mtmd_input_chunk_type type;
     std::vector<llama_token> tokens_text;
-    mtmd_image_tokens * tokens_image = nullptr;
+    mtmd_image_tokens_ptr tokens_image;
 };
 
 using mtmd_input_chunks = std::vector<mtmd_input_chunk>;
@@ -84,15 +89,16 @@ MTMD_API void mtmd_free(mtmd_context * ctx);
 //   2. (image tokens)
 //   3. "<end_of_image>\ndescribe it in detail."
 // number of bitmaps must be equal to the number of image markers in the prompt
-// the returned value must be freed using mtmd_input_chunks_free()
 // this function is thread-safe (shared ctx)
-MTMD_API mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
+// return values:
+//   0 on success
+//   1 on number of images not matching the number of markers
+//   2 on image preprocessing error
+MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
+                                std::vector<mtmd_input_chunk> & output,
                                 const mtmd_input_text & text,
                                 const std::vector<mtmd_bitmap> & bitmaps);
 
-// if free_images = true, free the image tokens ; otherwise, you must free them using mtmd_image_free()
-MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks, bool free_images);
-
 // access mtmd_image_tokens
 MTMD_API size_t   mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens);
 MTMD_API size_t   mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens);
@@ -117,7 +123,7 @@ MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
 //
 
 // helper to count the total number of tokens from a list of chunks, useful to keep track of n_past
-MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks * chunks);
+MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks);
 
 // helper function that automatically:
 // 1. run llama_decode() on text chunks
@@ -126,7 +132,7 @@ MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks * chunks);
 // otherwise, returns 0 on success
 MTMD_API int32_t mtmd_helper_eval(mtmd_context * ctx,
                                 llama_context * lctx,
-                                mtmd_input_chunks * chunks,
+                                mtmd_input_chunks & chunks,
                                 llama_pos pos0,
                                 llama_seq_id seq_id,
                                 int32_t n_batch);
@@ -148,16 +154,6 @@ struct mtmd_context_deleter {
 };
 using mtmd_context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>;
 
-struct mtmd_input_chunks_deleter {
-    void operator()(mtmd_input_chunks * val) { mtmd_input_chunks_free(val, true); }
-};
-using mtmd_input_chunks_ptr = std::unique_ptr<mtmd_input_chunks, mtmd_input_chunks_deleter>;
-
-struct mtmd_image_tokens_deleter {
-    void operator()(mtmd_image_tokens * val) { mtmd_image_tokens_free(val); }
-};
-using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens, mtmd_image_tokens_deleter>;
-
 #else
 
 static_assert(false && "C header is not yet supported by this library");

From d3c3e20c424b02fedbef8d2fdddd0061c6255348 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sat, 12 Apr 2025 11:03:38 +0200
Subject: [PATCH 4/6] move hash to user-define ID (fixed)

---
 examples/llava/gemma3-cli.cpp |  1 -
 examples/llava/mtmd.cpp       | 25 +++++--------------------
 examples/llava/mtmd.h         | 14 ++++++--------
 3 files changed, 11 insertions(+), 29 deletions(-)

diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp
index 34296c87132b0..de206c85ae80c 100644
--- a/examples/llava/gemma3-cli.cpp
+++ b/examples/llava/gemma3-cli.cpp
@@ -89,7 +89,6 @@ struct gemma3_context {
         ctx_vision.reset(mtmd_init_from_file(clip_path, model, mtmd_context_params{
             /* use_gpu */   true,
             /* timings */   true,
-            /* hash */      false,
             /* n_threads */ params.cpuparams.n_threads,
             /* verbosity */ GGML_LOG_LEVEL_INFO,
         }));
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index 44e48c7270368..0898439d11d48 100644
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -29,8 +29,7 @@ struct mtmd_context {
                    const mtmd_context_params & ctx_params) :
         print_timings  (ctx_params.print_timings),
         n_threads      (ctx_params.n_threads),
-        image_marker   (ctx_params.image_marker),
-        calc_image_hash(ctx_params.calc_image_hash)
+        image_marker   (ctx_params.image_marker)
     {
         clip_context_params ctx_clip_params;
         ctx_clip_params.use_gpu   = ctx_params.use_gpu;
@@ -56,7 +55,7 @@ struct mtmd_image_tokens {
     uint32_t ny; // number of tokens in y direction
     uint32_t n_tokens() const { return nx * ny; }
     clip_image_f32_batch batch_f32; // preprocessed image patches
-    size_t image_hash = 0; // hash of the image, useful for KV cache tracking
+    std::string id; // optional user-defined ID, useful for KV cache tracking
 };
 
 mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
@@ -96,16 +95,6 @@ static std::vector<llama_token> mtmd_tokenize_text_internal(
     return result;
 }
 
-static uint64_t hash_vector_float(const std::vector<float> & vec) {
-    uint64_t seed = vec.size();
-    std::hash<float> hasher;
-    for (float val : vec) {
-        // inspired by boost::hash_combine
-        seed ^= hasher(val) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
-    }
-    return seed;
-}
-
 int32_t mtmd_tokenize(mtmd_context * ctx,
                         std::vector<mtmd_input_chunk> & output,
                         const mtmd_input_text & text,
@@ -170,11 +159,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
             image_tokens->nx = clip_n_patches(ctx->ctx_clip); // TODO @ngxson : use clip_n_patches_by_image
             image_tokens->ny = 1; // TODO
             image_tokens->batch_f32 = std::move(batch_f32);
-
-            // optionally calculate the hash
-            if (ctx->calc_image_hash) {
-                image_tokens->image_hash = hash_vector_float(image_tokens->batch_f32.entries[0]->buf);
-            }
+            image_tokens->id = bitmaps[i_img].id; // optional
 
             mtmd_input_chunk chunk{
                 MTMD_INPUT_CHUNK_TYPE_IMAGE,
@@ -207,8 +192,8 @@ size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
     return image_tokens->ny;
 }
 
-uint64_t mtmd_image_tokens_get_hash(const mtmd_image_tokens * image_tokens) {
-    return image_tokens->image_hash;
+std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
+    return image_tokens->id;
 }
 
 int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
diff --git a/examples/llava/mtmd.h b/examples/llava/mtmd.h
index f07814a56208c..78be192dd6eb6 100644
--- a/examples/llava/mtmd.h
+++ b/examples/llava/mtmd.h
@@ -39,6 +39,7 @@ struct mtmd_bitmap {
     uint32_t nx;
     uint32_t ny;
     std::vector<unsigned char> data;
+    std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
 };
 
 struct mtmd_image_tokens_deleter {
@@ -57,9 +58,6 @@ using mtmd_input_chunks = std::vector<mtmd_input_chunk>;
 struct mtmd_context_params {
     bool use_gpu = true;
     bool print_timings = true;
-    // calc_image_hash is useful for tracking KV cache
-    // if not set, mtmd_image_tokens_get_hash will return 0
-    bool calc_image_hash = false;
     int n_threads = 4;
     enum ggml_log_level verbosity = GGML_LOG_LEVEL_INFO;
     const char * image_marker = "<__image__>";
@@ -100,11 +98,11 @@ MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
                                 const std::vector<mtmd_bitmap> & bitmaps);
 
 // access mtmd_image_tokens
-MTMD_API size_t   mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens);
-MTMD_API size_t   mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens);
-MTMD_API size_t   mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens);
-MTMD_API uint64_t mtmd_image_tokens_get_hash(const mtmd_image_tokens * image_tokens);
-MTMD_API void     mtmd_image_tokens_free(mtmd_image_tokens * image_tokens);
+MTMD_API size_t      mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens);
+MTMD_API size_t      mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens);
+MTMD_API size_t      mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens);
+MTMD_API std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens);
+MTMD_API void        mtmd_image_tokens_free(mtmd_image_tokens * image_tokens);
 
 // returns 0 on success
 MTMD_API int32_t mtmd_encode(mtmd_context * ctx,

From cd5dc6b860ea76db9e9ed93b60461dff7fea4ff1 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sun, 13 Apr 2025 23:39:03 +0200
Subject: [PATCH 5/6] fix prompt_modified

---
 examples/llava/mtmd.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index 0898439d11d48..fe6d769095011 100644
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -112,7 +112,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
         string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
     }
 
-    std::vector<std::string> parts = string_split_str(text.text, ctx->image_marker);
+    std::vector<std::string> parts = string_split_str(prompt_modified, ctx->image_marker);
     output.clear();
     output.reserve(parts.size());
 

From dbb257c14a3fb37c3500f2ae9ce12a59539a3683 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 14 Apr 2025 19:57:03 +0200
Subject: [PATCH 6/6] rm redundant data member

---
 examples/llava/mtmd.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index fe6d769095011..3fd5bebc6a7d5 100644
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -20,16 +20,15 @@ struct mtmd_context {
     bool print_timings;
     int n_threads;
     std::string image_marker;
-    bool calc_image_hash;
 
     // TODO @ngxson : add timings
 
     mtmd_context(const char * mmproj_fname,
                    const llama_model * text_model,
                    const mtmd_context_params & ctx_params) :
-        print_timings  (ctx_params.print_timings),
-        n_threads      (ctx_params.n_threads),
-        image_marker   (ctx_params.image_marker)
+        print_timings(ctx_params.print_timings),
+        n_threads    (ctx_params.n_threads),
+        image_marker (ctx_params.image_marker)
     {
         clip_context_params ctx_clip_params;
         ctx_clip_params.use_gpu   = ctx_params.use_gpu;