feat: support incrementing ref image index (omni-kontext) (leejet#755)

stduhpf · leejet · web-flow · commit c587a43c9930 · 2025-09-07T22:35:16.000+08:00
* kontext: support  ref images indices

* lora: support x_embedder

* update help message

* Support for negative indices

* support for OmniControl (offsets at index 0)

* c++11 compat

* add --increase-ref-index option

* simplify the logic and fix some issues

* update README.md

* remove unused variable

---------

Co-authored-by: leejet &lt;leejet714@gmail.com&gt;
diff --git a/README.md b/README.md
@@ -319,6 +319,7 @@ arguments:
   -i, --end-img [IMAGE]              path to the end image, required by flf2v
   --control-image [IMAGE]            path to image condition, control net
   -r, --ref-image [PATH]             reference image for Flux Kontext models (can be used multiple times)
+  --increase-ref-index               automatically increase the indices of references images based on the order they are listed (starting with 1).
   -o, --output OUTPUT                path to write result image to (default: ./output.png)
   -p, --prompt [PROMPT]              the prompt to render
   -n, --negative-prompt PROMPT       the negative prompt (default: "")
diff --git a/diffusion_model.hpp b/diffusion_model.hpp
@@ -16,6 +16,7 @@ struct DiffusionModel {
                          struct ggml_tensor* y,
                          struct ggml_tensor* guidance,
                          std::vector<ggml_tensor*> ref_latents     = {},
+                         bool increase_ref_index                   = false,
                          int num_video_frames                      = -1,
                          std::vector<struct ggml_tensor*> controls = {},
                          float control_strength                    = 0.f,
@@ -77,6 +78,7 @@ struct UNetModel : public DiffusionModel {
                  struct ggml_tensor* y,
                  struct ggml_tensor* guidance,
                  std::vector<ggml_tensor*> ref_latents     = {},
+                 bool increase_ref_index                   = false,
                  int num_video_frames                      = -1,
                  std::vector<struct ggml_tensor*> controls = {},
                  float control_strength                    = 0.f,
@@ -133,6 +135,7 @@ struct MMDiTModel : public DiffusionModel {
                  struct ggml_tensor* y,
                  struct ggml_tensor* guidance,
                  std::vector<ggml_tensor*> ref_latents     = {},
+                 bool increase_ref_index                   = false,
                  int num_video_frames                      = -1,
                  std::vector<struct ggml_tensor*> controls = {},
                  float control_strength                    = 0.f,
@@ -191,13 +194,14 @@ struct FluxModel : public DiffusionModel {
                  struct ggml_tensor* y,
                  struct ggml_tensor* guidance,
                  std::vector<ggml_tensor*> ref_latents     = {},
+                 bool increase_ref_index                   = false,
                  int num_video_frames                      = -1,
                  std::vector<struct ggml_tensor*> controls = {},
                  float control_strength                    = 0.f,
                  struct ggml_tensor** output               = NULL,
                  struct ggml_context* output_ctx           = NULL,
                  std::vector<int> skip_layers              = std::vector<int>()) {
-        return flux.compute(n_threads, x, timesteps, context, c_concat, y, guidance, ref_latents, output, output_ctx, skip_layers);
+        return flux.compute(n_threads, x, timesteps, context, c_concat, y, guidance, ref_latents, increase_ref_index, output, output_ctx, skip_layers);
     }
 };
 
@@ -250,6 +254,7 @@ struct WanModel : public DiffusionModel {
                  struct ggml_tensor* y,
                  struct ggml_tensor* guidance,
                  std::vector<ggml_tensor*> ref_latents     = {},
+                 bool increase_ref_index                   = false,
                  int num_video_frames                      = -1,
                  std::vector<struct ggml_tensor*> controls = {},
                  float control_strength                    = 0.f,
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
@@ -74,6 +74,7 @@ struct SDParams {
     std::string mask_image_path;
     std::string control_image_path;
     std::vector<std::string> ref_image_paths;
+    bool increase_ref_index = false;
 
     std::string prompt;
     std::string negative_prompt;
@@ -156,6 +157,7 @@ void print_params(SDParams params) {
     for (auto& path : params.ref_image_paths) {
         printf("        %s\n", path.c_str());
     };
+    printf("    increase_ref_index:                %s\n", params.increase_ref_index ? "true" : "false");
     printf("    offload_params_to_cpu:             %s\n", params.offload_params_to_cpu ? "true" : "false");
     printf("    clip_on_cpu:                       %s\n", params.clip_on_cpu ? "true" : "false");
     printf("    control_net_cpu:                   %s\n", params.control_net_cpu ? "true" : "false");
@@ -222,6 +224,7 @@ void print_usage(int argc, const char* argv[]) {
     printf("  -i, --end-img [IMAGE]              path to the end image, required by flf2v\n");
     printf("  --control-image [IMAGE]            path to image condition, control net\n");
     printf("  -r, --ref-image [PATH]             reference image for Flux Kontext models (can be used multiple times) \n");
+    printf("  --increase-ref-index               automatically increase the indices of references images based on the order they are listed (starting with 1).\n");
     printf("  -o, --output OUTPUT                path to write result image to (default: ./output.png)\n");
     printf("  -p, --prompt [PROMPT]              the prompt to render\n");
     printf("  -n, --negative-prompt PROMPT       the negative prompt (default: \"\")\n");
@@ -536,6 +539,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         {"", "--color", "", true, &params.color},
         {"", "--chroma-disable-dit-mask", "", false, &params.chroma_use_dit_mask},
         {"", "--chroma-enable-t5-mask", "", true, &params.chroma_use_t5_mask},
+        {"", "--increase-ref-index", "", true, &params.increase_ref_index},
     };
 
     auto on_mode_arg = [&](int argc, const char** argv, int index) {
@@ -1207,6 +1211,7 @@ int main(int argc, const char* argv[]) {
             init_image,
             ref_images.data(),
             (int)ref_images.size(),
+            params.increase_ref_index,
             mask_image,
             params.width,
             params.height,
diff --git a/flux.hpp b/flux.hpp
@@ -960,6 +960,7 @@ namespace Flux {
                                         struct ggml_tensor* y,
                                         struct ggml_tensor* guidance,
                                         std::vector<ggml_tensor*> ref_latents = {},
+                                        bool increase_ref_index               = false,
                                         std::vector<int> skip_layers          = {}) {
             GGML_ASSERT(x->ne[3] == 1);
             struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, FLUX_GRAPH_SIZE, false);
@@ -999,6 +1000,7 @@ namespace Flux {
                                             x->ne[3],
                                             context->ne[1],
                                             ref_latents,
+                                            increase_ref_index,
                                             flux_params.theta,
                                             flux_params.axes_dim);
             int pos_len = pe_vec.size() / flux_params.axes_dim_sum / 2;
@@ -1035,6 +1037,7 @@ namespace Flux {
                      struct ggml_tensor* y,
                      struct ggml_tensor* guidance,
                      std::vector<ggml_tensor*> ref_latents = {},
+                     bool increase_ref_index               = false,
                      struct ggml_tensor** output           = NULL,
                      struct ggml_context* output_ctx       = NULL,
                      std::vector<int> skip_layers          = std::vector<int>()) {
@@ -1044,7 +1047,7 @@ namespace Flux {
             // y: [N, adm_in_channels] or [1, adm_in_channels]
             // guidance: [N, ]
             auto get_graph = [&]() -> struct ggml_cgraph* {
-                return build_graph(x, timesteps, context, c_concat, y, guidance, ref_latents, skip_layers);
+                return build_graph(x, timesteps, context, c_concat, y, guidance, ref_latents, increase_ref_index, skip_layers);
             };
 
             GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
@@ -1084,7 +1087,7 @@ namespace Flux {
                 struct ggml_tensor* out = NULL;
 
                 int t0 = ggml_time_ms();
-                compute(8, x, timesteps, context, NULL, y, guidance, {}, &out, work_ctx);
+                compute(8, x, timesteps, context, NULL, y, guidance, {}, false, &out, work_ctx);
                 int t1 = ggml_time_ms();
 
                 print_ggml_tensor(out);
diff --git a/lora.hpp b/lora.hpp
@@ -58,6 +58,7 @@ struct LoraModel : public GGMLRunner {
         {"x_block.attn.proj", "attn.to_out.0"},
         {"x_block.attn2.proj", "attn2.to_out.0"},
         // flux
+        {"img_in", "x_embedder"},
         // singlestream
         {"linear2", "proj_out"},
         {"modulation.lin", "norm.linear"},
diff --git a/rope.hpp b/rope.hpp
@@ -156,25 +156,33 @@ struct Rope {
                                                         int patch_size,
                                                         int bs,
                                                         int context_len,
-                                                        std::vector<ggml_tensor*> ref_latents) {
+                                                        std::vector<ggml_tensor*> ref_latents,
+                                                        bool increase_ref_index) {
         auto txt_ids = gen_txt_ids(bs, context_len);
         auto img_ids = gen_img_ids(h, w, patch_size, bs);
 
         auto ids               = concat_ids(txt_ids, img_ids, bs);
         uint64_t curr_h_offset = 0;
         uint64_t curr_w_offset = 0;
+        int index              = 1;
         for (ggml_tensor* ref : ref_latents) {
             uint64_t h_offset = 0;
             uint64_t w_offset = 0;
-            if (ref->ne[1] + curr_h_offset > ref->ne[0] + curr_w_offset) {
-                w_offset = curr_w_offset;
-            } else {
-                h_offset = curr_h_offset;
+            if (!increase_ref_index) {
+                if (ref->ne[1] + curr_h_offset > ref->ne[0] + curr_w_offset) {
+                    w_offset = curr_w_offset;
+                } else {
+                    h_offset = curr_h_offset;
+                }
             }
 
-            auto ref_ids = gen_img_ids(ref->ne[1], ref->ne[0], patch_size, bs, 1, h_offset, w_offset);
+            auto ref_ids = gen_img_ids(ref->ne[1], ref->ne[0], patch_size, bs, index, h_offset, w_offset);
             ids          = concat_ids(ids, ref_ids, bs);
 
+            if (increase_ref_index) {
+                index++;
+            }
+
             curr_h_offset = std::max(curr_h_offset, ref->ne[1] + h_offset);
             curr_w_offset = std::max(curr_w_offset, ref->ne[0] + w_offset);
         }
@@ -188,9 +196,10 @@ struct Rope {
                                           int bs,
                                           int context_len,
                                           std::vector<ggml_tensor*> ref_latents,
+                                          bool increase_ref_index,
                                           int theta,
                                           const std::vector<int>& axes_dim) {
-        std::vector<std::vector<float>> ids = gen_flux_ids(h, w, patch_size, bs, context_len, ref_latents);
+        std::vector<std::vector<float>> ids = gen_flux_ids(h, w, patch_size, bs, context_len, ref_latents, increase_ref_index);
         return embed_nd(ids, bs, theta, axes_dim);
     }
 
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
@@ -775,7 +775,7 @@ class StableDiffusionGGML {
 
         int64_t t0              = ggml_time_ms();
         struct ggml_tensor* out = ggml_dup_tensor(work_ctx, x_t);
-        diffusion_model->compute(n_threads, x_t, timesteps, c, concat, NULL, NULL, {}, -1, {}, 0.f, &out);
+        diffusion_model->compute(n_threads, x_t, timesteps, c, concat, NULL, NULL, {}, false, -1, {}, 0.f, &out);
         diffusion_model->free_compute_buffer();
 
         double result = 0.f;
@@ -1032,6 +1032,7 @@ class StableDiffusionGGML {
                         int start_merge_step,
                         SDCondition id_cond,
                         std::vector<ggml_tensor*> ref_latents = {},
+                        bool increase_ref_index               = false,
                         ggml_tensor* denoise_mask             = nullptr) {
         std::vector<int> skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count);
 
@@ -1126,6 +1127,7 @@ class StableDiffusionGGML {
                                               cond.c_vector,
                                               guidance_tensor,
                                               ref_latents,
+                                              increase_ref_index,
                                               -1,
                                               controls,
                                               control_strength,
@@ -1139,6 +1141,7 @@ class StableDiffusionGGML {
                                               id_cond.c_vector,
                                               guidance_tensor,
                                               ref_latents,
+                                              increase_ref_index,
                                               -1,
                                               controls,
                                               control_strength,
@@ -1160,6 +1163,7 @@ class StableDiffusionGGML {
                                               uncond.c_vector,
                                               guidance_tensor,
                                               ref_latents,
+                                              increase_ref_index,
                                               -1,
                                               controls,
                                               control_strength,
@@ -1177,6 +1181,7 @@ class StableDiffusionGGML {
                                               img_cond.c_vector,
                                               guidance_tensor,
                                               ref_latents,
+                                              increase_ref_index,
                                               -1,
                                               controls,
                                               control_strength,
@@ -1198,6 +1203,7 @@ class StableDiffusionGGML {
                                               cond.c_vector,
                                               guidance_tensor,
                                               ref_latents,
+                                              increase_ref_index,
                                               -1,
                                               controls,
                                               control_strength,
@@ -1710,6 +1716,7 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {
              "\n"
              "batch_count: %d\n"
              "ref_images_count: %d\n"
+             "increase_ref_index: %s\n"
              "control_strength: %.2f\n"
              "style_strength: %.2f\n"
              "normalize_input: %s\n"
@@ -1724,6 +1731,7 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {
              sd_img_gen_params->seed,
              sd_img_gen_params->batch_count,
              sd_img_gen_params->ref_images_count,
+             BOOL_STR(sd_img_gen_params->increase_ref_index),
              sd_img_gen_params->control_strength,
              sd_img_gen_params->style_strength,
              BOOL_STR(sd_img_gen_params->normalize_input),
@@ -1797,6 +1805,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
                                     bool normalize_input,
                                     std::string input_id_images_path,
                                     std::vector<ggml_tensor*> ref_latents,
+                                    bool increase_ref_index,
                                     ggml_tensor* concat_latent = NULL,
                                     ggml_tensor* denoise_mask  = NULL) {
     if (seed < 0) {
@@ -2054,6 +2063,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
                                                      start_merge_step,
                                                      id_cond,
                                                      ref_latents,
+                                                     increase_ref_index,
                                                      denoise_mask);
         // print_ggml_tensor(x_0);
         int64_t sampling_end = ggml_time_ms();
@@ -2304,7 +2314,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
         LOG_INFO("EDIT mode");
     }
 
-    std::vector<struct ggml_tensor*> ref_latents;
+    std::vector<ggml_tensor*> ref_latents;
     for (int i = 0; i < sd_img_gen_params->ref_images_count; i++) {
         ggml_tensor* img = ggml_new_tensor_4d(work_ctx,
                                               GGML_TYPE_F32,
@@ -2359,6 +2369,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
                                                         sd_img_gen_params->normalize_input,
                                                         sd_img_gen_params->input_id_images_path,
                                                         ref_latents,
+                                                        sd_img_gen_params->increase_ref_index,
                                                         concat_latent,
                                                         denoise_mask);
 
diff --git a/stable-diffusion.h b/stable-diffusion.h
@@ -182,6 +182,7 @@ typedef struct {
     sd_image_t init_image;
     sd_image_t* ref_images;
     int ref_images_count;
+    bool increase_ref_index;
     sd_image_t mask_image;
     int width;
     int height;