openvinotoolkit · mengweiguo · Dec 4, 2025 · Dec 4, 2025 · Dec 5, 2025 · as-suvorov
diff --git a/src/cpp/src/rag/text_embedding_pipeline.cpp b/src/cpp/src/rag/text_embedding_pipeline.cpp
@@ -211,32 +211,47 @@ class TextEmbeddingPipeline::TextEmbeddingPipelineImpl {
 
         auto model = core.read_model(models_path / "openvino_model.xml", {}, properties);
 
-        const bool should_reshape = m_config.batch_size.has_value() || m_config.max_length.has_value();
-        if (should_reshape) {
-            reshape_model(model);
-        }
-
-        if (device == "NPU") {
-            OPENVINO_ASSERT(!model->is_dynamic(),
-                            "NPU device does not support dynamic shapes. In order to fix model shape, set batch_size, "
-                            "max_length and pad_to_max_length in the configuration.");
-        }
-
-        model = apply_postprocessing(model, m_config);
-
+        bool is_seq_len_fixed = true;
         if (m_config.max_length) {
             m_tokenization_params.insert({max_length.name(), *m_config.max_length});
+        } else {
+            is_seq_len_fixed = false;
         }
 
         if (m_config.pad_to_max_length) {
             m_tokenization_params.insert({pad_to_max_length.name(), *m_config.pad_to_max_length});
+            is_seq_len_fixed &= m_config.pad_to_max_length.value();
+        } else {
+            is_seq_len_fixed = false;
         }
 
         if (m_config.padding_side) {
             m_tokenization_params.insert({padding_side.name(), *m_config.padding_side});
         }
 
-        ov::CompiledModel compiled_model = core.compile_model(model, device, properties);
+        bool should_reshape_non_npu =
+            (device != "NPU" && (m_config.batch_size.has_value() || m_config.max_length.has_value()));
+        bool should_reshape_npu = (device == "NPU" && m_config.batch_size.has_value() && is_seq_len_fixed);
+        if (should_reshape_non_npu || should_reshape_npu) {
+            reshape_model(model);
+        }
+
+        ov::CompiledModel compiled_model;
+        if (device == "NPU" && model->is_dynamic()) {
+            bool is_padding_on_left = m_config.padding_side.has_value() && m_config.padding_side.value() == "left";
+            if (is_padding_on_left && is_seq_len_fixed &&
+                config.pooling_type != TextEmbeddingPipeline::PoolingType::MEAN) {
+                OPENVINO_THROW("Padding on left is only supported for the mean post-processing type");
+            }
+
+            auto kv_pos = ov::genai::utils::get_kv_axes_pos(model);
+            utils::KVDesc kv_desc;
+            std::tie(compiled_model, kv_desc) =
+                utils::compile_decoder_for_npu_text_embedding(model, properties, kv_pos, m_config);
+        } else {
+            model = apply_postprocessing(model, m_config);
+            compiled_model = core.compile_model(model, device, properties);
+        }
 
         utils::print_compiled_model_properties(compiled_model, "text embedding model");
         m_request = compiled_model.create_infer_request();
@@ -383,7 +398,6 @@ class TextEmbeddingPipeline::TextEmbeddingPipelineImpl {
 
         std::vector<std::vector<float>> result;
         const auto shape = last_hidden_state.get_shape();
-
         const size_t batch_size = shape[0];
         const size_t hidden_size = shape[1];
 

diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp
@@ -116,6 +116,25 @@ void update_npu_config_whisper(ov::AnyMap& config,
     update_config(config, {"NPUW_LLM_PREFILL_HINT", "STATIC"});
 }
 
+void update_npu_config_text_embedding(ov::AnyMap& config,
+                                      const ov::genai::utils::KVAxesPosition& kv_pos,
+                                      const ov::genai::utils::KVDesc& kv_desc,
+                                      const std::string& post_type,
+                                      const bool is_to_normalize) {
+    update_config(config, {"NPU_USE_NPUW", "YES"});
+    update_config(config, {"NPUW_LLM", "YES"});
+    update_config(config, {"NPUW_LLM_BATCH_DIM", kv_pos.batch});
+    update_config(config, {"NPUW_LLM_SEQ_LEN_DIM", kv_pos.seq_len});
+
+    update_config(config, {"NPUW_LLM_MAX_PROMPT_LEN", kv_desc.max_prompt_len});
+    update_config(config, {"NPUW_LLM_MIN_RESPONSE_LEN", kv_desc.min_response_len});
+    update_config(config, {"NPUW_LLM_SHARED_HEAD", "NO"});
+
+    update_config(config, {"NPUW_TEXT_EMBED", "YES"});
+    update_config(config, {"NPUW_TEXT_EMBED_POST_TYPE", post_type});
+    update_config(config, {"NPUW_TEXT_EMBED_NORMALIZE", is_to_normalize});
+}
+
 inline bool is_paged_attention_available() {
 #if defined(OPENVINO_ARCH_X86_64) || defined(OPENVINO_ARCH_ARM64)
     return true;
@@ -130,6 +149,8 @@ namespace ov {
 namespace genai {
 namespace utils {
 
+enum class ModelType { Default, Whisper, TextEmbedding };
+
 Tensor init_attention_mask(const Tensor& input_ids) {
     auto shape = input_ids.get_shape();
     auto attention_mask = ov::Tensor{input_ids.get_element_type(), shape};
@@ -570,11 +591,84 @@ void print_scheduler_config_info(const SchedulerConfig &scheduler_config) {
     std::cout << scheduler_config.to_string() << std::endl;
 }
 
-std::pair<ov::CompiledModel, KVDesc>
-compile_decoder_for_npu(const std::shared_ptr<ov::Model>& model,
-                        const ov::AnyMap& config,
-                        const KVAxesPosition& kv_pos,
-                        const bool is_whisper) {
+void import_npu_model(ov::CompiledModel& compiled,
+                      KVDesc& kv_desc,
+                      const ov::AnyMap& config,
+                      const std::string& blob_path) {
+    if (!std::filesystem::exists(blob_path)) {
+        OPENVINO_THROW("Blob file is not found at: " + blob_path);
+    }
+    std::ifstream fin(blob_path, std::ios::in | std::ios::binary);
+    if (!fin.is_open()) {
+        OPENVINO_THROW("Blob file can't be opened: " + blob_path);
+    }
+    compiled = ov::genai::utils::singleton_core().import_model(fin, "NPU", config);
+    kv_desc.max_prompt_len = compiled.get_property("NPUW_LLM_MAX_PROMPT_LEN").as<uint32_t>();
+    kv_desc.min_response_len = compiled.get_property("NPUW_LLM_MIN_RESPONSE_LEN").as<uint32_t>();
+}
+
+void export_npu_model(ov::CompiledModel& compiled, const std::string& blob_path) {
+    // Check the path is full
+    const int EXT_SIZE = 5;  // ".blob"
+    if (blob_path.size() < EXT_SIZE) {
+        OPENVINO_THROW("Please provide a full path to blob file in BLOB_PATH: " + blob_path);
+    }
+    if (strncmp(&blob_path[blob_path.size() - EXT_SIZE], ".blob", EXT_SIZE) != 0) {
+        OPENVINO_THROW("Please provide a full path to blob file in BLOB_PATH: " + blob_path);
+    }
+    std::ofstream fout(blob_path, std::ios::out | std::ios::binary);
+    if (!fout.is_open()) {
+        OPENVINO_THROW("Blob file can't be exported to: " + blob_path);
+    }
+    compiled.export_model(fout);
+}
+
+void get_npu_model_config(ov::AnyMap& properties,
+                          const KVAxesPosition& kv_pos,
+                          KVDesc& kv_desc,
+                          const bool is_whisper) {
+    if (is_whisper) {
+        kv_desc.max_prompt_len = pop_int_and_cast(properties, "MAX_PROMPT_LEN").value_or(4u);
+        // kvcache size for Whisper = 448u (MAX_PROMPT_LEN + MIN_RESPONSE_LEN)
+        kv_desc.min_response_len = pop_int_and_cast(properties, "MIN_RESPONSE_LEN").value_or(444u);
+        update_npu_config_whisper(properties, kv_pos, kv_desc);
+    } else {
+        kv_desc.max_prompt_len = pop_int_and_cast(properties, "MAX_PROMPT_LEN").value_or(1024u);
+        kv_desc.min_response_len = pop_int_and_cast(properties, "MIN_RESPONSE_LEN").value_or(128u);
+        update_npu_config(properties, kv_pos, kv_desc);
+    }
+}
+
+std::string get_post_type_string(const TextEmbeddingPipeline::Config& config) {
+    std::string post_type;
+    if (config.pooling_type == TextEmbeddingPipeline::PoolingType::CLS) {
+        post_type = "cls";
+    } else if (config.pooling_type == TextEmbeddingPipeline::PoolingType::MEAN) {
+        post_type = "mean";
+    } else {
+        post_type = "last_token";
+    }
+    return post_type;
+}
+
+void get_npu_text_embedding_config(ov::AnyMap& properties,
+                                   const KVAxesPosition& kv_pos,
+                                   KVDesc& kv_desc,
+                                   const TextEmbeddingPipeline::Config& text_embed_config) {
+    if (text_embed_config.max_length.has_value()) {
+        kv_desc.max_prompt_len = text_embed_config.max_length.value();
+    } else {
+        kv_desc.max_prompt_len = pop_int_and_cast(properties, "MAX_PROMPT_LEN").value_or(1024u);
+    }
+    kv_desc.min_response_len = kv_desc.max_prompt_len;
+    update_npu_config_text_embedding(properties, kv_pos, kv_desc, get_post_type_string(text_embed_config), text_embed_config.normalize);
+}
+
+std::pair<ov::CompiledModel, KVDesc> compile_decoder_for_npu_impl(const std::shared_ptr<ov::Model>& model,
+                                                                  const ov::AnyMap& config,
+                                                                  const KVAxesPosition& kv_pos,
+                                                                  ModelType model_type,
+                                                                  const TextEmbeddingPipeline::Config& text_embed_config = {}) {
     ov::CompiledModel compiled;
     ov::AnyMap properties = config;
     KVDesc kv_desc;
@@ -584,49 +678,46 @@ compile_decoder_for_npu(const std::shared_ptr<ov::Model>& model,
     const bool do_import = (!blob_path.empty() && !export_blob);
 
     if (do_import) {
-        if (!std::filesystem::exists(blob_path)) {
-            OPENVINO_THROW("Blob file is not found at: " + blob_path);
-        }
-        std::ifstream fin(blob_path, std::ios::in | std::ios::binary);
-        if (!fin.is_open()) {
-            OPENVINO_THROW("Blob file can't be opened: " + blob_path);
-        }
-        compiled = ov::genai::utils::singleton_core().import_model(fin, "NPU", config);
-        kv_desc.max_prompt_len = compiled.get_property("NPUW_LLM_MAX_PROMPT_LEN").as<uint32_t>();
-        kv_desc.min_response_len = compiled.get_property("NPUW_LLM_MIN_RESPONSE_LEN").as<uint32_t>();
+        import_npu_model(compiled, kv_desc, properties, blob_path);
     } else {
-        if (is_whisper) {
-            kv_desc.max_prompt_len = pop_int_and_cast(properties, "MAX_PROMPT_LEN").value_or(4u);
-            // kvcache size for Whisper = 448u (MAX_PROMPT_LEN + MIN_RESPONSE_LEN)
-            kv_desc.min_response_len = pop_int_and_cast(properties, "MIN_RESPONSE_LEN").value_or(444u);
-            update_npu_config_whisper(properties, kv_pos, kv_desc);
-        } else {
-            kv_desc.max_prompt_len = pop_int_and_cast(properties, "MAX_PROMPT_LEN").value_or(1024u);
-            kv_desc.min_response_len = pop_int_and_cast(properties, "MIN_RESPONSE_LEN").value_or(128u);
-            update_npu_config(properties, kv_pos, kv_desc);
+        switch (model_type) {
+        case ModelType::TextEmbedding:
+            get_npu_text_embedding_config(properties, kv_pos, kv_desc, text_embed_config);
+            break;
+        case ModelType::Whisper:
+            get_npu_model_config(properties, kv_pos, kv_desc, true);
+            break;
+        case ModelType::Default:
+        default:
+            get_npu_model_config(properties, kv_pos, kv_desc, false);
+            break;
         }
+
         compiled = ov::genai::utils::singleton_core().compile_model(model, "NPU", properties);
         // Also export compiled model if required
         if (export_blob) {
             if (blob_path.empty()) {
                 blob_path = "openvino_model.blob";
             }
-            // Check the path is full
-            const int EXT_SIZE = 5; // ".blob"
-            if (blob_path.size() < EXT_SIZE) {
-                OPENVINO_THROW("Please provide a full path to blob file in BLOB_PATH: " + blob_path);
-            }
-            if (strncmp(".blob", &blob_path[blob_path.size() - EXT_SIZE], EXT_SIZE) != 0) {
-                OPENVINO_THROW("Please provide a full path to blob file in BLOB_PATH: " + blob_path);
-            }
-            std::ofstream fout(blob_path, std::ios::out | std::ios::binary);
-            if (!fout.is_open()) {
-                OPENVINO_THROW("Blob file can't be exported to: " + blob_path);
-            }
-            compiled.export_model(fout);
+            export_npu_model(compiled, blob_path);
         }
     }
-    return { compiled, kv_desc };
+
+    return {compiled, kv_desc};
+}
+
+std::pair<ov::CompiledModel, KVDesc> compile_decoder_for_npu(const std::shared_ptr<ov::Model>& model,
+                                                             const ov::AnyMap& config,
+                                                             const KVAxesPosition& kv_pos,
+                                                             const bool is_whisper) {
+    return compile_decoder_for_npu_impl(model, config, kv_pos, is_whisper ? ModelType::Whisper : ModelType::Default);
+}
+
+std::pair<ov::CompiledModel, KVDesc> compile_decoder_for_npu_text_embedding(const std::shared_ptr<ov::Model>& model,
+                                                                            const ov::AnyMap& config,
+                                                                            const KVAxesPosition& kv_pos,
+                                                                            const TextEmbeddingPipeline::Config& text_embed_config) {
+    return compile_decoder_for_npu_impl(model, config, kv_pos, ModelType::TextEmbedding, text_embed_config);
 }
 
 std::optional<ov::Any> pop_option(ov::AnyMap& config, const std::string& option_name) {

diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp
@@ -9,6 +9,7 @@
 
 #include "openvino/genai/llm_pipeline.hpp"
 #include "openvino/genai/visual_language/pipeline.hpp"
+#include "openvino/genai/rag/text_embedding_pipeline.hpp"
 #include "openvino/runtime/core.hpp"
 
 #include "openvino/genai/generation_handle.hpp"
@@ -196,6 +197,11 @@ std::pair<ov::CompiledModel, KVDesc> compile_decoder_for_npu(const std::shared_p
                                                              const KVAxesPosition& kv_pos,
                                                              const bool is_whisper = false);
 
+std::pair<ov::CompiledModel, KVDesc> compile_decoder_for_npu_text_embedding(const std::shared_ptr<ov::Model>& model,
+                                                                            const ov::AnyMap& config,
+                                                                            const KVAxesPosition& kv_pos,
+                                                                            const ov::genai::TextEmbeddingPipeline::Config& text_embed_config);
+
 /// @brief SharedOptional is a wrapper around a reference to an existing object and an optional shared alternative value.
 /// The difference from std::optional is that the default state is not empty and contains a reference to an existing object outside the class.
 /// Another difference is that the alternative value is shared between all instances of SharedOptional like std::shared_ptr.

diff --git a/tools/llm_bench/benchmark.py b/tools/llm_bench/benchmark.py
@@ -210,6 +210,7 @@ def get_argprser():
                         help="Pooling type CLS or MEAN for encoders, LAST_TOKEN for decoders. "
                              "Different post-processing is applied depending on the padding side. Applicable only for text embeddings")
     parser.add_argument("--embedding_normalize", action="store_true", help="Normalize embeddings. Applicable only for text embeddings")
+    parser.add_argument("--embedding_pad_to_max_length", action="store_true", help="Pad embeddings. Applicable only for text embeddings")
     parser.add_argument("--embedding_max_length", type=int, default=None,
                         help="Max length for text embeddings. Input text will be padded or truncated to specified value")
     parser.add_argument("--embedding_padding_side", choices=["left", "right"], default=None,

diff --git a/tools/llm_bench/llm_bench_utils/model_utils.py b/tools/llm_bench/llm_bench_utils/model_utils.py
@@ -136,6 +136,7 @@ def analyze_args(args):
     model_args['emb_normalize'] = args.embedding_normalize
     model_args["emb_max_length"] = args.embedding_max_length
     model_args["emb_padding_side"] = args.embedding_padding_side
+    model_args["emb_pad_to_max_length"] = args.embedding_pad_to_max_length
     model_args['rerank_max_length'] = args.reranking_max_length
     model_args["rerank_top_n"] = args.reranking_top_n
     model_args["rerank_texts"] = args.texts

diff --git a/tools/llm_bench/llm_bench_utils/ov_utils.py b/tools/llm_bench/llm_bench_utils/ov_utils.py
@@ -666,7 +666,7 @@ def create_genai_text_embed_model(model_path, device, memory_data_collector, **k
 
     pooling_type = kwargs.get("emb_pooling_type")
     max_length = kwargs.get("emb_max_length")
-    padding_side = kwargs.get("embedding_padding_side")
+    padding_side = kwargs.get("emb_padding_side")
 
     config = openvino_genai.TextEmbeddingPipeline.Config()
     if pooling_type is not None:
@@ -678,7 +678,8 @@ def create_genai_text_embed_model(model_path, device, memory_data_collector, **k
             config.pooling_type = openvino_genai.TextEmbeddingPipeline.PoolingType.CLS
     if max_length is not None:
         config.max_length = max_length
-        config.pad_to_max_length = True
+
+    config.pad_to_max_length = kwargs.get("emb_pad_to_max_length", False)
     config.normalize = kwargs.get("emb_normalize", False)
     if padding_side:
         config.padding_side = padding_side