Add option normalize support

mengweiguo · mengweiguo · commit f81e8807c692 · 2025-12-05T14:20:01.000+08:00
diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
@@ -1101,15 +1101,6 @@ std::optional<NPUDesc> extract_npu_descriptor(const std::shared_ptr<const ov::IP
     return std::make_optional(std::move(desc));
 }
 
-std::optional<ov::Any> pop_option(ov::AnyMap& config, const std::string& option_name) {
-    if (auto it = config.find(option_name); it != config.end()) {
-        std::optional<ov::Any> found = std::make_optional(it->second);
-        config.erase(it);
-        return found;
-    }
-    return std::nullopt;
-}
-
 void apply_weights_bank_name(ov::AnyMap& config, const std::string& bank_name) {
     auto it = config.find("NPUW_WEIGHTS_BANK");
     if (it != config.end()) {
@@ -1537,8 +1528,7 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
             ov::npuw::util::prepare_text_embedding_model(kvcache_model, seq_len_dim);
         }
 
-        auto post_type = pop_option(other_props, std::string("NPUW_TEXT_EMBED_POST_TYPE"));
-        ov::npuw::util::create_text_embedding_post_model(kvcache_model, text_embedding_post_model, post_type);
+        ov::npuw::util::create_text_embedding_post_model(kvcache_model, text_embedding_post_model, other_props);
     } else {
         LOG_DEBUG("Transform kvcache model from stateful to stateless.");
         ov::pass::StatefulToStateless().run_on_model(kvcache_model);
diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model_utils.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model_utils.cpp
@@ -647,6 +647,15 @@ class CachePositionInput : public ov::pass::MatcherPass {
 #    pragma GCC diagnostic pop
 #endif
 
+std::optional<ov::Any> ov::npuw::util::pop_option(ov::AnyMap& config, const std::string& option_name) {
+    if (auto it = config.find(option_name); it != config.end()) {
+        std::optional<ov::Any> found = std::make_optional(it->second);
+        config.erase(it);
+        return found;
+    }
+    return std::nullopt;
+}
+
 bool ov::npuw::util::has_input(const std::shared_ptr<ov::Model>& model, const std::string& name) {
     auto inputs = model->inputs();
     auto it = std::find_if(inputs.begin(), inputs.end(), [&](const auto& port) {
@@ -837,11 +846,21 @@ std::shared_ptr<ov::op::Op> get_last_token_pooling_op(std::shared_ptr<ov::Model>
     return std::make_shared<op::v8::Gather>(last_hidden_state_node, subtract, one, 1);
 }
 
+std::shared_ptr<ov::op::Op> normalize_output(std::shared_ptr<ov::op::Op> last_hidden_state_node) {
+    using namespace ov;
+
+    auto axis_const = std::make_shared<op::v0::Constant>(ov::element::i32, ov::Shape{1}, std::vector{1});
+    return std::make_shared<op::v0::NormalizeL2>(last_hidden_state_node,
+                                                 axis_const,
+                                                 static_cast<float>(1e-7),
+                                                 op::EpsMode::MAX);
+}
+
 }  // namespace
 
 void ov::npuw::util::create_text_embedding_post_model(std::shared_ptr<ov::Model> model,
                                                       std::shared_ptr<ov::Model>& post_model,
-                                                      std::optional<ov::Any>& post_type_any) {
+                                                      ov::AnyMap& config) {
     auto output_node = model->outputs()[0];
     auto input_param =
         std::make_shared<ov::op::v0::Parameter>(output_node.get_element_type(), output_node.get_partial_shape());
@@ -850,7 +869,9 @@ void ov::npuw::util::create_text_embedding_post_model(std::shared_ptr<ov::Model>
     auto attention_mask = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::PartialShape{-1, -1});
     set_node_name(attention_mask, "attention_mask");
 
-    auto post_type = post_type_any.value_or(std::string("last_token")).as<std::string>();
+    auto post_type_opt = pop_option(config, std::string("NPUW_TEXT_EMBED_POST_TYPE"));
+    auto post_type = post_type_opt.value_or(std::string("last_token")).as<std::string>();
+
     std::shared_ptr<ov::op::Op> post_output;
     if (post_type == "cls") {
         post_output = get_cls_pooling_op(input_param);
@@ -859,9 +880,14 @@ void ov::npuw::util::create_text_embedding_post_model(std::shared_ptr<ov::Model>
     } else if (post_type == "last_token") {
         post_output = get_last_token_pooling_op(model, input_param, attention_mask);
     }
-
     OPENVINO_ASSERT(post_output != nullptr);
 
+    auto is_to_normalize_opt = pop_option(config, std::string("NPUW_TEXT_EMBED_NORMALIZE"));
+    auto is_to_normalize = is_to_normalize_opt.value_or(true).as<bool>();
+    if (is_to_normalize) {
+        post_output = normalize_output(post_output);
+    }
+
     auto result_node = std::make_shared<ov::op::v0::Result>(post_output);
     post_model =
         std::make_shared<ov::Model>(ov::OutputVector{result_node}, ov::ParameterVector{input_param, attention_mask});
diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model_utils.hpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model_utils.hpp
@@ -33,14 +33,16 @@ bool optimize_value_tensors(std::shared_ptr<ov::Model> model, bool isPrefill);
 void prepare_text_embedding_model(std::shared_ptr<ov::Model> model, uint32_t seq_len_dim);
 void create_text_embedding_post_model(std::shared_ptr<ov::Model> model,
                                       std::shared_ptr<ov::Model>& post_model,
-                                      std::optional<ov::Any>& post_type);
+                                      ov::AnyMap& config);
 
 std::shared_ptr<ov::Model> prepare_whisper_prefill_model(std::shared_ptr<ov::Model>& model,
                                                          const uint32_t& max_prompt_size,
                                                          const uint32_t& lhs_seq_size);
 
 std::shared_ptr<ov::Model> prepare_whisper_kvcache_model(std::shared_ptr<ov::Model>& model);
 
+std::optional<ov::Any> pop_option(ov::AnyMap& config, const std::string& option_name);
+
 // clang-format off
 }  // namespace ov
 // clang-format on