Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 29 additions & 15 deletions src/cpp/src/rag/text_embedding_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -211,32 +211,47 @@ class TextEmbeddingPipeline::TextEmbeddingPipelineImpl {

auto model = core.read_model(models_path / "openvino_model.xml", {}, properties);

const bool should_reshape = m_config.batch_size.has_value() || m_config.max_length.has_value();
if (should_reshape) {
reshape_model(model);
}

if (device == "NPU") {
OPENVINO_ASSERT(!model->is_dynamic(),
"NPU device does not support dynamic shapes. In order to fix model shape, set batch_size, "
"max_length and pad_to_max_length in the configuration.");
}

model = apply_postprocessing(model, m_config);

bool is_seq_len_fixed = true;
if (m_config.max_length) {
m_tokenization_params.insert({max_length.name(), *m_config.max_length});
} else {
is_seq_len_fixed = false;
}

if (m_config.pad_to_max_length) {
m_tokenization_params.insert({pad_to_max_length.name(), *m_config.pad_to_max_length});
is_seq_len_fixed &= m_config.pad_to_max_length.value();
} else {
is_seq_len_fixed = false;
}

if (m_config.padding_side) {
m_tokenization_params.insert({padding_side.name(), *m_config.padding_side});
}

ov::CompiledModel compiled_model = core.compile_model(model, device, properties);
bool should_reshape_non_npu =
(device != "NPU" && (m_config.batch_size.has_value() || m_config.max_length.has_value()));
bool should_reshape_npu = (device == "NPU" && m_config.batch_size.has_value() && is_seq_len_fixed);
if (should_reshape_non_npu || should_reshape_npu) {
reshape_model(model);
}

ov::CompiledModel compiled_model;
if (device == "NPU" && model->is_dynamic()) {
bool is_padding_on_left = m_config.padding_side.has_value() && m_config.padding_side.value() == "left";
if (is_padding_on_left && is_seq_len_fixed &&
config.pooling_type != TextEmbeddingPipeline::PoolingType::MEAN) {
OPENVINO_THROW("Padding on left is only supported for the mean post-processing type");
}

auto kv_pos = ov::genai::utils::get_kv_axes_pos(model);
utils::KVDesc kv_desc;
std::tie(compiled_model, kv_desc) =
utils::compile_decoder_for_npu_text_embedding(model, properties, kv_pos, m_config);
} else {
model = apply_postprocessing(model, m_config);
compiled_model = core.compile_model(model, device, properties);
}

utils::print_compiled_model_properties(compiled_model, "text embedding model");
m_request = compiled_model.create_infer_request();
Expand Down Expand Up @@ -383,7 +398,6 @@ class TextEmbeddingPipeline::TextEmbeddingPipelineImpl {

std::vector<std::vector<float>> result;
const auto shape = last_hidden_state.get_shape();

const size_t batch_size = shape[0];
const size_t hidden_size = shape[1];

Expand Down
167 changes: 129 additions & 38 deletions src/cpp/src/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,25 @@ void update_npu_config_whisper(ov::AnyMap& config,
update_config(config, {"NPUW_LLM_PREFILL_HINT", "STATIC"});
}

void update_npu_config_text_embedding(ov::AnyMap& config,
const ov::genai::utils::KVAxesPosition& kv_pos,
const ov::genai::utils::KVDesc& kv_desc,
const std::string& post_type,
const bool is_to_normalize) {
update_config(config, {"NPU_USE_NPUW", "YES"});
update_config(config, {"NPUW_LLM", "YES"});
update_config(config, {"NPUW_LLM_BATCH_DIM", kv_pos.batch});
update_config(config, {"NPUW_LLM_SEQ_LEN_DIM", kv_pos.seq_len});

update_config(config, {"NPUW_LLM_MAX_PROMPT_LEN", kv_desc.max_prompt_len});
update_config(config, {"NPUW_LLM_MIN_RESPONSE_LEN", kv_desc.min_response_len});
update_config(config, {"NPUW_LLM_SHARED_HEAD", "NO"});

update_config(config, {"NPUW_TEXT_EMBED", "YES"});
update_config(config, {"NPUW_TEXT_EMBED_POST_TYPE", post_type});
update_config(config, {"NPUW_TEXT_EMBED_NORMALIZE", is_to_normalize});
}

inline bool is_paged_attention_available() {
#if defined(OPENVINO_ARCH_X86_64) || defined(OPENVINO_ARCH_ARM64)
return true;
Expand All @@ -130,6 +149,8 @@ namespace ov {
namespace genai {
namespace utils {

enum class ModelType { Default, Whisper, TextEmbedding };

Tensor init_attention_mask(const Tensor& input_ids) {
auto shape = input_ids.get_shape();
auto attention_mask = ov::Tensor{input_ids.get_element_type(), shape};
Expand Down Expand Up @@ -570,11 +591,84 @@ void print_scheduler_config_info(const SchedulerConfig &scheduler_config) {
std::cout << scheduler_config.to_string() << std::endl;
}

std::pair<ov::CompiledModel, KVDesc>
compile_decoder_for_npu(const std::shared_ptr<ov::Model>& model,
const ov::AnyMap& config,
const KVAxesPosition& kv_pos,
const bool is_whisper) {
void import_npu_model(ov::CompiledModel& compiled,
KVDesc& kv_desc,
const ov::AnyMap& config,
const std::string& blob_path) {
if (!std::filesystem::exists(blob_path)) {
OPENVINO_THROW("Blob file is not found at: " + blob_path);
}
std::ifstream fin(blob_path, std::ios::in | std::ios::binary);
if (!fin.is_open()) {
OPENVINO_THROW("Blob file can't be opened: " + blob_path);
}
compiled = ov::genai::utils::singleton_core().import_model(fin, "NPU", config);
kv_desc.max_prompt_len = compiled.get_property("NPUW_LLM_MAX_PROMPT_LEN").as<uint32_t>();
kv_desc.min_response_len = compiled.get_property("NPUW_LLM_MIN_RESPONSE_LEN").as<uint32_t>();
}

void export_npu_model(ov::CompiledModel& compiled, const std::string& blob_path) {
// Check the path is full
const int EXT_SIZE = 5; // ".blob"
if (blob_path.size() < EXT_SIZE) {
OPENVINO_THROW("Please provide a full path to blob file in BLOB_PATH: " + blob_path);
}
if (strncmp(&blob_path[blob_path.size() - EXT_SIZE], ".blob", EXT_SIZE) != 0) {
OPENVINO_THROW("Please provide a full path to blob file in BLOB_PATH: " + blob_path);
}
std::ofstream fout(blob_path, std::ios::out | std::ios::binary);
if (!fout.is_open()) {
OPENVINO_THROW("Blob file can't be exported to: " + blob_path);
}
compiled.export_model(fout);
}

void get_npu_model_config(ov::AnyMap& properties,
const KVAxesPosition& kv_pos,
KVDesc& kv_desc,
const bool is_whisper) {
if (is_whisper) {
kv_desc.max_prompt_len = pop_int_and_cast(properties, "MAX_PROMPT_LEN").value_or(4u);
// kvcache size for Whisper = 448u (MAX_PROMPT_LEN + MIN_RESPONSE_LEN)
kv_desc.min_response_len = pop_int_and_cast(properties, "MIN_RESPONSE_LEN").value_or(444u);
update_npu_config_whisper(properties, kv_pos, kv_desc);
} else {
kv_desc.max_prompt_len = pop_int_and_cast(properties, "MAX_PROMPT_LEN").value_or(1024u);
kv_desc.min_response_len = pop_int_and_cast(properties, "MIN_RESPONSE_LEN").value_or(128u);
update_npu_config(properties, kv_pos, kv_desc);
}
}

std::string get_post_type_string(const TextEmbeddingPipeline::Config& config) {
std::string post_type;
if (config.pooling_type == TextEmbeddingPipeline::PoolingType::CLS) {
post_type = "cls";
} else if (config.pooling_type == TextEmbeddingPipeline::PoolingType::MEAN) {
post_type = "mean";
} else {
post_type = "last_token";
}
return post_type;
}

void get_npu_text_embedding_config(ov::AnyMap& properties,
const KVAxesPosition& kv_pos,
KVDesc& kv_desc,
const TextEmbeddingPipeline::Config& text_embed_config) {
if (text_embed_config.max_length.has_value()) {
kv_desc.max_prompt_len = text_embed_config.max_length.value();
} else {
kv_desc.max_prompt_len = pop_int_and_cast(properties, "MAX_PROMPT_LEN").value_or(1024u);
}
kv_desc.min_response_len = kv_desc.max_prompt_len;
update_npu_config_text_embedding(properties, kv_pos, kv_desc, get_post_type_string(text_embed_config), text_embed_config.normalize);
}

std::pair<ov::CompiledModel, KVDesc> compile_decoder_for_npu_impl(const std::shared_ptr<ov::Model>& model,
const ov::AnyMap& config,
const KVAxesPosition& kv_pos,
ModelType model_type,
const TextEmbeddingPipeline::Config& text_embed_config = {}) {
ov::CompiledModel compiled;
ov::AnyMap properties = config;
KVDesc kv_desc;
Expand All @@ -584,49 +678,46 @@ compile_decoder_for_npu(const std::shared_ptr<ov::Model>& model,
const bool do_import = (!blob_path.empty() && !export_blob);

if (do_import) {
if (!std::filesystem::exists(blob_path)) {
OPENVINO_THROW("Blob file is not found at: " + blob_path);
}
std::ifstream fin(blob_path, std::ios::in | std::ios::binary);
if (!fin.is_open()) {
OPENVINO_THROW("Blob file can't be opened: " + blob_path);
}
compiled = ov::genai::utils::singleton_core().import_model(fin, "NPU", config);
kv_desc.max_prompt_len = compiled.get_property("NPUW_LLM_MAX_PROMPT_LEN").as<uint32_t>();
kv_desc.min_response_len = compiled.get_property("NPUW_LLM_MIN_RESPONSE_LEN").as<uint32_t>();
import_npu_model(compiled, kv_desc, properties, blob_path);
} else {
if (is_whisper) {
kv_desc.max_prompt_len = pop_int_and_cast(properties, "MAX_PROMPT_LEN").value_or(4u);
// kvcache size for Whisper = 448u (MAX_PROMPT_LEN + MIN_RESPONSE_LEN)
kv_desc.min_response_len = pop_int_and_cast(properties, "MIN_RESPONSE_LEN").value_or(444u);
update_npu_config_whisper(properties, kv_pos, kv_desc);
} else {
kv_desc.max_prompt_len = pop_int_and_cast(properties, "MAX_PROMPT_LEN").value_or(1024u);
kv_desc.min_response_len = pop_int_and_cast(properties, "MIN_RESPONSE_LEN").value_or(128u);
update_npu_config(properties, kv_pos, kv_desc);
switch (model_type) {
case ModelType::TextEmbedding:
get_npu_text_embedding_config(properties, kv_pos, kv_desc, text_embed_config);
break;
case ModelType::Whisper:
get_npu_model_config(properties, kv_pos, kv_desc, true);
break;
case ModelType::Default:
default:
get_npu_model_config(properties, kv_pos, kv_desc, false);
break;
}

compiled = ov::genai::utils::singleton_core().compile_model(model, "NPU", properties);
// Also export compiled model if required
if (export_blob) {
if (blob_path.empty()) {
blob_path = "openvino_model.blob";
}
// Check the path is full
const int EXT_SIZE = 5; // ".blob"
if (blob_path.size() < EXT_SIZE) {
OPENVINO_THROW("Please provide a full path to blob file in BLOB_PATH: " + blob_path);
}
if (strncmp(".blob", &blob_path[blob_path.size() - EXT_SIZE], EXT_SIZE) != 0) {
OPENVINO_THROW("Please provide a full path to blob file in BLOB_PATH: " + blob_path);
}
std::ofstream fout(blob_path, std::ios::out | std::ios::binary);
if (!fout.is_open()) {
OPENVINO_THROW("Blob file can't be exported to: " + blob_path);
}
compiled.export_model(fout);
export_npu_model(compiled, blob_path);
}
}
return { compiled, kv_desc };

return {compiled, kv_desc};
}

std::pair<ov::CompiledModel, KVDesc> compile_decoder_for_npu(const std::shared_ptr<ov::Model>& model,
const ov::AnyMap& config,
const KVAxesPosition& kv_pos,
const bool is_whisper) {
return compile_decoder_for_npu_impl(model, config, kv_pos, is_whisper ? ModelType::Whisper : ModelType::Default);
}

std::pair<ov::CompiledModel, KVDesc> compile_decoder_for_npu_text_embedding(const std::shared_ptr<ov::Model>& model,
const ov::AnyMap& config,
const KVAxesPosition& kv_pos,
const TextEmbeddingPipeline::Config& text_embed_config) {
return compile_decoder_for_npu_impl(model, config, kv_pos, ModelType::TextEmbedding, text_embed_config);
}

std::optional<ov::Any> pop_option(ov::AnyMap& config, const std::string& option_name) {
Expand Down
6 changes: 6 additions & 0 deletions src/cpp/src/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

#include "openvino/genai/llm_pipeline.hpp"
#include "openvino/genai/visual_language/pipeline.hpp"
#include "openvino/genai/rag/text_embedding_pipeline.hpp"
#include "openvino/runtime/core.hpp"

#include "openvino/genai/generation_handle.hpp"
Expand Down Expand Up @@ -196,6 +197,11 @@ std::pair<ov::CompiledModel, KVDesc> compile_decoder_for_npu(const std::shared_p
const KVAxesPosition& kv_pos,
const bool is_whisper = false);

std::pair<ov::CompiledModel, KVDesc> compile_decoder_for_npu_text_embedding(const std::shared_ptr<ov::Model>& model,
const ov::AnyMap& config,
const KVAxesPosition& kv_pos,
const ov::genai::TextEmbeddingPipeline::Config& text_embed_config);

/// @brief SharedOptional is a wrapper around a reference to an existing object and an optional shared alternative value.
/// The difference from std::optional is that the default state is not empty and contains a reference to an existing object outside the class.
/// Another difference is that the alternative value is shared between all instances of SharedOptional like std::shared_ptr.
Expand Down
1 change: 1 addition & 0 deletions tools/llm_bench/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ def get_argprser():
help="Pooling type CLS or MEAN for encoders, LAST_TOKEN for decoders. "
"Different post-processing is applied depending on the padding side. Applicable only for text embeddings")
parser.add_argument("--embedding_normalize", action="store_true", help="Normalize embeddings. Applicable only for text embeddings")
parser.add_argument("--embedding_pad_to_max_length", action="store_true", help="Pad embeddings. Applicable only for text embeddings")
parser.add_argument("--embedding_max_length", type=int, default=None,
help="Max length for text embeddings. Input text will be padded or truncated to specified value")
parser.add_argument("--embedding_padding_side", choices=["left", "right"], default=None,
Expand Down
1 change: 1 addition & 0 deletions tools/llm_bench/llm_bench_utils/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ def analyze_args(args):
model_args['emb_normalize'] = args.embedding_normalize
model_args["emb_max_length"] = args.embedding_max_length
model_args["emb_padding_side"] = args.embedding_padding_side
model_args["emb_pad_to_max_length"] = args.embedding_pad_to_max_length
model_args['rerank_max_length'] = args.reranking_max_length
model_args["rerank_top_n"] = args.reranking_top_n
model_args["rerank_texts"] = args.texts
Expand Down
5 changes: 3 additions & 2 deletions tools/llm_bench/llm_bench_utils/ov_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -666,7 +666,7 @@ def create_genai_text_embed_model(model_path, device, memory_data_collector, **k

pooling_type = kwargs.get("emb_pooling_type")
max_length = kwargs.get("emb_max_length")
padding_side = kwargs.get("embedding_padding_side")
padding_side = kwargs.get("emb_padding_side")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@sbalandi argument names were not aligned, thus option was lost. Consider arguments review and potentially introduce types.
@mengweiguo thanks!


config = openvino_genai.TextEmbeddingPipeline.Config()
if pooling_type is not None:
Expand All @@ -678,7 +678,8 @@ def create_genai_text_embed_model(model_path, device, memory_data_collector, **k
config.pooling_type = openvino_genai.TextEmbeddingPipeline.PoolingType.CLS
if max_length is not None:
config.max_length = max_length
config.pad_to_max_length = True

config.pad_to_max_length = kwargs.get("emb_pad_to_max_length", False)
config.normalize = kwargs.get("emb_normalize", False)
if padding_side:
config.padding_side = padding_side
Expand Down