Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions CMakeLists.txt
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -28,20 +28,20 @@ if(USE_NPU)
if(DEVICE_TYPE STREQUAL "USE_A3")
message("downloading a3 arm xllm kernels")
file(DOWNLOAD
"https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.6.0/xllm_kernels-1.3.1-Linux.a3.arm.rpm"
"https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.6.0/xllm_kernels-1.3.2-Linux.a3.arm.rpm"
"${CMAKE_BINARY_DIR}/xllm_kernels.rpm"
)
else()
if(DEVICE_ARCH STREQUAL "ARM")
message("downloading a2 arm xllm_kernels")
file(DOWNLOAD
"https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.6.0/xllm_kernels-1.3.1-Linux.a2.arm.rpm"
"https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.6.0/xllm_kernels-1.3.2-Linux.a2.arm.rpm"
"${CMAKE_BINARY_DIR}/xllm_kernels.rpm"
)
else()
message("downloading a2 x86 xllm_kernels")
file(DOWNLOAD
"https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.6.0/xllm_kernels-1.3.1-Linux.a2.x86.rpm"
"https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.6.0/xllm_kernels-1.3.2-Linux.a2.x86.rpm"
"${CMAKE_BINARY_DIR}/xllm_kernels.rpm"
)
endif()
Expand Down
2 changes: 1 addition & 1 deletion third_party/xllm_ops
Submodule xllm_ops updated from 2cda9b to 797a0c
6 changes: 6 additions & 0 deletions xllm/core/framework/hf_model_loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,12 @@ bool HFModelLoader::load_image_preprocessor_args(
image_prerocess_data["norm_std"].get<std::vector<double>>();
}

args_.mm_image_shortest_edge() =
image_preprocess_reader.value_or<int>("size.shortest_edge", 0);

args_.mm_image_longest_edge() =
image_preprocess_reader.value_or<int>("size.longest_edge", 0);

args_.mm_image_min_pixels() =
image_preprocess_reader.value_or<int>("min_pixels", 0);

Expand Down
13 changes: 13 additions & 0 deletions xllm/core/framework/model/model_args.h
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -242,12 +242,15 @@ struct ModelArgs {

PROPERTY(int, mm_window_size) = 0;
PROPERTY(std::vector<int64_t>, mm_fullatt_block_indexes);
PROPERTY(std::vector<int64_t>, mm_deepstack_visual_indexes);
PROPERTY(int, mm_tokens_per_second) = 0;
PROPERTY(int, mm_temporal_patch_size) = 0;

// VLM model projector's mm_projector_type
PROPERTY(std::string, mm_projector_type);

//
PROPERTY(int64_t, mm_num_position_embeddings);
// VLM model projector's mm_projector_hidden_act
PROPERTY(std::string, mm_projector_hidden_act);

Expand Down Expand Up @@ -284,6 +287,9 @@ struct ModelArgs {
PROPERTY(int, mm_image_min_pixels) = 0;
PROPERTY(int, mm_image_max_pixels) = 0;

PROPERTY(int64_t, mm_image_shortest_edge) = 0;
PROPERTY(int64_t, mm_image_longest_edge) = 0;

PROPERTY(int, mm_image_patch_size) = 0;
PROPERTY(int, mm_image_temporal_patch_size) = 0;
PROPERTY(int, mm_image_merge_size) = 0;
Expand Down Expand Up @@ -447,6 +453,11 @@ inline std::ostream& operator<<(std::ostream& os, const ModelArgs& args) {
os << index << ",";
}
os << "]";
os << ", mm_deepstack_visual_indexes: [";
for (auto& index : args.mm_deepstack_visual_indexes()) {
os << index << ",";
}
os << "]";
os << ", mm_tokens_per_second: " << args.mm_tokens_per_second();
os << ", mm_temporal_patch_size: " << args.mm_temporal_patch_size();
os << ", mm_projector_type: " << args.mm_projector_type();
Expand Down Expand Up @@ -474,6 +485,8 @@ inline std::ostream& operator<<(std::ostream& os, const ModelArgs& args) {
os << std << ", ";
}
os << "]";
os << ", mm_image_shortest_edge: " << args.mm_image_shortest_edge();
os << ", mm_image_longest_edge: " << args.mm_image_longest_edge();
os << ", mm_image_min_pixels: " << args.mm_image_min_pixels();
os << ", mm_image_max_pixels: " << args.mm_image_max_pixels();
os << ", mm_image_patch_size: " << args.mm_image_patch_size();
Expand Down
8 changes: 8 additions & 0 deletions xllm/core/framework/model/model_input_params.h
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ struct ModelInputParams {

params.input_embedding = safe_to(input_embedding, device);

params.deep_stacks = deep_stacks;
params.visual_pos_masks = visual_pos_masks;

params.mm_data = MMData::to(mm_data, device);
params.dp_global_token_nums = dp_global_token_nums;
params.prefill_seq_len = prefill_seq_len;
Expand Down Expand Up @@ -149,6 +152,11 @@ struct ModelInputParams {
// multimodal
MMData mm_data;

// deep_stack for Qwen3-VL
mutable std::vector<torch::Tensor> deep_stacks;
// visual pos mask for Qwen3-VL
mutable torch::Tensor visual_pos_masks;

// num tokens of all workers,mainly used for dp case
std::vector<int32_t> dp_global_token_nums;
// whether the kv-cache is empty for all sequences,mainly used for dp case
Expand Down
2 changes: 1 addition & 1 deletion xllm/core/framework/quant_args.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ struct QuantArgs {
PROPERTY(std::string, quant_method);

PROPERTY(std::string, quantize_type);
PROPERTY(std::string, torch_dtype);
PROPERTY(std::string, torch_dtype) = "bfloat16";
// quantization bits
PROPERTY(int64_t, bits) = 0;

Expand Down
1 change: 1 addition & 0 deletions xllm/core/layers/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ cc_library(
multi_head_attention.h
qwen2_decoder_layer.h
qwen2dot5_vision_decode_layer.h
qwen3_vision_encode_layer.h
qwen3_decoder_layer.h
qwen3_moe_decoder_layer.h
rms_norm.h
Expand Down
Empty file modified xllm/core/layers/base_layer.cpp
100644 → 100755
Empty file.
2 changes: 2 additions & 0 deletions xllm/core/layers/npu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ cc_library(
npu_pos_embedding_impl.h
npu_lm_head_impl.h
npu_qwen2dot5_vision_encoder_layer_impl.h
npu_qwen3_vision_encoder_layer_impl.h
npu_qwen3_moe_decoder_layer_impl.h
# atb_parallel_linear.h
npu_block_copy_impl.h
Expand All @@ -29,6 +30,7 @@ cc_library(
npu_pos_embedding_impl.cpp
npu_lm_head_impl.cpp
npu_qwen2dot5_vision_encoder_layer_impl.cpp
npu_qwen3_vision_encoder_layer_impl.cpp
npu_qwen3_moe_decoder_layer_impl.cpp
# atb_parallel_linear.cpp
npu_block_copy_impl.cpp
Expand Down
6 changes: 1 addition & 5 deletions xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.cpp
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,7 @@ void NpuQwen3MoeDecoderLayerImpl::initialize_mlp_parameters(
const ModelArgs& args,
const ParallelArgs& parallel_args) {
param.hasSharedExpert = (args.n_shared_experts() > 0);
param.hasSharedExpertGate = true;
param.hasSharedExpertGate = false;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

other qwen models will be effected by this modification.

param.processLogits = "normalization";
param.numOfSelectedExperts = {args.num_experts_per_tok()};

Expand Down Expand Up @@ -492,7 +492,6 @@ void NpuQwen3MoeDecoderLayerImpl::process_expert_weights(
const int local_index = expert_index % num_experts_per_partition_;
const bool is_sharded = shard_map.count(index);

std::lock_guard<std::mutex> lock(experts_mutex_);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as above

torch::Tensor tmp_tensor = is_sharded
? get_sharded_tensor(state_dict,
name,
Expand All @@ -517,8 +516,6 @@ void NpuQwen3MoeDecoderLayerImpl::process_mlp_common_weights(
const int index = get_mapped_index(name, weight_mapping);
const bool is_sharded = shard_map.count(index);

std::lock_guard<std::mutex> lock(shared_experts_mutex_);

torch::Tensor tmp_tensor = is_sharded
? get_sharded_tensor(state_dict,
name,
Expand Down Expand Up @@ -650,7 +647,6 @@ void NpuQwen3MoeDecoderLayerImpl::verify_loaded_weights(

void NpuQwen3MoeDecoderLayerImpl::merge_loaded_weights() {
merge_experts_weights();

at_weight_tensors_[IN_QKV_WEIGHT_0] =
torch::cat({at_weight_tensors_[IN_QKV_WEIGHT_0],
at_weight_tensors_[IN_QKV_WEIGHT_1],
Expand Down
Loading