Skip to content

Commit ceaa6a2

Browse files
committed
Merge remote-tracking branch 'upstream/master' into mitmul/add-plamo2
2 parents 2d76b21 + 704bb7a commit ceaa6a2

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

51 files changed

+2536
-1083
lines changed

.github/workflows/build.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,7 @@ jobs:
342342
cd build
343343
export GGML_VK_VISIBLE_DEVICES=0
344344
# This is using llvmpipe and runs slower than other backends
345-
ctest -L main --verbose --timeout 3600
345+
ctest -L main --verbose --timeout 4200
346346
347347
ubuntu-22-cmake-hip:
348348
runs-on: ubuntu-22.04

common/CMakeLists.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -112,13 +112,13 @@ if (LLAMA_LLGUIDANCE)
112112

113113
ExternalProject_Add(llguidance_ext
114114
GIT_REPOSITORY https://github.com/guidance-ai/llguidance
115-
# v0.7.20 (+ fix to build on GCC 15):
116-
GIT_TAG b5b8b64dba11c4e4ee6b1d1450d3a3ae279891e8
115+
# v1.0.1:
116+
GIT_TAG d795912fedc7d393de740177ea9ea761e7905774
117117
PREFIX ${CMAKE_BINARY_DIR}/llguidance
118118
SOURCE_DIR ${LLGUIDANCE_SRC}
119119
BUILD_IN_SOURCE TRUE
120120
CONFIGURE_COMMAND ""
121-
BUILD_COMMAND cargo build --release
121+
BUILD_COMMAND cargo build --release --package llguidance
122122
INSTALL_COMMAND ""
123123
BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/${LLGUIDANCE_LIB_NAME} ${LLGUIDANCE_PATH}/llguidance.h
124124
UPDATE_COMMAND ""

common/arg.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2734,6 +2734,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
27342734
params.public_path = value;
27352735
}
27362736
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
2737+
add_opt(common_arg(
2738+
{"--api-prefix"}, "PREFIX",
2739+
string_format("prefix path the server serves from, without the trailing slash (default: %s)", params.api_prefix.c_str()),
2740+
[](common_params & params, const std::string & value) {
2741+
params.api_prefix = value;
2742+
}
2743+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
27372744
add_opt(common_arg(
27382745
{"--no-webui"},
27392746
string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,7 @@ struct common_params {
370370

371371
std::string hostname = "127.0.0.1";
372372
std::string public_path = ""; // NOLINT
373+
std::string api_prefix = ""; // NOLINT
373374
std::string chat_template = ""; // NOLINT
374375
bool use_jinja = false; // NOLINT
375376
bool enable_chat_template = true;

convert_hf_to_gguf.py

Lines changed: 296 additions & 5 deletions
Large diffs are not rendered by default.

convert_hf_to_gguf_update.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ class TOKENIZER_TYPE(IntEnum):
128128
{"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
129129
{"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
130130
{"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
131+
{"name": "a.x-4.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/skt/A.X-4.0", },
131132
]
132133

133134
# some models are known to be broken upstream, so we will skip them as exceptions
@@ -137,6 +138,12 @@ class TOKENIZER_TYPE(IntEnum):
137138
{"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516"},
138139
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"},
139140
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"},
141+
{"name": "hunyuan", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-A13B-Instruct", "chkhsh": "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664"},
142+
# falcon-h1 series uses 4 different tokenizers across model sizes (0.5b - 34b), hence we need to define 4 different hashes
143+
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base", "chkhsh": "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6"},
144+
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-1B-Base", "chkhsh": "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86"},
145+
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-7B-Base", "chkhsh": "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896"},
146+
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"},
140147
]
141148

142149

docs/development/HOWTO-add-model.md

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -83,20 +83,22 @@ NOTE: Tensor names must end with `.weight` or `.bias` suffixes, that is the conv
8383

8484
### 2. Define the model architecture in `llama.cpp`
8585

86-
The model params and tensors layout must be defined in `llama.cpp`:
87-
1. Define a new `llm_arch`
88-
2. Define the tensors layout in `LLM_TENSOR_NAMES`
89-
3. Add any non-standard metadata in `llm_load_hparams`
90-
4. Create the tensors for inference in `llm_load_tensors`
91-
5. If the model has a RoPE operation, add the rope type in `llama_rope_type`
86+
The model params and tensors layout must be defined in `llama.cpp` source files:
87+
1. Define a new `llm_arch` enum value in `src/llama-arch.h`.
88+
2. In `src/llama-arch.cpp`:
89+
- Add the architecture name to the `LLM_ARCH_NAMES` map.
90+
- Add the tensor mappings to the `LLM_TENSOR_NAMES` map.
91+
3. Add any non-standard metadata loading in the `llama_model_loader` constructor in `src/llama-model-loader.cpp`.
92+
4. If the model has a RoPE operation, add a case for the architecture in `llama_model_rope_type` function in `src/llama-model.cpp`.
9293

9394
NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorch` dimensions.
9495

9596
### 3. Build the GGML graph implementation
9697

97-
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.
98-
99-
Have a look at existing implementations like `build_llama`, `build_dbrx` or `build_bert`.
98+
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `src/llama-model.cpp`.
99+
Create a new struct that inherits from `llm_graph_context` and implement the graph-building logic in its constructor.
100+
Have a look at existing implementations like `llm_build_llama`, `llm_build_dbrx` or `llm_build_bert`.
101+
Then, in the `llama_model::build_graph` method, add a case for your architecture to instantiate your new graph-building struct.
100102

101103
Some `ggml` backends do not support all operations. Backend implementations can be added in a separate PR.
102104

ggml/include/ggml.h

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -495,7 +495,7 @@ extern "C" {
495495
GGML_OP_POOL_1D,
496496
GGML_OP_POOL_2D,
497497
GGML_OP_POOL_2D_BACK,
498-
GGML_OP_UPSCALE, // nearest interpolate
498+
GGML_OP_UPSCALE,
499499
GGML_OP_PAD,
500500
GGML_OP_PAD_REFLECT_1D,
501501
GGML_OP_ROLL,
@@ -1297,6 +1297,19 @@ extern "C" {
12971297
struct ggml_tensor * a,
12981298
float s);
12991299

1300+
// x = s * a + b
1301+
GGML_API struct ggml_tensor * ggml_scale_bias(
1302+
struct ggml_context * ctx,
1303+
struct ggml_tensor * a,
1304+
float s,
1305+
float b);
1306+
1307+
GGML_API struct ggml_tensor * ggml_scale_bias_inplace(
1308+
struct ggml_context * ctx,
1309+
struct ggml_tensor * a,
1310+
float s,
1311+
float b);
1312+
13001313
// b -> view(a,offset,nb1,nb2,3), return modified a
13011314
GGML_API struct ggml_tensor * ggml_set(
13021315
struct ggml_context * ctx,

ggml/src/ggml-cann/ggml-cann.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2188,7 +2188,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
21882188
case GGML_OP_MUL:
21892189
case GGML_OP_DIV:
21902190
case GGML_OP_RMS_NORM:
2191-
case GGML_OP_SCALE:
21922191
case GGML_OP_SQR:
21932192
case GGML_OP_SQRT:
21942193
case GGML_OP_CLAMP:
@@ -2210,6 +2209,10 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
22102209
case GGML_OP_PAD_REFLECT_1D:
22112210
case GGML_OP_COUNT_EQUAL:
22122211
return true;
2212+
case GGML_OP_SCALE:
2213+
float bias;
2214+
memcpy(&bias, (float*)op->op_params + 1, sizeof(float));
2215+
return bias == 0.0f; // TODO: support bias != 0.0f
22132216
case GGML_OP_SOFT_MAX:
22142217
// TODO: support broadcast
22152218
// ref: https://github.com/ggml-org/llama.cpp/pull/14435

ggml/src/ggml-cpu/ops.cpp

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4643,9 +4643,11 @@ static void ggml_compute_forward_scale_f32(
46434643
GGML_ASSERT(ggml_is_contiguous(dst));
46444644
GGML_ASSERT(ggml_are_same_shape(src0, dst));
46454645

4646-
// scale factor
4647-
float v;
4648-
memcpy(&v, dst->op_params, sizeof(float));
4646+
float s; // scale factor
4647+
float b; // bias
4648+
4649+
memcpy(&s, (float *) dst->op_params + 0, sizeof(float));
4650+
memcpy(&b, (float *) dst->op_params + 1, sizeof(float));
46494651

46504652
const int ith = params->ith;
46514653
const int nth = params->nth;
@@ -4664,12 +4666,22 @@ static void ggml_compute_forward_scale_f32(
46644666

46654667
const size_t nb1 = dst->nb[1];
46664668

4667-
for (int i1 = ir0; i1 < ir1; i1++) {
4668-
if (dst->data != src0->data) {
4669-
// src0 is same shape as dst => same indices
4670-
memcpy((char *)dst->data + i1*nb1, (char *)src0->data + i1*nb01, nc * sizeof(float));
4669+
if (b == 0.0f) {
4670+
for (int i1 = ir0; i1 < ir1; i1++) {
4671+
if (dst->data != src0->data) {
4672+
// src0 is same shape as dst => same indices
4673+
// TODO: add x parameter to ggml_vec_scale_f32 and remove this memcpy
4674+
memcpy((char *)dst->data + i1*nb1, (char *)src0->data + i1*nb01, nc * sizeof(float));
4675+
}
4676+
ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), s);
4677+
}
4678+
} else {
4679+
for (int i1 = ir0; i1 < ir1; i1++) {
4680+
ggml_vec_mad1_f32(nc,
4681+
(float *) ((char *) dst->data + i1*nb1),
4682+
(float *) ((char *) src0->data + i1*nb1),
4683+
s, b);
46714684
}
4672-
ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), v);
46734685
}
46744686
}
46754687

0 commit comments

Comments
 (0)