Merge pull request #81 from xyc0123456789/main

chenqy4933 · web-flow · commit d5c2ed9b4b97 · 2024-04-07T09:57:37.000+08:00
feat(chatglm3-6B): support chatglm3-6B
diff --git a/application/chatglm/chatglm.cpp b/application/chatglm/chatglm.cpp
@@ -145,6 +145,9 @@ int main(int argc, char** argv) {
     if (params.version == 2) {
         model_name = "chatglm2";
         etoken = 2;
+    }else if(params.version == 3){
+        model_name = "chatglm3";
+        etoken = 2;
     }
 
     std::shared_ptr<inferllm::Model> model =
diff --git a/application/chatglm/convert.py b/application/chatglm/convert.py
@@ -1,4 +1,4 @@
-# Convert a ChatGLM model checkpoint to a InferLLM compatible file
+# Convert a chatglm model checkpoint to a InferLLM compatible file
 #
 # Load the model using Torch
 # Iterate over all variables and write them to a binary file.
@@ -27,57 +27,86 @@
 #   - Name (char[name_length])
 #   - Data (int8_t[len])
 #
+#
 # By default, the bigger matrices are converted to 16-bit floats.
 # This can be disabled by adding the "use-f32" CLI argument.
 #
 # At the start of the ggml file we write the model parameters
 # and vocabulary.
-#
 
 import sys
 import json
 import struct
+from enum import Enum
 import numpy as np
 import torch
 import argparse
 import tempfile 
-from transformers import AutoTokenizer, AutoModel
+from transformers import AutoTokenizer, AutoModel, AutoConfig
 from sentencepiece import SentencePieceProcessor 
 
 # parse arguments
 parser = argparse.ArgumentParser(description="Convert a ChatGLM model to a InferLLM compatible fp16 data type file")
 parser.add_argument("-o", "--outfile", type=str, help="the output file")
 parser.add_argument("-v", "--version", type=int, default=1, help="the chatglm mode version")
+parser.add_argument("-q", "--quantization", type=int, default=32, help="quantization bits")
 args = parser.parse_args()
 
 # output in the same directory as the model
 model_out_path = args.outfile
 
-hparams = {
-        "embd_size": 4096,
-        "n_heads": 32,
-        "n_layers": 28,
-        "fc_hidden": 16384,
-}
-dtype = 0
+class GGMLType(Enum):
+    # src: https://github.com/li-plus/chatglm.cpp/blob/04910ce72a5d22087ec6e404dbefd73c1ccf2700/chatglm_cpp/convert.py#L32
+    F32 = 0
+    F16 = 1
+    QInt4 = 2
+    # QUInt4 = 3
+    QInt8 = 4
+
+alignment_size = 32
+bits = args.quantization
+if bits == 32:
+    dtype = GGMLType.F32
+elif bits == 16:
+    dtype = GGMLType.F16
+    raise NotImplementedError(f"kernel not suport bits: {bits}")
+elif bits == 8:
+    dtype = GGMLType.QInt8
+elif bits == 4:
+    dtype = GGMLType.QInt4
+else:
+    raise NotImplementedError(f"Unknown quantization bits: {bits}")
+
 version = args.version
 if version == 1:
     model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).float().state_dict()
     auto_tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
+    config = AutoConfig.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
 elif version == 2:
     model = AutoModel.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True).float().state_dict()
     auto_tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True)
+    config = AutoConfig.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True)
+elif version == 3:
+    model = AutoModel.from_pretrained("THUDM/chatglm3-6b", trust_remote_code=True).float().state_dict()
+    auto_tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm3-6b", trust_remote_code=True)
+    config = AutoConfig.from_pretrained("THUDM/chatglm3-6b", trust_remote_code=True)
 
 _, vocab_file = tempfile.mkstemp()
 auto_tokenizer.save_vocabulary(vocab_file)
 tokenizer = SentencePieceProcessor(vocab_file)
 
+hparams = {
+        "embd_size": config.hidden_size,
+        "n_heads": config.num_attention_heads,
+        "n_layers": config.num_layers,
+}
 hparams.update({"vocab_size": tokenizer.vocab_size()})
 
-if version == 2:
-    hparams.update({"multi_qeury": 1})
-    hparams.update({"attention_patition": 2})
-    hparams.update({"fc_hidden": 13696})
+if version > 1:
+    hparams.update({"multi_qeury": 1 if config.multi_query_attention else 0})
+    hparams.update({"attention_patition": config.multi_query_group_num})
+    hparams.update({"fc_hidden": config.ffn_hidden_size})
+
 
 print(hparams)
 
@@ -91,7 +120,7 @@
 param_byte +=struct.pack("i", hparams["n_layers"])
 param_byte +=struct.pack("i", hparams["fc_hidden"])
 param_byte +=struct.pack("i", hparams["vocab_size"])
-if version == 2:
+if version > 1:
     param_byte +=struct.pack("i", hparams["multi_qeury"])
     param_byte +=struct.pack("i", hparams["attention_patition"])
 
@@ -150,32 +179,136 @@
 # seek to the end of the file
 fout.seek(0, 2)
 
-for k, v in model.items():
-    name = k
-    shape = v.shape
+
+
+GGML_QK8_0 = 32
+GGML_QK4_0 = 32
+GGML_QK4_1 = 32
+
+
+GGML_MEM_ALIGN = 16
+
+def float32Toint8(tensor):
+    oriShape = tensor.shape
+    newLastElement = oriShape[-1] * 4
+    newShape = oriShape[:-1] + (newLastElement,)
+    tensor_bytes = tensor.numpy().tobytes()
+    return torch.tensor(np.frombuffer(tensor_bytes, dtype=np.int8)).view(newShape)
+
+def offset(tensor, alignment):
+    # 计算tensor所占用的字节数
+    num_bytes = tensor.element_size() * tensor.nelement()
+    # 计算需要填充的字节数
+    padding = (alignment - (num_bytes % alignment)) % alignment
+    return num_bytes+padding, padding
+def quantize_q8_0(tensor: torch.Tensor) -> torch.Tensor:
+    """
+    src: https://github.com/li-plus/chatglm.cpp/blob/04910ce72a5d22087ec6e404dbefd73c1ccf2700/chatglm_cpp/convert.py#L51
+    """
+    # equivalent to ggml_quantize_q8_0 in ggml.c
+
+    if len(tensor.shape) == 1:
+        tensor = tensor.unsqueeze(0)
+    assert tensor.shape[1] % GGML_QK8_0 == 0
+    tensor = tensor.view(-1, GGML_QK8_0)
+    scale = tensor.abs().max(dim=-1, keepdim=True).values / ((1 << 7) - 1)
+    tensor = (tensor / scale).round().clamp(min=-128, max=127).type(torch.int8)
+    # add scale into each block
+    tensor = torch.cat((float32Toint8(scale.float()), tensor), dim=-1)
+    return tensor
+
+def quantize_quint4(tensor: torch.Tensor) -> torch.Tensor:
+    """
+    src: https://github.com/li-plus/chatglm.cpp/blob/04910ce72a5d22087ec6e404dbefd73c1ccf2700/chatglm_cpp/convert.py#L62
+    """
+    # equivalent to ggml_quantize_q4_0 in ggml.c
+    if len(tensor.shape) == 1:
+        tensor = tensor.unsqueeze(0)
+    assert tensor.shape[1] % GGML_QK4_0 == 0
+    tensor = tensor.view(-1, GGML_QK4_0)
+    abs_max_indices = tensor.abs().max(dim=-1, keepdim=True).indices
+    max_values = torch.take_along_dim(tensor, abs_max_indices, dim=-1)
+    scale = max_values / -8
+    tensor = (tensor / scale + 8).round().clamp(min=0, max=15).char()
+    # compress two int4 weights into an int8
+    tensor = tensor[:, :16] | (tensor[:, 16:] << 4).type(torch.int8)
+    # add scale into each block
+    tensor = torch.cat((float32Toint8(scale.float()), tensor), dim=-1)
+    return tensor
+
+
+
+def quantize_qint4(tensor: torch.Tensor) -> torch.Tensor:
+    """
+    src: https://github.com/li-plus/chatglm.cpp/blob/04910ce72a5d22087ec6e404dbefd73c1ccf2700/chatglm_cpp/convert.py#L62
+    """
+    # equivalent to ggml_quantize_q4_0 in ggml.c
+    if len(tensor.shape) == 1:
+        tensor = tensor.unsqueeze(0)
+    assert tensor.shape[1] % GGML_QK4_0 == 0
+    tensor = tensor.view(-1, GGML_QK4_0)
+    abs_max_indices = tensor.abs().max(dim=-1, keepdim=True).indices
+    max_values = torch.take_along_dim(tensor, abs_max_indices, dim=-1)
+    scale = max_values / -8
+    tensor = (tensor / scale).round().clamp(min=-8, max=7).char()
+    # compress two int4 weights into an int8
+    tensor = tensor[:, :16] | (tensor[:, 16:] << 4).type(torch.int8)
+    # add scale into each block
+    tensor = torch.cat((float32Toint8(scale.float()), tensor), dim=-1)
+    return tensor
+
+def dump_tensor(f, name: str, tensor: torch.Tensor, ggml_type: GGMLType):
+    assert tensor.dtype == torch.float32
+    shape = tensor.shape
 
     # skip layers.X.attention.inner_attention.rope.freqs
-    if name[-5:] == "freqs":
-        continue
+    if name[-5:] == "freqs" or name[-4:]=="freq":
+        return
+
 
-    print("Processing variable: " + name + " with shape: ", shape, " and type: ", v.dtype)
     if name.endswith("query_key_value.weight") or name.endswith("attention.query_key_value.bias"):
         if version == 1:
-            v = v.reshape(32, 3, -1).transpose(0, 1).reshape(-1, 4096)
+            tensor = tensor.reshape(32, 3, -1).transpose(0, 1).reshape(-1, 4096)
+    dshape = tensor.shape
+    sname = name.encode('utf-8')
 
-    data = v.numpy().squeeze()
-    n_dims = len(data.shape)
 
-    dshape = data.shape
-    sname = name.encode('utf-8')
-    print("write tensor: ", name, " to file :", fout.tell())
-    fout.write(struct.pack("iii", n_dims, len(sname), dtype))
+    if "layernorm" not in name:
+        # tensor data
+        if ggml_type == GGMLType.F32:
+            tensor = tensor.float()
+        elif ggml_type == GGMLType.F16:
+            tensor = tensor.half()
+        elif ggml_type == GGMLType.QInt8:
+            tensor = quantize_q8_0(tensor)
+        elif ggml_type == GGMLType.QInt4:
+            tensor = quantize_qint4(tensor)
+        else:
+            raise NotImplementedError(f"Cannot dump tensor of dtype {tensor.dtype}")
+    else:
+        tensor = tensor.float()
+        ggml_type = GGMLType.F32
+
+    n_dims = len(shape)
+    print("Processing variable: " + name + " with shape: ", shape, " and type: ", ggml_type.value)
+    f.write(struct.pack("iii", n_dims, len(sname), ggml_type.value))
     for i in range(n_dims):
-        fout.write(struct.pack("i", dshape[i]))
-    fout.write(sname)
+        f.write(struct.pack("i", dshape[i]))
+    f.write(sname)
+    print("write tensor: ", name, " to file :", f.tell())
+
+    tensor.numpy().tofile(f)
+    # align address
+    if ggml_type == GGMLType.QInt8 or ggml_type == GGMLType.QInt4:
+        length, paddingSize =offset(tensor, alignment_size)
+        if paddingSize>0:
+            paddingTensor = torch.zeros(paddingSize)
+            paddingTensor.numpy().tofile(f)
+            print("write paddingTensor: ", name, "paddingSize:", paddingSize," to file :", f.tell())
+
+for k, v in model.items():
+    dump_tensor(fout, k, v, dtype)
 
-    # data
-    data.tofile(fout)
 
 # I hope this deallocates the memory ..
 model = None
diff --git a/src/core/graph.cpp b/src/core/graph.cpp
@@ -320,7 +320,7 @@ void Graph::load(
                 "Error weight is not found when loading.");
         auto weight = m_weights_map[alias_name];
         if (weight->length() != nr_number) {
-            INFER_LOG("weight %s is not match.\n", alias_name.c_str());
+            INFER_LOG("weight %s %zu is not match.\n", alias_name.c_str(), weight->length());
         }
         INFER_ASSERT(
                 weight->length() == nr_number, "Error length of weight is mismatch.");
diff --git a/src/graph/chatGLM.h b/src/graph/chatGLM.h
@@ -41,6 +41,19 @@ class ChatGLMGraph : public Graph {
 class ChatGLMGraph2 : public Graph {
     using Graph::Graph;
 
+public:
+    void set_weights_alias() override;
+    void construct_llm() override;
+    void load_param(
+            std::shared_ptr<InputFile> fin, LlmParams& param,
+            std::shared_ptr<Vocab> vocab) override;
+
+    void post_tokenize(std::vector<Vocab::Id>& input) override;
+};
+
+class ChatGLMGraph3 : public Graph {
+    using Graph::Graph;
+
 public:
     void set_weights_alias() override;
     void construct_llm() override;
diff --git a/src/graph/chatGLM3.cpp b/src/graph/chatGLM3.cpp
diff --git a/src/graph/graph_imp.cpp b/src/graph/graph_imp.cpp

Original file line number	Diff line number	Diff line change
`@@ -145,6 +145,9 @@ int main(int argc, char** argv) {`
`145`	`145`	`if (params.version == 2) {`
`146`	`146`	`model_name = "chatglm2";`
`147`	`147`	`etoken = 2;`
	`148`	`+ }else if(params.version == 3){`
	`149`	`+ model_name = "chatglm3";`
	`150`	`+ etoken = 2;`
`148`	`151`	`}`
`149`	`152`
`150`	`153`	`std::shared_ptr<inferllm::Model> model =`
Original file line number	Diff line number	Diff line change
`@@ -320,7 +320,7 @@ void Graph::load(`
`320`	`320`	`"Error weight is not found when loading.");`
`321`	`321`	`auto weight = m_weights_map[alias_name];`
`322`	`322`	`if (weight->length() != nr_number) {`
`323`		`- INFER_LOG("weight %s is not match.\n", alias_name.c_str());`
	`323`	`+ INFER_LOG("weight %s %zu is not match.\n", alias_name.c_str(), weight->length());`
`324`	`324`	`}`
`325`	`325`	`INFER_ASSERT(`
`326`	`326`	`weight->length() == nr_number, "Error length of weight is mismatch.");`