update quant_device_map (#2154)

Jintao-Huang · web-flow · commit 91e9751f8bc4 · 2024-09-29T00:24:30.000+08:00
diff --git a/docs/source/Instruction/命令行参数.md b/docs/source/Instruction/命令行参数.md
@@ -382,7 +382,7 @@ export参数继承了infer参数, 除此之外增加了以下参数:
 - `--quant_n_samples`: 量化参数, 默认为`256`. 当设置为`--quant_method awq`时, 如果出现量化的时候OOM, 可以适度降低`--quant_n_samples`和`--quant_seqlen`. `--quant_method gptq`通常不会出现量化OOM.
 - `--quant_seqlen`: 量化参数, 默认为`2048`.
 - `--quant_batch_size`: 量化数据集的batch_size，默认为`1`.
-- `--quant_device_map`: 默认为`'cpu'`, 节约显存. 你可以指定为'cuda:0', 'auto', 'cpu'等, 表示量化时模型导入的设备. 该参数与实际执行量化的设备无关, 例如awq和gptq会在cuda:0中进行量化.
+- `--quant_device_map`: 默认为`None`. 你可以指定为'cuda:0', 'auto', 'cpu'等, 表示量化时模型导入的设备.
 - `--quant_output_dir`: 默认为`None`, 默认的quant_output_dir会被打印在命令行中.
 - `--push_to_hub`: 默认为`False`. 是否将最后的`ckpt_dir`push到ModelScope Hub中. 如果你指定了`merge_lora`, 则将推送全量参数; 如果你还指定了`quant_bits`, 则将推送量化后的模型.
 - `--hub_model_id`: 默认为`None`. 推送到的ModelScope Hub的model_id. 如果`push_to_hub`设置为True, 该参数必须被设置.
diff --git a/docs/source/Instruction/支持的模型和数据集.md b/docs/source/Instruction/支持的模型和数据集.md
@@ -442,7 +442,7 @@
 |qwen2-vl-72b-instruct-gptq-int8|[qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45.dev.0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)|
 |qwen2-vl-72b-instruct-awq|[qwen/Qwen2-VL-72B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45.dev.0, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-AWQ)|
 |glm4v-9b-chat|[ZhipuAI/glm-4v-9b](https://modelscope.cn/models/ZhipuAI/glm-4v-9b/summary)|^(transformer.encoder)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|glm4v|&#x2718;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.42|vision|[THUDM/glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b)|
-|llama3_2-11b-visiont|[LLM-Research/Llama-3.2-11B-Vision](https://modelscope.cn/models/LLM-Research/Llama-3.2-11B-Vision/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_2-vision-generation|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45|vision|[meta-llama/Llama-3.2-11B-Vision](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision)|
+|llama3_2-11b-vision|[LLM-Research/Llama-3.2-11B-Vision](https://modelscope.cn/models/LLM-Research/Llama-3.2-11B-Vision/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_2-vision-generation|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45|vision|[meta-llama/Llama-3.2-11B-Vision](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision)|
 |llama3_2-11b-vision-instruct|[LLM-Research/Llama-3.2-11B-Vision-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-11B-Vision-Instruct/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_2-vision|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45|vision|[meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct)|
 |llama3_2-90b-vision|[LLM-Research/Llama-3.2-90B-Vision](https://modelscope.cn/models/LLM-Research/Llama-3.2-90B-Vision/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_2-vision-generation|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45|vision|[meta-llama/Llama-3.2-90B-Vision](https://huggingface.co/meta-llama/Llama-3.2-90B-Vision)|
 |llama3_2-90b-vision-instruct|[LLM-Research/Llama-3.2-90B-Vision-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-90B-Vision-Instruct/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_2-vision|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45|vision|[meta-llama/Llama-3.2-90B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-90B-Vision-Instruct)|
@@ -653,6 +653,7 @@
 |cosmopedia-100k|[swift/cosmopedia-100k](https://modelscope.cn/datasets/swift/cosmopedia-100k/summary)||100000|1024.5±243.1, min=239, max=2981|multi-domain, en, qa|[HuggingFaceTB/cosmopedia-100k](https://huggingface.co/datasets/HuggingFaceTB/cosmopedia-100k)|
 |dolma|[swift/dolma](https://modelscope.cn/datasets/swift/dolma/summary)|v1_7|-|Dataset is too huge, please click the original link to view the dataset stat.|pretrain, quality|[allenai/dolma](https://huggingface.co/datasets/allenai/dolma)|
 |dolphin|[swift/dolphin](https://modelscope.cn/datasets/swift/dolphin/summary)|flan1m-alpaca-uncensored<br>flan5m-alpaca-uncensored|-|Dataset is too huge, please click the original link to view the dataset stat.|en|[cognitivecomputations/dolphin](https://huggingface.co/datasets/cognitivecomputations/dolphin)|
+|duet|[AI-ModelScope/Duet-v0.5](https://modelscope.cn/datasets/AI-ModelScope/Duet-v0.5/summary)||5000|1157.4±189.3, min=657, max=2344|CoT, en|[G-reen/Duet-v0.5](https://huggingface.co/datasets/G-reen/Duet-v0.5)|
 |evol-instruct-v2|[AI-ModelScope/WizardLM_evol_instruct_V2_196k](https://modelscope.cn/datasets/AI-ModelScope/WizardLM_evol_instruct_V2_196k/summary)||109184|480.9±333.1, min=26, max=4942|chat, en|[WizardLM/WizardLM_evol_instruct_V2_196k](https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k)|
 |fineweb|[None](https://modelscope.cn/datasets/None/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|pretrain, quality|[HuggingFaceFW/fineweb](https://huggingface.co/datasets/HuggingFaceFW/fineweb)|
 |gen-qa|[swift/GenQA](https://modelscope.cn/datasets/swift/GenQA/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|qa, quality, multi-task|[tomg-group-umd/GenQA](https://huggingface.co/datasets/tomg-group-umd/GenQA)|
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
@@ -383,7 +383,7 @@ export parameters inherit from infer parameters, with the following added parame
 - `--quant_n_samples`: Quantization parameter, default is `256`. When set to `--quant_method awq`, if OOM occurs during quantization, you can moderately reduce `--quant_n_samples` and `--quant_seqlen`. `--quant_method gptq` generally does not encounter quantization OOM.
 - `--quant_seqlen`: Quantization parameter, default is `2048`.
 - `--quant_batch_size`: Calibrating batch_size，Default `1`.
-- `--quant_device_map`: Default is `'cpu'`, to save memory. You can specify 'cuda:0', 'auto', 'cpu', etc., representing the device to load model during quantization. This parameter is independent of the actual device that performs the quantization, such as AWQ and GPTQ which will carry out quantization on cuda:0.
+- `--quant_device_map`: Default is `None`, to save memory. You can specify 'cuda:0', 'auto', 'cpu', etc., representing the device to load model during quantization.
 - `quant_output_dir`: Default is `None`, the default quant_output_dir will be printed in the command line.
 - `--push_to_hub`: Default is `False`. Whether to push the final `ckpt_dir` to ModelScope Hub. If you specify `merge_lora`, full parameters will be pushed; if you also specify `quant_bits`, quantized model will be pushed.
 - `--hub_model_id`: Default is `None`. Model_id to push to on ModelScope Hub. If `push_to_hub` is set to True, this parameter must be set.
diff --git a/docs/source_en/Instruction/Supported-models-datasets.md b/docs/source_en/Instruction/Supported-models-datasets.md
@@ -442,7 +442,7 @@ The table below introcudes all models supported by SWIFT:
 |qwen2-vl-72b-instruct-gptq-int8|[qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45.dev.0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)|
 |qwen2-vl-72b-instruct-awq|[qwen/Qwen2-VL-72B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45.dev.0, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-AWQ)|
 |glm4v-9b-chat|[ZhipuAI/glm-4v-9b](https://modelscope.cn/models/ZhipuAI/glm-4v-9b/summary)|^(transformer.encoder)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|glm4v|&#x2718;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.42|vision|[THUDM/glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b)|
-|llama3_2-11b-visiont|[LLM-Research/Llama-3.2-11B-Vision](https://modelscope.cn/models/LLM-Research/Llama-3.2-11B-Vision/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_2-vision-generation|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45|vision|[meta-llama/Llama-3.2-11B-Vision](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision)|
+|llama3_2-11b-vision|[LLM-Research/Llama-3.2-11B-Vision](https://modelscope.cn/models/LLM-Research/Llama-3.2-11B-Vision/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_2-vision-generation|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45|vision|[meta-llama/Llama-3.2-11B-Vision](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision)|
 |llama3_2-11b-vision-instruct|[LLM-Research/Llama-3.2-11B-Vision-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-11B-Vision-Instruct/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_2-vision|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45|vision|[meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct)|
 |llama3_2-90b-vision|[LLM-Research/Llama-3.2-90B-Vision](https://modelscope.cn/models/LLM-Research/Llama-3.2-90B-Vision/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_2-vision-generation|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45|vision|[meta-llama/Llama-3.2-90B-Vision](https://huggingface.co/meta-llama/Llama-3.2-90B-Vision)|
 |llama3_2-90b-vision-instruct|[LLM-Research/Llama-3.2-90B-Vision-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-90B-Vision-Instruct/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_2-vision|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45|vision|[meta-llama/Llama-3.2-90B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-90B-Vision-Instruct)|
@@ -653,6 +653,7 @@ The table below introduces the datasets supported by SWIFT:
 |cosmopedia-100k|[swift/cosmopedia-100k](https://modelscope.cn/datasets/swift/cosmopedia-100k/summary)||100000|1024.5±243.1, min=239, max=2981|multi-domain, en, qa|[HuggingFaceTB/cosmopedia-100k](https://huggingface.co/datasets/HuggingFaceTB/cosmopedia-100k)|
 |dolma|[swift/dolma](https://modelscope.cn/datasets/swift/dolma/summary)|v1_7|-|Dataset is too huge, please click the original link to view the dataset stat.|pretrain, quality|[allenai/dolma](https://huggingface.co/datasets/allenai/dolma)|
 |dolphin|[swift/dolphin](https://modelscope.cn/datasets/swift/dolphin/summary)|flan1m-alpaca-uncensored<br>flan5m-alpaca-uncensored|-|Dataset is too huge, please click the original link to view the dataset stat.|en|[cognitivecomputations/dolphin](https://huggingface.co/datasets/cognitivecomputations/dolphin)|
+|duet|[AI-ModelScope/Duet-v0.5](https://modelscope.cn/datasets/AI-ModelScope/Duet-v0.5/summary)||5000|1157.4±189.3, min=657, max=2344|CoT, en|[G-reen/Duet-v0.5](https://huggingface.co/datasets/G-reen/Duet-v0.5)|
 |evol-instruct-v2|[AI-ModelScope/WizardLM_evol_instruct_V2_196k](https://modelscope.cn/datasets/AI-ModelScope/WizardLM_evol_instruct_V2_196k/summary)||109184|480.9±333.1, min=26, max=4942|chat, en|[WizardLM/WizardLM_evol_instruct_V2_196k](https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k)|
 |fineweb|[None](https://modelscope.cn/datasets/None/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|pretrain, quality|[HuggingFaceFW/fineweb](https://huggingface.co/datasets/HuggingFaceFW/fineweb)|
 |gen-qa|[swift/GenQA](https://modelscope.cn/datasets/swift/GenQA/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|qa, quality, multi-task|[tomg-group-umd/GenQA](https://huggingface.co/datasets/tomg-group-umd/GenQA)|
diff --git a/swift/llm/export.py b/swift/llm/export.py
@@ -1,11 +1,9 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
-from typing import Dict, List, Optional
+from typing import List, Optional
 
 import json
 import torch
-import transformers
-from packaging import version
 
 from swift.llm import get_model_tokenizer, get_template
 from swift.utils import (check_json_format, get_logger, get_main, get_model_info, push_to_ms_hub, seed_everything,
@@ -66,67 +64,8 @@ def _get_dataset(*args, **kwargs):
 
 def awq_model_quantize(awq_model, tokenizer, batch_size) -> None:
 
-    def _llama_rotary_emb_forward(self, x, position_ids):
-        with torch.no_grad():
-            if 'dynamic' in self.rope_type:
-                self._dynamic_frequency_update(position_ids, device=x.device)
-
-            # Core RoPE block
-            inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
-            position_ids_expanded = position_ids[:, None, :].float()
-            # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-            device_type = x.device.type
-            device_type = device_type if isinstance(device_type, str) and device_type != 'mps' else 'cpu'
-            with torch.autocast(device_type=device_type, enabled=False):
-                inv_freq_expanded = inv_freq_expanded.to(position_ids_expanded.device)
-                freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
-                emb = torch.cat((freqs, freqs), dim=-1)
-                cos = emb.cos()
-                sin = emb.sin()
-
-            # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-            cos = cos * self.attention_scaling
-            sin = sin * self.attention_scaling
-
-            return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
-
-    @torch.no_grad()
-    def _module_forward(self, x: torch.Tensor, module: torch.nn.Module, module_kwargs: Dict) -> torch.Tensor:
-        # The original code of awq.AwqQuantizer._module_forward has a bug with n_parallel_calib_samples
-        if self.n_parallel_calib_samples is None:
-            # runs through all samples at once
-            module_output = module(x, **module_kwargs)
-            if isinstance(module_output, tuple):
-                module_output = module_output[0]
-        else:
-            # memory efficiently runs through all calibration samples
-            # but only n_parallel_calib_samples at a time
-            module_output = []
-            partitioned_inputs = torch.split(x, self.n_parallel_calib_samples)
-            for idx, x_partial in enumerate(partitioned_inputs):
-                tmp_module_kwargs = {**module_kwargs}
-                if tmp_module_kwargs.get('attention_mask'):
-                    tmp_module_kwargs['attention_mask'] = tmp_module_kwargs['attention_mask'][idx:idx + self.
-                                                                                              n_parallel_calib_samples]
-                partial_output = module(x_partial, **tmp_module_kwargs)
-
-                if isinstance(partial_output, tuple):
-                    partial_output = partial_output[0]
-
-                module_output.append(partial_output.cpu())
-
-            module_output = torch.cat(module_output, dim=0)
-
-        return module_output
-
-    import awq
     from awq.quantize import quantizer
     from transformers import AwqConfig
-    if version.parse(awq.__version__) >= version.parse('0.2.6'):
-        quantizer.AwqQuantizer._module_forward = _module_forward
-
-    if version.parse(transformers.__version__) >= version.parse('4.43.0'):
-        transformers.models.llama.modeling_llama.LlamaRotaryEmbedding.forward = _llama_rotary_emb_forward
 
     assert _args is not None
     logger.info(f'Quantization dataset: {_args.dataset}')
@@ -257,7 +196,6 @@ def llm_export(args: ExportArguments) -> None:
             model.config.quantization_config.pop('dataset', None)
             gptq_quantizer.save(model, args.quant_output_dir)
         elif args.quant_method == 'bnb':
-            args.quant_device_map = 'auto'  # cannot use cpu on bnb
             args.quantization_bit = args.quant_bits
             args.bnb_4bit_compute_dtype, args.load_in_4bit, args.load_in_8bit = args.select_bnb()
             model, template = prepare_model_template(args, device_map=args.quant_device_map, verbose=False)
diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py
@@ -1546,7 +1546,7 @@ def handle_infer_backend(self):
             self.infer_media_type = 'interleave'
         self.media_type = template_info.get('media_type', 'image')
         self.media_key = MediaTag.media_keys.get(self.media_type, 'images')
-        if self.merge_device_map is None:
+        if self.merge_device_map is None and not isinstance(self, ExportArguments):
             self.merge_device_map = 'cpu'
 
     @staticmethod
@@ -1661,7 +1661,7 @@ class ExportArguments(InferArguments):
     quant_method: Literal['awq', 'gptq', 'bnb'] = 'awq'
     quant_n_samples: int = 256
     quant_seqlen: int = 2048
-    quant_device_map: str = 'cpu'  # e.g. 'cpu', 'auto'
+    quant_device_map: Optional[str] = None  # e.g. 'cpu', 'auto'
     quant_output_dir: Optional[str] = None
     quant_batch_size: int = 1
 
@@ -1684,8 +1684,8 @@ class ExportArguments(InferArguments):
     # merge_lora, hub_token
 
     def __post_init__(self):
-        if self.merge_device_map is None:
-            self.merge_device_map = 'cpu' if self.quant_bits > 0 else 'auto'
+        if self.merge_device_map is None and self.quant_bits > 0:
+            self.merge_device_map = 'cpu'
         if self.quant_bits > 0 and self.dtype == 'AUTO':
             self.dtype = 'fp16'
             logger.info(f'Setting args.dtype: {self.dtype}')
diff --git a/tests/llm/test_ollama_export.py b/tests/llm/test_ollama_export.py
@@ -3,6 +3,9 @@
 import tempfile
 import unittest
 
+import transformers
+from packaging import version
+
 from swift.llm import ExportArguments, export_main
 
 if __name__ == '__main__':
@@ -16,7 +19,8 @@ def setUp(self):
         self.tmp_dir = tempfile.TemporaryDirectory().name
 
     def tearDown(self):
-        shutil.rmtree(self.tmp_dir)
+        if os.path.exists(self.tmp_dir):
+            shutil.rmtree(self.tmp_dir)
         super().tearDown()
 
     def test_llama3(self):
@@ -36,6 +40,9 @@ def test_llama3(self):
             self.assertTrue(stop in content)
 
     def test_glm4(self):
+        if version.parse(transformers.__version__) >= version.parse('4.45'):
+            return
+
         args = ExportArguments(model_type='glm4-9b-chat', to_ollama=True, ollama_output_dir=self.tmp_dir)
         export_main(args)