"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " \n",
+ "
\n",
+ " [10/10 00:31, Epoch 0/1]\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | Step | \n",
+ " Training Loss | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 1 | \n",
+ " 1.517500 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 1.428600 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 1.369000 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 1.747600 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 1.629700 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 1.517800 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 1.584500 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 2.080300 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 1.817600 | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " 1.349400 | \n",
+ "
\n",
+ " \n",
+ "
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "from trl import SFTTrainer\n",
+ "from transformers import TrainingArguments\n",
+ "import os\n",
+ "\n",
+ "tokenizer.pad_token = tokenizer.eos_token\n",
+ "\n",
+ "# prepare model for training... \n",
+ "dtype = model.base_model.model.model.embed_tokens.parameters().__next__().dtype\n",
+ "model.base_model.model.model.embed_tokens.to(torch.float32)\n",
+ "model.base_model.model.lm_head.to(torch.float32)\n",
+ "for param in model.parameters():\n",
+ " if param.requires_grad:\n",
+ " param.data = param.data.to(torch.float32)\n",
+ "\n",
+ "\n",
+ "trainer = SFTTrainer(\n",
+ " model = model,\n",
+ " tokenizer = tokenizer,\n",
+ " train_dataset = dataset,\n",
+ " eval_dataset = None,\n",
+ " dataset_text_field = \"text\",\n",
+ " max_seq_length = 1024,\n",
+ " dataset_num_proc = 2,\n",
+ " packing = False, # Can make training 5x faster for short sequences.\n",
+ " args = TrainingArguments(\n",
+ " gradient_checkpointing=False,\n",
+ " per_device_train_batch_size = 1,\n",
+ " gradient_accumulation_steps = 1,\n",
+ " warmup_steps = 0,\n",
+ " max_steps = 10,\n",
+ " #num_train_epochs = 1,\n",
+ " learning_rate = 2e-3,\n",
+ " fp16 = not torch.cuda.is_bf16_supported(),\n",
+ " bf16 = torch.cuda.is_bf16_supported(),\n",
+ " logging_steps = 1,\n",
+ " optim = \"adamw_8bit\",\n",
+ " weight_decay = 0.01,\n",
+ " #lr_scheduler_type = \"linear\",\n",
+ " seed = 3407,\n",
+ " output_dir = \"outputs\",\n",
+ " ),\n",
+ ")\n",
+ "\n",
+ "with torch.cuda.amp.autocast():\n",
+ " trainer_stats = trainer.train()\n",
+ "\n",
+ "# prepare model for inference \n",
+ "model.base_model.model.model.embed_tokens.to(dtype)\n",
+ "model.base_model.model.lm_head.to(dtype)\n",
+ "\n",
+ "for param in model.parameters():\n",
+ " if param.requires_grad:\n",
+ " param.data = param.data.to(torch.float16)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "tensor(0.2426, dtype=torch.float16)\n"
+ ]
+ }
+ ],
+ "source": [
+ "from matplotlib import pyplot as plt\n",
+ "w_after = model.base_model.model.model.embed_tokens.new_embeddings.weight.detach().cpu().clone()\n",
+ "print(((w_before - w_after)**2).sum())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Test saving and loading"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/dlabdata1/wendler/.rlllm/lib/python3.11/site-packages/peft/utils/save_and_load.py:154: UserWarning: Could not find a config file in /dlabscratch1/public/llm_weights/llama2_hf/Llama-2-7b-hf/ - will assume that the vocabulary was not modified.\n",
+ " warnings.warn(\n"
+ ]
+ }
+ ],
+ "source": [
+ "surgeon.save(model, '/dlabscratch1/tmp/peft_test')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "PeftModelForCausalLM(\n",
+ " (base_model): LoraModel(\n",
+ " (model): LlamaForCausalLM(\n",
+ " (model): LlamaModel(\n",
+ " (embed_tokens): Embedding(32000, 4096)\n",
+ " (layers): ModuleList(\n",
+ " (0-31): 32 x LlamaDecoderLayer(\n",
+ " (self_attn): LlamaSdpaAttention(\n",
+ " (q_proj): lora.Linear(\n",
+ " (base_layer): Linear(in_features=4096, out_features=4096, bias=False)\n",
+ " (lora_dropout): ModuleDict(\n",
+ " (default): Identity()\n",
+ " )\n",
+ " (lora_A): ModuleDict(\n",
+ " (default): Linear(in_features=4096, out_features=64, bias=False)\n",
+ " )\n",
+ " (lora_B): ModuleDict(\n",
+ " (default): Linear(in_features=64, out_features=4096, bias=False)\n",
+ " )\n",
+ " (lora_embedding_A): ParameterDict()\n",
+ " (lora_embedding_B): ParameterDict()\n",
+ " )\n",
+ " (k_proj): lora.Linear(\n",
+ " (base_layer): Linear(in_features=4096, out_features=4096, bias=False)\n",
+ " (lora_dropout): ModuleDict(\n",
+ " (default): Identity()\n",
+ " )\n",
+ " (lora_A): ModuleDict(\n",
+ " (default): Linear(in_features=4096, out_features=64, bias=False)\n",
+ " )\n",
+ " (lora_B): ModuleDict(\n",
+ " (default): Linear(in_features=64, out_features=4096, bias=False)\n",
+ " )\n",
+ " (lora_embedding_A): ParameterDict()\n",
+ " (lora_embedding_B): ParameterDict()\n",
+ " )\n",
+ " (v_proj): lora.Linear(\n",
+ " (base_layer): Linear(in_features=4096, out_features=4096, bias=False)\n",
+ " (lora_dropout): ModuleDict(\n",
+ " (default): Identity()\n",
+ " )\n",
+ " (lora_A): ModuleDict(\n",
+ " (default): Linear(in_features=4096, out_features=64, bias=False)\n",
+ " )\n",
+ " (lora_B): ModuleDict(\n",
+ " (default): Linear(in_features=64, out_features=4096, bias=False)\n",
+ " )\n",
+ " (lora_embedding_A): ParameterDict()\n",
+ " (lora_embedding_B): ParameterDict()\n",
+ " )\n",
+ " (o_proj): lora.Linear(\n",
+ " (base_layer): Linear(in_features=4096, out_features=4096, bias=False)\n",
+ " (lora_dropout): ModuleDict(\n",
+ " (default): Identity()\n",
+ " )\n",
+ " (lora_A): ModuleDict(\n",
+ " (default): Linear(in_features=4096, out_features=64, bias=False)\n",
+ " )\n",
+ " (lora_B): ModuleDict(\n",
+ " (default): Linear(in_features=64, out_features=4096, bias=False)\n",
+ " )\n",
+ " (lora_embedding_A): ParameterDict()\n",
+ " (lora_embedding_B): ParameterDict()\n",
+ " )\n",
+ " (rotary_emb): LlamaRotaryEmbedding()\n",
+ " )\n",
+ " (mlp): LlamaMLP(\n",
+ " (gate_proj): lora.Linear(\n",
+ " (base_layer): Linear(in_features=4096, out_features=11008, bias=False)\n",
+ " (lora_dropout): ModuleDict(\n",
+ " (default): Identity()\n",
+ " )\n",
+ " (lora_A): ModuleDict(\n",
+ " (default): Linear(in_features=4096, out_features=64, bias=False)\n",
+ " )\n",
+ " (lora_B): ModuleDict(\n",
+ " (default): Linear(in_features=64, out_features=11008, bias=False)\n",
+ " )\n",
+ " (lora_embedding_A): ParameterDict()\n",
+ " (lora_embedding_B): ParameterDict()\n",
+ " )\n",
+ " (up_proj): lora.Linear(\n",
+ " (base_layer): Linear(in_features=4096, out_features=11008, bias=False)\n",
+ " (lora_dropout): ModuleDict(\n",
+ " (default): Identity()\n",
+ " )\n",
+ " (lora_A): ModuleDict(\n",
+ " (default): Linear(in_features=4096, out_features=64, bias=False)\n",
+ " )\n",
+ " (lora_B): ModuleDict(\n",
+ " (default): Linear(in_features=64, out_features=11008, bias=False)\n",
+ " )\n",
+ " (lora_embedding_A): ParameterDict()\n",
+ " (lora_embedding_B): ParameterDict()\n",
+ " )\n",
+ " (down_proj): lora.Linear(\n",
+ " (base_layer): Linear(in_features=11008, out_features=4096, bias=False)\n",
+ " (lora_dropout): ModuleDict(\n",
+ " (default): Identity()\n",
+ " )\n",
+ " (lora_A): ModuleDict(\n",
+ " (default): Linear(in_features=11008, out_features=64, bias=False)\n",
+ " )\n",
+ " (lora_B): ModuleDict(\n",
+ " (default): Linear(in_features=64, out_features=4096, bias=False)\n",
+ " )\n",
+ " (lora_embedding_A): ParameterDict()\n",
+ " (lora_embedding_B): ParameterDict()\n",
+ " )\n",
+ " (act_fn): SiLU()\n",
+ " )\n",
+ " (input_layernorm): LlamaRMSNorm()\n",
+ " (post_attention_layernorm): LlamaRMSNorm()\n",
+ " )\n",
+ " )\n",
+ " (norm): LlamaRMSNorm()\n",
+ " )\n",
+ " (lm_head): Linear(in_features=4096, out_features=32000, bias=False)\n",
+ " )\n",
+ " )\n",
+ ")"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import gc\n",
+ "model.cpu()\n",
+ "gc.collect()\n",
+ "for i in range(torch.cuda.device_count()):\n",
+ " torch.cuda.set_device(i) \n",
+ " torch.cuda.empty_cache() "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "96dc671298af4c36b29b86cc0219d4d1",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Loading checkpoint shards: 0%| | 0/2 [00:00, ?it/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "surgeon2 = PeftModelEmbeddingSurgeon.load('/dlabscratch1/tmp/peft_test', device_map='auto', torch_dtype=torch.float16)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model2 = surgeon2.get_surgeried_model()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "model.embed_tokens.old_embeddings.weight cuda:0\n",
+ "model.embed_tokens.new_embeddings.weight cuda:0\n",
+ "model.layers.0.self_attn.q_proj.base_layer.weight cuda:0\n",
+ "model.layers.0.self_attn.q_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.0.self_attn.q_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.0.self_attn.k_proj.base_layer.weight cuda:0\n",
+ "model.layers.0.self_attn.k_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.0.self_attn.k_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.0.self_attn.v_proj.base_layer.weight cuda:0\n",
+ "model.layers.0.self_attn.v_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.0.self_attn.v_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.0.self_attn.o_proj.base_layer.weight cuda:0\n",
+ "model.layers.0.self_attn.o_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.0.self_attn.o_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.0.mlp.gate_proj.base_layer.weight cuda:0\n",
+ "model.layers.0.mlp.gate_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.0.mlp.gate_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.0.mlp.up_proj.base_layer.weight cuda:0\n",
+ "model.layers.0.mlp.up_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.0.mlp.up_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.0.mlp.down_proj.base_layer.weight cuda:0\n",
+ "model.layers.0.mlp.down_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.0.mlp.down_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.0.input_layernorm.weight cuda:0\n",
+ "model.layers.0.post_attention_layernorm.weight cuda:0\n",
+ "model.layers.1.self_attn.q_proj.base_layer.weight cuda:0\n",
+ "model.layers.1.self_attn.q_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.1.self_attn.q_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.1.self_attn.k_proj.base_layer.weight cuda:0\n",
+ "model.layers.1.self_attn.k_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.1.self_attn.k_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.1.self_attn.v_proj.base_layer.weight cuda:0\n",
+ "model.layers.1.self_attn.v_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.1.self_attn.v_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.1.self_attn.o_proj.base_layer.weight cuda:0\n",
+ "model.layers.1.self_attn.o_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.1.self_attn.o_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.1.mlp.gate_proj.base_layer.weight cuda:0\n",
+ "model.layers.1.mlp.gate_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.1.mlp.gate_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.1.mlp.up_proj.base_layer.weight cuda:0\n",
+ "model.layers.1.mlp.up_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.1.mlp.up_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.1.mlp.down_proj.base_layer.weight cuda:0\n",
+ "model.layers.1.mlp.down_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.1.mlp.down_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.1.input_layernorm.weight cuda:0\n",
+ "model.layers.1.post_attention_layernorm.weight cuda:0\n",
+ "model.layers.2.self_attn.q_proj.base_layer.weight cuda:0\n",
+ "model.layers.2.self_attn.q_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.2.self_attn.q_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.2.self_attn.k_proj.base_layer.weight cuda:0\n",
+ "model.layers.2.self_attn.k_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.2.self_attn.k_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.2.self_attn.v_proj.base_layer.weight cuda:0\n",
+ "model.layers.2.self_attn.v_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.2.self_attn.v_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.2.self_attn.o_proj.base_layer.weight cuda:0\n",
+ "model.layers.2.self_attn.o_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.2.self_attn.o_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.2.mlp.gate_proj.base_layer.weight cuda:0\n",
+ "model.layers.2.mlp.gate_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.2.mlp.gate_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.2.mlp.up_proj.base_layer.weight cuda:0\n",
+ "model.layers.2.mlp.up_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.2.mlp.up_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.2.mlp.down_proj.base_layer.weight cuda:0\n",
+ "model.layers.2.mlp.down_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.2.mlp.down_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.2.input_layernorm.weight cuda:0\n",
+ "model.layers.2.post_attention_layernorm.weight cuda:0\n",
+ "model.layers.3.self_attn.q_proj.base_layer.weight cuda:0\n",
+ "model.layers.3.self_attn.q_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.3.self_attn.q_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.3.self_attn.k_proj.base_layer.weight cuda:0\n",
+ "model.layers.3.self_attn.k_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.3.self_attn.k_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.3.self_attn.v_proj.base_layer.weight cuda:0\n",
+ "model.layers.3.self_attn.v_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.3.self_attn.v_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.3.self_attn.o_proj.base_layer.weight cuda:0\n",
+ "model.layers.3.self_attn.o_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.3.self_attn.o_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.3.mlp.gate_proj.base_layer.weight cuda:0\n",
+ "model.layers.3.mlp.gate_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.3.mlp.gate_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.3.mlp.up_proj.base_layer.weight cuda:0\n",
+ "model.layers.3.mlp.up_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.3.mlp.up_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.3.mlp.down_proj.base_layer.weight cuda:0\n",
+ "model.layers.3.mlp.down_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.3.mlp.down_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.3.input_layernorm.weight cuda:0\n",
+ "model.layers.3.post_attention_layernorm.weight cuda:0\n",
+ "model.layers.4.self_attn.q_proj.base_layer.weight cuda:0\n",
+ "model.layers.4.self_attn.q_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.4.self_attn.q_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.4.self_attn.k_proj.base_layer.weight cuda:0\n",
+ "model.layers.4.self_attn.k_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.4.self_attn.k_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.4.self_attn.v_proj.base_layer.weight cuda:0\n",
+ "model.layers.4.self_attn.v_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.4.self_attn.v_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.4.self_attn.o_proj.base_layer.weight cuda:0\n",
+ "model.layers.4.self_attn.o_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.4.self_attn.o_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.4.mlp.gate_proj.base_layer.weight cuda:0\n",
+ "model.layers.4.mlp.gate_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.4.mlp.gate_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.4.mlp.up_proj.base_layer.weight cuda:0\n",
+ "model.layers.4.mlp.up_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.4.mlp.up_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.4.mlp.down_proj.base_layer.weight cuda:0\n",
+ "model.layers.4.mlp.down_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.4.mlp.down_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.4.input_layernorm.weight cuda:0\n",
+ "model.layers.4.post_attention_layernorm.weight cuda:0\n",
+ "model.layers.5.self_attn.q_proj.base_layer.weight cuda:0\n",
+ "model.layers.5.self_attn.q_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.5.self_attn.q_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.5.self_attn.k_proj.base_layer.weight cuda:0\n",
+ "model.layers.5.self_attn.k_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.5.self_attn.k_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.5.self_attn.v_proj.base_layer.weight cuda:0\n",
+ "model.layers.5.self_attn.v_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.5.self_attn.v_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.5.self_attn.o_proj.base_layer.weight cuda:0\n",
+ "model.layers.5.self_attn.o_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.5.self_attn.o_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.5.mlp.gate_proj.base_layer.weight cuda:0\n",
+ "model.layers.5.mlp.gate_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.5.mlp.gate_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.5.mlp.up_proj.base_layer.weight cuda:0\n",
+ "model.layers.5.mlp.up_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.5.mlp.up_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.5.mlp.down_proj.base_layer.weight cuda:0\n",
+ "model.layers.5.mlp.down_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.5.mlp.down_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.5.input_layernorm.weight cuda:0\n",
+ "model.layers.5.post_attention_layernorm.weight cuda:0\n",
+ "model.layers.6.self_attn.q_proj.base_layer.weight cuda:0\n",
+ "model.layers.6.self_attn.q_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.6.self_attn.q_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.6.self_attn.k_proj.base_layer.weight cuda:0\n",
+ "model.layers.6.self_attn.k_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.6.self_attn.k_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.6.self_attn.v_proj.base_layer.weight cuda:0\n",
+ "model.layers.6.self_attn.v_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.6.self_attn.v_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.6.self_attn.o_proj.base_layer.weight cuda:0\n",
+ "model.layers.6.self_attn.o_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.6.self_attn.o_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.6.mlp.gate_proj.base_layer.weight cuda:0\n",
+ "model.layers.6.mlp.gate_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.6.mlp.gate_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.6.mlp.up_proj.base_layer.weight cuda:0\n",
+ "model.layers.6.mlp.up_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.6.mlp.up_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.6.mlp.down_proj.base_layer.weight cuda:0\n",
+ "model.layers.6.mlp.down_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.6.mlp.down_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.6.input_layernorm.weight cuda:0\n",
+ "model.layers.6.post_attention_layernorm.weight cuda:0\n",
+ "model.layers.7.self_attn.q_proj.base_layer.weight cuda:0\n",
+ "model.layers.7.self_attn.q_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.7.self_attn.q_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.7.self_attn.k_proj.base_layer.weight cuda:0\n",
+ "model.layers.7.self_attn.k_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.7.self_attn.k_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.7.self_attn.v_proj.base_layer.weight cuda:0\n",
+ "model.layers.7.self_attn.v_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.7.self_attn.v_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.7.self_attn.o_proj.base_layer.weight cuda:0\n",
+ "model.layers.7.self_attn.o_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.7.self_attn.o_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.7.mlp.gate_proj.base_layer.weight cuda:0\n",
+ "model.layers.7.mlp.gate_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.7.mlp.gate_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.7.mlp.up_proj.base_layer.weight cuda:0\n",
+ "model.layers.7.mlp.up_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.7.mlp.up_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.7.mlp.down_proj.base_layer.weight cuda:0\n",
+ "model.layers.7.mlp.down_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.7.mlp.down_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.7.input_layernorm.weight cuda:0\n",
+ "model.layers.7.post_attention_layernorm.weight cuda:0\n",
+ "model.layers.8.self_attn.q_proj.base_layer.weight cuda:0\n",
+ "model.layers.8.self_attn.q_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.8.self_attn.q_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.8.self_attn.k_proj.base_layer.weight cuda:0\n",
+ "model.layers.8.self_attn.k_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.8.self_attn.k_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.8.self_attn.v_proj.base_layer.weight cuda:0\n",
+ "model.layers.8.self_attn.v_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.8.self_attn.v_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.8.self_attn.o_proj.base_layer.weight cuda:0\n",
+ "model.layers.8.self_attn.o_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.8.self_attn.o_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.8.mlp.gate_proj.base_layer.weight cuda:0\n",
+ "model.layers.8.mlp.gate_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.8.mlp.gate_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.8.mlp.up_proj.base_layer.weight cuda:0\n",
+ "model.layers.8.mlp.up_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.8.mlp.up_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.8.mlp.down_proj.base_layer.weight cuda:0\n",
+ "model.layers.8.mlp.down_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.8.mlp.down_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.8.input_layernorm.weight cuda:0\n",
+ "model.layers.8.post_attention_layernorm.weight cuda:0\n",
+ "model.layers.9.self_attn.q_proj.base_layer.weight cuda:0\n",
+ "model.layers.9.self_attn.q_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.9.self_attn.q_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.9.self_attn.k_proj.base_layer.weight cuda:0\n",
+ "model.layers.9.self_attn.k_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.9.self_attn.k_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.9.self_attn.v_proj.base_layer.weight cuda:0\n",
+ "model.layers.9.self_attn.v_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.9.self_attn.v_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.9.self_attn.o_proj.base_layer.weight cuda:0\n",
+ "model.layers.9.self_attn.o_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.9.self_attn.o_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.9.mlp.gate_proj.base_layer.weight cuda:0\n",
+ "model.layers.9.mlp.gate_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.9.mlp.gate_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.9.mlp.up_proj.base_layer.weight cuda:0\n",
+ "model.layers.9.mlp.up_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.9.mlp.up_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.9.mlp.down_proj.base_layer.weight cuda:0\n",
+ "model.layers.9.mlp.down_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.9.mlp.down_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.9.input_layernorm.weight cuda:0\n",
+ "model.layers.9.post_attention_layernorm.weight cuda:0\n",
+ "model.layers.10.self_attn.q_proj.base_layer.weight cuda:0\n",
+ "model.layers.10.self_attn.q_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.10.self_attn.q_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.10.self_attn.k_proj.base_layer.weight cuda:0\n",
+ "model.layers.10.self_attn.k_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.10.self_attn.k_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.10.self_attn.v_proj.base_layer.weight cuda:0\n",
+ "model.layers.10.self_attn.v_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.10.self_attn.v_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.10.self_attn.o_proj.base_layer.weight cuda:0\n",
+ "model.layers.10.self_attn.o_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.10.self_attn.o_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.10.mlp.gate_proj.base_layer.weight cuda:0\n",
+ "model.layers.10.mlp.gate_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.10.mlp.gate_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.10.mlp.up_proj.base_layer.weight cuda:0\n",
+ "model.layers.10.mlp.up_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.10.mlp.up_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.10.mlp.down_proj.base_layer.weight cuda:0\n",
+ "model.layers.10.mlp.down_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.10.mlp.down_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.10.input_layernorm.weight cuda:0\n",
+ "model.layers.10.post_attention_layernorm.weight cuda:0\n",
+ "model.layers.11.self_attn.q_proj.base_layer.weight cuda:0\n",
+ "model.layers.11.self_attn.q_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.11.self_attn.q_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.11.self_attn.k_proj.base_layer.weight cuda:0\n",
+ "model.layers.11.self_attn.k_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.11.self_attn.k_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.11.self_attn.v_proj.base_layer.weight cuda:0\n",
+ "model.layers.11.self_attn.v_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.11.self_attn.v_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.11.self_attn.o_proj.base_layer.weight cuda:0\n",
+ "model.layers.11.self_attn.o_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.11.self_attn.o_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.11.mlp.gate_proj.base_layer.weight cuda:0\n",
+ "model.layers.11.mlp.gate_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.11.mlp.gate_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.11.mlp.up_proj.base_layer.weight cuda:0\n",
+ "model.layers.11.mlp.up_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.11.mlp.up_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.11.mlp.down_proj.base_layer.weight cuda:0\n",
+ "model.layers.11.mlp.down_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.11.mlp.down_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.11.input_layernorm.weight cuda:0\n",
+ "model.layers.11.post_attention_layernorm.weight cuda:0\n",
+ "model.layers.12.self_attn.q_proj.base_layer.weight cuda:0\n",
+ "model.layers.12.self_attn.q_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.12.self_attn.q_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.12.self_attn.k_proj.base_layer.weight cuda:0\n",
+ "model.layers.12.self_attn.k_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.12.self_attn.k_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.12.self_attn.v_proj.base_layer.weight cuda:0\n",
+ "model.layers.12.self_attn.v_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.12.self_attn.v_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.12.self_attn.o_proj.base_layer.weight cuda:0\n",
+ "model.layers.12.self_attn.o_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.12.self_attn.o_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.12.mlp.gate_proj.base_layer.weight cuda:0\n",
+ "model.layers.12.mlp.gate_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.12.mlp.gate_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.12.mlp.up_proj.base_layer.weight cuda:0\n",
+ "model.layers.12.mlp.up_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.12.mlp.up_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.12.mlp.down_proj.base_layer.weight cuda:0\n",
+ "model.layers.12.mlp.down_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.12.mlp.down_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.12.input_layernorm.weight cuda:0\n",
+ "model.layers.12.post_attention_layernorm.weight cuda:0\n",
+ "model.layers.13.self_attn.q_proj.base_layer.weight cuda:0\n",
+ "model.layers.13.self_attn.q_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.13.self_attn.q_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.13.self_attn.k_proj.base_layer.weight cuda:0\n",
+ "model.layers.13.self_attn.k_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.13.self_attn.k_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.13.self_attn.v_proj.base_layer.weight cuda:0\n",
+ "model.layers.13.self_attn.v_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.13.self_attn.v_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.13.self_attn.o_proj.base_layer.weight cuda:0\n",
+ "model.layers.13.self_attn.o_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.13.self_attn.o_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.13.mlp.gate_proj.base_layer.weight cuda:0\n",
+ "model.layers.13.mlp.gate_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.13.mlp.gate_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.13.mlp.up_proj.base_layer.weight cuda:0\n",
+ "model.layers.13.mlp.up_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.13.mlp.up_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.13.mlp.down_proj.base_layer.weight cuda:0\n",
+ "model.layers.13.mlp.down_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.13.mlp.down_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.13.input_layernorm.weight cuda:0\n",
+ "model.layers.13.post_attention_layernorm.weight cuda:0\n",
+ "model.layers.14.self_attn.q_proj.base_layer.weight cuda:0\n",
+ "model.layers.14.self_attn.q_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.14.self_attn.q_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.14.self_attn.k_proj.base_layer.weight cuda:0\n",
+ "model.layers.14.self_attn.k_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.14.self_attn.k_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.14.self_attn.v_proj.base_layer.weight cuda:0\n",
+ "model.layers.14.self_attn.v_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.14.self_attn.v_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.14.self_attn.o_proj.base_layer.weight cuda:0\n",
+ "model.layers.14.self_attn.o_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.14.self_attn.o_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.14.mlp.gate_proj.base_layer.weight cuda:0\n",
+ "model.layers.14.mlp.gate_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.14.mlp.gate_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.14.mlp.up_proj.base_layer.weight cuda:0\n",
+ "model.layers.14.mlp.up_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.14.mlp.up_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.14.mlp.down_proj.base_layer.weight cuda:0\n",
+ "model.layers.14.mlp.down_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.14.mlp.down_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.14.input_layernorm.weight cuda:0\n",
+ "model.layers.14.post_attention_layernorm.weight cuda:0\n",
+ "model.layers.15.self_attn.q_proj.base_layer.weight cuda:0\n",
+ "model.layers.15.self_attn.q_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.15.self_attn.q_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.15.self_attn.k_proj.base_layer.weight cuda:0\n",
+ "model.layers.15.self_attn.k_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.15.self_attn.k_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.15.self_attn.v_proj.base_layer.weight cuda:0\n",
+ "model.layers.15.self_attn.v_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.15.self_attn.v_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.15.self_attn.o_proj.base_layer.weight cuda:0\n",
+ "model.layers.15.self_attn.o_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.15.self_attn.o_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.15.mlp.gate_proj.base_layer.weight cuda:0\n",
+ "model.layers.15.mlp.gate_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.15.mlp.gate_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.15.mlp.up_proj.base_layer.weight cuda:0\n",
+ "model.layers.15.mlp.up_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.15.mlp.up_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.15.mlp.down_proj.base_layer.weight cuda:0\n",
+ "model.layers.15.mlp.down_proj.lora_A.default.weight cuda:0\n",
+ "model.layers.15.mlp.down_proj.lora_B.default.weight cuda:0\n",
+ "model.layers.15.input_layernorm.weight cuda:0\n",
+ "model.layers.15.post_attention_layernorm.weight cuda:0\n",
+ "model.layers.16.self_attn.q_proj.base_layer.weight cuda:1\n",
+ "model.layers.16.self_attn.q_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.16.self_attn.q_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.16.self_attn.k_proj.base_layer.weight cuda:1\n",
+ "model.layers.16.self_attn.k_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.16.self_attn.k_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.16.self_attn.v_proj.base_layer.weight cuda:1\n",
+ "model.layers.16.self_attn.v_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.16.self_attn.v_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.16.self_attn.o_proj.base_layer.weight cuda:1\n",
+ "model.layers.16.self_attn.o_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.16.self_attn.o_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.16.mlp.gate_proj.base_layer.weight cuda:1\n",
+ "model.layers.16.mlp.gate_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.16.mlp.gate_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.16.mlp.up_proj.base_layer.weight cuda:1\n",
+ "model.layers.16.mlp.up_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.16.mlp.up_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.16.mlp.down_proj.base_layer.weight cuda:1\n",
+ "model.layers.16.mlp.down_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.16.mlp.down_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.16.input_layernorm.weight cuda:1\n",
+ "model.layers.16.post_attention_layernorm.weight cuda:1\n",
+ "model.layers.17.self_attn.q_proj.base_layer.weight cuda:1\n",
+ "model.layers.17.self_attn.q_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.17.self_attn.q_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.17.self_attn.k_proj.base_layer.weight cuda:1\n",
+ "model.layers.17.self_attn.k_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.17.self_attn.k_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.17.self_attn.v_proj.base_layer.weight cuda:1\n",
+ "model.layers.17.self_attn.v_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.17.self_attn.v_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.17.self_attn.o_proj.base_layer.weight cuda:1\n",
+ "model.layers.17.self_attn.o_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.17.self_attn.o_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.17.mlp.gate_proj.base_layer.weight cuda:1\n",
+ "model.layers.17.mlp.gate_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.17.mlp.gate_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.17.mlp.up_proj.base_layer.weight cuda:1\n",
+ "model.layers.17.mlp.up_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.17.mlp.up_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.17.mlp.down_proj.base_layer.weight cuda:1\n",
+ "model.layers.17.mlp.down_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.17.mlp.down_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.17.input_layernorm.weight cuda:1\n",
+ "model.layers.17.post_attention_layernorm.weight cuda:1\n",
+ "model.layers.18.self_attn.q_proj.base_layer.weight cuda:1\n",
+ "model.layers.18.self_attn.q_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.18.self_attn.q_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.18.self_attn.k_proj.base_layer.weight cuda:1\n",
+ "model.layers.18.self_attn.k_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.18.self_attn.k_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.18.self_attn.v_proj.base_layer.weight cuda:1\n",
+ "model.layers.18.self_attn.v_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.18.self_attn.v_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.18.self_attn.o_proj.base_layer.weight cuda:1\n",
+ "model.layers.18.self_attn.o_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.18.self_attn.o_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.18.mlp.gate_proj.base_layer.weight cuda:1\n",
+ "model.layers.18.mlp.gate_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.18.mlp.gate_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.18.mlp.up_proj.base_layer.weight cuda:1\n",
+ "model.layers.18.mlp.up_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.18.mlp.up_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.18.mlp.down_proj.base_layer.weight cuda:1\n",
+ "model.layers.18.mlp.down_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.18.mlp.down_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.18.input_layernorm.weight cuda:1\n",
+ "model.layers.18.post_attention_layernorm.weight cuda:1\n",
+ "model.layers.19.self_attn.q_proj.base_layer.weight cuda:1\n",
+ "model.layers.19.self_attn.q_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.19.self_attn.q_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.19.self_attn.k_proj.base_layer.weight cuda:1\n",
+ "model.layers.19.self_attn.k_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.19.self_attn.k_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.19.self_attn.v_proj.base_layer.weight cuda:1\n",
+ "model.layers.19.self_attn.v_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.19.self_attn.v_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.19.self_attn.o_proj.base_layer.weight cuda:1\n",
+ "model.layers.19.self_attn.o_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.19.self_attn.o_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.19.mlp.gate_proj.base_layer.weight cuda:1\n",
+ "model.layers.19.mlp.gate_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.19.mlp.gate_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.19.mlp.up_proj.base_layer.weight cuda:1\n",
+ "model.layers.19.mlp.up_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.19.mlp.up_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.19.mlp.down_proj.base_layer.weight cuda:1\n",
+ "model.layers.19.mlp.down_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.19.mlp.down_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.19.input_layernorm.weight cuda:1\n",
+ "model.layers.19.post_attention_layernorm.weight cuda:1\n",
+ "model.layers.20.self_attn.q_proj.base_layer.weight cuda:1\n",
+ "model.layers.20.self_attn.q_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.20.self_attn.q_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.20.self_attn.k_proj.base_layer.weight cuda:1\n",
+ "model.layers.20.self_attn.k_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.20.self_attn.k_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.20.self_attn.v_proj.base_layer.weight cuda:1\n",
+ "model.layers.20.self_attn.v_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.20.self_attn.v_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.20.self_attn.o_proj.base_layer.weight cuda:1\n",
+ "model.layers.20.self_attn.o_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.20.self_attn.o_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.20.mlp.gate_proj.base_layer.weight cuda:1\n",
+ "model.layers.20.mlp.gate_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.20.mlp.gate_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.20.mlp.up_proj.base_layer.weight cuda:1\n",
+ "model.layers.20.mlp.up_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.20.mlp.up_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.20.mlp.down_proj.base_layer.weight cuda:1\n",
+ "model.layers.20.mlp.down_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.20.mlp.down_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.20.input_layernorm.weight cuda:1\n",
+ "model.layers.20.post_attention_layernorm.weight cuda:1\n",
+ "model.layers.21.self_attn.q_proj.base_layer.weight cuda:1\n",
+ "model.layers.21.self_attn.q_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.21.self_attn.q_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.21.self_attn.k_proj.base_layer.weight cuda:1\n",
+ "model.layers.21.self_attn.k_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.21.self_attn.k_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.21.self_attn.v_proj.base_layer.weight cuda:1\n",
+ "model.layers.21.self_attn.v_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.21.self_attn.v_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.21.self_attn.o_proj.base_layer.weight cuda:1\n",
+ "model.layers.21.self_attn.o_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.21.self_attn.o_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.21.mlp.gate_proj.base_layer.weight cuda:1\n",
+ "model.layers.21.mlp.gate_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.21.mlp.gate_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.21.mlp.up_proj.base_layer.weight cuda:1\n",
+ "model.layers.21.mlp.up_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.21.mlp.up_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.21.mlp.down_proj.base_layer.weight cuda:1\n",
+ "model.layers.21.mlp.down_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.21.mlp.down_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.21.input_layernorm.weight cuda:1\n",
+ "model.layers.21.post_attention_layernorm.weight cuda:1\n",
+ "model.layers.22.self_attn.q_proj.base_layer.weight cuda:1\n",
+ "model.layers.22.self_attn.q_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.22.self_attn.q_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.22.self_attn.k_proj.base_layer.weight cuda:1\n",
+ "model.layers.22.self_attn.k_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.22.self_attn.k_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.22.self_attn.v_proj.base_layer.weight cuda:1\n",
+ "model.layers.22.self_attn.v_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.22.self_attn.v_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.22.self_attn.o_proj.base_layer.weight cuda:1\n",
+ "model.layers.22.self_attn.o_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.22.self_attn.o_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.22.mlp.gate_proj.base_layer.weight cuda:1\n",
+ "model.layers.22.mlp.gate_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.22.mlp.gate_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.22.mlp.up_proj.base_layer.weight cuda:1\n",
+ "model.layers.22.mlp.up_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.22.mlp.up_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.22.mlp.down_proj.base_layer.weight cuda:1\n",
+ "model.layers.22.mlp.down_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.22.mlp.down_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.22.input_layernorm.weight cuda:1\n",
+ "model.layers.22.post_attention_layernorm.weight cuda:1\n",
+ "model.layers.23.self_attn.q_proj.base_layer.weight cuda:1\n",
+ "model.layers.23.self_attn.q_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.23.self_attn.q_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.23.self_attn.k_proj.base_layer.weight cuda:1\n",
+ "model.layers.23.self_attn.k_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.23.self_attn.k_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.23.self_attn.v_proj.base_layer.weight cuda:1\n",
+ "model.layers.23.self_attn.v_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.23.self_attn.v_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.23.self_attn.o_proj.base_layer.weight cuda:1\n",
+ "model.layers.23.self_attn.o_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.23.self_attn.o_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.23.mlp.gate_proj.base_layer.weight cuda:1\n",
+ "model.layers.23.mlp.gate_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.23.mlp.gate_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.23.mlp.up_proj.base_layer.weight cuda:1\n",
+ "model.layers.23.mlp.up_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.23.mlp.up_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.23.mlp.down_proj.base_layer.weight cuda:1\n",
+ "model.layers.23.mlp.down_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.23.mlp.down_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.23.input_layernorm.weight cuda:1\n",
+ "model.layers.23.post_attention_layernorm.weight cuda:1\n",
+ "model.layers.24.self_attn.q_proj.base_layer.weight cuda:1\n",
+ "model.layers.24.self_attn.q_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.24.self_attn.q_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.24.self_attn.k_proj.base_layer.weight cuda:1\n",
+ "model.layers.24.self_attn.k_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.24.self_attn.k_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.24.self_attn.v_proj.base_layer.weight cuda:1\n",
+ "model.layers.24.self_attn.v_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.24.self_attn.v_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.24.self_attn.o_proj.base_layer.weight cuda:1\n",
+ "model.layers.24.self_attn.o_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.24.self_attn.o_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.24.mlp.gate_proj.base_layer.weight cuda:1\n",
+ "model.layers.24.mlp.gate_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.24.mlp.gate_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.24.mlp.up_proj.base_layer.weight cuda:1\n",
+ "model.layers.24.mlp.up_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.24.mlp.up_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.24.mlp.down_proj.base_layer.weight cuda:1\n",
+ "model.layers.24.mlp.down_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.24.mlp.down_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.24.input_layernorm.weight cuda:1\n",
+ "model.layers.24.post_attention_layernorm.weight cuda:1\n",
+ "model.layers.25.self_attn.q_proj.base_layer.weight cuda:1\n",
+ "model.layers.25.self_attn.q_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.25.self_attn.q_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.25.self_attn.k_proj.base_layer.weight cuda:1\n",
+ "model.layers.25.self_attn.k_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.25.self_attn.k_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.25.self_attn.v_proj.base_layer.weight cuda:1\n",
+ "model.layers.25.self_attn.v_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.25.self_attn.v_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.25.self_attn.o_proj.base_layer.weight cuda:1\n",
+ "model.layers.25.self_attn.o_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.25.self_attn.o_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.25.mlp.gate_proj.base_layer.weight cuda:1\n",
+ "model.layers.25.mlp.gate_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.25.mlp.gate_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.25.mlp.up_proj.base_layer.weight cuda:1\n",
+ "model.layers.25.mlp.up_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.25.mlp.up_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.25.mlp.down_proj.base_layer.weight cuda:1\n",
+ "model.layers.25.mlp.down_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.25.mlp.down_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.25.input_layernorm.weight cuda:1\n",
+ "model.layers.25.post_attention_layernorm.weight cuda:1\n",
+ "model.layers.26.self_attn.q_proj.base_layer.weight cuda:1\n",
+ "model.layers.26.self_attn.q_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.26.self_attn.q_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.26.self_attn.k_proj.base_layer.weight cuda:1\n",
+ "model.layers.26.self_attn.k_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.26.self_attn.k_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.26.self_attn.v_proj.base_layer.weight cuda:1\n",
+ "model.layers.26.self_attn.v_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.26.self_attn.v_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.26.self_attn.o_proj.base_layer.weight cuda:1\n",
+ "model.layers.26.self_attn.o_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.26.self_attn.o_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.26.mlp.gate_proj.base_layer.weight cuda:1\n",
+ "model.layers.26.mlp.gate_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.26.mlp.gate_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.26.mlp.up_proj.base_layer.weight cuda:1\n",
+ "model.layers.26.mlp.up_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.26.mlp.up_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.26.mlp.down_proj.base_layer.weight cuda:1\n",
+ "model.layers.26.mlp.down_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.26.mlp.down_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.26.input_layernorm.weight cuda:1\n",
+ "model.layers.26.post_attention_layernorm.weight cuda:1\n",
+ "model.layers.27.self_attn.q_proj.base_layer.weight cuda:1\n",
+ "model.layers.27.self_attn.q_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.27.self_attn.q_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.27.self_attn.k_proj.base_layer.weight cuda:1\n",
+ "model.layers.27.self_attn.k_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.27.self_attn.k_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.27.self_attn.v_proj.base_layer.weight cuda:1\n",
+ "model.layers.27.self_attn.v_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.27.self_attn.v_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.27.self_attn.o_proj.base_layer.weight cuda:1\n",
+ "model.layers.27.self_attn.o_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.27.self_attn.o_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.27.mlp.gate_proj.base_layer.weight cuda:1\n",
+ "model.layers.27.mlp.gate_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.27.mlp.gate_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.27.mlp.up_proj.base_layer.weight cuda:1\n",
+ "model.layers.27.mlp.up_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.27.mlp.up_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.27.mlp.down_proj.base_layer.weight cuda:1\n",
+ "model.layers.27.mlp.down_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.27.mlp.down_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.27.input_layernorm.weight cuda:1\n",
+ "model.layers.27.post_attention_layernorm.weight cuda:1\n",
+ "model.layers.28.self_attn.q_proj.base_layer.weight cuda:1\n",
+ "model.layers.28.self_attn.q_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.28.self_attn.q_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.28.self_attn.k_proj.base_layer.weight cuda:1\n",
+ "model.layers.28.self_attn.k_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.28.self_attn.k_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.28.self_attn.v_proj.base_layer.weight cuda:1\n",
+ "model.layers.28.self_attn.v_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.28.self_attn.v_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.28.self_attn.o_proj.base_layer.weight cuda:1\n",
+ "model.layers.28.self_attn.o_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.28.self_attn.o_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.28.mlp.gate_proj.base_layer.weight cuda:1\n",
+ "model.layers.28.mlp.gate_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.28.mlp.gate_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.28.mlp.up_proj.base_layer.weight cuda:1\n",
+ "model.layers.28.mlp.up_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.28.mlp.up_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.28.mlp.down_proj.base_layer.weight cuda:1\n",
+ "model.layers.28.mlp.down_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.28.mlp.down_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.28.input_layernorm.weight cuda:1\n",
+ "model.layers.28.post_attention_layernorm.weight cuda:1\n",
+ "model.layers.29.self_attn.q_proj.base_layer.weight cuda:1\n",
+ "model.layers.29.self_attn.q_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.29.self_attn.q_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.29.self_attn.k_proj.base_layer.weight cuda:1\n",
+ "model.layers.29.self_attn.k_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.29.self_attn.k_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.29.self_attn.v_proj.base_layer.weight cuda:1\n",
+ "model.layers.29.self_attn.v_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.29.self_attn.v_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.29.self_attn.o_proj.base_layer.weight cuda:1\n",
+ "model.layers.29.self_attn.o_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.29.self_attn.o_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.29.mlp.gate_proj.base_layer.weight cuda:1\n",
+ "model.layers.29.mlp.gate_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.29.mlp.gate_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.29.mlp.up_proj.base_layer.weight cuda:1\n",
+ "model.layers.29.mlp.up_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.29.mlp.up_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.29.mlp.down_proj.base_layer.weight cuda:1\n",
+ "model.layers.29.mlp.down_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.29.mlp.down_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.29.input_layernorm.weight cuda:1\n",
+ "model.layers.29.post_attention_layernorm.weight cuda:1\n",
+ "model.layers.30.self_attn.q_proj.base_layer.weight cuda:1\n",
+ "model.layers.30.self_attn.q_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.30.self_attn.q_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.30.self_attn.k_proj.base_layer.weight cuda:1\n",
+ "model.layers.30.self_attn.k_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.30.self_attn.k_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.30.self_attn.v_proj.base_layer.weight cuda:1\n",
+ "model.layers.30.self_attn.v_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.30.self_attn.v_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.30.self_attn.o_proj.base_layer.weight cuda:1\n",
+ "model.layers.30.self_attn.o_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.30.self_attn.o_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.30.mlp.gate_proj.base_layer.weight cuda:1\n",
+ "model.layers.30.mlp.gate_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.30.mlp.gate_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.30.mlp.up_proj.base_layer.weight cuda:1\n",
+ "model.layers.30.mlp.up_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.30.mlp.up_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.30.mlp.down_proj.base_layer.weight cuda:1\n",
+ "model.layers.30.mlp.down_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.30.mlp.down_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.30.input_layernorm.weight cuda:1\n",
+ "model.layers.30.post_attention_layernorm.weight cuda:1\n",
+ "model.layers.31.self_attn.q_proj.base_layer.weight cuda:1\n",
+ "model.layers.31.self_attn.q_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.31.self_attn.q_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.31.self_attn.k_proj.base_layer.weight cuda:1\n",
+ "model.layers.31.self_attn.k_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.31.self_attn.k_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.31.self_attn.v_proj.base_layer.weight cuda:1\n",
+ "model.layers.31.self_attn.v_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.31.self_attn.v_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.31.self_attn.o_proj.base_layer.weight cuda:1\n",
+ "model.layers.31.self_attn.o_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.31.self_attn.o_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.31.mlp.gate_proj.base_layer.weight cuda:1\n",
+ "model.layers.31.mlp.gate_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.31.mlp.gate_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.31.mlp.up_proj.base_layer.weight cuda:1\n",
+ "model.layers.31.mlp.up_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.31.mlp.up_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.31.mlp.down_proj.base_layer.weight cuda:1\n",
+ "model.layers.31.mlp.down_proj.lora_A.default.weight cuda:1\n",
+ "model.layers.31.mlp.down_proj.lora_B.default.weight cuda:1\n",
+ "model.layers.31.input_layernorm.weight cuda:1\n",
+ "model.layers.31.post_attention_layernorm.weight cuda:1\n",
+ "model.norm.weight cuda:1\n",
+ "lm_head.layer.weight cuda:1\n",
+ "lm_head.new_embeddings.weight cuda:1\n"
+ ]
+ }
+ ],
+ "source": [
+ "for name, param in model2.named_parameters():\n",
+ " print(name, param.device)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import gc\n",
+ "model2.cpu()\n",
+ "surgeon2.extended_embedding.cpu()\n",
+ "surgeon2.extended_unembedding.cpu()\n",
+ "surgeon2.backup_embed_tokens.cpu()\n",
+ "surgeon2.backup_lm_head.cpu()\n",
+ "surgeon.extended_embedding.cpu()\n",
+ "surgeon.extended_unembedding.cpu()\n",
+ "surgeon.backup_embed_tokens.cpu()\n",
+ "surgeon.backup_lm_head.cpu()\n",
+ "gc.collect()\n",
+ "for i in range(torch.cuda.device_count()):\n",
+ " torch.cuda.set_device(i) \n",
+ " torch.cuda.empty_cache() "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "PeftModelForCausalLM(\n",
+ " (base_model): LoraModel(\n",
+ " (model): LlamaForCausalLM(\n",
+ " (model): LlamaModel(\n",
+ " (embed_tokens): Embedding(32000, 4096)\n",
+ " (layers): ModuleList(\n",
+ " (0-31): 32 x LlamaDecoderLayer(\n",
+ " (self_attn): LlamaSdpaAttention(\n",
+ " (q_proj): lora.Linear(\n",
+ " (base_layer): Linear(in_features=4096, out_features=4096, bias=False)\n",
+ " (lora_dropout): ModuleDict(\n",
+ " (default): Identity()\n",
+ " )\n",
+ " (lora_A): ModuleDict(\n",
+ " (default): Linear(in_features=4096, out_features=64, bias=False)\n",
+ " )\n",
+ " (lora_B): ModuleDict(\n",
+ " (default): Linear(in_features=64, out_features=4096, bias=False)\n",
+ " )\n",
+ " (lora_embedding_A): ParameterDict()\n",
+ " (lora_embedding_B): ParameterDict()\n",
+ " )\n",
+ " (k_proj): lora.Linear(\n",
+ " (base_layer): Linear(in_features=4096, out_features=4096, bias=False)\n",
+ " (lora_dropout): ModuleDict(\n",
+ " (default): Identity()\n",
+ " )\n",
+ " (lora_A): ModuleDict(\n",
+ " (default): Linear(in_features=4096, out_features=64, bias=False)\n",
+ " )\n",
+ " (lora_B): ModuleDict(\n",
+ " (default): Linear(in_features=64, out_features=4096, bias=False)\n",
+ " )\n",
+ " (lora_embedding_A): ParameterDict()\n",
+ " (lora_embedding_B): ParameterDict()\n",
+ " )\n",
+ " (v_proj): lora.Linear(\n",
+ " (base_layer): Linear(in_features=4096, out_features=4096, bias=False)\n",
+ " (lora_dropout): ModuleDict(\n",
+ " (default): Identity()\n",
+ " )\n",
+ " (lora_A): ModuleDict(\n",
+ " (default): Linear(in_features=4096, out_features=64, bias=False)\n",
+ " )\n",
+ " (lora_B): ModuleDict(\n",
+ " (default): Linear(in_features=64, out_features=4096, bias=False)\n",
+ " )\n",
+ " (lora_embedding_A): ParameterDict()\n",
+ " (lora_embedding_B): ParameterDict()\n",
+ " )\n",
+ " (o_proj): lora.Linear(\n",
+ " (base_layer): Linear(in_features=4096, out_features=4096, bias=False)\n",
+ " (lora_dropout): ModuleDict(\n",
+ " (default): Identity()\n",
+ " )\n",
+ " (lora_A): ModuleDict(\n",
+ " (default): Linear(in_features=4096, out_features=64, bias=False)\n",
+ " )\n",
+ " (lora_B): ModuleDict(\n",
+ " (default): Linear(in_features=64, out_features=4096, bias=False)\n",
+ " )\n",
+ " (lora_embedding_A): ParameterDict()\n",
+ " (lora_embedding_B): ParameterDict()\n",
+ " )\n",
+ " (rotary_emb): LlamaRotaryEmbedding()\n",
+ " )\n",
+ " (mlp): LlamaMLP(\n",
+ " (gate_proj): lora.Linear(\n",
+ " (base_layer): Linear(in_features=4096, out_features=11008, bias=False)\n",
+ " (lora_dropout): ModuleDict(\n",
+ " (default): Identity()\n",
+ " )\n",
+ " (lora_A): ModuleDict(\n",
+ " (default): Linear(in_features=4096, out_features=64, bias=False)\n",
+ " )\n",
+ " (lora_B): ModuleDict(\n",
+ " (default): Linear(in_features=64, out_features=11008, bias=False)\n",
+ " )\n",
+ " (lora_embedding_A): ParameterDict()\n",
+ " (lora_embedding_B): ParameterDict()\n",
+ " )\n",
+ " (up_proj): lora.Linear(\n",
+ " (base_layer): Linear(in_features=4096, out_features=11008, bias=False)\n",
+ " (lora_dropout): ModuleDict(\n",
+ " (default): Identity()\n",
+ " )\n",
+ " (lora_A): ModuleDict(\n",
+ " (default): Linear(in_features=4096, out_features=64, bias=False)\n",
+ " )\n",
+ " (lora_B): ModuleDict(\n",
+ " (default): Linear(in_features=64, out_features=11008, bias=False)\n",
+ " )\n",
+ " (lora_embedding_A): ParameterDict()\n",
+ " (lora_embedding_B): ParameterDict()\n",
+ " )\n",
+ " (down_proj): lora.Linear(\n",
+ " (base_layer): Linear(in_features=11008, out_features=4096, bias=False)\n",
+ " (lora_dropout): ModuleDict(\n",
+ " (default): Identity()\n",
+ " )\n",
+ " (lora_A): ModuleDict(\n",
+ " (default): Linear(in_features=11008, out_features=64, bias=False)\n",
+ " )\n",
+ " (lora_B): ModuleDict(\n",
+ " (default): Linear(in_features=64, out_features=4096, bias=False)\n",
+ " )\n",
+ " (lora_embedding_A): ParameterDict()\n",
+ " (lora_embedding_B): ParameterDict()\n",
+ " )\n",
+ " (act_fn): SiLU()\n",
+ " )\n",
+ " (input_layernorm): LlamaRMSNorm()\n",
+ " (post_attention_layernorm): LlamaRMSNorm()\n",
+ " )\n",
+ " )\n",
+ " (norm): LlamaRMSNorm()\n",
+ " )\n",
+ " (lm_head): Linear(in_features=4096, out_features=32000, bias=False)\n",
+ " )\n",
+ " )\n",
+ ")"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "model"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.8"
+ },
+ "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}