Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

edit save.py to fix gguf saving breaks. #1649

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 38 additions & 29 deletions unsloth/save.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ def unsloth_save_model(
# First check for a token!
if push_to_hub:
from huggingface_hub import whoami
try:
try:
username = whoami(token = token)["name"]
except:
raise RuntimeError(
Expand Down Expand Up @@ -385,7 +385,7 @@ def unsloth_save_model(
else:
internal_model = model
pass

# Cannot be converted properly!
if (save_method == "merged_4bit") or (save_method == "lora") or (
not hasattr(model, "model") or \
Expand Down Expand Up @@ -481,7 +481,7 @@ def unsloth_save_model(
gb_found = re.match("([0-9]{1,})[\s]{0,}GB", max_shard_size, flags = re.IGNORECASE)
mb_found = re.match("([0-9]{1,})[\s]{0,}MB", max_shard_size, flags = re.IGNORECASE)
if gb_found: sharded_ram_usage = int(gb_found.group(1)) * 1024 * 1024 * 1024
elif mb_found: sharded_ram_usage = int(mb_found.group(1)) * 1024 * 1024
elif mb_found: sharded_ram_usage = int(mb_found.group(1)) * 1024 * 1024
elif type(max_shard_size) is int:
sharded_ram_usage = sharded_ram_usage
pass
Expand Down Expand Up @@ -612,7 +612,7 @@ def unsloth_save_model(
# Edit save_pretrained_settings
# [TODO] _create_repo has errors due to **kwargs getting accepted
save_pretrained_settings["state_dict"] = state_dict

# commit_description does not seem to work?
what_to_delete = ("use_temp_dir", "commit_message", "create_pr", "revision", "commit_description", "tags",) \
if not push_to_hub else \
Expand Down Expand Up @@ -665,7 +665,7 @@ def unsloth_save_model(

# Revert back padding side
tokenizer.padding_side = old_padding_side

print(" Done.")
else:
print()
Expand Down Expand Up @@ -877,10 +877,15 @@ def install_llama_cpp_old(version = -10):
pass

# Check if successful
if not os.path.exists("llama.cpp/quantize") and not os.path.exists("llama.cpp/llama-quantize"):
if not (
os.path.exists("llama.cpp/llama-quantize.exe") or
os.path.exists("llama.cpp/llama-quantize") or
os.path.exists("llama.cpp/quantize.exe") or
os.path.exists("llama.cpp/quantize")
):
raise RuntimeError(
"Unsloth: The file 'llama.cpp/llama-quantize' or `llama.cpp/quantize` does not exist.\n"\
"But we expect this file to exist! Maybe the llama.cpp developers changed the name?"
"But we expect this file to exist! Maybe the llama.cpp developers changed the name or check extension of the llama-quantize file."
)
pass
pass
Expand Down Expand Up @@ -957,7 +962,7 @@ def save_to_gguf(
else:
raise TypeError("Unsloth: quantization_method can only be a string or a list of strings")
pass

# Check if bfloat16 is supported
if model_dtype == "bf16" and not torch.cuda.is_bf16_supported():
logger.warning(
Expand All @@ -973,7 +978,7 @@ def save_to_gguf(
pass

# Check I quants
for quant_method in quantization_method:
for quant_method in quantization_method:
if quant_method.startswith("iq2"):
raise RuntimeError("Unsloth: Currently iq2 type quantizations aren't supported yet - sorry!")
pass
Expand Down Expand Up @@ -1026,9 +1031,9 @@ def save_to_gguf(
pass

# Determine whether the system already has llama.cpp installed and the scripts are executable
quantize_location = get_executable(["llama-quantize", "quantize"])
quantize_location = get_executable(["llama-quantize", "quantize", "llama-quantize.exe", "quantize.exe"])
convert_location = get_executable(["convert-hf-to-gguf.py", "convert_hf_to_gguf.py"])

error = 0
if quantize_location is not None and convert_location is not None:
print("Unsloth: llama.cpp found in the system. We shall skip installation.")
Expand Down Expand Up @@ -1062,14 +1067,18 @@ def save_to_gguf(
# and llama.cpp/main changed to llama.cpp/llama-cli
# See https://github.com/ggerganov/llama.cpp/pull/7809
quantize_location = None
if os.path.exists("llama.cpp/quantize"):
if os.path.exists("llama.cpp/quantize.exe"):
quantize_location = "llama.cpp/quantize.exe"
elif os.path.exists("llama.cpp/quantize"):
quantize_location = "llama.cpp/quantize"
elif os.path.exists("llama.cpp/llama-quantize.exe"):
quantize_location = "llama.cpp/llama-quantize.exe"
elif os.path.exists("llama.cpp/llama-quantize"):
quantize_location = "llama.cpp/llama-quantize"
else:
raise RuntimeError(
"Unsloth: The file 'llama.cpp/llama-quantize' or 'llama.cpp/quantize' does not exist.\n"\
"But we expect this file to exist! Maybe the llama.cpp developers changed the name?"
"Unsloth: The file ('llama.cpp/llama-quantize' or 'llama.cpp/llama-quantize.exe' if you are on Windows WSL) or 'llama.cpp/quantize' does not exist.\n"\
"But we expect this file to exist! Maybe the llama.cpp developers changed the name or check extension of the llama-quantize file."
)
pass

Expand Down Expand Up @@ -1150,7 +1159,7 @@ def save_to_gguf(
# Concurrency from https://rentry.org/llama-cpp-conversions#merging-loras-into-a-model

final_location = str((Path(model_directory) / f"unsloth.{first_conversion.upper()}.gguf").absolute())

print(f"Unsloth: [1] Converting model at {model_directory} into {first_conversion} GGUF format.\n"\
f"The output location will be {final_location}\n"\
"This might take 3 minutes...")
Expand Down Expand Up @@ -1217,7 +1226,7 @@ def save_to_gguf(

command = f"./{quantize_location} {full_precision_location} "\
f"{final_location} {quant_method} {n_cpus}"

try_execute([command,], force_complete = True)

# Check if quantization succeeded!
Expand Down Expand Up @@ -1378,7 +1387,7 @@ def _determine_username(save_directory, old_username, token):
save_directory = save_directory.lstrip("./")
if "/" not in save_directory:
from huggingface_hub import whoami
try:
try:
username = whoami(token = token)["name"]
if type(old_username) is str and username != old_username:
username = old_username
Expand Down Expand Up @@ -1412,7 +1421,7 @@ def create_huggingface_repo(
repo_type = "model",
exist_ok = False,
private = private,
)
)

# Create model card
from huggingface_hub import ModelCard
Expand Down Expand Up @@ -1453,7 +1462,7 @@ def upload_to_huggingface(
repo_type = "model",
exist_ok = False,
private = private,
)
)

# Create model card
from huggingface_hub import ModelCard
Expand Down Expand Up @@ -1527,7 +1536,7 @@ def fix_tokenizer_bos_token(tokenizer):
# Check if BOS added already, then warn
fix_bos_token = False
chat_template = getattr(tokenizer, "chat_template", None)

if (tokenizer("A").input_ids[0] == getattr(tokenizer, "bos_token_id", None)):
if chat_template is not None and \
(
Expand All @@ -1546,7 +1555,7 @@ def fix_tokenizer_bos_token(tokenizer):
new_chat_template = re.sub(r"\{[\s]{0,}\{[\s]{0,}bos\_token[\s]{0,}\}[\s]{0,}\}", "", chat_template)
# Remove {{bos_token +
new_chat_template = re.sub(r"\{[\s]{0,}\{[\s]{0,}bos\_token[\s]{0,}\+[\s]{0,}", "", new_chat_template)

tokenizer.chat_template = new_chat_template

pass
Expand Down Expand Up @@ -1580,7 +1589,7 @@ def create_ollama_modelfile(tokenizer, gguf_location):
modelfile = modelfile\
.replace(FILE_LOCATION_REPLACER, "{__FILE_LOCATION__}")\
.replace(EOS_TOKEN_REPLACER, "{__EOS_TOKEN__}")

if "__EOS_TOKEN__" in modelfile:
modelfile = modelfile.format(
__FILE_LOCATION__ = gguf_location,
Expand All @@ -1591,7 +1600,7 @@ def create_ollama_modelfile(tokenizer, gguf_location):
__FILE_LOCATION__ = gguf_location,
)
pass

modelfile = modelfile\
.replace("⚫@✅#🦥", "{")\
.replace("⚡@🦥#⛵", "}")\
Expand Down Expand Up @@ -1733,7 +1742,7 @@ def unsloth_save_pretrained_gguf(

# Save to GGUF
all_file_locations, want_full_precision = save_to_gguf(
model_type, model_dtype, is_sentencepiece_model,
model_type, model_dtype, is_sentencepiece_model,
new_save_directory, quantization_method, first_conversion, makefile,
)

Expand Down Expand Up @@ -1911,7 +1920,7 @@ def unsloth_push_to_hub_gguf(

# Save to GGUF
all_file_locations, want_full_precision = save_to_gguf(
model_type, model_dtype, is_sentencepiece_model,
model_type, model_dtype, is_sentencepiece_model,
new_save_directory, quantization_method, first_conversion, makefile,
)

Expand All @@ -1928,7 +1937,7 @@ def unsloth_push_to_hub_gguf(

# If not needing full precision, skip the first
if not want_full_precision: all_file_locations = all_file_locations[1:]

for file_location in all_file_locations:
print("Unsloth: Uploading GGUF to Huggingface Hub...")
username = upload_to_huggingface(
Expand Down Expand Up @@ -2044,8 +2053,8 @@ def unsloth_convert_lora_to_ggml_and_push_to_hub(

def unsloth_convert_lora_to_ggml_and_save_locally(
self,
save_directory: str, # Added parameter for the folder name
tokenizer,
save_directory: str, # Added parameter for the folder name
tokenizer,
temporary_location: str = "_unsloth_temporary_saved_buffers",
maximum_memory_usage: float = 0.85,
):
Expand Down Expand Up @@ -2162,7 +2171,7 @@ def unsloth_generic_save_pretrained_merged(
tags : List[str] = None,
temporary_location : str = "_unsloth_temporary_saved_buffers",
maximum_memory_usage : float = 0.75,
):
):
"""
Same as .push_to_hub(...) except 4bit weights are auto
converted to float16 with as few overhead as possible.
Expand Down