diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
index aff69510d636..6e98ee0f1493 100644
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -16,10 +16,9 @@
import argparse
import copy
import os
-import random
from dataclasses import dataclass
-from typing import Any, Dict, List, Optional
-import glob
+from typing import Any, Optional
+
import yaml
@@ -30,6 +29,7 @@
"RUN_PIPELINE_TESTS": False,
# will be adjust in `CircleCIJob.to_dict`.
"RUN_FLAKY": True,
+ "DISABLE_SAFETENSORS_CONVERSION": True,
}
# Disable the use of {"s": None} as the output is way too long, causing the navigation on CircleCI impractical
COMMON_PYTEST_OPTIONS = {"max-worker-restart": 0, "vvv": None, "rsfE":None}
@@ -82,15 +82,15 @@ def to_dict(self):
@dataclass
class CircleCIJob:
name: str
- additional_env: Dict[str, Any] = None
- docker_image: List[Dict[str, str]] = None
- install_steps: List[str] = None
+ additional_env: dict[str, Any] = None
+ docker_image: list[dict[str, str]] = None
+ install_steps: list[str] = None
marker: Optional[str] = None
parallelism: Optional[int] = 0
pytest_num_workers: int = 8
- pytest_options: Dict[str, Any] = None
+ pytest_options: dict[str, Any] = None
resource_class: Optional[str] = "xlarge"
- tests_to_run: Optional[List[str]] = None
+ tests_to_run: Optional[list[str]] = None
num_test_files_per_worker: Optional[int] = 10
# This should be only used for doctest job!
command_timeout: Optional[int] = None
@@ -130,6 +130,12 @@ def __post_init__(self):
def to_dict(self):
env = COMMON_ENV_VARIABLES.copy()
+ if self.job_name != "tests_hub":
+ # fmt: off
+ # not critical
+ env.update({"HF_TOKEN": "".join(["h", "f", "_", "H", "o", "d", "V", "u", "M", "q", "b", "R", "m", "t", "b", "z", "F", "Q", "O", "Q", "A", "J", "G", "D", "l", "V", "Q", "r", "R", "N", "w", "D", "M", "V", "C", "s", "d"])})
+ # fmt: on
+
# Do not run tests decorated by @is_flaky on pull requests
env['RUN_FLAKY'] = os.environ.get("CIRCLE_PULL_REQUEST", "") == ""
env.update(self.additional_env)
@@ -149,7 +155,7 @@ def to_dict(self):
# Examples special case: we need to download NLTK files in advance to avoid cuncurrency issues
timeout_cmd = f"timeout {self.command_timeout} " if self.command_timeout else ""
marker_cmd = f"-m '{self.marker}'" if self.marker is not None else ""
- junit_flags = f" -p no:warning -o junit_family=xunit1 --junitxml=test-results/junit.xml"
+ junit_flags = " -p no:warning -o junit_family=xunit1 --junitxml=test-results/junit.xml"
joined_flaky_patterns = "|".join(FLAKY_TEST_FAILURE_PATTERNS)
repeat_on_failure_flags = f"--reruns 5 --reruns-delay 2 --only-rerun '({joined_flaky_patterns})'"
parallel = f' << pipeline.parameters.{self.job_name}_parallelism >> '
@@ -180,6 +186,7 @@ def to_dict(self):
# During the CircleCI docker images build time, we might already (or not) download the data.
# If it's done already, the files are inside the directory `/test_data/`.
{"run": {"name": "fetch hub objects before pytest", "command": "cp -r /test_data/* . 2>/dev/null || true; python3 utils/fetch_hub_objects_for_ci.py"}},
+ {"run": {"name": "download and unzip hub cache", "command": 'curl -L -o huggingface-cache.tar.gz https://huggingface.co/datasets/hf-internal-testing/hf_hub_cache/resolve/main/huggingface-cache.tar.gz && apt-get install pigz && tar --use-compress-program="pigz -d -p 8" -xf huggingface-cache.tar.gz && mv -n hub/* /root/.cache/huggingface/hub/ && ls -la /root/.cache/huggingface/hub/'}},
{"run": {
"name": "Run tests",
"command": f"({timeout_cmd} python3 -m pytest {marker_cmd} -n {self.pytest_num_workers} {junit_flags} {repeat_on_failure_flags} {' '.join(pytest_flags)} $(cat splitted_tests.txt) | tee tests_output.txt)"}
@@ -200,9 +207,9 @@ def to_dict(self):
fi"""
},
},
- {"run": {"name": "Expand to show skipped tests", "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --skip"}},
- {"run": {"name": "Failed tests: show reasons", "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --fail"}},
- {"run": {"name": "Errors", "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --errors"}},
+ {"run": {"name": "Expand to show skipped tests", "when": "always", "command": "python3 .circleci/parse_test_outputs.py --file tests_output.txt --skip"}},
+ {"run": {"name": "Failed tests: show reasons", "when": "always", "command": "python3 .circleci/parse_test_outputs.py --file tests_output.txt --fail"}},
+ {"run": {"name": "Errors", "when": "always", "command": "python3 .circleci/parse_test_outputs.py --file tests_output.txt --errors"}},
{"store_test_results": {"path": "test-results"}},
{"store_artifacts": {"path": "test-results/junit.xml"}},
{"store_artifacts": {"path": "reports"}},
diff --git a/.circleci/parse_test_outputs.py b/.circleci/parse_test_outputs.py
index a69da1a3eafb..c58447155859 100644
--- a/.circleci/parse_test_outputs.py
+++ b/.circleci/parse_test_outputs.py
@@ -1,5 +1,6 @@
-import re
import argparse
+import re
+
def parse_pytest_output(file_path):
skipped_tests = {}
diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml
index 78e96e9b3386..30ac3b4c9512 100644
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -61,6 +61,7 @@ body:
- Big Model Inference: @SunMarc
- quantization (bitsandbytes, autogpt): @SunMarc @MekkCyber
- kernels: @MekkCyber @drbh
+ - peft: @BenjaminBossan @githubnemo
Devices/Backends:
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index aa1e881122c1..de4ed57873ef 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -39,20 +39,23 @@ members/contributors who may be interested in your PR.
Models:
-- text models: @ArthurZucker
-- vision models: @amyeroberts, @qubvel
-- speech models: @eustlb
+- text models: @ArthurZucker @Cyrilvallez
+- vision models: @yonigozlan @molbap
+- audio models: @eustlb @ebezzam @vasqu
+- multimodal models: @zucchini-nlp
- graph models: @clefourrier
Library:
-- flax: @gante and @Rocketknight1
- generate: @zucchini-nlp (visual-language models) or @gante (all others)
+- continuous batching: @remi-or @ArthurZucker @McPatate
- pipelines: @Rocketknight1
-- tensorflow: @gante and @Rocketknight1
-- tokenizers: @ArthurZucker
-- trainer: @zach-huggingface, @SunMarc and @qgallouedec
-- chat templates: @Rocketknight1
+- tokenizers: @ArthurZucker and @itazap
+- trainer: @zach-huggingface @SunMarc
+- attention: @vasqu @ArthurZucker @CyrilVallez
+- model loading (from pretrained, etc): @CyrilVallez
+- distributed: @3outeille @ArthurZucker @S1ro1
+- CIs: @ydshieh
Integrations:
@@ -60,20 +63,17 @@ Integrations:
- ray/raytune: @richardliaw, @amogkam
- Big Model Inference: @SunMarc
- quantization (bitsandbytes, autogpt): @SunMarc @MekkCyber
+- kernels: @MekkCyber @drbh
+- peft: @BenjaminBossan @githubnemo
-Documentation: @stevhliu
-
-HF projects:
+Devices/Backends:
-- accelerate: [different repo](https://github.com/huggingface/accelerate)
-- datasets: [different repo](https://github.com/huggingface/datasets)
-- diffusers: [different repo](https://github.com/huggingface/diffusers)
-- rust tokenizers: [different repo](https://github.com/huggingface/tokenizers)
+- AMD ROCm: @ivarflakstad
+- Intel XPU: @IlyasMoutawwakil
+- Ascend NPU: @ivarflakstad
-Maintained examples (not research project or legacy):
+Documentation: @stevhliu
-- Flax: @Rocketknight1
-- PyTorch: See Models above and tag the person corresponding to the modality of the example.
-- TensorFlow: @Rocketknight1
+Research projects are not maintained and should be taken as is.
-->
diff --git a/.github/scripts/assign_reviewers.py b/.github/scripts/assign_reviewers.py
index 02966204ea32..18567203596f 100644
--- a/.github/scripts/assign_reviewers.py
+++ b/.github/scripts/assign_reviewers.py
@@ -13,14 +13,16 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-import os
-import github
import json
-from github import Github
+import os
import re
from collections import Counter
from pathlib import Path
+import github
+from github import Github
+
+
def pattern_to_regex(pattern):
if pattern.startswith("/"):
start_anchor = True
diff --git a/.github/scripts/codeowners_for_review_action b/.github/scripts/codeowners_for_review_action
index 7325b0f570cc..f6c4b65a1e22 100644
--- a/.github/scripts/codeowners_for_review_action
+++ b/.github/scripts/codeowners_for_review_action
@@ -7,8 +7,8 @@ docs/ @stevhliu
/docker/ @ydshieh @ArthurZucker
# More high-level globs catch cases when specific rules later don't apply
-/src/transformers/models/*/processing* @molbap @yonigozlan @qubvel
-/src/transformers/models/*/image_processing* @qubvel
+/src/transformers/models/*/processing* @molbap @yonigozlan
+/src/transformers/models/*/image_processing* @yonigozlan
/src/transformers/models/*/image_processing_*_fast* @yonigozlan
# Owners of subsections of the library
@@ -186,65 +186,65 @@ trainer_utils.py @zach-huggingface @SunMarc
/src/transformers/models/zamba/mod*_zamba* @ArthurZucker
# Vision models
-/src/transformers/models/beit/mod*_beit* @amyeroberts @qubvel
-/src/transformers/models/bit/mod*_bit* @amyeroberts @qubvel
-/src/transformers/models/conditional_detr/mod*_conditional_detr* @amyeroberts @qubvel
-/src/transformers/models/convnext/mod*_convnext* @amyeroberts @qubvel
-/src/transformers/models/convnextv2/mod*_convnextv2* @amyeroberts @qubvel
-/src/transformers/models/cvt/mod*_cvt* @amyeroberts @qubvel
-/src/transformers/models/deformable_detr/mod*_deformable_detr* @amyeroberts @qubvel
-/src/transformers/models/deit/mod*_deit* @amyeroberts @qubvel
-/src/transformers/models/depth_anything/mod*_depth_anything* @amyeroberts @qubvel
-/src/transformers/models/depth_anything_v2/mod*_depth_anything_v2* @amyeroberts @qubvel
-/src/transformers/models/deta/mod*_deta* @amyeroberts @qubvel
-/src/transformers/models/detr/mod*_detr* @amyeroberts @qubvel
-/src/transformers/models/dinat/mod*_dinat* @amyeroberts @qubvel
-/src/transformers/models/dinov2/mod*_dinov2* @amyeroberts @qubvel
-/src/transformers/models/dinov2_with_registers/mod*_dinov2_with_registers* @amyeroberts @qubvel
-/src/transformers/models/dit/mod*_dit* @amyeroberts @qubvel
-/src/transformers/models/dpt/mod*_dpt* @amyeroberts @qubvel
-/src/transformers/models/efficientformer/mod*_efficientformer* @amyeroberts @qubvel
-/src/transformers/models/efficientnet/mod*_efficientnet* @amyeroberts @qubvel
-/src/transformers/models/focalnet/mod*_focalnet* @amyeroberts @qubvel
-/src/transformers/models/glpn/mod*_glpn* @amyeroberts @qubvel
-/src/transformers/models/hiera/mod*_hiera* @amyeroberts @qubvel
-/src/transformers/models/ijepa/mod*_ijepa* @amyeroberts @qubvel
-/src/transformers/models/imagegpt/mod*_imagegpt* @amyeroberts @qubvel
-/src/transformers/models/levit/mod*_levit* @amyeroberts @qubvel
-/src/transformers/models/mask2former/mod*_mask2former* @amyeroberts @qubvel
-/src/transformers/models/maskformer/mod*_maskformer* @amyeroberts @qubvel
-/src/transformers/models/mobilenet_v1/mod*_mobilenet_v1* @amyeroberts @qubvel
-/src/transformers/models/mobilenet_v2/mod*_mobilenet_v2* @amyeroberts @qubvel
-/src/transformers/models/mobilevit/mod*_mobilevit* @amyeroberts @qubvel
-/src/transformers/models/mobilevitv2/mod*_mobilevitv2* @amyeroberts @qubvel
-/src/transformers/models/nat/mod*_nat* @amyeroberts @qubvel
-/src/transformers/models/poolformer/mod*_poolformer* @amyeroberts @qubvel
-/src/transformers/models/pvt/mod*_pvt* @amyeroberts @qubvel
-/src/transformers/models/pvt_v2/mod*_pvt_v2* @amyeroberts @qubvel
-/src/transformers/models/regnet/mod*_regnet* @amyeroberts @qubvel
-/src/transformers/models/resnet/mod*_resnet* @amyeroberts @qubvel
-/src/transformers/models/rt_detr/mod*_rt_detr* @amyeroberts @qubvel
-/src/transformers/models/segformer/mod*_segformer* @amyeroberts @qubvel
-/src/transformers/models/seggpt/mod*_seggpt* @amyeroberts @qubvel
-/src/transformers/models/superpoint/mod*_superpoint* @amyeroberts @qubvel
-/src/transformers/models/swiftformer/mod*_swiftformer* @amyeroberts @qubvel
-/src/transformers/models/swin/mod*_swin* @amyeroberts @qubvel
-/src/transformers/models/swinv2/mod*_swinv2* @amyeroberts @qubvel
-/src/transformers/models/swin2sr/mod*_swin2sr* @amyeroberts @qubvel
-/src/transformers/models/table_transformer/mod*_table_transformer* @amyeroberts @qubvel
-/src/transformers/models/textnet/mod*_textnet* @amyeroberts @qubvel
-/src/transformers/models/timm_wrapper/mod*_timm_wrapper* @amyeroberts @qubvel
-/src/transformers/models/upernet/mod*_upernet* @amyeroberts @qubvel
-/src/transformers/models/van/mod*_van* @amyeroberts @qubvel
-/src/transformers/models/vit/mod*_vit* @amyeroberts @qubvel
-/src/transformers/models/vit_hybrid/mod*_vit_hybrid* @amyeroberts @qubvel
-/src/transformers/models/vitdet/mod*_vitdet* @amyeroberts @qubvel
-/src/transformers/models/vit_mae/mod*_vit_mae* @amyeroberts @qubvel
-/src/transformers/models/vitmatte/mod*_vitmatte* @amyeroberts @qubvel
-/src/transformers/models/vit_msn/mod*_vit_msn* @amyeroberts @qubvel
-/src/transformers/models/vitpose/mod*_vitpose* @amyeroberts @qubvel
-/src/transformers/models/yolos/mod*_yolos* @amyeroberts @qubvel
-/src/transformers/models/zoedepth/mod*_zoedepth* @amyeroberts @qubvel
+/src/transformers/models/beit/mod*_beit* @yonigozlan @molbap
+/src/transformers/models/bit/mod*_bit* @yonigozlan @molbap
+/src/transformers/models/conditional_detr/mod*_conditional_detr* @yonigozlan @molbap
+/src/transformers/models/convnext/mod*_convnext* @yonigozlan @molbap
+/src/transformers/models/convnextv2/mod*_convnextv2* @yonigozlan @molbap
+/src/transformers/models/cvt/mod*_cvt* @yonigozlan @molbap
+/src/transformers/models/deformable_detr/mod*_deformable_detr* @yonigozlan @molbap
+/src/transformers/models/deit/mod*_deit* @yonigozlan @molbap
+/src/transformers/models/depth_anything/mod*_depth_anything* @yonigozlan @molbap
+/src/transformers/models/depth_anything_v2/mod*_depth_anything_v2* @yonigozlan @molbap
+/src/transformers/models/deta/mod*_deta* @yonigozlan @molbap
+/src/transformers/models/detr/mod*_detr* @yonigozlan @molbap
+/src/transformers/models/dinat/mod*_dinat* @yonigozlan @molbap
+/src/transformers/models/dinov2/mod*_dinov2* @yonigozlan @molbap
+/src/transformers/models/dinov2_with_registers/mod*_dinov2_with_registers* @yonigozlan @molbap
+/src/transformers/models/dit/mod*_dit* @yonigozlan @molbap
+/src/transformers/models/dpt/mod*_dpt* @yonigozlan @molbap
+/src/transformers/models/efficientformer/mod*_efficientformer* @yonigozlan @molbap
+/src/transformers/models/efficientnet/mod*_efficientnet* @yonigozlan @molbap
+/src/transformers/models/focalnet/mod*_focalnet* @yonigozlan @molbap
+/src/transformers/models/glpn/mod*_glpn* @yonigozlan @molbap
+/src/transformers/models/hiera/mod*_hiera* @yonigozlan @molbap
+/src/transformers/models/ijepa/mod*_ijepa* @yonigozlan @molbap
+/src/transformers/models/imagegpt/mod*_imagegpt* @yonigozlan @molbap
+/src/transformers/models/levit/mod*_levit* @yonigozlan @molbap
+/src/transformers/models/mask2former/mod*_mask2former* @yonigozlan @molbap
+/src/transformers/models/maskformer/mod*_maskformer* @yonigozlan @molbap
+/src/transformers/models/mobilenet_v1/mod*_mobilenet_v1* @yonigozlan @molbap
+/src/transformers/models/mobilenet_v2/mod*_mobilenet_v2* @yonigozlan @molbap
+/src/transformers/models/mobilevit/mod*_mobilevit* @yonigozlan @molbap
+/src/transformers/models/mobilevitv2/mod*_mobilevitv2* @yonigozlan @molbap
+/src/transformers/models/nat/mod*_nat* @yonigozlan @molbap
+/src/transformers/models/poolformer/mod*_poolformer* @yonigozlan @molbap
+/src/transformers/models/pvt/mod*_pvt* @yonigozlan @molbap
+/src/transformers/models/pvt_v2/mod*_pvt_v2* @yonigozlan @molbap
+/src/transformers/models/regnet/mod*_regnet* @yonigozlan @molbap
+/src/transformers/models/resnet/mod*_resnet* @yonigozlan @molbap
+/src/transformers/models/rt_detr/mod*_rt_detr* @yonigozlan @molbap
+/src/transformers/models/segformer/mod*_segformer* @yonigozlan @molbap
+/src/transformers/models/seggpt/mod*_seggpt* @yonigozlan @molbap
+/src/transformers/models/superpoint/mod*_superpoint* @yonigozlan @molbap
+/src/transformers/models/swiftformer/mod*_swiftformer* @yonigozlan @molbap
+/src/transformers/models/swin/mod*_swin* @yonigozlan @molbap
+/src/transformers/models/swinv2/mod*_swinv2* @yonigozlan @molbap
+/src/transformers/models/swin2sr/mod*_swin2sr* @yonigozlan @molbap
+/src/transformers/models/table_transformer/mod*_table_transformer* @yonigozlan @molbap
+/src/transformers/models/textnet/mod*_textnet* @yonigozlan @molbap
+/src/transformers/models/timm_wrapper/mod*_timm_wrapper* @yonigozlan @molbap
+/src/transformers/models/upernet/mod*_upernet* @yonigozlan @molbap
+/src/transformers/models/van/mod*_van* @yonigozlan @molbap
+/src/transformers/models/vit/mod*_vit* @yonigozlan @molbap
+/src/transformers/models/vit_hybrid/mod*_vit_hybrid* @yonigozlan @molbap
+/src/transformers/models/vitdet/mod*_vitdet* @yonigozlan @molbap
+/src/transformers/models/vit_mae/mod*_vit_mae* @yonigozlan @molbap
+/src/transformers/models/vitmatte/mod*_vitmatte* @yonigozlan @molbap
+/src/transformers/models/vit_msn/mod*_vit_msn* @yonigozlan @molbap
+/src/transformers/models/vitpose/mod*_vitpose* @yonigozlan @molbap
+/src/transformers/models/yolos/mod*_yolos* @yonigozlan @molbap
+/src/transformers/models/zoedepth/mod*_zoedepth* @yonigozlan @molbap
# Audio models
/src/transformers/models/audio_spectrogram_transformer/mod*_audio_spectrogram_transformer* @eustlb
@@ -304,7 +304,7 @@ trainer_utils.py @zach-huggingface @SunMarc
/src/transformers/models/donut/mod*_donut* @zucchini-nlp
/src/transformers/models/flava/mod*_flava* @zucchini-nlp
/src/transformers/models/git/mod*_git* @zucchini-nlp
-/src/transformers/models/grounding_dino/mod*_grounding_dino* @qubvel
+/src/transformers/models/grounding_dino/mod*_grounding_dino* @yonigozlan
/src/transformers/models/groupvit/mod*_groupvit* @zucchini-nlp
/src/transformers/models/idefics/mod*_idefics* @zucchini-nlp
/src/transformers/models/idefics2/mod*_idefics2* @zucchini-nlp
@@ -326,10 +326,10 @@ trainer_utils.py @zach-huggingface @SunMarc
/src/transformers/models/mgp_str/mod*_mgp_str* @zucchini-nlp
/src/transformers/models/mllama/mod*_mllama* @zucchini-nlp
/src/transformers/models/nougat/mod*_nougat* @NielsRogge
-/src/transformers/models/omdet_turbo/mod*_omdet_turbo* @qubvel @yonigozlan
+/src/transformers/models/omdet_turbo/mod*_omdet_turbo* @yonigozlan
/src/transformers/models/oneformer/mod*_oneformer* @zucchini-nlp
-/src/transformers/models/owlvit/mod*_owlvit* @qubvel
-/src/transformers/models/owlv2/mod*_owlv2* @qubvel
+/src/transformers/models/owlvit/mod*_owlvit* @yonigozlan
+/src/transformers/models/owlv2/mod*_owlv2* @yonigozlan
/src/transformers/models/paligemma/mod*_paligemma* @zucchini-nlp @molbap
/src/transformers/models/perceiver/mod*_perceiver* @zucchini-nlp
/src/transformers/models/pix2struct/mod*_pix2struct* @zucchini-nlp
diff --git a/.github/workflows/benchmark_v2.yml b/.github/workflows/benchmark_v2.yml
new file mode 100644
index 000000000000..fc9e07635185
--- /dev/null
+++ b/.github/workflows/benchmark_v2.yml
@@ -0,0 +1,85 @@
+name: Benchmark v2 Framework
+
+on:
+ workflow_call:
+ inputs:
+ runner:
+ description: 'GH Actions runner group to use'
+ required: true
+ type: string
+ container_image:
+ description: 'Docker image to use'
+ required: true
+ type: string
+ container_options:
+ description: 'Container options to use'
+ required: true
+ type: string
+ commit_sha:
+ description: 'Commit SHA to benchmark'
+ required: false
+ type: string
+ default: ''
+ run_id:
+ description: 'Custom run ID for organizing results (auto-generated if not provided)'
+ required: false
+ type: string
+ default: ''
+ benchmark_repo_id:
+ description: 'HuggingFace Dataset to upload results to (e.g., "org/benchmark-results")'
+ required: false
+ type: string
+ default: ''
+
+env:
+ HF_HOME: /mnt/cache
+ TRANSFORMERS_IS_CI: yes
+ # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access.
+ # This token is created under the bot `hf-transformers-bot`.
+ HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
+
+jobs:
+ benchmark-v2:
+ name: Benchmark v2
+ runs-on: ${{ inputs.runner }}
+ if: |
+ (github.event_name == 'pull_request' && contains( github.event.pull_request.labels.*.name, 'run-benchmark')) ||
+ (github.event_name == 'schedule')
+ container:
+ image: ${{ inputs.container_image }}
+ options: ${{ inputs.container_options }}
+ steps:
+ - name: Get repo
+ uses: actions/checkout@v4
+ with:
+ ref: ${{ inputs.commit_sha || github.sha }}
+
+ - name: Install benchmark dependencies
+ run: |
+ python3 -m pip install -r benchmark_v2/requirements.txt
+
+ - name: Reinstall transformers in edit mode
+ run: |
+ python3 -m pip uninstall -y transformers
+ python3 -m pip install -e ".[torch]"
+
+ - name: Show installed libraries and their versions
+ run: |
+ python3 -m pip list
+ python3 -c "import torch; print(f'PyTorch version: {torch.__version__}')"
+ python3 -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')"
+ python3 -c "import torch; print(f'CUDA device count: {torch.cuda.device_count()}')" || true
+ nvidia-smi || true
+
+ - name: Run benchmark v2
+ working-directory: benchmark_v2
+ run: |
+ echo "Running benchmarks"
+ python3 run_benchmarks.py \
+ --commit-id '${{ inputs.commit_sha || github.sha }}' \
+ --run-id '${{ inputs.run_id }}' \
+ --push-to-hub '${{ inputs.benchmark_repo_id}}' \
+ --token '${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}' \
+ --log-level INFO
+ env:
+ HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
\ No newline at end of file
diff --git a/.github/workflows/benchmark_v2_a10_caller.yml b/.github/workflows/benchmark_v2_a10_caller.yml
new file mode 100644
index 000000000000..6573d398b000
--- /dev/null
+++ b/.github/workflows/benchmark_v2_a10_caller.yml
@@ -0,0 +1,21 @@
+name: Benchmark v2 Scheduled Runner - A10 Single-GPU
+
+on:
+ schedule:
+ # Run daily at 16:30 UTC
+ - cron: "30 16 * * *"
+ pull_request:
+ types: [ opened, labeled, reopened, synchronize ]
+
+jobs:
+ benchmark-v2-default:
+ name: Benchmark v2 - Default Models
+ uses: ./.github/workflows/benchmark_v2.yml
+ with:
+ runner: aws-g5-4xlarge-cache-use1-public-80
+ container_image: huggingface/transformers-pytorch-gpu
+ container_options: --gpus all --privileged --ipc host --shm-size "16gb"
+ commit_sha: ${{ github.sha }}
+ run_id: ${{ github.run_id }}
+ benchmark_repo_id: hf-internal-testing/transformers-daily-benchmarks
+ secrets: inherit
\ No newline at end of file
diff --git a/.github/workflows/benchmark_v2_mi325_caller.yml b/.github/workflows/benchmark_v2_mi325_caller.yml
new file mode 100644
index 000000000000..ed403148e596
--- /dev/null
+++ b/.github/workflows/benchmark_v2_mi325_caller.yml
@@ -0,0 +1,21 @@
+name: Benchmark v2 Scheduled Runner - MI325 Single-GPU
+
+on:
+ schedule:
+ # Run daily at 16:30 UTC
+ - cron: "30 16 * * *"
+ pull_request:
+ types: [ opened, labeled, reopened, synchronize ]
+
+jobs:
+ benchmark-v2-default:
+ name: Benchmark v2 - Default Models
+ uses: ./.github/workflows/benchmark_v2.yml
+ with:
+ runner: amd-mi325-ci-1gpu
+ container_image: huggingface/transformers-pytorch-amd-gpu
+ container_options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache
+ commit_sha: ${{ github.sha }}
+ run_id: ${{ github.run_id }}
+ benchmark_repo_id: hf-internal-testing/transformers-daily-benchmarks
+ secrets: inherit
\ No newline at end of file
diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml
index fe1f18f42b99..b53c6a4671f0 100644
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@@ -5,6 +5,7 @@ on:
branches:
- build_ci_docker_image*
repository_dispatch:
+ workflow_dispatch:
workflow_call:
inputs:
image_postfix:
@@ -221,7 +222,7 @@ jobs:
latest-pytorch-amd:
name: "Latest PyTorch (AMD) [dev]"
runs-on:
- group: aws-general-8-plus
+ group: aws-highcpu-32-priv
steps:
-
name: Set up Docker Buildx
diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml
index c55638ded149..28982d04eb46 100644
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@@ -16,8 +16,20 @@ jobs:
commit_sha: ${{ github.sha }}
package: transformers
notebook_folder: transformers_doc
- languages: ar de en es fr hi it ko pt tr zh ja te
+ languages: en
custom_container: huggingface/transformers-doc-builder
secrets:
token: ${{ secrets.HUGGINGFACE_PUSH }}
hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
+
+ build_other_lang:
+ uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
+ with:
+ commit_sha: ${{ github.sha }}
+ package: transformers
+ notebook_folder: transformers_doc
+ languages: ar de es fr hi it ja ko pt zh
+ custom_container: huggingface/transformers-doc-builder
+ secrets:
+ token: ${{ secrets.HUGGINGFACE_PUSH }}
+ hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
\ No newline at end of file
diff --git a/.github/workflows/model_jobs.yml b/.github/workflows/model_jobs.yml
index 5da145c2b006..83f818fcda3b 100644
--- a/.github/workflows/model_jobs.yml
+++ b/.github/workflows/model_jobs.yml
@@ -128,28 +128,47 @@ jobs:
echo "machine_type=$machine_type" >> $GITHUB_ENV
echo "machine_type=$machine_type" >> $GITHUB_OUTPUT
+ - name: Create report directory if it doesn't exist
+ shell: bash
+ run: |
+ mkdir -p /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
+ echo "dummy" > /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports/dummy.txt
+ ls -la /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
+
- name: Run all tests on GPU
working-directory: /transformers
- run: python3 -m pytest -rsfE -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
+ run: |
+ script -q -c "PATCH_TESTING_METHODS_TO_COLLECT_OUTPUTS=yes _PATCHED_TESTING_METHODS_OUTPUT_DIR=/transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports python3 -m pytest -rsfE -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports tests/${{ matrix.folders }}" test_outputs.txt
+ ls -la
+ # Extract the exit code from the output file
+ EXIT_CODE=$(tail -1 test_outputs.txt | grep -o 'COMMAND_EXIT_CODE="[0-9]*"' | cut -d'"' -f2)
+ exit ${EXIT_CODE:-1}
- name: Failure short reports
if: ${{ failure() }}
+ # This step is only to show information on Github Actions log.
+ # Always mark this step as successful, even if the report directory or the file `failures_short.txt` in it doesn't exist
continue-on-error: true
- run: cat /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/failures_short.txt
+ run: cat /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports/failures_short.txt
- - name: Run test
- shell: bash
+ - name: Captured information
+ if: ${{ failure() }}
+ continue-on-error: true
+ run: |
+ cat /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports/captured_info.txt
+
+ - name: Copy test_outputs.txt
+ if: ${{ always() }}
+ continue-on-error: true
run: |
- mkdir -p /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
- echo "hello" > /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/hello.txt
- echo "${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports"
+ cp /transformers/test_outputs.txt /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
- name: "Test suite reports artifacts: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
- path: /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
+ path: /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
collated_reports:
name: Collated Reports
diff --git a/.github/workflows/pr_build_doc_with_comment.yml b/.github/workflows/pr_build_doc_with_comment.yml
index ec43c5b2cf96..59aa22eef1ec 100644
--- a/.github/workflows/pr_build_doc_with_comment.yml
+++ b/.github/workflows/pr_build_doc_with_comment.yml
@@ -14,7 +14,7 @@ permissions: {}
jobs:
get-pr-number:
name: Get PR number
- if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber", "manueldeprada", "vasqu", "ivarflakstad", "stevhliu", "ebezzam"]'), github.actor) && (startsWith(github.event.comment.body, 'build-doc')) }}
+ if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "eustlb", "MekkCyber", "vasqu", "ivarflakstad", "stevhliu", "ebezzam", "itazap"]'), github.actor) && (startsWith(github.event.comment.body, 'build-doc')) }}
uses: ./.github/workflows/get-pr-number.yml
get-pr-info:
diff --git a/.github/workflows/self-comment-ci.yml b/.github/workflows/self-comment-ci.yml
index f1c93aab5a86..e485973dcb05 100644
--- a/.github/workflows/self-comment-ci.yml
+++ b/.github/workflows/self-comment-ci.yml
@@ -29,7 +29,7 @@ jobs:
runs-on: ubuntu-22.04
name: Get PR number
# For security: only allow team members to run
- if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber", "manueldeprada", "vasqu", "ivarflakstad", "stevhliu", "ebezzam", "remi-or"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
+ if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "eustlb", "MekkCyber", "vasqu", "ivarflakstad", "stevhliu", "ebezzam", "remi-or", "itazap"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
outputs:
PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }}
steps:
diff --git a/.github/workflows/self-scheduled-amd-mi325-caller.yml b/.github/workflows/self-scheduled-amd-mi325-caller.yml
index 8c2bad414bcf..510b3f6e2c78 100644
--- a/.github/workflows/self-scheduled-amd-mi325-caller.yml
+++ b/.github/workflows/self-scheduled-amd-mi325-caller.yml
@@ -20,7 +20,7 @@ jobs:
with:
job: run_models_gpu
slack_report_channel: "#amd-hf-ci"
- runner_scale_set: amd-mi325-ci
+ runner_group: amd-mi325
docker: huggingface/transformers-pytorch-amd-gpu
ci_event: Scheduled CI (AMD) - mi325
report_repo_id: optimum-amd/transformers_daily_ci
@@ -33,7 +33,7 @@ jobs:
with:
job: run_pipelines_torch_gpu
slack_report_channel: "#amd-hf-ci"
- runner_scale_set: amd-mi325-ci
+ runner_group: amd-mi325
docker: huggingface/transformers-pytorch-amd-gpu
ci_event: Scheduled CI (AMD) - mi325
report_repo_id: optimum-amd/transformers_daily_ci
@@ -46,7 +46,7 @@ jobs:
with:
job: run_examples_gpu
slack_report_channel: "#amd-hf-ci"
- runner_scale_set: amd-mi325-ci
+ runner_group: amd-mi325
docker: huggingface/transformers-pytorch-amd-gpu
ci_event: Scheduled CI (AMD) - mi325
report_repo_id: optimum-amd/transformers_daily_ci
@@ -59,7 +59,7 @@ jobs:
with:
job: run_torch_cuda_extensions_gpu
slack_report_channel: "#amd-hf-ci"
- runner_scale_set: amd-mi325-ci
+ runner_group: amd-mi325
docker: huggingface/transformers-pytorch-deepspeed-amd-gpu
ci_event: Scheduled CI (AMD) - mi325
report_repo_id: optimum-amd/transformers_daily_ci
diff --git a/.github/workflows/self-scheduled-amd-mi355-caller.yml b/.github/workflows/self-scheduled-amd-mi355-caller.yml
index d7061f433569..1b5dbe96ad97 100644
--- a/.github/workflows/self-scheduled-amd-mi355-caller.yml
+++ b/.github/workflows/self-scheduled-amd-mi355-caller.yml
@@ -3,7 +3,7 @@ name: Self-hosted runner scale set (AMD mi355 scheduled CI caller)
# Note: For every job in this workflow, the name of the runner scale set is finalized in the runner yaml i.e. huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml
# For example, 1gpu : amd-mi355-ci-1gpu
# 2gpu : amd-mi355-ci-2gpu
-
+
on:
workflow_run:
workflows: ["Self-hosted runner (AMD scheduled CI caller)"]
@@ -20,7 +20,7 @@ jobs:
with:
job: run_models_gpu
slack_report_channel: "#amd-hf-ci"
- runner_scale_set: amd-mi355-ci
+ runner_group: hfc-amd-mi355
docker: huggingface/testing-rocm7.0-preview
ci_event: Scheduled CI (AMD) - mi355
report_repo_id: hf-transformers-bot/transformers-ci-dummy
@@ -32,7 +32,7 @@ jobs:
with:
job: run_pipelines_torch_gpu
slack_report_channel: "#amd-hf-ci"
- runner_scale_set: amd-mi355-ci
+ runner_group: hfc-amd-mi355
docker: huggingface/testing-rocm7.0-preview
ci_event: Scheduled CI (AMD) - mi355
report_repo_id: hf-transformers-bot/transformers-ci-dummy
@@ -44,7 +44,7 @@ jobs:
with:
job: run_examples_gpu
slack_report_channel: "#amd-hf-ci"
- runner_scale_set: amd-mi355-ci
+ runner_group: hfc-amd-mi355
docker: huggingface/testing-rocm7.0-preview
ci_event: Scheduled CI (AMD) - mi355
report_repo_id: hf-transformers-bot/transformers-ci-dummy
@@ -53,10 +53,10 @@ jobs:
deepspeed-ci:
name: DeepSpeed CI
uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml@main
- with:
+ with:
job: run_torch_cuda_extensions_gpu
slack_report_channel: "#amd-hf-ci"
- runner_scale_set: amd-mi355-ci
+ runner_group: hfc-amd-mi355
docker: huggingface/testing-rocm7.0-preview
ci_event: Scheduled CI (AMD) - mi355
report_repo_id: hf-transformers-bot/transformers-ci-dummy
diff --git a/.gitignore b/.gitignore
index cdf189505dc7..b59797c2188b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,6 +13,7 @@ tests/fixtures/cached_*_text.txt
logs/
lightning_logs/
lang_code_data/
+reports/
# Distribution / packaging
.Python
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 7728546633b9..ea62fd545882 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -278,13 +278,14 @@ are working on it).
useful to avoid duplicated work, and to differentiate it from PRs ready to be merged.
☐ Make sure existing tests pass.
☐ If adding a new feature, also add tests for it.
- - If you are adding a new model, make sure you use
+
+- If you are adding a new model, make sure you use
`ModelTester.all_model_classes = (MyModel, MyModelWithLMHead,...)` to trigger the common tests.
- - If you are adding new `@slow` tests, make sure they pass using
+- If you are adding new `@slow` tests, make sure they pass using
`RUN_SLOW=1 python -m pytest tests/models/my_new_model/test_my_new_model.py`.
- - If you are adding a new tokenizer, write tests and make sure
+- If you are adding a new tokenizer, write tests and make sure
`RUN_SLOW=1 python -m pytest tests/models/{your_model_name}/test_tokenization_{your_model_name}.py` passes.
- - CircleCI does not run the slow tests, but GitHub Actions does every night!
+- CircleCI does not run the slow tests, but GitHub Actions does every night!
☐ All public methods must have informative docstrings (see
[`modeling_bert.py`](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py)
@@ -340,6 +341,7 @@ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/t
```
Like the slow tests, there are other environment variables available which are not enabled by default during testing:
+
- `RUN_CUSTOM_TOKENIZERS`: Enables tests for custom tokenizers.
More environment variables and additional information can be found in the [testing_utils.py](https://github.com/huggingface/transformers/blob/main/src/transformers/testing_utils.py).
diff --git a/ISSUES.md b/ISSUES.md
index 9c96162647bc..c87bd9fc2c3f 100644
--- a/ISSUES.md
+++ b/ISSUES.md
@@ -38,7 +38,6 @@ In particular all "Please explain" questions or objectively very user-specific f
* "How to train T5 on De->En translation?"
-
## The GitHub Issues
Everything which hints at a bug should be opened as an [issue](https://github.com/huggingface/transformers/issues).
@@ -247,7 +246,6 @@ You are not required to read the following guidelines before opening an issue. H
Try not use italics and bold text too much as these often make the text more difficult to read.
-
12. If you are cross-referencing a specific comment in a given thread or another issue, always link to that specific comment, rather than using the issue link. If you do the latter it could be quite impossible to find which specific comment you're referring to.
To get the link to the specific comment do not copy the url from the location bar of your browser, but instead, click the `...` icon in the upper right corner of the comment and then select "Copy Link".
@@ -257,7 +255,6 @@ You are not required to read the following guidelines before opening an issue. H
1. https://github.com/huggingface/transformers/issues/9257
2. https://github.com/huggingface/transformers/issues/9257#issuecomment-749945162
-
13. If you are replying to a last comment, it's totally fine to make your reply with just your comment in it. The readers can follow the information flow here.
But if you're replying to a comment that happened some comments back it's always a good practice to quote just the relevant lines you're replying it. The `>` is used for quoting, or you can always use the menu to do so. For example your editor box will look like:
diff --git a/README.md b/README.md
index 5d782bcea78e..f01a2bcc6e52 100644
--- a/README.md
+++ b/README.md
@@ -48,9 +48,11 @@ limitations under the License.
తెలుగు |
Français |
Deutsch |
+ Italiano |
Tiếng Việt |
العربية |
اردو |
+ বাংলা |
+Transformers acts as the model-definition framework for state-of-the-art machine learning models in text, computer
+vision, audio, video, and multimodal model, for both inference and training.
-Transformers acts as the model-definition framework for state-of-the-art machine learning models in text, computer
-vision, audio, video, and multimodal model, for both inference and training.
-
-It centralizes the model definition so that this definition is agreed upon across the ecosystem. `transformers` is the
-pivot across frameworks: if a model definition is supported, it will be compatible with the majority of training
+It centralizes the model definition so that this definition is agreed upon across the ecosystem. `transformers` is the
+pivot across frameworks: if a model definition is supported, it will be compatible with the majority of training
frameworks (Axolotl, Unsloth, DeepSpeed, FSDP, PyTorch-Lightning, ...), inference engines (vLLM, SGLang, TGI, ...),
and adjacent modeling libraries (llama.cpp, mlx, ...) which leverage the model definition from `transformers`.
@@ -110,10 +111,10 @@ git clone https://github.com/huggingface/transformers.git
cd transformers
# pip
-pip install .[torch]
+pip install '.[torch]'
# uv
-uv pip install .[torch]
+uv pip install '.[torch]'
```
## Quickstart
@@ -193,7 +194,6 @@ pipeline("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.pn
-
Transformers acts as the model-definition framework for state-of-the-art machine learning models in text, computer
vision, audio, video, and multimodal model, for both inference and training.
@@ -35,6 +34,10 @@ There are over 1M+ Transformers [model checkpoints](https://huggingface.co/model
Explore the [Hub](https://huggingface.com/) today to find a model and use Transformers to help you get started right away.
+Explore the [Models Timeline](./models_timeline) to discover the latest text, vision, audio and multimodal model architectures in Transformers.
+
+
+
## Features
Transformers provides everything you need for inference or training with state-of-the-art pretrained models. Some of the main features include:
@@ -61,4 +64,4 @@ Transformers is designed for developers and machine learning engineers and resea
## Learn
-If you're new to Transformers or want to learn more about transformer models, we recommend starting with the [LLM course](https://huggingface.co/learn/llm-course/chapter1/1?fw=pt). This comprehensive course covers everything from the fundamentals of how transformer models work to practical applications across various tasks. You'll learn the complete workflow, from curating high-quality datasets to fine-tuning large language models and implementing reasoning capabilities. The course contains both theoretical and hands-on exercises to build a solid foundational knowledge of transformer models as you learn.
\ No newline at end of file
+If you're new to Transformers or want to learn more about transformer models, we recommend starting with the [LLM course](https://huggingface.co/learn/llm-course/chapter1/1?fw=pt). This comprehensive course covers everything from the fundamentals of how transformer models work to practical applications across various tasks. You'll learn the complete workflow, from curating high-quality datasets to fine-tuning large language models and implementing reasoning capabilities. The course contains both theoretical and hands-on exercises to build a solid foundational knowledge of transformer models as you learn.
diff --git a/docs/source/en/internal/file_utils.md b/docs/source/en/internal/file_utils.md
index 31fbc5b88110..63db5756a622 100644
--- a/docs/source/en/internal/file_utils.md
+++ b/docs/source/en/internal/file_utils.md
@@ -20,7 +20,6 @@ This page lists all of Transformers general utility functions that are found in
Most of those are only useful if you are studying the general code in the library.
-
## Enums and namedtuples
[[autodoc]] utils.ExplicitEnum
diff --git a/docs/source/en/internal/generation_utils.md b/docs/source/en/internal/generation_utils.md
index d47eba82d8cc..87b0111ff053 100644
--- a/docs/source/en/internal/generation_utils.md
+++ b/docs/source/en/internal/generation_utils.md
@@ -65,7 +65,6 @@ values. Here, for instance, it has two keys that are `sequences` and `scores`.
We document here all output types.
-
[[autodoc]] generation.GenerateDecoderOnlyOutput
[[autodoc]] generation.GenerateEncoderDecoderOutput
@@ -74,13 +73,11 @@ We document here all output types.
[[autodoc]] generation.GenerateBeamEncoderDecoderOutput
-
## LogitsProcessor
A [`LogitsProcessor`] can be used to modify the prediction scores of a language model head for
generation.
-
[[autodoc]] AlternatingCodebooksLogitsProcessor
- __call__
@@ -174,8 +171,6 @@ generation.
[[autodoc]] WatermarkLogitsProcessor
- __call__
-
-
## StoppingCriteria
A [`StoppingCriteria`] can be used to change when to stop generation (other than EOS token). Please note that this is exclusively available to our PyTorch implementations.
@@ -300,7 +295,6 @@ A [`Constraint`] can be used to force the generation to include specific tokens
- to_legacy_cache
- from_legacy_cache
-
## Watermark Utils
[[autodoc]] WatermarkingConfig
diff --git a/docs/source/en/internal/import_utils.md b/docs/source/en/internal/import_utils.md
index 0d76c2bbe33a..4a9915378a1f 100644
--- a/docs/source/en/internal/import_utils.md
+++ b/docs/source/en/internal/import_utils.md
@@ -22,8 +22,8 @@ worked around. We don't want for all users of `transformers` to have to install
we therefore mark those as soft dependencies rather than hard dependencies.
The transformers toolkit is not made to error-out on import of a model that has a specific dependency; instead, an
-object for which you are lacking a dependency will error-out when calling any method on it. As an example, if
-`torchvision` isn't installed, the fast image processors will not be available.
+object for which you are lacking a dependency will error-out when calling any method on it. As an example, if
+`torchvision` isn't installed, the fast image processors will not be available.
This object is still importable:
@@ -60,7 +60,7 @@ PyTorch dependency
**Tokenizers**: All files starting with `tokenization_` and ending with `_fast` have an automatic `tokenizers` dependency
-**Vision**: All files starting with `image_processing_` have an automatic dependency to the `vision` dependency group;
+**Vision**: All files starting with `image_processing_` have an automatic dependency to the `vision` dependency group;
at the time of writing, this only contains the `pillow` dependency.
**Vision + Torch + Torchvision**: All files starting with `image_processing_` and ending with `_fast` have an automatic
@@ -71,7 +71,7 @@ All of these automatic dependencies are added on top of the explicit dependencie
### Explicit Object Dependencies
We add a method called `requires` that is used to explicitly specify the dependencies of a given object. As an
-example, the `Trainer` class has two hard dependencies: `torch` and `accelerate`. Here is how we specify these
+example, the `Trainer` class has two hard dependencies: `torch` and `accelerate`. Here is how we specify these
required dependencies:
```python
diff --git a/docs/source/en/internal/model_debugging_utils.md b/docs/source/en/internal/model_debugging_utils.md
index 262113575f42..553a5ce56845 100644
--- a/docs/source/en/internal/model_debugging_utils.md
+++ b/docs/source/en/internal/model_debugging_utils.md
@@ -21,10 +21,8 @@ provides for it.
Most of those are only useful if you are adding new models in the library.
-
## Model addition debuggers
-
### Model addition debugger - context manager for model adders
This context manager is a power user tool intended for model adders. It tracks all forward calls within a model forward
@@ -72,7 +70,6 @@ with model_addition_debugger_context(
```
-
### Reading results
The debugger generates two files from the forward call, both with the same base name, but ending either with
@@ -221,9 +218,9 @@ path reference to the associated `.safetensors` file. Each tensor is written to
the state dictionary. File names are constructed using the `module_path` as a prefix with a few possible postfixes that
are built recursively.
-* Module inputs are denoted with the `_inputs` and outputs by `_outputs`.
-* `list` and `tuple` instances, such as `args` or function return values, will be postfixed with `_{index}`.
-* `dict` instances will be postfixed with `_{key}`.
+* Module inputs are denoted with the `_inputs` and outputs by `_outputs`.
+* `list` and `tuple` instances, such as `args` or function return values, will be postfixed with `_{index}`.
+* `dict` instances will be postfixed with `_{key}`.
### Comparing between implementations
@@ -231,10 +228,8 @@ Once the forward passes of two models have been traced by the debugger, one can
below: we can see slight differences between these two implementations' key projection layer. Inputs are mostly
identical, but not quite. Looking through the file differences makes it easier to pinpoint which layer is wrong.
-

-
### Limitations and scope
This feature will only work for torch-based models, and would require more work and case-by-case approach for say
@@ -254,13 +249,14 @@ layers.
This small util is a power user tool intended for model adders and maintainers. It lists all test methods
existing in `test_modeling_common.py`, inherited by all model tester classes, and scans the repository to measure
-how many tests are being skipped and for which models.
+how many tests are being skipped and for which models.
### Rationale
When porting models to transformers, tests fail as they should, and sometimes `test_modeling_common` feels irreconcilable with the peculiarities of our brand new model. But how can we be sure we're not breaking everything by adding a seemingly innocent skip?
This utility:
+
- scans all test_modeling_common methods
- looks for times where a method is skipped
- returns a summary json you can load as a DataFrame/inspect
@@ -269,8 +265,7 @@ This utility:

-
-### Usage
+### Usage
You can run the skipped test analyzer in two ways:
@@ -286,7 +281,7 @@ python utils/scan_skipped_tests.py --output_dir path/to/output
**Example output:**
-```
+```text
🔬 Parsing 331 model test files once each...
📝 Aggregating 224 tests...
(224/224) test_update_candidate_strategy_with_matches_1es_3d_is_nonecodet_schedule_fa_kwargs
diff --git a/docs/source/en/internal/pipelines_utils.md b/docs/source/en/internal/pipelines_utils.md
index 6ea6de9a61b8..23856e5639c3 100644
--- a/docs/source/en/internal/pipelines_utils.md
+++ b/docs/source/en/internal/pipelines_utils.md
@@ -20,7 +20,6 @@ This page lists all the utility functions the library provides for pipelines.
Most of those are only useful if you are studying the code of the models in the library.
-
## Argument handling
[[autodoc]] pipelines.ArgumentHandler
diff --git a/docs/source/en/jan.md b/docs/source/en/jan.md
index ff580496c81b..95309f46cd04 100644
--- a/docs/source/en/jan.md
+++ b/docs/source/en/jan.md
@@ -25,7 +25,7 @@ You are now ready to chat!
To conclude this example, let's look into a more advanced use-case. If you have a beefy machine to serve models with, but prefer using Jan on a different device, you need to add port forwarding. If you have `ssh` access from your Jan machine into your server, this can be accomplished by typing the following to your Jan machine's terminal
-```
+```bash
ssh -N -f -L 8000:localhost:8000 your_server_account@your_server_IP -p port_to_ssh_into_your_server
```
diff --git a/docs/source/en/kv_cache.md b/docs/source/en/kv_cache.md
index f0a781cba4fc..f318c73d28a9 100644
--- a/docs/source/en/kv_cache.md
+++ b/docs/source/en/kv_cache.md
@@ -67,7 +67,7 @@ out = model.generate(**inputs, do_sample=False, max_new_tokens=20, past_key_valu
## Fixed-size cache
-The default [`DynamicCache`] prevents you from taking advantage of most just-in-time (JIT) optimizations because the cache size isn't fixed. JIT optimizations enable you to maximize latency at the expense of memory usage. All of the following cache types are compatible with JIT optimizations like [torch.compile](./llm_optims#static-kv-cache-and-torchcompile) to accelerate generation.
+The default [`DynamicCache`] prevents you from taking advantage of most just-in-time (JIT) optimizations because the cache size isn't fixed. JIT optimizations enable you to maximize latency at the expense of memory usage. All of the following cache types are compatible with JIT optimizations like [torch.compile](./llm_optims#static-kv-cache-and-torchcompile) to accelerate generation.
A fixed-size cache ([`StaticCache`]) pre-allocates a specific maximum cache size for the kv pairs. You can generate up to the maximum cache size without needing to modify it. However, having a fixed (usually large) size for the key/value states means that while generating, a lot of tokens will actually be masked as they should not take part in the attention. So this trick allows to easily `compile` the decoding stage, but it incurs a waste of tokens in the attention computation. As all things, it's then a trade-off which should be very good if you generate with several sequence of more or less the same lengths, but may be sub-optimal if you have for example 1 very large sequence, and then only short sequences (as the fix cache size would be large, a lot would be wasted for the short sequences). Make sure you understand the impact if you use it!
@@ -213,7 +213,7 @@ A cache can also work in iterative generation settings where there is back-and-f
For iterative generation with a cache, start by initializing an empty cache class and then you can feed in your new prompts. Keep track of dialogue history with a [chat template](./chat_templating).
-The following example demonstrates [Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf). If you’re using a different chat-style model, [`~PreTrainedTokenizer.apply_chat_template`] may process messages differently. It might cut out important tokens depending on how the Jinja template is written.
+The following example demonstrates [Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf). If you're using a different chat-style model, [`~PreTrainedTokenizer.apply_chat_template`] may process messages differently. It might cut out important tokens depending on how the Jinja template is written.
For example, some models use special `
+alt="drawing" width="600"/>
BLIP-2 architecture. Taken from the original paper.
diff --git a/docs/source/en/model_doc/blip.md b/docs/source/en/model_doc/blip.md
index 13a2a5731a5f..5e727050f6ee 100644
--- a/docs/source/en/model_doc/blip.md
+++ b/docs/source/en/model_doc/blip.md
@@ -25,7 +25,6 @@ rendered properly in your Markdown viewer.
[BLIP](https://huggingface.co/papers/2201.12086) (Bootstrapped Language-Image Pretraining) is a vision-language pretraining (VLP) framework designed for *both* understanding and generation tasks. Most existing pretrained models are only good at one or the other. It uses a captioner to generate captions and a filter to remove the noisy captions. This increases training data quality and more effectively uses the messy web data.
-
You can find all the original BLIP checkpoints under the [BLIP](https://huggingface.co/collections/Salesforce/blip-models-65242f40f1491fbf6a9e9472) collection.
> [!TIP]
@@ -129,7 +128,7 @@ Refer to this [notebook](https://github.com/huggingface/notebooks/blob/main/exam
## BlipTextLMHeadModel
[[autodoc]] BlipTextLMHeadModel
-- forward
+ - forward
## BlipVisionModel
diff --git a/docs/source/en/model_doc/bloom.md b/docs/source/en/model_doc/bloom.md
index 805379338e32..51e2970c25f6 100644
--- a/docs/source/en/model_doc/bloom.md
+++ b/docs/source/en/model_doc/bloom.md
@@ -43,17 +43,19 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
- [`BloomForCausalLM`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#gpt-2gpt-and-causal-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb).
See also:
+
- [Causal language modeling task guide](../tasks/language_modeling)
- [Text classification task guide](../tasks/sequence_classification)
- [Token classification task guide](../tasks/token_classification)
- [Question answering task guide](../tasks/question_answering)
-
⚡️ Inference
+
- A blog on [Optimization story: Bloom inference](https://huggingface.co/blog/bloom-inference-optimization).
- A blog on [Incredibly Fast BLOOM Inference with DeepSpeed and Accelerate](https://huggingface.co/blog/bloom-inference-pytorch-scripts).
⚙️ Training
+
- A blog on [The Technology Behind BLOOM Training](https://huggingface.co/blog/bloom-megatron-deepspeed).
## BloomConfig
diff --git a/docs/source/en/model_doc/blt.md b/docs/source/en/model_doc/blt.md
new file mode 100644
index 000000000000..254cf6c0f44a
--- /dev/null
+++ b/docs/source/en/model_doc/blt.md
@@ -0,0 +1,97 @@
+
+*This model was released on 2024-12-13 and added to Hugging Face Transformers on 2025-09-19.*
+
+
@@ -52,7 +50,6 @@ alt="drawing" width="600"/>
This model was contributed by [joaogante](https://huggingface.co/joaogante) and [RaushanTurganbay](https://huggingface.co/RaushanTurganbay).
The original code can be found [here](https://github.com/facebookresearch/chameleon).
-
## Usage tips
- We advise users to use `padding_side="left"` when computing batched generation as it leads to more accurate results. Simply make sure to set `processor.tokenizer.padding_side = "left"` before generating.
diff --git a/docs/source/en/model_doc/chinese_clip.md b/docs/source/en/model_doc/chinese_clip.md
index 7ed4d503c00f..96b094ccd91b 100644
--- a/docs/source/en/model_doc/chinese_clip.md
+++ b/docs/source/en/model_doc/chinese_clip.md
@@ -119,4 +119,4 @@ Currently, following scales of pretrained Chinese-CLIP models are available on
## ChineseCLIPVisionModel
[[autodoc]] ChineseCLIPVisionModel
- - forward
\ No newline at end of file
+ - forward
diff --git a/docs/source/en/model_doc/clipseg.md b/docs/source/en/model_doc/clipseg.md
index e27d49ffe484..099fd4fb1bac 100644
--- a/docs/source/en/model_doc/clipseg.md
+++ b/docs/source/en/model_doc/clipseg.md
@@ -47,7 +47,7 @@ can be formulated. Finally, we find our system to adapt well
to generalized queries involving affordances or properties*
+alt="drawing" width="600"/>
CLIPSeg overview. Taken from the original paper.
@@ -106,4 +106,4 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
## CLIPSegForImageSegmentation
[[autodoc]] CLIPSegForImageSegmentation
- - forward
\ No newline at end of file
+ - forward
diff --git a/docs/source/en/model_doc/clvp.md b/docs/source/en/model_doc/clvp.md
index 926438a3c1f5..eead4a546435 100644
--- a/docs/source/en/model_doc/clvp.md
+++ b/docs/source/en/model_doc/clvp.md
@@ -29,29 +29,25 @@ The abstract from the paper is the following:
*In recent years, the field of image generation has been revolutionized by the application of autoregressive transformers and DDPMs. These approaches model the process of image generation as a step-wise probabilistic processes and leverage large amounts of compute and data to learn the image distribution. This methodology of improving performance need not be confined to images. This paper describes a way to apply advances in the image generative domain to speech synthesis. The result is TorToise - an expressive, multi-voice text-to-speech system.*
-
This model was contributed by [Susnato Dhar](https://huggingface.co/susnato).
The original code can be found [here](https://github.com/neonbjb/tortoise-tts).
-
## Usage tips
1. CLVP is an integral part of the Tortoise TTS model.
2. CLVP can be used to compare different generated speech candidates with the provided text, and the best speech tokens are forwarded to the diffusion model.
3. The use of the [`ClvpModelForConditionalGeneration.generate()`] method is strongly recommended for tortoise usage.
-4. Note that the CLVP model expects the audio to be sampled at 22.05 kHz contrary to other audio models which expects 16 kHz.
-
+4. Note that the CLVP model expects the audio to be sampled at 22.05 kHz contrary to other audio models which expects 16 kHz.
## Brief Explanation:
- The [`ClvpTokenizer`] tokenizes the text input, and the [`ClvpFeatureExtractor`] extracts the log mel-spectrogram from the desired audio.
- [`ClvpConditioningEncoder`] takes those text tokens and audio representations and converts them into embeddings conditioned on the text and audio.
- The [`ClvpForCausalLM`] uses those embeddings to generate multiple speech candidates.
-- Each speech candidate is passed through the speech encoder ([`ClvpEncoder`]) which converts them into a vector representation, and the text encoder ([`ClvpEncoder`]) converts the text tokens into the same latent space.
-- At the end, we compare each speech vector with the text vector to see which speech vector is most similar to the text vector.
+- Each speech candidate is passed through the speech encoder ([`ClvpEncoder`]) which converts them into a vector representation, and the text encoder ([`ClvpEncoder`]) converts the text tokens into the same latent space.
+- At the end, we compare each speech vector with the text vector to see which speech vector is most similar to the text vector.
- [`ClvpModelForConditionalGeneration.generate()`] compresses all of the logic described above into a single method.
-
Example :
```python
@@ -74,7 +70,6 @@ Example :
>>> generated_output = model.generate(**processor_output)
```
-
## ClvpConfig
[[autodoc]] ClvpConfig
@@ -128,4 +123,3 @@ Example :
## ClvpDecoder
[[autodoc]] ClvpDecoder
-
diff --git a/docs/source/en/model_doc/code_llama.md b/docs/source/en/model_doc/code_llama.md
index 60e9cb4c3cf2..a46e1f05b32a 100644
--- a/docs/source/en/model_doc/code_llama.md
+++ b/docs/source/en/model_doc/code_llama.md
@@ -143,6 +143,7 @@ visualizer("""def func(a, b):
- Infilling is only available in the 7B and 13B base models, and not in the Python, Instruct, 34B, or 70B models.
- Use the `
-
## Usage Tips
### Generate text
@@ -84,7 +83,6 @@ generate_text = tokenizer.decode(output_ids, skip_special_tokens=True)
This model was contributed by [Anton Vlasjuk](https://huggingface.co/AntonV).
The original code can be found [here](https://github.com/PaddlePaddle/ERNIE).
-
## Ernie4_5Config
[[autodoc]] Ernie4_5Config
diff --git a/docs/source/en/model_doc/ernie4_5_moe.md b/docs/source/en/model_doc/ernie4_5_moe.md
index 20c4dcfd5435..fb6b8d791bec 100644
--- a/docs/source/en/model_doc/ernie4_5_moe.md
+++ b/docs/source/en/model_doc/ernie4_5_moe.md
@@ -40,7 +40,6 @@ Other models from the family can be found at [Ernie 4.5](./ernie4_5).
-
## Usage Tips
### Generate text
@@ -167,7 +166,6 @@ generate_text = tokenizer.decode(output_ids, skip_special_tokens=True)
This model was contributed by [Anton Vlasjuk](https://huggingface.co/AntonV).
The original code can be found [here](https://github.com/PaddlePaddle/ERNIE).
-
## Ernie4_5_MoeConfig
[[autodoc]] Ernie4_5_MoeConfig
diff --git a/docs/source/en/model_doc/ernie_m.md b/docs/source/en/model_doc/ernie_m.md
index 508fe2f596b2..e044614e7644 100644
--- a/docs/source/en/model_doc/ernie_m.md
+++ b/docs/source/en/model_doc/ernie_m.md
@@ -40,7 +40,6 @@ The abstract from the paper is the following:
*Recent studies have demonstrated that pre-trained cross-lingual models achieve impressive performance in downstream cross-lingual tasks. This improvement benefits from learning a large amount of monolingual and parallel corpora. Although it is generally acknowledged that parallel corpora are critical for improving the model performance, existing methods are often constrained by the size of parallel corpora, especially for lowresource languages. In this paper, we propose ERNIE-M, a new training method that encourages the model to align the representation of multiple languages with monolingual corpora, to overcome the constraint that the parallel corpus size places on the model performance. Our key insight is to integrate back-translation into the pre-training process. We generate pseudo-parallel sentence pairs on a monolingual corpus to enable the learning of semantic alignments between different languages, thereby enhancing the semantic modeling of cross-lingual models. Experimental results show that ERNIE-M outperforms existing cross-lingual models and delivers new state-of-the-art results in various cross-lingual downstream tasks.*
This model was contributed by [Susnato Dhar](https://huggingface.co/susnato). The original code can be found [here](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/paddlenlp/transformers/ernie_m).
-
## Usage tips
- Ernie-M is a BERT-like model so it is a stacked Transformer Encoder.
@@ -59,7 +58,6 @@ This model was contributed by [Susnato Dhar](https://huggingface.co/susnato). Th
[[autodoc]] ErnieMConfig
-
## ErnieMTokenizer
[[autodoc]] ErnieMTokenizer
@@ -68,7 +66,6 @@ This model was contributed by [Susnato Dhar](https://huggingface.co/susnato). Th
- create_token_type_ids_from_sequences
- save_vocabulary
-
## ErnieMModel
[[autodoc]] ErnieMModel
@@ -79,19 +76,16 @@ This model was contributed by [Susnato Dhar](https://huggingface.co/susnato). Th
[[autodoc]] ErnieMForSequenceClassification
- forward
-
## ErnieMForMultipleChoice
[[autodoc]] ErnieMForMultipleChoice
- forward
-
## ErnieMForTokenClassification
[[autodoc]] ErnieMForTokenClassification
- forward
-
## ErnieMForQuestionAnswering
[[autodoc]] ErnieMForQuestionAnswering
diff --git a/docs/source/en/model_doc/esm.md b/docs/source/en/model_doc/esm.md
index e83e2d5aa1da..a6190a71f020 100644
--- a/docs/source/en/model_doc/esm.md
+++ b/docs/source/en/model_doc/esm.md
@@ -44,12 +44,10 @@ sequence alignment (MSA) step at inference time, which means that ESMFold checkp
they do not require a database of known protein sequences and structures with associated external query tools
to make predictions, and are much faster as a result.
-
The abstract from
"Biological structure and function emerge from scaling unsupervised learning to 250
million protein sequences" is
-
*In the field of artificial intelligence, a combination of scale in data and model capacity enabled by unsupervised
learning has led to major advances in representation learning and statistical generation. In the life sciences, the
anticipated growth of sequencing promises unprecedented data on natural sequence diversity. Protein language modeling
@@ -63,7 +61,6 @@ can be identified by linear projections. Representation learning produces featur
applications, enabling state-of-the-art supervised prediction of mutational effect and secondary structure and
improving state-of-the-art features for long-range contact prediction.*
-
The abstract from
"Language models of protein sequences at the scale of evolution enable accurate structure prediction" is
diff --git a/docs/source/en/model_doc/evolla.md b/docs/source/en/model_doc/evolla.md
index a39103a06d12..ea8605050599 100644
--- a/docs/source/en/model_doc/evolla.md
+++ b/docs/source/en/model_doc/evolla.md
@@ -25,7 +25,7 @@ Evolla is an advanced 80-billion-parameter protein-language generative model des
The abstract from the paper is the following:
-*Proteins, nature’s intricate molecular machines, are the products of billions of years of evolution and play fundamental roles in sustaining life. Yet, deciphering their molecular language - that is, understanding how protein sequences and structures encode and determine biological functions - remains a corner-stone challenge in modern biology. Here, we introduce Evolla, an 80 billion frontier protein-language generative model designed to decode the molecular language of proteins. By integrating information from protein sequences, structures, and user queries, Evolla generates precise and contextually nuanced insights into protein function. A key innovation of Evolla lies in its training on an unprecedented AI-generated dataset: 546 million protein question-answer pairs and 150 billion word tokens, designed to reflect the immense complexity and functional diversity of proteins. Post-pretraining, Evolla integrates Direct Preference Optimization (DPO) to refine the model based on preference signals and Retrieval-Augmented Generation (RAG) for external knowledge incorporation, improving response quality and relevance. To evaluate its performance, we propose a novel framework, Instructional Response Space (IRS), demonstrating that Evolla delivers expert-level insights, advancing research in proteomics and functional genomics while shedding light on the molecular logic encoded in proteins. The online demo is available at http://www.chat-protein.com/.*
+*Proteins, nature's intricate molecular machines, are the products of billions of years of evolution and play fundamental roles in sustaining life. Yet, deciphering their molecular language - that is, understanding how protein sequences and structures encode and determine biological functions - remains a corner-stone challenge in modern biology. Here, we introduce Evolla, an 80 billion frontier protein-language generative model designed to decode the molecular language of proteins. By integrating information from protein sequences, structures, and user queries, Evolla generates precise and contextually nuanced insights into protein function. A key innovation of Evolla lies in its training on an unprecedented AI-generated dataset: 546 million protein question-answer pairs and 150 billion word tokens, designed to reflect the immense complexity and functional diversity of proteins. Post-pretraining, Evolla integrates Direct Preference Optimization (DPO) to refine the model based on preference signals and Retrieval-Augmented Generation (RAG) for external knowledge incorporation, improving response quality and relevance. To evaluate its performance, we propose a novel framework, Instructional Response Space (IRS), demonstrating that Evolla delivers expert-level insights, advancing research in proteomics and functional genomics while shedding light on the molecular logic encoded in proteins. The online demo is available at http://www.chat-protein.com/.*
Examples:
@@ -75,7 +75,6 @@ Tips:
- This model was contributed by [Xibin Bayes Zhou](https://huggingface.co/XibinBayesZhou).
- The original code can be found [here](https://github.com/westlake-repl/Evolla).
-
## EvollaConfig
[[autodoc]] EvollaConfig
diff --git a/docs/source/en/model_doc/exaone4.md b/docs/source/en/model_doc/exaone4.md
index 69d7ee0b2a81..9482f5be2c06 100644
--- a/docs/source/en/model_doc/exaone4.md
+++ b/docs/source/en/model_doc/exaone4.md
@@ -20,7 +20,7 @@ rendered properly in your Markdown viewer.
## Overview
**[EXAONE 4.0](https://github.com/LG-AI-EXAONE/EXAONE-4.0)** model is the language model, which integrates a **Non-reasoning mode** and **Reasoning mode** to achieve both the excellent usability of [EXAONE 3.5](https://github.com/LG-AI-EXAONE/EXAONE-3.5) and the advanced reasoning abilities of [EXAONE Deep](https://github.com/LG-AI-EXAONE/EXAONE-Deep). To pave the way for the agentic AI era, EXAONE 4.0 incorporates essential features such as agentic tool use, and its multilingual capabilities are extended
-to support Spanish in addition to English and Korean.
+to support Spanish in addition to English and Korean.
The EXAONE 4.0 model series consists of two sizes: a mid-size **32B** model optimized for high performance, and a small-size **1.2B** model designed for on-device applications.
@@ -33,7 +33,6 @@ For more details, please refer to our [technical report](https://huggingface.co/
All model weights including quantized versions are available at [Huggingface Collections](https://huggingface.co/collections/LGAI-EXAONE/exaone-40-686b2e0069800c835ed48375).
-
## Model Details
### Model Specifications
@@ -57,7 +56,6 @@ All model weights including quantized versions are available at [Huggingface Col
| Tied word embedding | False | True |
| Knowledge cut-off | Nov. 2024 | Nov. 2024 |
-
## Usage tips
### Non-reasoning mode
@@ -206,4 +204,4 @@ print(tokenizer.decode(output[0]))
## Exaone4ForQuestionAnswering
[[autodoc]] Exaone4ForQuestionAnswering
- - forward
\ No newline at end of file
+ - forward
diff --git a/docs/source/en/model_doc/falcon3.md b/docs/source/en/model_doc/falcon3.md
index 368a5457ab6d..3d79a4e225dd 100644
--- a/docs/source/en/model_doc/falcon3.md
+++ b/docs/source/en/model_doc/falcon3.md
@@ -30,5 +30,6 @@ Depth up-scaling for improved reasoning: Building on recent studies on the effec
Knowledge distillation for better tiny models: To provide compact and efficient alternatives, we developed Falcon3-1B-Base and Falcon3-3B-Base by leveraging pruning and knowledge distillation techniques, using less than 100GT of curated high-quality data, thereby redefining pre-training efficiency.
## Resources
+
- [Blog post](https://huggingface.co/blog/falcon3)
- [Models on Huggingface](https://huggingface.co/collections/tiiuae/falcon3-67605ae03578be86e4e87026)
diff --git a/docs/source/en/model_doc/falcon_h1.md b/docs/source/en/model_doc/falcon_h1.md
index 981c00bd626b..48a647cd3797 100644
--- a/docs/source/en/model_doc/falcon_h1.md
+++ b/docs/source/en/model_doc/falcon_h1.md
@@ -21,7 +21,6 @@ The [FalconH1](https://huggingface.co/blog/tiiuae/falcon-h1) model was developed
This model was contributed by [DhiyaEddine](https://huggingface.co/DhiyaEddine), [ybelkada](https://huggingface.co/ybelkada), [JingweiZuo](https://huggingface.co/JingweiZuo), [IlyasChahed](https://huggingface.co/IChahed), and [MaksimVelikanov](https://huggingface.co/yellowvm).
The original code can be found [here](https://github.com/tiiuae/Falcon-H1).
-
## FalconH1Config
| Model | Depth | Dim | Attn Heads | KV | Mamba Heads | d_head | d_state | Ctx Len |
@@ -33,8 +32,6 @@ The original code can be found [here](https://github.com/tiiuae/Falcon-H1).
| H1 7B | 44 | 3072 | 12 | 2 | 24 | 128 / 128 | 256 | 256K |
| H1 34B | 72 | 5120 | 20 | 4 | 32 | 128 / 128 | 256 | 256K |
-
-
[[autodoc]] FalconH1Config
-*This model was released on 2025-07-09 and added to Hugging Face Transformers on 2025-09-15.*
+*This model was released on 2025-07-09 and added to Hugging Face Transformers on 2025-09-18.*
-
## GPTBigCodeConfig
[[autodoc]] GPTBigCodeConfig
diff --git a/docs/source/en/model_doc/gpt_neo.md b/docs/source/en/model_doc/gpt_neo.md
index f3de04d0e550..b0d13cf780b3 100644
--- a/docs/source/en/model_doc/gpt_neo.md
+++ b/docs/source/en/model_doc/gpt_neo.md
@@ -22,12 +22,10 @@ rendered properly in your Markdown viewer.
-
## GPT-Neo
[GPT-Neo](https://zenodo.org/records/5297715) is an open-source alternative to GPT-2 and GPT-3 models, built with Mesh TensorFlow for TPUs. GPT-Neo uses local attention in every other layer for more efficiency. It is trained on the [Pile](https://huggingface.co/datasets/EleutherAI/pile), a diverse dataset consisting of 22 smaller high-quality datasets. The original github repository can be found [here](https://github.com/EleutherAI/gpt-neo/tree/v1.1)
-
You can find all the original GPT-Neo checkpoints under the [EleutherAI](https://huggingface.co/EleutherAI?search_models=gpt-neo) organization.
> [!TIP]
@@ -45,6 +43,7 @@ from transformers import pipeline
pipeline = pipeline(task="text-generation", model="EleutherAI/gpt-neo-1.3B", dtype=torch.float16, device=0)
pipeline("Hello, I'm a language model")
```
+
-
## Using Scaled Dot Product Attention (SDPA)
PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function
encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the
@@ -162,7 +160,6 @@ following speedups during training and inference.
| 4 | 1024 | 11.765 | 11.303 | 4.09 | 2558.96 | 2546.04 | 0.508 |
| 4 | 2048 | 19.568 | 17.735 | 10.33 | 4175.5 | 4165.26 | 0.246 |
-
## Resources
- [Causal language modeling task guide](../tasks/language_modeling)
diff --git a/docs/source/en/model_doc/gpt_neox_japanese.md b/docs/source/en/model_doc/gpt_neox_japanese.md
index 7b22484b9a76..bf786f7561d4 100644
--- a/docs/source/en/model_doc/gpt_neox_japanese.md
+++ b/docs/source/en/model_doc/gpt_neox_japanese.md
@@ -27,8 +27,6 @@ rendered properly in your Markdown viewer.
GPT-NeoX-Japanese, a Japanese language model based on [GPT-NeoX](./gpt_neox).
Japanese uses three types of characters (hiragana, katakana, kanji) and has a huge vocabulary. This model uses [BPEEncoder V2](https://github.com/tanreinama/Japanese-BPEEncoder_V2), a sub-word tokenizer to handle the different characters.
-
-
The model also removes some bias parameters for better performance.
You can find all the original GPT-NeoX-Japanese checkpoints under the [ABEJA](https://huggingface.co/abeja/models?search=gpt-neo-x) organization.
diff --git a/docs/source/en/model_doc/gpt_oss.md b/docs/source/en/model_doc/gpt_oss.md
index 136ebeb29570..60741d8473fa 100644
--- a/docs/source/en/model_doc/gpt_oss.md
+++ b/docs/source/en/model_doc/gpt_oss.md
@@ -35,13 +35,14 @@ The abstract from the paper is the following:
*
-
> Click on the I-JEPA models in the right sidebar for more examples of how to apply I-JEPA to different image representation and classification tasks.
The example below demonstrates how to extract image features with [`Pipeline`] or the [`AutoModel`] class.
@@ -88,10 +86,10 @@ embed_2 = infer(image_2)
similarity = cosine_similarity(embed_1, embed_2)
print(similarity)
```
+
-
Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to 4-bits.
@@ -142,4 +140,3 @@ print(similarity)
[[autodoc]] IJepaForImageClassification
- forward
-
diff --git a/docs/source/en/model_doc/informer.md b/docs/source/en/model_doc/informer.md
index 7e79399cbc57..a9cea0f09cab 100644
--- a/docs/source/en/model_doc/informer.md
+++ b/docs/source/en/model_doc/informer.md
@@ -52,4 +52,4 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
## InformerForPrediction
[[autodoc]] InformerForPrediction
- - forward
\ No newline at end of file
+ - forward
diff --git a/docs/source/en/model_doc/instructblip.md b/docs/source/en/model_doc/instructblip.md
index b0669f1c065f..ac84a71d887e 100644
--- a/docs/source/en/model_doc/instructblip.md
+++ b/docs/source/en/model_doc/instructblip.md
@@ -59,7 +59,6 @@ The attributes can be obtained from model config, as `model.config.num_query_tok
[[autodoc]] InstructBlipProcessor
-
## InstructBlipVisionModel
[[autodoc]] InstructBlipVisionModel
@@ -78,4 +77,4 @@ The attributes can be obtained from model config, as `model.config.num_query_tok
[[autodoc]] InstructBlipForConditionalGeneration
- forward
- - generate
\ No newline at end of file
+ - generate
diff --git a/docs/source/en/model_doc/instructblipvideo.md b/docs/source/en/model_doc/instructblipvideo.md
index e34b454a1237..d4d868b7f90e 100644
--- a/docs/source/en/model_doc/instructblipvideo.md
+++ b/docs/source/en/model_doc/instructblipvideo.md
@@ -59,7 +59,6 @@ The attributes can be obtained from model config, as `model.config.num_query_tok
[[autodoc]] InstructBlipVideoProcessor
-
## InstructBlipVideoVideoProcessor
[[autodoc]] InstructBlipVideoVideoProcessor
diff --git a/docs/source/en/model_doc/internvl.md b/docs/source/en/model_doc/internvl.md
index bf760fdbdd71..7e9fea7f4f20 100644
--- a/docs/source/en/model_doc/internvl.md
+++ b/docs/source/en/model_doc/internvl.md
@@ -15,7 +15,6 @@ rendered properly in your Markdown viewer.
-->
*This model was released on 2025-04-14 and added to Hugging Face Transformers on 2025-04-18.*
-
Overview of InternVL3 models architecture, which is the same as InternVL2.5. Taken from the original checkpoint.
-
-
Comparison of InternVL3 performance on OpenCompass against other SOTA VLLMs. Taken from the original checkpoint.
-
-
This model was contributed by [yonigozlan](https://huggingface.co/yonigozlan).
The original code can be found [here](https://github.com/OpenGVLab/InternVL).
@@ -75,6 +69,7 @@ Here is how you can use the `image-text-to-text` pipeline to perform inference w
>>> outputs[0]["generated_text"]
'The image showcases a vibrant scene of nature, featuring several flowers and a bee. \n\n1. **Foreground Flowers**: \n - The primary focus is on a large, pink cosmos flower with a prominent yellow center. The petals are soft and slightly r'
```
+
### Inference on a single image
This example demonstrates how to perform inference on a single image with the InternVL models using chat templates.
@@ -112,7 +107,6 @@ This example demonstrates how to perform inference on a single image with the In
### Text-only generation
This example shows how to generate text using the InternVL model without providing any image input.
-
```python
>>> from transformers import AutoProcessor, AutoModelForImageTextToText
>>> import torch
diff --git a/docs/source/en/model_doc/jamba.md b/docs/source/en/model_doc/jamba.md
index 0aa06b16e90f..f85d08c5f64d 100644
--- a/docs/source/en/model_doc/jamba.md
+++ b/docs/source/en/model_doc/jamba.md
@@ -75,6 +75,7 @@ input_ids = tokenizer("Plants create energy through a process known as", return_
output = model.generate(**input_ids, cache_implementation="static")
print(tokenizer.decode(output[0], skip_special_tokens=True))
```
+
+
MSN architecture. Taken from the original paper.
-This model was contributed by [sayakpaul](https://huggingface.co/sayakpaul). The original code can be found [here](https://github.com/facebookresearch/msn).
+This model was contributed by [sayakpaul](https://huggingface.co/sayakpaul). The original code can be found [here](https://github.com/facebookresearch/msn).
## Usage tips
@@ -58,16 +58,16 @@ labels when fine-tuned.
### Using Scaled Dot Product Attention (SDPA)
-PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function
-encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the
-[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)
+PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function
+encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the
+[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)
or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
page for more information.
-SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set
+SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set
`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
-```
+```py
from transformers import ViTMSNForImageClassification
model = ViTMSNForImageClassification.from_pretrained("facebook/vit-msn-base", attn_implementation="sdpa", dtype=torch.float16)
...
diff --git a/docs/source/en/model_doc/vitdet.md b/docs/source/en/model_doc/vitdet.md
index 539ae5e376c8..a1250f1bb909 100644
--- a/docs/source/en/model_doc/vitdet.md
+++ b/docs/source/en/model_doc/vitdet.md
@@ -40,4 +40,4 @@ Tips:
## VitDetModel
[[autodoc]] VitDetModel
- - forward
\ No newline at end of file
+ - forward
diff --git a/docs/source/en/model_doc/vitmatte.md b/docs/source/en/model_doc/vitmatte.md
index 519a2dd74d66..0584df8e67a5 100644
--- a/docs/source/en/model_doc/vitmatte.md
+++ b/docs/source/en/model_doc/vitmatte.md
@@ -62,4 +62,4 @@ The model expects both the image and trimap (concatenated) as input. Use [`ViTMa
## VitMatteForImageMatting
[[autodoc]] VitMatteForImageMatting
- - forward
\ No newline at end of file
+ - forward
diff --git a/docs/source/en/model_doc/vits.md b/docs/source/en/model_doc/vits.md
index 2c1777b77f18..96dc93892470 100644
--- a/docs/source/en/model_doc/vits.md
+++ b/docs/source/en/model_doc/vits.md
@@ -149,11 +149,10 @@ Audio(waveform, rate=model.config.sampling_rate)
## VitsTokenizer
[[autodoc]] VitsTokenizer
-- __call__
-- save_vocabulary
+ - __call__
+ - save_vocabulary
## VitsModel
[[autodoc]] VitsModel
-- forward
-
+ - forward
diff --git a/docs/source/en/model_doc/vivit.md b/docs/source/en/model_doc/vivit.md
index 041f80f61ae6..fc127fa6f595 100644
--- a/docs/source/en/model_doc/vivit.md
+++ b/docs/source/en/model_doc/vivit.md
@@ -32,16 +32,16 @@ This model was contributed by [jegormeister](https://huggingface.co/jegormeister
### Using Scaled Dot Product Attention (SDPA)
-PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function
-encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the
-[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)
+PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function
+encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the
+[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)
or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
page for more information.
-SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set
+SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set
`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
-```
+```py
from transformers import VivitModel
model = VivitModel.from_pretrained("google/vivit-b-16x2-kinetics400", attn_implementation="sdpa", dtype=torch.float16)
...
@@ -56,8 +56,6 @@ On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32`
|---------------------:|-------------:|----------:|--------------:|----------------------:|---------------------:|-----------------:|
| 100 | 1 | True | 7.122 | 2575.28 | 5932.54 | 130.364 |
-
-
### Inference
| num_batches | batch_size | is cuda | is half | Speedup (%) | Mem eager (MB) | Mem BT (MB) | Mem saved (%) |
|---------------|--------------|-----------|-----------|---------------|------------------|---------------|-----------------|
@@ -65,7 +63,6 @@ On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32`
| 20 | 2 | True | False | 17.146 | 1234.75 | 447.175 | 176.122 |
| 20 | 4 | True | False | 18.093 | 2275.82 | 709.864 | 220.6 |
| 20 | 8 | True | False | 19.284 | 4358.19 | 1233.24 | 253.393 |
-
## VivitConfig
diff --git a/docs/source/en/model_doc/vjepa2.md b/docs/source/en/model_doc/vjepa2.md
index 93960f051893..049c7ff98f21 100644
--- a/docs/source/en/model_doc/vjepa2.md
+++ b/docs/source/en/model_doc/vjepa2.md
@@ -15,7 +15,6 @@ rendered properly in your Markdown viewer.
-->
*This model was released on 2025-06-11 and added to Hugging Face Transformers on 2025-06-11.*
-
YOLOS architecture. Taken from the original paper.
-
> [!TIP]
> This model wasa contributed by [nielsr](https://huggingface.co/nielsr).
> Click on the YOLOS models in the right sidebar for more examples of how to apply YOLOS to different object detection tasks.
@@ -98,8 +96,8 @@ for score, label, box in zip(filtered_scores, filtered_labels, pixel_boxes):
-
## Notes
+
- Use [`YolosImageProcessor`] for preparing images (and optional targets) for the model. Contrary to [DETR](./detr), YOLOS doesn't require a `pixel_mask`.
## Resources
diff --git a/docs/source/en/model_doc/yoso.md b/docs/source/en/model_doc/yoso.md
index f07e5aba0827..211b0dcf8091 100644
--- a/docs/source/en/model_doc/yoso.md
+++ b/docs/source/en/model_doc/yoso.md
@@ -26,20 +26,20 @@ rendered properly in your Markdown viewer.
The YOSO model was proposed in [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://huggingface.co/papers/2111.09714)
by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh. YOSO approximates standard softmax self-attention
via a Bernoulli sampling scheme based on Locality Sensitive Hashing (LSH). In principle, all the Bernoulli random variables can be sampled with
-a single hash.
+a single hash.
The abstract from the paper is the following:
-*Transformer-based models are widely used in natural language processing (NLP). Central to the transformer model is
-the self-attention mechanism, which captures the interactions of token pairs in the input sequences and depends quadratically
-on the sequence length. Training such models on longer sequences is expensive. In this paper, we show that a Bernoulli sampling
-attention mechanism based on Locality Sensitive Hashing (LSH), decreases the quadratic complexity of such models to linear.
-We bypass the quadratic cost by considering self-attention as a sum of individual tokens associated with Bernoulli random
-variables that can, in principle, be sampled at once by a single hash (although in practice, this number may be a small constant).
-This leads to an efficient sampling scheme to estimate self-attention which relies on specific modifications of
-LSH (to enable deployment on GPU architectures). We evaluate our algorithm on the GLUE benchmark with standard 512 sequence
-length where we see favorable performance relative to a standard pretrained Transformer. On the Long Range Arena (LRA) benchmark,
-for evaluating performance on long sequences, our method achieves results consistent with softmax self-attention but with sizable
+*Transformer-based models are widely used in natural language processing (NLP). Central to the transformer model is
+the self-attention mechanism, which captures the interactions of token pairs in the input sequences and depends quadratically
+on the sequence length. Training such models on longer sequences is expensive. In this paper, we show that a Bernoulli sampling
+attention mechanism based on Locality Sensitive Hashing (LSH), decreases the quadratic complexity of such models to linear.
+We bypass the quadratic cost by considering self-attention as a sum of individual tokens associated with Bernoulli random
+variables that can, in principle, be sampled at once by a single hash (although in practice, this number may be a small constant).
+This leads to an efficient sampling scheme to estimate self-attention which relies on specific modifications of
+LSH (to enable deployment on GPU architectures). We evaluate our algorithm on the GLUE benchmark with standard 512 sequence
+length where we see favorable performance relative to a standard pretrained Transformer. On the Long Range Arena (LRA) benchmark,
+for evaluating performance on long sequences, our method achieves results consistent with softmax self-attention but with sizable
speed-ups and memory savings and often outperforms other efficient self-attention methods. Our code is available at this https URL*
This model was contributed by [novice03](https://huggingface.co/novice03). The original code can be found [here](https://github.com/mlpen/YOSO).
@@ -50,12 +50,12 @@ This model was contributed by [novice03](https://huggingface.co/novice03). The o
in parallel on a GPU.
- The kernels provide a `fast_hash` function, which approximates the random projections of the queries and keys using the Fast Hadamard Transform. Using these
hash codes, the `lsh_cumulation` function approximates self-attention via LSH-based Bernoulli sampling.
-- To use the custom kernels, the user should set `config.use_expectation = False`. To ensure that the kernels are compiled successfully,
-the user must install the correct version of PyTorch and cudatoolkit. By default, `config.use_expectation = True`, which uses YOSO-E and
+- To use the custom kernels, the user should set `config.use_expectation = False`. To ensure that the kernels are compiled successfully,
+the user must install the correct version of PyTorch and cudatoolkit. By default, `config.use_expectation = True`, which uses YOSO-E and
does not require compiling CUDA kernels.
+alt="drawing" width="600"/>
YOSO Attention Algorithm. Taken from the original paper.
@@ -99,4 +99,4 @@ alt="drawing" width="600"/>
## YosoForQuestionAnswering
[[autodoc]] YosoForQuestionAnswering
- - forward
\ No newline at end of file
+ - forward
diff --git a/docs/source/en/model_doc/zamba.md b/docs/source/en/model_doc/zamba.md
index bb9740807703..847f0532e2a7 100644
--- a/docs/source/en/model_doc/zamba.md
+++ b/docs/source/en/model_doc/zamba.md
@@ -24,7 +24,6 @@ rendered properly in your Markdown viewer.
This model was contributed by [pglo](https://huggingface.co/pglo).
-
## Model details
Zamba-7B-v1 is a hybrid between state-space models (Specifically [Mamba](https://github.com/state-spaces/mamba)) and transformer, and was trained using next-token prediction. Zamba uses a shared transformer layer after every 6 mamba blocks. It uses the [Mistral v0.1 tokenizer](https://huggingface.co/mistralai/Mistral-7B-v0.1). We came to this architecture after a series of ablations at small scales. Zamba-7B-v1 was pre-trained on 1T tokens of text and code data.
@@ -33,23 +32,24 @@ Zamba-7B-v1 is a hybrid between state-space models (Specifically [Mamba](https:/
## Quick start
-
### Presequities
Zamba requires you use `transformers` version 4.46.0 or higher:
+
```bash
pip install transformers>=4.45.0
```
In order to run optimized Mamba implementations, you first need to install `mamba-ssm` and `causal-conv1d`:
+
```bash
pip install mamba-ssm causal-conv1d>=1.2.0
```
+
You also have to have the model on a CUDA device.
You can run the model not using the optimized Mamba kernels, but it is **not** recommended as it will result in significantly lower latencies. In order to do that, you'll need to specify `use_mamba_kernels=False` when loading the model.
-
## Inference
```python
@@ -66,39 +66,33 @@ outputs = model.generate(**input_ids, max_new_tokens=100)
print(tokenizer.decode(outputs[0]))
```
-
## Model card
The model cards can be found at:
-* [Zamba-7B](https://huggingface.co/Zyphra/Zamba-7B-v1)
+* [Zamba-7B](https://huggingface.co/Zyphra/Zamba-7B-v1)
## Issues
For issues with model output, or community discussion, please use the Hugging Face community [forum](https://huggingface.co/Zyphra/Zamba-7B-v1/discussions)
-
## License
The model weights are open-sourced via an Apache 2.0 license.
-
## ZambaConfig
[[autodoc]] ZambaConfig
-
## ZambaModel
[[autodoc]] ZambaModel
- forward
-
## ZambaForCausalLM
[[autodoc]] ZambaForCausalLM
- forward
-
## ZambaForSequenceClassification
[[autodoc]] transformers.ZambaForSequenceClassification
diff --git a/docs/source/en/model_doc/zamba2.md b/docs/source/en/model_doc/zamba2.md
index 1d911a59c277..c9d3d3d1de75 100644
--- a/docs/source/en/model_doc/zamba2.md
+++ b/docs/source/en/model_doc/zamba2.md
@@ -26,19 +26,18 @@ rendered properly in your Markdown viewer.
This model was contributed by [pglo](https://huggingface.co/pglo).
-
## Model details
-[Zamba2-1.2B](https://www.zyphra.com/post/zamba2-mini), [Zamba2-2.7B](https://www.zyphra.com/post/zamba2-small) and [Zamba2-7B](https://www.zyphra.com/post/zamba2-7b) are hybrid models combining state-space models (Specifically [Mamba](https://github.com/state-spaces/mamba)) and transformer, and were trained using next-token prediction. Zamba2 uses shared transformer layers after every 6 mamba blocks. It uses the [Mistral v0.1 tokenizer](https://huggingface.co/mistralai/Mistral-7B-v0.1). We came to this architecture after a series of ablations at small scales. Zamba2-1.2B, Zamba2-2.7B and Zamba2-7B were pre-trained on 2T and 3T tokens, respectively.
+[Zamba2-1.2B](https://www.zyphra.com/post/zamba2-mini), [Zamba2-2.7B](https://www.zyphra.com/post/zamba2-small) and [Zamba2-7B](https://www.zyphra.com/post/zamba2-7b) are hybrid models combining state-space models (Specifically [Mamba2](https://github.com/state-spaces/mamba)) and transformer, and were trained using next-token prediction. Zamba2 uses shared transformer layers after every 6 mamba blocks. It uses the [Mistral v0.1 tokenizer](https://huggingface.co/mistralai/Mistral-7B-v0.1). We came to this architecture after a series of ablations at small scales. Zamba2-1.2B, Zamba2-2.7B and Zamba2-7B were pre-trained on 2T and 3T tokens, respectively.
@@ -286,10 +287,10 @@ In the example above we instruct the model to classify the image into a single c
## Image-guided text generation
-For more creative applications, you can use image-guided text generation to generate text based on an image. This can be
-useful to create descriptions of products, ads, descriptions of a scene, etc.
+For more creative applications, you can use image-guided text generation to generate text based on an image. This can be
+useful to create descriptions of products, ads, descriptions of a scene, etc.
-Let's prompt IDEFICS to write a story based on a simple image of a red door:
+Let's prompt IDEFICS to write a story based on a simple image of a red door:
@@ -333,14 +334,14 @@ Looks like IDEFICS noticed the pumpkin on the doorstep and went with a spooky Ha
-
-
-
In the original implementation ZoeDepth model performs inference on both the original and flipped images and averages out the results. The post_process_depth_estimation function can handle this for us by passing the flipped outputs to the optional outputs_flipped argument:
>>> with torch.no_grad():
+>>> with torch.no_grad():
... outputs = model(pixel_values)
... outputs_flipped = model(pixel_values=torch.flip(inputs.pixel_values, dims=[3]))
>>> post_processed_output = image_processor.post_process_depth_estimation(
diff --git a/docs/source/en/tasks/multiple_choice.md b/docs/source/en/tasks/multiple_choice.md
index 3f4c9d4637fb..d35f108ecce5 100644
--- a/docs/source/en/tasks/multiple_choice.md
+++ b/docs/source/en/tasks/multiple_choice.md
@@ -113,6 +113,7 @@ To apply the preprocessing function over the entire dataset, use 🤗 Datasets [
```
To create a batch of examples, it's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length. [`DataCollatorForMultipleChoice`] flattens all the model inputs, applies padding, and then unflattens the results.
+
```py
>>> from transformers import DataCollatorForMultipleChoice
>>> collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
@@ -197,7 +198,6 @@ Once training is completed, share your model to the Hub with the [`~transformers
>>> trainer.push_to_hub()
```
-
For a more in-depth example of how to finetune a model for multiple choice, take a look at the corresponding
diff --git a/docs/source/en/tasks/object_detection.md b/docs/source/en/tasks/object_detection.md
index 394e77104b74..ef2a86190bbc 100644
--- a/docs/source/en/tasks/object_detection.md
+++ b/docs/source/en/tasks/object_detection.md
@@ -121,6 +121,7 @@ To get familiar with the data, explore what the examples look like.
```
The examples in the dataset have the following fields:
+
- `image_id`: the example image id
- `image`: a `PIL.Image.Image` object containing the image
- `width`: width of the image
@@ -171,11 +172,11 @@ To get an even better understanding of the data, visualize an example in the dat
>>> image
```
+
-
To visualize the bounding boxes with associated labels, you can get the labels from the dataset's metadata, specifically
the `category` field.
You'll also want to create dictionaries that map a label id to a label class (`id2label`) and the other way around (`label2id`).
@@ -216,6 +217,7 @@ Instantiate the image processor from the same checkpoint as the model you want t
```
Before passing the images to the `image_processor`, apply two preprocessing transformations to the dataset:
+
- Augmenting images
- Reformatting annotations to meet DETR expectations
@@ -505,6 +507,7 @@ The images in this dataset are still quite large, even after resizing. This mean
require at least one GPU.
Training involves the following steps:
+
1. Load the model with [`AutoModelForObjectDetection`] using the same checkpoint as in the preprocessing.
2. Define your training hyperparameters in [`TrainingArguments`].
3. Pass the training arguments to [`Trainer`] along with the model, dataset, image processor, and data collator.
@@ -527,9 +530,10 @@ and `id2label` maps that you created earlier from the dataset's metadata. Additi
In the [`TrainingArguments`] use `output_dir` to specify where to save your model, then configure hyperparameters as you see fit. For `num_train_epochs=30` training will take about 35 minutes in Google Colab T4 GPU, increase the number of epoch to get better results.
Important notes:
- - Do not remove unused columns because this will drop the image column. Without the image column, you
+
+- Do not remove unused columns because this will drop the image column. Without the image column, you
can't create `pixel_values`. For this reason, set `remove_unused_columns` to `False`.
- - Set `eval_do_concat_batches=False` to get proper evaluation results. Images have different number of target boxes, if batches are concatenated we will not be able to determine which boxes belongs to particular image.
+- Set `eval_do_concat_batches=False` to get proper evaluation results. Images have different number of target boxes, if batches are concatenated we will not be able to determine which boxes belongs to particular image.
If you wish to share your model by pushing to the Hub, set `push_to_hub` to `True` (you must be signed in to Hugging
Face to upload your model).
@@ -576,6 +580,7 @@ Finally, bring everything together, and call [`~transformers.Trainer.train`]:
>>> trainer.train()
```
+
@@ -1487,6 +1492,7 @@ Now that you have finetuned a model, evaluated it, and uploaded it to the Huggin
```
Load model and image processor from the Hugging Face Hub (skip to use already trained in this session):
+
```py
>>> from transformers import infer_device
diff --git a/docs/source/en/tasks/prompting.md b/docs/source/en/tasks/prompting.md
index eb8e61d67aaf..2678792c5f3d 100644
--- a/docs/source/en/tasks/prompting.md
+++ b/docs/source/en/tasks/prompting.md
@@ -80,7 +80,7 @@ This section covers a few prompting techniques.
### Few-shot prompting
-Few-shot prompting improves accuracy and performance by including specific examples of what a model should generate given an input. The explicit examples give the model a better understanding of the task and the output format you’re looking for. Try experimenting with different numbers of examples (2, 4, 8, etc.) to see how it affects performance. The example below provides the model with 1 example (1-shot) of the output format (a date in MM/DD/YYYY format) it should return.
+Few-shot prompting improves accuracy and performance by including specific examples of what a model should generate given an input. The explicit examples give the model a better understanding of the task and the output format you're looking for. Try experimenting with different numbers of examples (2, 4, 8, etc.) to see how it affects performance. The example below provides the model with 1 example (1-shot) of the output format (a date in MM/DD/YYYY format) it should return.
```python
from transformers import pipeline
@@ -127,7 +127,6 @@ for output in outputs:
print(f"Result: {output['generated_text']}")
```
-
While the basic few-shot prompting approach embedded examples within a single text string, the chat template format offers the following benefits.
- The model may have a potentially improved understanding because it can better recognize the pattern and the expected roles of user input and assistant output.
diff --git a/docs/source/en/tasks/semantic_segmentation.md b/docs/source/en/tasks/semantic_segmentation.md
index 5d3c8e70aa1f..de88a0af6866 100644
--- a/docs/source/en/tasks/semantic_segmentation.md
+++ b/docs/source/en/tasks/semantic_segmentation.md
@@ -23,6 +23,7 @@ rendered properly in your Markdown viewer.
Image segmentation models separate areas corresponding to different areas of interest in an image. These models work by assigning a label to each pixel. There are several types of segmentation: semantic segmentation, instance segmentation, and panoptic segmentation.
In this guide, we will:
+
1. [Take a look at different types of segmentation](#types-of-segmentation).
2. [Have an end-to-end fine-tuning example for semantic segmentation](#fine-tuning-a-model-for-segmentation).
@@ -69,6 +70,7 @@ results
```
The segmentation pipeline output includes a mask for every predicted class.
+
```bash
[{'score': None,
'label': 'road',
@@ -107,6 +109,7 @@ Taking a look at the mask for the car class, we can see every car is classified
```python
results[-1]["mask"]
```
+
@@ -135,11 +138,13 @@ As you can see below, there are multiple cars classified, and there's no classif
'label': 'person',
'mask': }]
```
+
Checking out one of the car masks below.
```python
results[2]["mask"]
```
+
@@ -151,6 +156,7 @@ panoptic_segmentation = pipeline("image-segmentation", "facebook/mask2former-swi
results = panoptic_segmentation(image)
results
```
+
As you can see below, we have more classes. We will later illustrate to see that every pixel is classified into one of the classes.
```bash
@@ -206,7 +212,6 @@ To see all architectures and checkpoints compatible with this task, we recommend
-
### Load SceneParse150 dataset
Start by loading a smaller subset of the SceneParse150 dataset from the 🤗 Datasets library. This'll give you a chance to experiment and make sure everything works before spending more time training on the full dataset.
@@ -473,7 +478,6 @@ Reload the dataset and load an image for inference.
-
We will now see how to infer without a pipeline. Process the image with an image processor and place the `pixel_values` on a GPU:
```py
@@ -503,7 +507,6 @@ Next, rescale the logits to the original image size:
>>> pred_seg = upsampled_logits.argmax(dim=1)[0]
```
-
To visualize the results, load the [dataset color palette](https://github.com/tensorflow/models/blob/3f1ca33afe3c1631b733ea7e40c294273b9e406d/research/deeplab/utils/get_dataset_colormap.py#L51) as `ade_palette()` that maps each class to their RGB values.
```py
diff --git a/docs/source/en/tasks/summarization.md b/docs/source/en/tasks/summarization.md
index c57097421fbc..b2f2beebc806 100644
--- a/docs/source/en/tasks/summarization.md
+++ b/docs/source/en/tasks/summarization.md
@@ -213,7 +213,6 @@ Once training is completed, share your model to the Hub with the [`~transformers
>>> trainer.push_to_hub()
```
-
For a more in-depth example of how to finetune a model for summarization, take a look at the corresponding
diff --git a/docs/source/en/tasks/token_classification.md b/docs/source/en/tasks/token_classification.md
index 49b0fcf216b8..5096298affd1 100644
--- a/docs/source/en/tasks/token_classification.md
+++ b/docs/source/en/tasks/token_classification.md
@@ -242,7 +242,6 @@ Before you start training your model, create a map of the expected ids to their
... }
```
-
If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)!
@@ -298,7 +297,6 @@ Once training is completed, share your model to the Hub with the [`~transformers
>>> trainer.push_to_hub()
```
-
For a more in-depth example of how to finetune a model for token classification, take a look at the corresponding
diff --git a/docs/source/en/tasks/video_classification.md b/docs/source/en/tasks/video_classification.md
index b387a8320dfc..bae638bd84ed 100644
--- a/docs/source/en/tasks/video_classification.md
+++ b/docs/source/en/tasks/video_classification.md
@@ -363,7 +363,6 @@ Leverage [`Trainer`](https://huggingface.co/docs/transformers/main_classes/train
Most of the training arguments are self-explanatory, but one that is quite important here is `remove_unused_columns=False`. This one will drop any features not used by the model's call function. By default it's `True` because usually it's ideal to drop unused feature columns, making it easier to unpack inputs into the model's call function. But, in this case, you need the unused features ('video' in particular) in order to create `pixel_values` (which is a mandatory key our model expects in its inputs).
-
```py
>>> from transformers import TrainingArguments, Trainer
@@ -477,7 +476,6 @@ The simplest way to try out your fine-tuned model for inference is to use it in
You can also manually replicate the results of the `pipeline` if you'd like.
-
```py
>>> def run_inference(model, video):
... # (num_frames, num_channels, height, width)
diff --git a/docs/source/en/tasks/video_text_to_text.md b/docs/source/en/tasks/video_text_to_text.md
index 0e0191af5884..58ca97e9a56c 100644
--- a/docs/source/en/tasks/video_text_to_text.md
+++ b/docs/source/en/tasks/video_text_to_text.md
@@ -18,13 +18,14 @@ rendered properly in your Markdown viewer.
[[open-in-colab]]
-Video-text-to-text models, also known as video language models or vision language models with video input, are language models that take a video input. These models can tackle various tasks, from video question answering to video captioning.
+Video-text-to-text models, also known as video language models or vision language models with video input, are language models that take a video input. These models can tackle various tasks, from video question answering to video captioning.
-These models have nearly the same architecture as [image-text-to-text](../image_text_to_text) models except for some changes to accept video data, since video data is essentially image frames with temporal dependencies. Some image-text-to-text models take in multiple images, but this alone is inadequate for a model to accept videos. Moreover, video-text-to-text models are often trained with all vision modalities. Each example might have videos, multiple videos, images and multiple images. Some of these models can also take interleaved inputs. For example, you can refer to a specific video inside a string of text by adding a video token in text like "What is happening in this video? `
+
+
+
+
+ English | + 简体中文 | + 繁體中文 | + 한국어 | + Español | + 日本語 | + हिन्दी | + Русский | + Português | + తెలుగు | + Français | + Deutsch | + Italiano | + Tiếng Việt | + العربية | + اردو | + বাংলা | +
+ + +ইনফারেন্স ও ট্রেনিংয়ের জন্য আধুনিকতম (State-of-the-art) প্রি-ট্রেইন্ড মডেলসমূহ
+ + +
+
+
+
+
+
+
+
+ English | + 简体中文 | + 繁體中文 | + 한국어 | + Español | + 日本語 | + हिन्दी | + Русский | + Português | + తెలుగు | + Français | + Deutsch | + Italiano | + Tiếng Việt | + العربية | + اردو | + বাংলা | +
+ + +Modelli preaddestrati all'avanguardia per l'inferenza e l'addestramento
+ + +
+
+
+diff --git a/i18n/README_te.md b/i18n/README_te.md index aee579b52abd..225bd74bb025 100644 --- a/i18n/README_te.md +++ b/i18n/README_te.md @@ -49,9 +49,11 @@ limitations under the License. తెలుగు | Français | Deutsch | + Italiano | Tiếng Việt | العربية | اردو | + বাংলা |
diff --git a/i18n/README_ur.md b/i18n/README_ur.md index bba5988e7717..215191e4cbb2 100644 --- a/i18n/README_ur.md +++ b/i18n/README_ur.md @@ -47,8 +47,10 @@ limitations under the License. తెలుగు | Français | Deutsch | + Italiano | Tiếng Việt | العربية | + বাংলা | اردو | diff --git a/i18n/README_vi.md b/i18n/README_vi.md index f78e3b6d4e9b..3e0146c1ddb0 100644 --- a/i18n/README_vi.md +++ b/i18n/README_vi.md @@ -47,9 +47,11 @@ limitations under the License. తెలుగు | Français | Deutsch | + Italiano | Tiếng việt | العربية | اردو | + বাংলা | diff --git a/i18n/README_zh-hans.md b/i18n/README_zh-hans.md index 8220e403b8b2..4c5859592c89 100644 --- a/i18n/README_zh-hans.md +++ b/i18n/README_zh-hans.md @@ -72,9 +72,11 @@ checkpoint: 检查点 తెలుగు | Français | Deutsch | + Italiano | Tiếng Việt | العربية | اردو | + বাংলা | diff --git a/i18n/README_zh-hant.md b/i18n/README_zh-hant.md index da6ed40910ea..5842e57255c3 100644 --- a/i18n/README_zh-hant.md +++ b/i18n/README_zh-hant.md @@ -84,9 +84,11 @@ user: 使用者 తెలుగు | Français | Deutsch | + Italiano | Tiếng Việt | العربية | اردو | + বাংলা | diff --git a/notebooks/README.md b/notebooks/README.md index 4d31797104f8..aed435878804 100644 --- a/notebooks/README.md +++ b/notebooks/README.md @@ -22,7 +22,6 @@ Also, we would like to list here interesting content created by the community. If you wrote some notebook(s) leveraging 🤗 Transformers and would like to be listed here, please open a Pull Request so it can be included under the Community notebooks. - ## Hugging Face's notebooks 🤗 ### Documentation notebooks @@ -38,7 +37,6 @@ You can open any page of the documentation as a notebook in Colab (there is a bu | [Summary of the tokenizers](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb) | The differences between the tokenizers algorithm |[](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb)| [](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb)| | [Multilingual models](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/multilingual.ipynb) | How to use the multilingual models of the library |[](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/multilingual.ipynb)| [](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/multilingual.ipynb)| - ### PyTorch Examples #### Natural Language Processing[[pytorch-nlp]] @@ -88,7 +86,6 @@ You can open any page of the documentation as a notebook in Colab (there is a bu | [How to fine-tune a Nucleotide Transformer model](https://github.com/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling.ipynb) | See how to tokenize DNA and fine-tune a large pre-trained DNA "language" model | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling.ipynb) | [](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling.ipynb) | | [Fine-tune a Nucleotide Transformer model with LoRA](https://github.com/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling_with_peft.ipynb) | Train even larger DNA models in a memory-efficient way | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling_with_peft.ipynb) | [](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling_with_peft.ipynb) | - #### Other modalities[[pytorch-other]] | Notebook | Description | | | @@ -101,7 +98,6 @@ You can open any page of the documentation as a notebook in Colab (there is a bu |:----------|:-------------|:-------------|------:| | [How to export model to ONNX](https://github.com/huggingface/notebooks/blob/main/examples/onnx-export.ipynb)| Highlight how to export and run inference workloads through ONNX | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/onnx-export.ipynb)| [](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/onnx-export.ipynb)| - ### Optimum notebooks 🤗 [Optimum](https://github.com/huggingface/optimum) is an extension of 🤗 Transformers, providing a set of performance optimization tools enabling maximum efficiency to train and run models on targeted hardwares. diff --git a/pyproject.toml b/pyproject.toml index 5d3a9436eb3f..80983fd49703 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ exclude_lines = [ ] [tool.ruff] -target-version = "py39" +target-version = "py310" line-length = 119 [tool.ruff.lint] @@ -27,7 +27,10 @@ line-length = 119 # UP031: Use format specifiers instead of percent format # UP004: Class `XXX` inherits from `object` # UP028: Checks for for loops that can be replaced with yield from expressions -ignore = ["C901", "E501", "E741", "F402", "F823", "SIM1", "SIM300", "SIM212", "SIM905", "UP009", "UP015", "UP031", "UP028", "UP004"] +# UP045: Use `X | None` for type annotations +# UP007: Use `X | Y` for type annotations +# UP035: temporarily disabled to minimize upgrade changes +ignore = ["C901", "E501", "E741", "F402", "F823", "SIM1", "SIM300", "SIM212", "SIM905", "UP009", "UP015", "UP031", "UP028", "UP004", "UP045", "UP007", "UP035"] # RUF013: Checks for the use of implicit Optional # in type annotations when the default parameter value is None. select = ["C", "E", "F", "I", "W", "RUF013", "PERF102", "PLC1802", "PLC0208", "SIM", "UP"] diff --git a/setup.py b/setup.py index 9f3bb1750597..86891a483ffb 100644 --- a/setup.py +++ b/setup.py @@ -160,7 +160,7 @@ "rhoknp>=1.1.0,<1.3.1", "rjieba", "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1", - "ruff==0.11.2", + "ruff==0.13.1", # `sacrebleu` not used in `transformers`. However, it is needed in several tests, when a test calls # `evaluate.load("sacrebleu")`. This metric is used in the examples that we use to test the `Trainer` with, in the # `Trainer` tests (see references to `run_translation.py`). @@ -461,7 +461,7 @@ def run(self): setup( name="transformers", - version="4.57.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) + version="4.57.1", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)", author_email="transformers@huggingface.co", description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow", diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 2cf1d5970b54..9bc547ddcd38 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -18,7 +18,7 @@ # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names # in the namespace without actually importing anything (and especially none of the backends). -__version__ = "4.57.0.dev0" +__version__ = "4.57.1" from pathlib import Path from typing import TYPE_CHECKING @@ -928,7 +928,6 @@ from .utils import is_torch_npu_available as is_torch_npu_available from .utils import is_torch_xla_available as is_torch_xla_available from .utils import is_torch_xpu_available as is_torch_xpu_available - from .utils import logging as logging # bitsandbytes config from .utils.quantization_config import AqlmConfig as AqlmConfig diff --git a/src/transformers/activations.py b/src/transformers/activations.py index 8bfd517add9f..7642e8aa238a 100644 --- a/src/transformers/activations.py +++ b/src/transformers/activations.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import functools import math from collections import OrderedDict @@ -26,7 +27,8 @@ logger = logging.get_logger(__name__) -class PytorchGELUTanh(nn.Module): +@use_kernel_forward_from_hub("GeluTanh") +class GELUTanh(nn.Module): """ A fast C implementation of the tanh approximation of the GeLU activation function. See https://huggingface.co/papers/1606.08415. @@ -35,8 +37,18 @@ class PytorchGELUTanh(nn.Module): match due to rounding errors. """ + def __init__(self, use_gelu_tanh_python: bool = False): + super().__init__() + if use_gelu_tanh_python: + self.act = self._gelu_tanh_python + else: + self.act = functools.partial(nn.functional.gelu, approximate="tanh") + + def _gelu_tanh_python(self, input: Tensor) -> Tensor: + return input * 0.5 * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0)))) + def forward(self, input: Tensor) -> Tensor: - return nn.functional.gelu(input, approximate="tanh") + return self.act(input) @use_kernel_forward_from_hub("NewGELU") @@ -50,6 +62,7 @@ def forward(self, input: Tensor) -> Tensor: return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0)))) +@use_kernel_forward_from_hub("GeLU") class GELUActivation(nn.Module): """ Original Implementation of the GELU activation function in Google BERT repo when initially created. For @@ -72,6 +85,20 @@ def forward(self, input: Tensor) -> Tensor: return self.act(input) +@use_kernel_forward_from_hub("SiLU") +class SiLUActivation(nn.Module): + """ + See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear + Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function + Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated + Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with + later. + """ + + def forward(self, input: Tensor) -> Tensor: + return nn.functional.silu(input) + + @use_kernel_forward_from_hub("FastGELU") class FastGELUActivation(nn.Module): """ @@ -290,7 +317,8 @@ def forward(self, input: Tensor) -> Tensor: "gelu_fast": FastGELUActivation, "gelu_new": NewGELUActivation, "gelu_python": (GELUActivation, {"use_gelu_python": True}), - "gelu_pytorch_tanh": PytorchGELUTanh, + "gelu_pytorch_tanh": GELUTanh, + "gelu_python_tanh": (GELUTanh, {"use_gelu_tanh_python": True}), "gelu_accurate": AccurateGELUActivation, "laplace": LaplaceActivation, "leaky_relu": nn.LeakyReLU, @@ -301,7 +329,7 @@ def forward(self, input: Tensor) -> Tensor: "relu2": ReLUSquaredActivation, "relu6": nn.ReLU6, "sigmoid": nn.Sigmoid, - "silu": nn.SiLU, + "silu": SiLUActivation, "swish": nn.SiLU, "tanh": nn.Tanh, "prelu": nn.PReLU, diff --git a/src/transformers/audio_utils.py b/src/transformers/audio_utils.py index e848f558738c..5de56618014e 100644 --- a/src/transformers/audio_utils.py +++ b/src/transformers/audio_utils.py @@ -23,8 +23,11 @@ import warnings from collections.abc import Sequence from io import BytesIO -from typing import Any, Optional, Union +from typing import TYPE_CHECKING, Any, Optional, Union + +if TYPE_CHECKING: + import torch import numpy as np import requests from packaging import version @@ -51,7 +54,7 @@ if is_torchcodec_available(): TORCHCODEC_VERSION = version.parse(importlib.metadata.version("torchcodec")) -AudioInput = Union[np.ndarray, "torch.Tensor", Sequence[np.ndarray], Sequence["torch.Tensor"]] # noqa: F821 +AudioInput = Union[np.ndarray, "torch.Tensor", Sequence[np.ndarray], Sequence["torch.Tensor"]] def load_audio(audio: Union[str, np.ndarray], sampling_rate=16000, timeout=None) -> np.ndarray: @@ -78,9 +81,7 @@ def load_audio(audio: Union[str, np.ndarray], sampling_rate=16000, timeout=None) audio = load_audio_torchcodec(audio, sampling_rate=sampling_rate) else: audio = load_audio_librosa(audio, sampling_rate=sampling_rate, timeout=timeout) - elif isinstance(audio, np.ndarray): - audio = audio - else: + elif not isinstance(audio, np.ndarray): raise TypeError( "Incorrect format used for `audio`. Should be an url linking to an audio, a local path, or numpy array." ) @@ -318,9 +319,7 @@ def mel_to_hertz(mels: Union[float, np.ndarray], mel_scale: str = "htk") -> Unio return freq -def hertz_to_octave( - freq: Union[float, np.ndarray], tuning: Optional[float] = 0.0, bins_per_octave: Optional[int] = 12 -): +def hertz_to_octave(freq: Union[float, np.ndarray], tuning: float = 0.0, bins_per_octave: int = 12): """ Convert frequency from hertz to fractional octave numbers. Adapted from *librosa*. @@ -370,7 +369,7 @@ def chroma_filter_bank( tuning: float = 0.0, power: Optional[float] = 2.0, weighting_parameters: Optional[tuple[float, float]] = (5.0, 2.0), - start_at_c_chroma: Optional[bool] = True, + start_at_c_chroma: bool = True, ): """ Creates a chroma filter bank, i.e a linear transformation to project spectrogram bins onto chroma bins. @@ -391,7 +390,7 @@ def chroma_filter_bank( weighting_parameters (`tuple[float, float]`, *optional*, defaults to `(5., 2.)`): If specified, apply a Gaussian weighting parameterized by the first element of the tuple being the center and the second element being the Gaussian half-width. - start_at_c_chroma (`float`, *optional*, defaults to `True`): + start_at_c_chroma (`bool`, *optional*, defaults to `True`): If True, the filter bank will start at the 'C' pitch class. Otherwise, it will start at 'A'. Returns: `np.ndarray` of shape `(num_frequency_bins, num_chroma)` @@ -586,7 +585,7 @@ def window_function( window = np.hamming(length) elif name in ["hann", "hann_window"]: window = np.hanning(length) - elif name in ["povey"]: + elif name == "povey": window = np.power(np.hanning(length), 0.85) else: raise ValueError(f"Unknown window function '{name}'") @@ -627,7 +626,7 @@ def spectrogram( reference: float = 1.0, min_value: float = 1e-10, db_range: Optional[float] = None, - remove_dc_offset: Optional[bool] = None, + remove_dc_offset: bool = False, dtype: np.dtype = np.float32, ) -> np.ndarray: """ @@ -838,7 +837,7 @@ def spectrogram_batch( reference: float = 1.0, min_value: float = 1e-10, db_range: Optional[float] = None, - remove_dc_offset: Optional[bool] = None, + remove_dc_offset: bool = False, dtype: np.dtype = np.float32, ) -> list[np.ndarray]: """ diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py index e6f2645a766e..99beb0b610a1 100644 --- a/src/transformers/cache_utils.py +++ b/src/transformers/cache_utils.py @@ -395,7 +395,12 @@ def update( if not self.is_initialized: self.lazy_initialization(key_states) - cache_position = cache_kwargs.get("cache_position") + # Some old models give None for `cache_position` or even omit passing `cache_kwargs` when used as cross-attention, + # in which case we should copy the whole Layer (key_states.shape[-2] == self.max_cache_len) + cache_position = cache_kwargs.get("cache_position") if cache_kwargs is not None else None + cache_position = ( + cache_position if cache_position is not None else torch.arange(key_states.shape[-2], device=self.device) + ) cumulative_length = self.cumulative_length is_full = cumulative_length >= self.max_cache_len @@ -790,7 +795,7 @@ def early_initialization( for layer in self.layers: layer.lazy_initialization(fake_keys_tensor) - def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: + def get_seq_length(self, layer_idx: int = 0) -> int: """Returns the sequence length of the cache for the given layer.""" if layer_idx >= len(self.layers): return 0 @@ -955,17 +960,19 @@ def __init__( layers = [] # If a config is passed, use it to infer the layer types and initialize accordingly if config is not None: - config = config.get_text_config(decoder=True) - sliding_window = getattr(config, "sliding_window", None) or getattr(config, "attention_chunk_size", None) - layer_types = getattr(config, "layer_types", None) + decoder_config = config.get_text_config(decoder=True) + sliding_window = getattr(decoder_config, "sliding_window", None) or getattr( + decoder_config, "attention_chunk_size", None + ) + layer_types = getattr(decoder_config, "layer_types", None) if layer_types is None: layer_types = [ "sliding_attention" if sliding_window is not None else "full_attention" - for _ in range(config.num_hidden_layers) + for _ in range(decoder_config.num_hidden_layers) ] # Some models have shared layers thus no cache is needed for them (e.g. Gemma3n) - if hasattr(config, "num_kv_shared_layers"): - layer_types = layer_types[: -config.num_kv_shared_layers] + if hasattr(decoder_config, "num_kv_shared_layers"): + layer_types = layer_types[: -decoder_config.num_kv_shared_layers] for layer_type in layer_types: # From a cache point of view, both sliding and chunked are the same in how they should behave and how many @@ -1286,7 +1293,7 @@ def from_legacy_cache( cache.is_updated[layer_idx] = True return cache - def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: + def get_seq_length(self, layer_idx: int = 0) -> int: """Returns the sequence length of the cached states. A layer index can be optionally passed.""" return self.self_attention_cache.get_seq_length(layer_idx) diff --git a/src/transformers/commands/add_new_model_like.py b/src/transformers/commands/add_new_model_like.py index ffff54df93ba..fce524d4a6c0 100644 --- a/src/transformers/commands/add_new_model_like.py +++ b/src/transformers/commands/add_new_model_like.py @@ -755,7 +755,7 @@ def register_subcommand(parser: ArgumentParser): ) add_new_model_like_parser.set_defaults(func=add_new_model_like_command_factory) - def __init__(self, path_to_repo=None, *args): + def __init__(self, path_to_repo=None, **kwargs): ( self.old_model_infos, self.new_lowercase_name, diff --git a/src/transformers/commands/chat.py b/src/transformers/commands/chat.py index 70ee41c0c514..6ddf90164ba7 100644 --- a/src/transformers/commands/chat.py +++ b/src/transformers/commands/chat.py @@ -40,6 +40,12 @@ from transformers.utils import is_rich_available, is_torch_available +try: + import readline # noqa importing this enables GNU readline capabilities +except ImportError: + # some platforms may not support readline: https://docs.python.org/3/library/readline.html + pass + if platform.system() != "Windows": import pwd @@ -53,9 +59,7 @@ from transformers import ( AutoModelForCausalLM, - AutoTokenizer, BitsAndBytesConfig, - GenerationConfig, ) ALLOWED_KEY_CHARS = set(string.ascii_letters + string.whitespace) @@ -437,8 +441,7 @@ def parse_generate_flags(self, generate_flags: list[str]) -> dict: # 2. b. strings should be quoted def is_number(s: str) -> bool: # handle negative numbers - if s.startswith("-"): - s = s[1:] + s = s.removeprefix("-") return s.replace(".", "", 1).isdigit() generate_flags_as_dict = {k: f'"{v}"' if not is_number(v) else v for k, v in generate_flags_as_dict.items()} @@ -528,7 +531,7 @@ def parse_eos_tokens( # ----------------------------------------------------------------------------------------------------------------- # Model loading and performance automation methods @staticmethod - def get_quantization_config(model_args: ChatArguments) -> Optional["BitsAndBytesConfig"]: + def get_quantization_config(model_args: ChatArguments) -> Optional[BitsAndBytesConfig]: if model_args.load_in_4bit: quantization_config = BitsAndBytesConfig( load_in_4bit=True, @@ -684,7 +687,6 @@ async def _inner_run(self): model = self.args.model_name_or_path + "@" + self.args.model_revision host = "http://localhost" if self.args.host == "localhost" else self.args.host - client = AsyncInferenceClient(f"{host}:{self.args.port}") args = self.args if args.examples_path is None: @@ -707,48 +709,47 @@ async def _inner_run(self): # Starts the session with a minimal help message at the top, so that a user doesn't get stuck interface.print_help(minimal=True) - while True: - try: - user_input = interface.input() - - # User commands - if user_input.startswith("!"): - # `!exit` is special, it breaks the loop - if user_input == "!exit": - break - else: - chat, valid_command, generation_config, model_kwargs = self.handle_non_exit_user_commands( - user_input=user_input, - args=args, - interface=interface, - examples=examples, - generation_config=generation_config, - model_kwargs=model_kwargs, - chat=chat, - ) - # `!example` sends a user message to the model - if not valid_command or not user_input.startswith("!example"): - continue - else: - chat.append({"role": "user", "content": user_input}) - - stream = client.chat_completion( - chat, - stream=True, - extra_body={ - "generation_config": generation_config.to_json_string(), - "model": model, - }, - ) - model_output = await interface.stream_output(stream) + async with AsyncInferenceClient(f"{host}:{self.args.port}") as client: + while True: + try: + user_input = interface.input() + + # User commands + if user_input.startswith("!"): + # `!exit` is special, it breaks the loop + if user_input == "!exit": + break + else: + chat, valid_command, generation_config, model_kwargs = self.handle_non_exit_user_commands( + user_input=user_input, + args=args, + interface=interface, + examples=examples, + generation_config=generation_config, + model_kwargs=model_kwargs, + chat=chat, + ) + # `!example` sends a user message to the model + if not valid_command or not user_input.startswith("!example"): + continue + else: + chat.append({"role": "user", "content": user_input}) + + stream = client.chat_completion( + chat, + stream=True, + extra_body={ + "generation_config": generation_config.to_json_string(), + "model": model, + }, + ) - chat.append({"role": "assistant", "content": model_output}) + model_output = await interface.stream_output(stream) - except KeyboardInterrupt: - break - finally: - await client.close() + chat.append({"role": "assistant", "content": model_output}) + except KeyboardInterrupt: + break if __name__ == "__main__": diff --git a/src/transformers/commands/env.py b/src/transformers/commands/env.py index 983a858cd952..e15a699e80f6 100644 --- a/src/transformers/commands/env.py +++ b/src/transformers/commands/env.py @@ -14,7 +14,6 @@ import contextlib -import importlib.util import io import os import platform @@ -27,7 +26,6 @@ from ..utils import ( is_accelerate_available, is_flax_available, - is_safetensors_available, is_tf_available, is_torch_available, is_torch_hpu_available, @@ -61,18 +59,13 @@ def __init__(self, accelerate_config_file, *args) -> None: self._accelerate_config_file = accelerate_config_file def run(self): - safetensors_version = "not installed" - if is_safetensors_available(): - import safetensors + import safetensors - safetensors_version = safetensors.__version__ - elif importlib.util.find_spec("safetensors") is not None: - import safetensors - - safetensors_version = f"{safetensors.__version__} but is ignored because of PyTorch version too old." + safetensors_version = safetensors.__version__ accelerate_version = "not installed" accelerate_config = accelerate_config_str = "not found" + if is_accelerate_available(): import accelerate from accelerate.commands.config import default_config_file, load_config_from_file diff --git a/src/transformers/commands/serving.py b/src/transformers/commands/serving.py index 33a48aed7e64..970d59c96e74 100644 --- a/src/transformers/commands/serving.py +++ b/src/transformers/commands/serving.py @@ -31,7 +31,7 @@ from dataclasses import dataclass, field from io import BytesIO from threading import Thread -from typing import Optional, Union +from typing import Optional, TypedDict, Union from huggingface_hub import model_info from huggingface_hub.constants import HF_HUB_OFFLINE @@ -141,7 +141,7 @@ class TransformersTranscriptionCreateParams(TranscriptionCreateParamsBase, total file: bytes # Overwritten -- pydantic isn't happy with `typing.IO[bytes]`, present in the original type generation_config: str - stream: Optional[bool] = False + stream: bool = False # Contrarily to OpenAI's output types, input types are `TypedDict`, which don't have built-in validation. response_validator = TypeAdapter(TransformersResponseCreateParamsStreaming) @@ -528,7 +528,7 @@ def __init__(self, args: ServeArguments): def _validate_request( self, request: dict, - schema: "_TypedDictMeta", # noqa: F821 + schema: TypedDict, validator: "TypeAdapter", unused_fields: set, ): @@ -538,7 +538,7 @@ def _validate_request( Args: request (`dict`): The request to validate. - schema (`_TypedDictMeta`): + schema (`TypedDict`): The schema of the request to validate. It is a `TypedDict` definition. validator (`TypeAdapter`): The validator to use to validate the request. Built from `schema`. @@ -600,7 +600,7 @@ def validate_transcription_request(self, request: dict): def build_chat_completion_chunk( self, - request_id: Optional[str] = "", + request_id: str = "", content: Optional[int] = None, model: Optional[str] = None, role: Optional[str] = None, @@ -1026,7 +1026,9 @@ def generate_chat_completion(self, req: dict) -> Generator[str, None, None]: last_kv_cache = None if self.is_continuation(req) and not must_discard_cache: - last_kv_cache = self.last_kv_cache + seq_len = self.last_kv_cache.get_seq_length() + if inputs["input_ids"].shape[-1] > seq_len: + last_kv_cache = self.last_kv_cache generation_kwargs = { **inputs, @@ -1064,8 +1066,7 @@ def generate_with_cache(**kwargs): for result in streamer: # Temporary hack for GPTOS 3: don't emit the final "<|return|>" if "gptoss" in model.config.architectures[0].lower(): - if result.endswith("<|return|>"): - result = result[: -len("<|return|>")] + result = result.removesuffix("<|return|>") results += result # (related to temporary hack 2) @@ -1213,7 +1214,9 @@ def generate_response(self, req: dict) -> Generator[str, None, None]: last_kv_cache = None if self.is_continuation(req) and not must_discard_cache: - last_kv_cache = self.last_kv_cache + seq_len = self.last_kv_cache.get_seq_length() + if inputs["input_ids"].shape[-1] > seq_len: + last_kv_cache = self.last_kv_cache generation_kwargs = { "inputs": inputs, @@ -1321,8 +1324,7 @@ def generate_with_cache(**kwargs): for result in streamer: # Temporary hack for GPTOS 3: don't emit the final "<|return|>" if "gptoss" in model.config.architectures[0].lower(): - if result.endswith("<|return|>"): - result = result[: -len("<|return|>")] + result = result.removesuffix("<|return|>") results += result # (related to temporary hack 2) diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py index a9e7c9bff5bc..aa32734ffb38 100644 --- a/src/transformers/convert_slow_tokenizer.py +++ b/src/transformers/convert_slow_tokenizer.py @@ -1454,7 +1454,7 @@ def pre_tokenizer(self, replacement, add_prefix_space): class HeliumConverter(SpmConverter): handle_byte_fallback = True - def __init__(self, vocab_file=None, *args): + def __init__(self, vocab_file=None, **kwargs): requires_backends(self, "protobuf") Converter.__init__(self, vocab_file) @@ -1540,6 +1540,54 @@ def post_processor(self): ) +class ParakeetConverter(SpmConverter): + handle_byte_fallback = True + + def __init__(self, vocab_file=None, *args): + self.vocab_file = vocab_file + + requires_backends(self, "protobuf") + + Converter.__init__(self, vocab_file) + + model_pb2 = import_protobuf() + m = model_pb2.ModelProto() + with open(vocab_file, "rb") as f: + m.ParseFromString(f.read()) + self.proto = m + + def tokenizer(self, proto): + vocab_scores = self.vocab(proto) + + _, merges = self.SpmExtractor(self.vocab_file).extract(vocab_scores) + bpe_vocab = {word: i for i, (word, score) in enumerate(vocab_scores)} + tokenizer = Tokenizer( + BPE( + bpe_vocab, + merges, + unk_token=proto.trainer_spec.unk_piece, + fuse_unk=True, + byte_fallback=self.handle_byte_fallback, + dropout=None, + ) + ) + + # Add user defined symbols and control tokens from sentencepiece model + spm_added_tokens = [ + (id, p.piece, p.type == 3 or p.piece in self.special_tokens) + for id, p in enumerate(proto.pieces) + if p.type in [3, 4] + ] + tokenizer.add_tokens( + [ + AddedToken(token, normalized=False, special=special) + for id, token, special in sorted(spm_added_tokens, key=lambda x: x[0]) + ] + ) + + return tokenizer + + # Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode def bytes_to_unicode(): """ @@ -1576,10 +1624,8 @@ def __init__( pattern=r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""", add_prefix_space=False, additional_special_tokens=None, - *args, **kwargs, ): - super().__init__(*args) self.vocab_file = vocab_file self.pattern = pattern self.add_prefix_space = add_prefix_space diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py index 10ee10e01950..3fa9cb72de9f 100644 --- a/src/transformers/data/data_collator.py +++ b/src/transformers/data/data_collator.py @@ -18,26 +18,25 @@ from collections.abc import Mapping from dataclasses import dataclass from random import randint -from typing import Any, Callable, NewType, Optional, Union +from typing import Any, Callable, Optional, Union import numpy as np -from ..models.bert import BertTokenizer, BertTokenizerFast from ..tokenization_utils_base import PreTrainedTokenizerBase from ..utils import PaddingStrategy -InputDataClass = NewType("InputDataClass", Any) +InputDataClass = Any """ A DataCollator is a function that takes a list of samples from a Dataset and collate them into a batch, as a dictionary of PyTorch/TensorFlow tensors or NumPy arrays. """ -DataCollator = NewType("DataCollator", Callable[[list[InputDataClass]], dict[str, Any]]) +DataCollator = Callable[[list[InputDataClass]], dict[str, Any]] class DataCollatorMixin: - def __call__(self, features, return_tensors=None): + def __call__(self, features, return_tensors: Optional[str] = None): if return_tensors is None: return_tensors = self.return_tensors if return_tensors == "tf": @@ -773,6 +772,8 @@ class DataCollatorForLanguageModeling(DataCollatorMixin): Whether or not to use masked language modeling. If set to `False`, the labels are the same as the inputs with the padding tokens ignored (by setting them to -100). Otherwise, the labels are -100 for non-masked tokens and the value to predict for the masked token. + whole_word_mask (`bool`, *optional*, defaults to `False`): + Whether or not to mask whole words instead of individual tokens. mlm_probability (`float`, *optional*, defaults to 0.15): The probability with which to (randomly) mask tokens in the input, when `mlm` is set to `True`. mask_replace_prob (`float`, *optional*, defaults to 0.8): @@ -824,6 +825,7 @@ class DataCollatorForLanguageModeling(DataCollatorMixin): tokenizer: PreTrainedTokenizerBase mlm: bool = True + whole_word_mask: bool = False mlm_probability: Optional[float] = 0.15 mask_replace_prob: float = 0.8 random_replace_prob: float = 0.1 @@ -842,6 +844,11 @@ def __post_init__(self): if self.mlm_probability is None or self.mlm_probability < 0 or self.mlm_probability > 1: raise ValueError("mlm_probability should be between 0 and 1.") self.mlm_probability = float(self.mlm_probability) + elif self.whole_word_mask: + raise ValueError( + "Whole word masking can only be used with mlm=True." + "If you want to use whole word masking, please set mlm=True." + ) if self.mask_replace_prob + self.random_replace_prob > 1: raise ValueError("The sum of mask_replace_prob and random_replace_prob should not exceed 1") if self.mask_replace_prob < 0 or self.mask_replace_prob > 1: @@ -856,6 +863,20 @@ def __post_init__(self): import tensorflow as tf self.tf_mask_tokens = tf.function(self.tf_mask_tokens, jit_compile=True) + if self.whole_word_mask: + if not self.tokenizer.is_fast: + warnings.warn( + "Whole word masking depends on offset mapping which is only natively available with fast tokenizers.", + UserWarning, + ) + + if self.mask_replace_prob < 1: + warnings.warn( + "Random token replacement is not supported with whole word masking.", + "Setting mask_replace_prob to 1.", + ) + self.mask_replace_prob = 1 + self.random_replace_prob = 0 self.generator = None @@ -869,8 +890,6 @@ def get_generator(self, seed): return tf.random.Generator.from_seed(seed) else: - import numpy as np - return np.random.default_rng(seed) def create_rng(self): @@ -1021,9 +1040,10 @@ def torch_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> d # If special token mask has been preprocessed, pop it from the dict. special_tokens_mask = batch.pop("special_tokens_mask", None) + offset_mapping = batch.pop("offset_mapping", None) if self.mlm: batch["input_ids"], batch["labels"] = self.torch_mask_tokens( - batch["input_ids"], special_tokens_mask=special_tokens_mask + batch["input_ids"], special_tokens_mask=special_tokens_mask, offset_mapping=offset_mapping ) else: labels = batch["input_ids"].clone() @@ -1032,9 +1052,11 @@ def torch_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> d batch["labels"] = labels return batch - def torch_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = None) -> tuple[Any, Any]: + def torch_mask_tokens( + self, inputs: Any, special_tokens_mask: Optional[Any] = None, offset_mapping: Optional[Any] = None + ) -> tuple[Any, Any]: """ - Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. + Prepare masked tokens inputs/labels for masked language modeling. """ import torch @@ -1045,12 +1067,24 @@ def torch_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = No special_tokens_mask = [ self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] - special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool) + + if self.whole_word_mask: + word_ids, no_mask_mask = self._calc_word_ids_and_prob_mask( + to_numpy(offset_mapping), to_numpy(special_tokens_mask) + ) + no_mask_mask = torch.tensor(no_mask_mask, dtype=torch.bool) else: - special_tokens_mask = special_tokens_mask.bool() + no_mask_mask = ( + special_tokens_mask.bool() + if isinstance(special_tokens_mask, torch.Tensor) + else torch.tensor(special_tokens_mask, dtype=torch.bool) + ) - probability_matrix.masked_fill_(special_tokens_mask, value=0.0) + probability_matrix.masked_fill_(no_mask_mask, value=0.0) masked_indices = torch.bernoulli(probability_matrix, generator=self.generator).bool() + if self.whole_word_mask: + masked_indices = torch.BoolTensor(self._whole_word_mask(word_ids, masked_indices)) + labels[~masked_indices] = -100 # We only compute loss on masked tokens # mask_replace_prob% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) @@ -1100,9 +1134,10 @@ def numpy_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> d # If special token mask has been preprocessed, pop it from the dict. special_tokens_mask = batch.pop("special_tokens_mask", None) + offset_mapping = batch.pop("offset_mapping", None) if self.mlm: batch["input_ids"], batch["labels"] = self.numpy_mask_tokens( - batch["input_ids"], special_tokens_mask=special_tokens_mask + batch["input_ids"], special_tokens_mask=special_tokens_mask, offset_mapping=offset_mapping ) else: labels = np.copy(batch["input_ids"]) @@ -1111,9 +1146,14 @@ def numpy_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> d batch["labels"] = labels return batch - def numpy_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = None) -> tuple[Any, Any]: + def numpy_mask_tokens( + self, + inputs: Any, + special_tokens_mask: Optional[Any] = None, + offset_mapping: Optional[Any] = None, + ) -> tuple[Any, Any]: """ - Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. + Prepare masked tokens inputs/labels for masked language modeling. """ labels = np.copy(inputs) # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`) @@ -1122,16 +1162,28 @@ def numpy_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = No special_tokens_mask = [ self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] - special_tokens_mask = np.array(special_tokens_mask, dtype=bool) + + if self.whole_word_mask: + word_ids, no_mask_mask = self._calc_word_ids_and_prob_mask( + to_numpy(offset_mapping), to_numpy(special_tokens_mask) + ) else: - special_tokens_mask = special_tokens_mask.astype(bool) + no_mask_mask = ( + special_tokens_mask.astype(bool) + if isinstance(special_tokens_mask, np.ndarray) + else np.array(special_tokens_mask, dtype=bool) + ) - probability_matrix[special_tokens_mask] = 0 + probability_matrix[no_mask_mask] = 0 # Numpy doesn't have bernoulli, so we use a binomial with 1 trial if self.generator: masked_indices = self.generator.binomial(1, probability_matrix, size=probability_matrix.shape).astype(bool) else: masked_indices = np.random.binomial(1, probability_matrix, size=probability_matrix.shape).astype(bool) + + if self.whole_word_mask: + masked_indices = self._whole_word_mask(word_ids, masked_indices) + labels[~masked_indices] = -100 # We only compute loss on masked tokens # mask_replace_prob% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) @@ -1176,6 +1228,51 @@ def numpy_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = No # The rest of the time (10% of the time) we keep the masked input tokens unchanged return inputs, labels + @staticmethod + def _calc_word_ids_and_prob_mask( + offsets: np.ndarray[np.ndarray[tuple[int, int]]], special_tokens_mask: np.ndarray[np.ndarray[int]] + ) -> tuple[np.ndarray[np.ndarray[int]], np.ndarray[np.ndarray[int]]]: + """ + Map tokens to word ids and create mask of tokens to not mask. + Tokens that are part of the same word will have the same word id and we will only + set a mask probability for the first token of each word. + """ + + token_starts = offsets[:, :, 0] + token_ends = offsets[:, :, 1] + + prev_token_ends = np.roll(token_ends, 1, axis=1) + prev_token_ends[:, 0] = -1 # First token has no previous token + + prev_token_special = np.roll(special_tokens_mask, 1, axis=1) + prev_token_special[:, 0] = 0 + + # Not special token AND (gap from previous or previous token was special) + special_tokens_mask = special_tokens_mask.astype(bool) + is_new_word = (~special_tokens_mask) & ((token_starts != prev_token_ends) | (prev_token_special == 1)) + + word_ids = np.cumsum(is_new_word, axis=1) + word_ids[special_tokens_mask] = -1 + + prob_mask = ~is_new_word + + return word_ids, prob_mask + + @staticmethod + def _whole_word_mask(word_ids: np.ndarray[np.ndarray[int]], mask: Any) -> Any: + """ + Mask whole words based on word ids and mask. + """ + mask = to_numpy(mask) + + valid_ids = word_ids != -1 + + # Create 3D mask where [batch, token_i, token_j] is True if token_i and token_j are the same word + same_word = (word_ids[:, :, None] == word_ids[:, None, :]) & valid_ids[:, :, None] & valid_ids[:, None, :] + + # For each token, set True if any token in the same word is masked + return np.any(same_word & mask[:, None, :], axis=2) + @dataclass class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling): @@ -1322,6 +1419,8 @@ def _whole_word_mask(self, input_tokens: list[str], max_predictions=512): """ Get 0/1 labels for masked tokens with whole word mask proxy """ + from transformers import BertTokenizer, BertTokenizerFast + if not isinstance(self.tokenizer, (BertTokenizer, BertTokenizerFast)): warnings.warn( "DataCollatorForWholeWordMask is only suitable for BertTokenizer-like tokenizers. " @@ -1539,8 +1638,18 @@ def numpy_mask_tokens(self, inputs: Any, mask_labels: Any) -> tuple[Any, Any]: # The rest of the time ((1-mask_replace_prob-random_replace_prob)% of the time) we keep the masked input tokens unchanged return inputs, labels + def __init__(self, *args, **kwargs): + warnings.warn( + "DataCollatorForWholeWordMask is deprecated and will be removed in a future version, you can now use " + "DataCollatorForLanguageModeling with whole_word_mask=True instead.", + FutureWarning, + ) + super().__init__(*args, **kwargs) + self.mlm = True # Force masked language modeling + self.whole_word_mask = True # Force whole word masking + -def tolist(x): +def tolist(x) -> list[Any]: if isinstance(x, list): return x elif hasattr(x, "numpy"): # Checks for TF tensors without needing the import @@ -1548,6 +1657,15 @@ def tolist(x): return x.tolist() +def to_numpy(x) -> np.ndarray[Any]: + if isinstance(x, np.ndarray): + return x + elif hasattr(x, "detach"): + return x.detach().cpu().numpy() + else: + return np.array(x) + + @dataclass class DataCollatorForSOP(DataCollatorForLanguageModeling): """ diff --git a/src/transformers/data/datasets/squad.py b/src/transformers/data/datasets/squad.py index fdee571e249b..d4f76a51f422 100644 --- a/src/transformers/data/datasets/squad.py +++ b/src/transformers/data/datasets/squad.py @@ -122,9 +122,9 @@ def __init__( tokenizer: PreTrainedTokenizer, limit_length: Optional[int] = None, mode: Union[str, Split] = Split.train, - is_language_sensitive: Optional[bool] = False, + is_language_sensitive: bool = False, cache_dir: Optional[str] = None, - dataset_format: Optional[str] = "pt", + dataset_format: str = "pt", ): self.args = args self.is_language_sensitive = is_language_sensitive diff --git a/src/transformers/data/metrics/squad_metrics.py b/src/transformers/data/metrics/squad_metrics.py index f83c23bdeecf..0ffc025b65a0 100644 --- a/src/transformers/data/metrics/squad_metrics.py +++ b/src/transformers/data/metrics/squad_metrics.py @@ -148,7 +148,7 @@ def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans): best_score = cur_score best_thresh = 0.0 qid_list = sorted(na_probs, key=lambda k: na_probs[k]) - for i, qid in enumerate(qid_list): + for qid in qid_list: if qid not in scores: continue if qid_to_has_ans[qid]: diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index ab6e747d14db..42bbcbaabfad 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -68,7 +68,7 @@ "rhoknp": "rhoknp>=1.1.0,<1.3.1", "rjieba": "rjieba", "rouge-score": "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1", - "ruff": "ruff==0.11.2", + "ruff": "ruff==0.13.1", "sacrebleu": "sacrebleu>=1.4.12,<2.0.0", "sacremoses": "sacremoses", "safetensors": "safetensors>=0.4.3", diff --git a/src/transformers/dynamic_module_utils.py b/src/transformers/dynamic_module_utils.py index 5b541c076f63..6d4e2bf48921 100644 --- a/src/transformers/dynamic_module_utils.py +++ b/src/transformers/dynamic_module_utils.py @@ -285,8 +285,7 @@ def get_class_in_module( `typing.Type`: The class looked for. """ name = os.path.normpath(module_path) - if name.endswith(".py"): - name = name[:-3] + name = name.removesuffix(".py") name = name.replace(os.path.sep, ".") module_file: Path = Path(HF_MODULES_CACHE) / module_path with _HF_REMOTE_CODE_LOCK: @@ -396,7 +395,7 @@ def get_cached_module_file( if is_local: submodule = _sanitize_module_name(os.path.basename(pretrained_model_name_or_path)) else: - submodule = _sanitize_module_name(pretrained_model_name_or_path.replace("/", os.path.sep)) + submodule = os.path.sep.join(map(_sanitize_module_name, pretrained_model_name_or_path.split("/"))) cached_module = try_to_load_from_cache( pretrained_model_name_or_path, module_file, cache_dir=cache_dir, revision=_commit_hash, repo_type=repo_type ) diff --git a/src/transformers/feature_extraction_utils.py b/src/transformers/feature_extraction_utils.py index a9ff39b0cc19..e007e72d4761 100644 --- a/src/transformers/feature_extraction_utils.py +++ b/src/transformers/feature_extraction_utils.py @@ -48,13 +48,12 @@ if TYPE_CHECKING: - if is_torch_available(): - import torch # noqa + from .feature_extraction_sequence_utils import SequenceFeatureExtractor logger = logging.get_logger(__name__) -PreTrainedFeatureExtractor = Union["SequenceFeatureExtractor"] # noqa: F821 +PreTrainedFeatureExtractor = Union["SequenceFeatureExtractor"] # type hinting: specifying the type of feature extractor class that inherits from FeatureExtractionMixin SpecificFeatureExtractorType = TypeVar("SpecificFeatureExtractorType", bound="FeatureExtractionMixin") @@ -127,7 +126,7 @@ def _get_is_as_tensor_fns(self, tensor_type: Optional[Union[str, TensorType]] = elif tensor_type == TensorType.PYTORCH: if not is_torch_available(): raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.") - import torch # noqa + import torch def as_tensor(value): if isinstance(value, (list, tuple)) and len(value) > 0: @@ -216,7 +215,7 @@ def to(self, *args, **kwargs) -> "BatchFeature": [`BatchFeature`]: The same instance after modification. """ requires_backends(self, ["torch"]) - import torch # noqa + import torch device = kwargs.get("device") non_blocking = kwargs.get("non_blocking", False) @@ -563,7 +562,9 @@ def get_feature_extractor_dict( return feature_extractor_dict, kwargs @classmethod - def from_dict(cls, feature_extractor_dict: dict[str, Any], **kwargs) -> PreTrainedFeatureExtractor: + def from_dict( + cls, feature_extractor_dict: dict[str, Any], **kwargs + ) -> Union["FeatureExtractionMixin", tuple["FeatureExtractionMixin", dict[str, Any]]]: """ Instantiates a type of [`~feature_extraction_utils.FeatureExtractionMixin`] from a Python dictionary of parameters. @@ -613,7 +614,7 @@ def to_dict(self) -> dict[str, Any]: return output @classmethod - def from_json_file(cls, json_file: Union[str, os.PathLike]) -> PreTrainedFeatureExtractor: + def from_json_file(cls, json_file: Union[str, os.PathLike]) -> "FeatureExtractionMixin": """ Instantiates a feature extractor of type [`~feature_extraction_utils.FeatureExtractionMixin`] from the path to a JSON file of parameters. diff --git a/src/transformers/generation/beam_search.py b/src/transformers/generation/beam_search.py index ba2820cb437a..8510a02c803a 100644 --- a/src/transformers/generation/beam_search.py +++ b/src/transformers/generation/beam_search.py @@ -165,10 +165,10 @@ def __init__( batch_size: int, num_beams: int, device: torch.device, - length_penalty: Optional[float] = 1.0, - do_early_stopping: Optional[Union[bool, str]] = False, - num_beam_hyps_to_keep: Optional[int] = 1, - num_beam_groups: Optional[int] = 1, + length_penalty: float = 1.0, + do_early_stopping: Union[bool, str] = False, + num_beam_hyps_to_keep: int = 1, + num_beam_groups: int = 1, max_length: Optional[int] = None, ): logger.warning_once( @@ -214,7 +214,7 @@ def __init__( @property def is_done(self) -> bool: - return self._done.all() + return self._done.all().item() def process( self, @@ -225,8 +225,8 @@ def process( pad_token_id: Optional[Union[int, torch.Tensor]] = None, eos_token_id: Optional[Union[int, list[int], torch.Tensor]] = None, beam_indices: Optional[torch.LongTensor] = None, - group_index: Optional[int] = 0, - decoder_prompt_len: Optional[int] = 0, + group_index: int = 0, + decoder_prompt_len: int = 0, ) -> dict[str, torch.Tensor]: # add up to the length which the next_scores is calculated on (including decoder prompt) cur_len = input_ids.shape[-1] + 1 @@ -331,7 +331,7 @@ def finalize( pad_token_id: Optional[Union[int, torch.Tensor]] = None, eos_token_id: Optional[Union[int, list[int], torch.Tensor]] = None, beam_indices: Optional[torch.LongTensor] = None, - decoder_prompt_len: Optional[int] = 0, + decoder_prompt_len: int = 0, ) -> tuple[torch.LongTensor]: batch_size = len(self._beam_hyps) // self.num_beam_groups @@ -460,9 +460,9 @@ def __init__( num_beams: int, constraints: list[Constraint], device: torch.device, - length_penalty: Optional[float] = 1.0, - do_early_stopping: Optional[Union[bool, str]] = False, - num_beam_hyps_to_keep: Optional[int] = 1, + length_penalty: float = 1.0, + do_early_stopping: Union[bool, str] = False, + num_beam_hyps_to_keep: int = 1, max_length: Optional[int] = None, ): logger.warning_once( @@ -495,7 +495,7 @@ def __init__( @property def is_done(self) -> bool: - return self._done.all() + return self._done.all().item() def make_constraint_states(self, n): return [ConstraintListState([constraint.copy() for constraint in self.constraints]) for _ in range(n)] @@ -515,7 +515,7 @@ def process( pad_token_id: Optional[Union[int, torch.Tensor]] = None, eos_token_id: Optional[Union[int, list[int], torch.Tensor]] = None, beam_indices: Optional[torch.LongTensor] = None, - decoder_prompt_len: Optional[int] = 0, + decoder_prompt_len: int = 0, ) -> tuple[torch.Tensor]: r""" Args: @@ -804,7 +804,7 @@ def finalize( pad_token_id: Optional[Union[int, torch.Tensor]] = None, eos_token_id: Optional[Union[int, list[int], torch.Tensor]] = None, beam_indices: Optional[torch.LongTensor] = None, - decoder_prompt_len: Optional[int] = 0, + decoder_prompt_len: int = 0, ) -> tuple[torch.LongTensor]: batch_size = len(self._beam_hyps) @@ -912,7 +912,9 @@ def finalize( class BeamHypotheses: - def __init__(self, num_beams: int, length_penalty: float, early_stopping: bool, max_length: Optional[int] = None): + def __init__( + self, num_beams: int, length_penalty: float, early_stopping: Union[bool, str], max_length: Optional[int] = None + ): """ Initialize n-best list of hypotheses. """ @@ -963,7 +965,7 @@ def add( else: self.worst_score = min(score, self.worst_score) - def is_done(self, best_sum_logprobs: float, cur_len: int, decoder_prompt_len: Optional[int] = 0) -> bool: + def is_done(self, best_sum_logprobs: float, cur_len: int, decoder_prompt_len: int = 0) -> bool: """ If there are enough hypotheses and that none of the hypotheses being generated can become better than the worst one in the heap, then we are done with this sentence. diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index a455e69d03ff..cd42288aebfa 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -524,7 +524,7 @@ def get_candidates(self, input_ids: torch.LongTensor) -> tuple[torch.LongTensor, self.assistant_kwargs.pop("attention_mask", None) assistant_output = self.assistant_model.generate(**generation_args, **self.assistant_kwargs) - new_target_ids = self._process_assistant_outputs(input_ids, assistant_output.sequences, assistant_input_ids) + new_target_ids = self._process_assistant_outputs(input_ids, assistant_output.sequences) # Update state self.prev_target_ids_len = input_ids.shape[1] @@ -583,7 +583,7 @@ def _prepare_assistant_input_ids(self, input_ids: torch.LongTensor) -> tuple[tor return assistant_input_ids, remove_from_pkv def _process_assistant_outputs( - self, input_ids: torch.LongTensor, assistant_sequences: torch.LongTensor, assistant_input_ids: torch.LongTensor + self, input_ids: torch.LongTensor, assistant_sequences: torch.LongTensor ) -> torch.LongTensor: """Processes assistant outputs to obtain target input IDs.""" num_prev_assistant = self.prev_assistant_ids.shape[1] diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py index 05caed152c6e..98a0d14ade1a 100644 --- a/src/transformers/generation/configuration_utils.py +++ b/src/transformers/generation/configuration_utils.py @@ -1282,11 +1282,11 @@ class WatermarkingConfig(BaseWatermarkingConfig): def __init__( self, - greenlist_ratio: Optional[float] = 0.25, - bias: Optional[float] = 2.0, - hashing_key: Optional[int] = 15485863, - seeding_scheme: Optional[str] = "lefthash", - context_width: Optional[int] = 1, + greenlist_ratio: float = 0.25, + bias: float = 2.0, + hashing_key: int = 15485863, + seeding_scheme: str = "lefthash", + context_width: int = 1, ): self.greenlist_ratio = greenlist_ratio self.bias = bias diff --git a/src/transformers/generation/continuous_batching/cache.py b/src/transformers/generation/continuous_batching/cache.py index 05de093f661f..8d6e057be84a 100644 --- a/src/transformers/generation/continuous_batching/cache.py +++ b/src/transformers/generation/continuous_batching/cache.py @@ -79,7 +79,7 @@ class PagedAttentionCache: layer group, and the shape of the cache tensor is `[num_blocks * block_size, num_heads, head_size]`. Grouping layers into groups is useful because when we allocate one block to a group N, the block allocated is the - same for all layers in group N, equivalently it is allocated accross all cache tensors. This allows us to + same for all layers in group N, equivalently it is allocated across all cache tensors. This allows us to efficiently allocate and free blocks, and to efficiently read and write key and value states. For instance, imagine we have 8 blocks of cache and a model with two layer groups: a full-attention group with 3 @@ -349,7 +349,7 @@ class PagedAttentionMemoryHandler: The memory footprint consists of three main components: - Cache memory: the space needed to store the cache tensors: 2 * layer_group_size * [num_pages, page_size] * cache_dtype - - Activation memory: the space temporarly taken by the largest activation during the model forward pass: + - Activation memory: the space temporarily taken by the largest activation during the model forward pass: peak_activation_per_token * max_tokens_per_batch * activation_dtype_size - Static tensors: the space taken by the input/output buffers and metadata tensors for batch processing, sum of: - inputs_ids + outputs_ids + position_ids + logits_indices: 4 * max_tokens_per_batch * int32_size diff --git a/src/transformers/generation/continuous_batching/continuous_api.py b/src/transformers/generation/continuous_batching/continuous_api.py index b00c0a4825c3..0d1801fa163e 100644 --- a/src/transformers/generation/continuous_batching/continuous_api.py +++ b/src/transformers/generation/continuous_batching/continuous_api.py @@ -42,7 +42,56 @@ def build_attention_mask( ) -> None: """Builds an attention mask inplace using the cumulative seqlens of the query and key. If given a sliding window, it will also apply a sliding window mask on top. The attention mask is not boolean, it uses zeroes and -inf (or its - equivalent) so it's more of an attention score bias tensor.""" + equivalent) so it's more of an attention score bias tensor. + The attention mask is a block-diagonal matrix, with each block an attention mask for a single query-key pair. + Each of those block is built from a causal mask and, if there is a sliding window, a sliding window mask. + + An example is represented below, with seqlen_k = 8, seqlen_q = 4 and sliding_window = 6: + + CAUSAL MASK: + + █ █ █ █ █ ░ ░ ░ + █ █ █ █ █ █ ░ ░ + █ █ █ █ █ █ █ ░ + █ █ █ █ █ █ █ █ + + SLIDING WINDOW MASK: + ┌──────────────────────── seqlen_k - seqlen_q - sliding_window = 8 - 4 - 6 = -2 offset to the right + <─┴─> + ░ █ | █ █ █ █ █ █ █ █ + ░ ░ | █ █ █ █ █ █ █ █ + ░ ░ | ░ █ █ █ █ █ █ █ + ░ ░ | ░ ░ █ █ █ █ █ █ + + ATTENTION MASK (sum of causal and sliding window masks): + + █ █ █ █ █ ░ ░ ░ + █ █ █ █ █ █ ░ ░ + ░ █ █ █ █ █ █ ░ + ░ ░ █ █ █ █ █ █ + + Another example with seqlen_k = 5, seqlen_q = 3 and sliding_window = 2: + + CAUSAL MASK: + + █ █ █ ░ ░ + █ █ █ █ ░ + █ █ █ █ █ + + SLIDING WINDOW MASK: + ┌──────────────────────── seqlen_k - seqlen_q - sliding_window = 5 - 3 - 2 = 0 offset to the right + <┴> + | ░ █ █ █ █ + | ░ ░ █ █ █ + | ░ ░ ░ █ █ + + ATTENTION MASK (sum of causal and sliding window masks): + + ░ █ █ ░ ░ + ░ ░ █ █ ░ + ░ ░ ░ █ █ + + """ min_value = torch.finfo(attention_mask.dtype).min for i in range(len(cumulative_seqlens_q) - 1): seqlen_q = cumulative_seqlens_q[i + 1] - cumulative_seqlens_q[i] @@ -63,8 +112,8 @@ def build_attention_mask( masked = torch.triu(minus_inf, diagonal=causal_diagonal) # Apply sliding window mask if needed if sliding_window > 1: - sliding_diagonal = seqlen_k - seqlen_q + sliding_window - masked = torch.tril(masked, diagonal=sliding_diagonal) + sliding_diagonal = seqlen_k - seqlen_q - sliding_window + masked += torch.tril(minus_inf, diagonal=sliding_diagonal) # Replace in attention mask attention_mask[..., query_range, key_range] = masked diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py index f63d2246c6a9..7d81501a783d 100644 --- a/src/transformers/generation/logits_process.py +++ b/src/transformers/generation/logits_process.py @@ -369,7 +369,6 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to if scores.dim() == 3: if self.logits_indices is not None and self.cu_seq_lens_q is not None: - batch_size, seq_len, vocab_size = scores.shape last_positions = self.logits_indices last_scores = scores[0, last_positions, :] @@ -2289,7 +2288,7 @@ def __init__( model, unconditional_ids: Optional[torch.LongTensor] = None, unconditional_attention_mask: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = True, + use_cache: bool = True, ): self.guidance_scale = guidance_scale self.model = model diff --git a/src/transformers/generation/stopping_criteria.py b/src/transformers/generation/stopping_criteria.py index 2b9e57aacd8d..5a013a49723d 100644 --- a/src/transformers/generation/stopping_criteria.py +++ b/src/transformers/generation/stopping_criteria.py @@ -76,9 +76,9 @@ def __init__(self, max_length: int, max_position_embeddings: Optional[int] = Non def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> torch.BoolTensor: cur_len = input_ids.shape[1] is_done = cur_len >= self.max_length - if self.max_position_embeddings is not None and not is_done and cur_len >= self.max_position_embeddings: + if self.max_position_embeddings is not None and not is_done and cur_len > self.max_position_embeddings: logger.warning_once( - "This is a friendly reminder - the current text generation call will exceed the model's predefined " + "This is a friendly reminder - the current text generation call has exceeded the model's predefined " f"maximum length ({self.max_position_embeddings}). Depending on the model, you may observe " "exceptions, performance degradation, or nothing at all." ) @@ -249,7 +249,7 @@ def __init__(self, tokenizer: PreTrainedTokenizerBase, stop_strings: Union[str, token_list, token_indices, tokenizer ) - self.maximum_token_len = max([len(stop_string) for stop_string in self.stop_strings]) + self.maximum_token_len = max(len(stop_string) for stop_string in self.stop_strings) self.num_stop_strings = len(self.stop_strings) self.target_lens = torch.tensor([len(stop_string) for stop_string in stop_strings], dtype=torch.int32) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 2e312bcb3c79..f9d58dfdf4f6 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -22,7 +22,6 @@ import torch import torch.distributed as dist -from huggingface_hub import file_exists from packaging import version from torch import nn @@ -414,23 +413,20 @@ def load_custom_generate( Returns: A callable that can be used to generate text. """ - # Does `pretrained_model_name_or_path` have a `custom_generate` subdirectory? If not -> OSError - is_local_code = os.path.exists(pretrained_model_name_or_path) - has_custom_generate_folder = True - if is_local_code: - if not os.path.exists(os.path.join(pretrained_model_name_or_path, "custom_generate/generate.py")): - has_custom_generate_folder = False - else: - if not file_exists(pretrained_model_name_or_path, "custom_generate/generate.py"): - has_custom_generate_folder = False - - if not has_custom_generate_folder: + # Fetches the generate.py file from the model repo. If it doesn't exist, a file in `.no_exist` cache directory + # is created (preventing future hub requests), and an OSError is raised. + try: + module = get_cached_module_file( + pretrained_model_name_or_path, module_file="custom_generate/generate.py", **kwargs + ) + except OSError: raise OSError( f"`{pretrained_model_name_or_path}` does not contain a `custom_generate` subdirectory with a " "`generate.py` file, can't load the custom generate function." ) # Handle opt-in `trust_remote_code` and related exceptions + is_local_code = os.path.exists(pretrained_model_name_or_path) error_message = ( f"The repository `{pretrained_model_name_or_path}` contains custom generation code that will override " "the default `generate` method." @@ -447,9 +443,6 @@ def load_custom_generate( check_python_requirements( pretrained_model_name_or_path, requirements_file="custom_generate/requirements.txt", **kwargs ) - module = get_cached_module_file( - pretrained_model_name_or_path, module_file="custom_generate/generate.py", **kwargs - ) custom_generate_function = get_class_in_module("generate", module) return custom_generate_function @@ -912,7 +905,7 @@ def _prepare_decoder_input_ids_for_generation( self.config.model_type == "vision-encoder-decoder" and "donut" in self.config.encoder.model_type.lower() ): pass - elif self.config.model_type in ["whisper"]: + elif self.config.model_type == "whisper": pass # user input but doesn't start with decoder_start_token_id -> prepend decoder_start_token_id (and adjust # decoder_attention_mask if provided) @@ -1018,7 +1011,7 @@ def _get_candidate_generator( input_ids: torch.LongTensor, inputs_tensor: torch.Tensor, logits_processor: LogitsProcessorList, - model_kwargs: dict, + model_kwargs: dict[str, Any], assistant_model: Optional["PreTrainedModel"] = None, target_tokenizer: Optional["PreTrainedTokenizerBase"] = None, assistant_tokenizer: Optional["PreTrainedTokenizerBase"] = None, @@ -1709,7 +1702,10 @@ def _prepare_generated_length( return generation_config def _prepare_generation_config( - self, generation_config: Optional[GenerationConfig], use_model_defaults: Optional[bool] = None, **kwargs: dict + self, + generation_config: Optional[GenerationConfig], + use_model_defaults: Optional[bool] = None, + **kwargs: Any, ) -> tuple[GenerationConfig, dict]: """ Prepares the base generation config, then applies any generation configuration options from kwargs. This @@ -1903,6 +1899,7 @@ def _supports_default_dynamic_cache(cls) -> bool: "minimax", "xlnet", "lfm2", + "lfm2-vl", ] ) @@ -2136,7 +2133,7 @@ def _tensor_or_none(token, device=None): generation_config._pad_token_tensor = pad_token_tensor generation_config._decoder_start_token_tensor = decoder_start_token_tensor - def _valid_auto_compile_criteria(self, model_kwargs: dict, generation_config: GenerationConfig) -> bool: + def _valid_auto_compile_criteria(self, model_kwargs: dict[str, Any], generation_config: GenerationConfig) -> bool: """ Determines whether to trigger auto-compilation of the model's forward pass at generation time. """ @@ -3453,7 +3450,7 @@ def _assisted_decoding( generation_config: GenerationConfig, synced_gpus: bool = False, streamer: Optional["BaseStreamer"] = None, - inputs_tensor: torch.FloatTensor = None, + inputs_tensor: Optional[torch.FloatTensor] = None, assistant_model: Optional["PreTrainedModel"] = None, assistant_tokenizer: Optional["PreTrainedTokenizerBase"] = None, tokenizer: Optional["PreTrainedTokenizerBase"] = None, diff --git a/src/transformers/generation/watermarking.py b/src/transformers/generation/watermarking.py index e62742ef7514..df8a6ef7d483 100644 --- a/src/transformers/generation/watermarking.py +++ b/src/transformers/generation/watermarking.py @@ -24,14 +24,9 @@ from torch.nn import BCELoss from ..modeling_utils import PreTrainedModel -from ..utils import ModelOutput, is_torch_available, logging +from ..utils import ModelOutput, logging from .configuration_utils import PretrainedConfig, WatermarkingConfig - - -if is_torch_available(): - import torch - - from .logits_process import SynthIDTextWatermarkLogitsProcessor, WatermarkLogitsProcessor +from .logits_process import SynthIDTextWatermarkLogitsProcessor, WatermarkLogitsProcessor logger = logging.get_logger(__name__) @@ -43,31 +38,31 @@ class WatermarkDetectorOutput: Outputs of a watermark detector. Args: - num_tokens_scored (np.array of shape (batch_size)): + num_tokens_scored (np.ndarray of shape (batch_size)): Array containing the number of tokens scored for each element in the batch. - num_green_tokens (np.array of shape (batch_size)): + num_green_tokens (np.ndarray of shape (batch_size)): Array containing the number of green tokens for each element in the batch. - green_fraction (np.array of shape (batch_size)): + green_fraction (np.ndarray of shape (batch_size)): Array containing the fraction of green tokens for each element in the batch. - z_score (np.array of shape (batch_size)): + z_score (np.ndarray of shape (batch_size)): Array containing the z-score for each element in the batch. Z-score here shows how many standard deviations away is the green token count in the input text from the expected green token count for machine-generated text. - p_value (np.array of shape (batch_size)): + p_value (np.ndarray of shape (batch_size)): Array containing the p-value for each batch obtained from z-scores. - prediction (np.array of shape (batch_size)), *optional*: + prediction (np.ndarray of shape (batch_size)), *optional*: Array containing boolean predictions whether a text is machine-generated for each element in the batch. - confidence (np.array of shape (batch_size)), *optional*: + confidence (np.ndarray of shape (batch_size)), *optional*: Array containing confidence scores of a text being machine-generated for each element in the batch. """ - num_tokens_scored: Optional[np.array] = None - num_green_tokens: Optional[np.array] = None - green_fraction: Optional[np.array] = None - z_score: Optional[np.array] = None - p_value: Optional[np.array] = None - prediction: Optional[np.array] = None - confidence: Optional[np.array] = None + num_tokens_scored: Optional[np.ndarray] = None + num_green_tokens: Optional[np.ndarray] = None + green_fraction: Optional[np.ndarray] = None + z_score: Optional[np.ndarray] = None + p_value: Optional[np.ndarray] = None + prediction: Optional[np.ndarray] = None + confidence: Optional[np.ndarray] = None class WatermarkDetector: @@ -179,7 +174,7 @@ def _score_ngrams_in_passage(self, input_ids: torch.LongTensor): ) return num_tokens_scored_batch, green_token_count_batch - def _compute_z_score(self, green_token_count: np.ndarray, total_num_tokens: np.ndarray) -> np.array: + def _compute_z_score(self, green_token_count: np.ndarray, total_num_tokens: np.ndarray) -> np.ndarray: expected_count = self.greenlist_ratio numer = green_token_count - expected_count * total_num_tokens denom = np.sqrt(total_num_tokens * expected_count * (1 - expected_count)) @@ -195,7 +190,7 @@ def __call__( input_ids: torch.LongTensor, z_threshold: float = 3.0, return_dict: bool = False, - ) -> Union[WatermarkDetectorOutput, np.array]: + ) -> Union[WatermarkDetectorOutput, np.ndarray]: """ Args: input_ids (`torch.LongTensor`): @@ -207,8 +202,8 @@ def __call__( Whether to return `~generation.WatermarkDetectorOutput` or not. If not it will return boolean predictions, ma Return: - [`~generation.WatermarkDetectorOutput`] or `np.array`: A [`~generation.WatermarkDetectorOutput`] - if `return_dict=True` otherwise a `np.array`. + [`~generation.WatermarkDetectorOutput`] or `np.ndarray`: A [`~generation.WatermarkDetectorOutput`] + if `return_dict=True` otherwise a `np.ndarray`. """ diff --git a/src/transformers/hf_argparser.py b/src/transformers/hf_argparser.py index be7f05344faf..503130ea651a 100644 --- a/src/transformers/hf_argparser.py +++ b/src/transformers/hf_argparser.py @@ -262,19 +262,6 @@ def _add_dataclass_arguments(self, dtype: DataClassType): "removing line of `from __future__ import annotations` which opts in Postponed " "Evaluation of Annotations (PEP 563)" ) - except TypeError as ex: - # Remove this block when we drop Python 3.9 support - if sys.version_info[:2] < (3, 10) and "unsupported operand type(s) for |" in str(ex): - python_version = ".".join(map(str, sys.version_info[:3])) - raise RuntimeError( - f"Type resolution failed for {dtype} on Python {python_version}. Try removing " - "line of `from __future__ import annotations` which opts in union types as " - "`X | Y` (PEP 604) via Postponed Evaluation of Annotations (PEP 563). To " - "support Python versions that lower than 3.10, you need to use " - "`typing.Union[X, Y]` instead of `X | Y` and `typing.Optional[X]` instead of " - "`X | None`." - ) from ex - raise for field in dataclasses.fields(dtype): if not field.init: diff --git a/src/transformers/image_processing_utils_fast.py b/src/transformers/image_processing_utils_fast.py index 983fd4e16953..4dfa7f08b0db 100644 --- a/src/transformers/image_processing_utils_fast.py +++ b/src/transformers/image_processing_utils_fast.py @@ -46,7 +46,6 @@ auto_docstring, is_torch_available, is_torchvision_available, - is_torchvision_v2_available, is_vision_available, logging, ) @@ -60,14 +59,13 @@ import torch if is_torchvision_available(): + from torchvision.transforms.v2 import functional as F + from .image_utils import pil_torch_interpolation_mapping + else: pil_torch_interpolation_mapping = None -if is_torchvision_v2_available(): - from torchvision.transforms.v2 import functional as F -elif is_torchvision_available(): - from torchvision.transforms import functional as F logger = logging.get_logger(__name__) @@ -85,7 +83,7 @@ def validate_fast_preprocess_arguments( size: Optional[SizeDict] = None, interpolation: Optional["F.InterpolationMode"] = None, return_tensors: Optional[Union[str, TensorType]] = None, - data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + data_format: ChannelDimension = ChannelDimension.FIRST, ): """ Checks validity of typically used arguments in an `ImageProcessorFast` `preprocess` method. @@ -131,7 +129,7 @@ def max_across_indices(values: Iterable[Any]) -> list[Any]: return [max(values_i) for values_i in zip(*values)] -def get_max_height_width(images: list["torch.Tensor"]) -> tuple[int]: +def get_max_height_width(images: list["torch.Tensor"]) -> tuple[int, ...]: """ Get the maximum height and width across all images in a batch. """ @@ -142,8 +140,8 @@ def get_max_height_width(images: list["torch.Tensor"]) -> tuple[int]: def divide_to_patches( - image: Union[np.array, "torch.Tensor"], patch_size: int -) -> list[Union[np.array, "torch.Tensor"]]: + image: Union[np.ndarray, "torch.Tensor"], patch_size: int +) -> list[Union[np.ndarray, "torch.Tensor"]]: """ Divides an image into patches of a specified size. @@ -248,7 +246,7 @@ def pad( pad_size: SizeDict = None, fill_value: Optional[int] = 0, padding_mode: Optional[str] = "constant", - return_mask: Optional[bool] = False, + return_mask: bool = False, disable_grouping: Optional[bool] = False, **kwargs, ) -> "torch.Tensor": @@ -375,9 +373,13 @@ def compile_friendly_resize( A wrapper around `F.resize` so that it is compatible with torch.compile when the image is a uint8 tensor. """ if image.dtype == torch.uint8: - image = image.float() / 255 + # 256 is used on purpose instead of 255 to avoid numerical differences + # see https://github.com/huggingface/transformers/pull/38540#discussion_r2127165652 + image = image.float() / 256 image = F.resize(image, new_size, interpolation=interpolation, antialias=antialias) - image = image * 255 + image = image * 256 + # torch.where is used on purpose instead of torch.clamp to avoid bug in torch.compile + # see https://github.com/huggingface/transformers/pull/38540#discussion_r2126888471 image = torch.where(image > 255, 255, image) image = torch.where(image < 0, 0, image) image = image.round().to(torch.uint8) diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py index f0aeae8985b7..c0158b7111b7 100644 --- a/src/transformers/image_transforms.py +++ b/src/transformers/image_transforms.py @@ -255,7 +255,7 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple[int, in # Logic adapted from torchvision resizing logic: https://github.com/pytorch/vision/blob/511924c1ced4ce0461197e5caa64ce5b9e558aab/torchvision/transforms/functional.py#L366 def get_resize_output_image_size( input_image: np.ndarray, - size: Union[int, tuple[int, int], list[int], tuple[int]], + size: Union[int, tuple[int, int], list[int], tuple[int, ...]], default_to_square: bool = True, max_size: Optional[int] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, @@ -323,7 +323,7 @@ def get_resize_output_image_size( def resize( image: np.ndarray, size: tuple[int, int], - resample: "PILImageResampling" = None, + resample: Optional["PILImageResampling"] = None, reducing_gap: Optional[int] = None, data_format: Optional[ChannelDimension] = None, return_numpy: bool = True, diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py index 2079c21f3b0c..c5f4d4a3fa4c 100644 --- a/src/transformers/image_utils.py +++ b/src/transformers/image_utils.py @@ -30,7 +30,6 @@ is_torch_available, is_torch_tensor, is_torchvision_available, - is_torchvision_v2_available, is_vision_available, logging, requires_backends, @@ -56,9 +55,7 @@ from torchvision.transforms import InterpolationMode pil_torch_interpolation_mapping = { - PILImageResampling.NEAREST: InterpolationMode.NEAREST_EXACT - if is_torchvision_v2_available() - else InterpolationMode.NEAREST, + PILImageResampling.NEAREST: InterpolationMode.NEAREST_EXACT, PILImageResampling.BOX: InterpolationMode.BOX, PILImageResampling.BILINEAR: InterpolationMode.BILINEAR, PILImageResampling.HAMMING: InterpolationMode.HAMMING, @@ -78,7 +75,7 @@ ImageInput = Union[ "PIL.Image.Image", np.ndarray, "torch.Tensor", list["PIL.Image.Image"], list[np.ndarray], list["torch.Tensor"] -] # noqa +] class ChannelDimension(ExplicitEnum): @@ -486,9 +483,7 @@ def load_image(image: Union[str, "PIL.Image.Image"], timeout: Optional[float] = raise ValueError( f"Incorrect image source. Must be a valid URL starting with `http://` or `https://`, a valid path to an image file, or a base64 encoded string. Got {image}. Failed with {e}" ) - elif isinstance(image, PIL.Image.Image): - image = image - else: + elif not isinstance(image, PIL.Image.Image): raise TypeError( "Incorrect format used for image. Should be an url linking to an image, a base64 string, a local path, or a PIL image." ) @@ -579,7 +574,7 @@ class ImageFeatureExtractionMixin: def _ensure_format_supported(self, image): if not isinstance(image, (PIL.Image.Image, np.ndarray)) and not is_torch_tensor(image): raise ValueError( - f"Got type {type(image)} which is not supported, only `PIL.Image.Image`, `np.array` and " + f"Got type {type(image)} which is not supported, only `PIL.Image.Image`, `np.ndarray` and " "`torch.Tensor` are." ) diff --git a/src/transformers/integrations/deepspeed.py b/src/transformers/integrations/deepspeed.py index 47d7a7ffcb5f..c5f9ecc03b53 100644 --- a/src/transformers/integrations/deepspeed.py +++ b/src/transformers/integrations/deepspeed.py @@ -130,58 +130,11 @@ def fill_match(self, ds_key_long, hf_val, hf_key=None, must_match=True): fill_only = partialmethod(fill_match, must_match=False) - def override_training_args_from_deepspeed(self, args): - """ - Override TrainingArguments based on DeepSpeed config values to ensure compatibility. - - This method ensures that the DeepSpeed config takes precedence over TrainingArguments - defaults when there are conflicts, particularly for mixed precision settings. - - Args: - args: TrainingArguments object to potentially modify - """ - # Check precision settings in DeepSpeed config and override TrainingArguments accordingly - # Only override defaults, not explicit user settings - - # Check if user explicitly set precision options (we assume defaults are False) - user_set_fp16 = args.fp16 is True - user_set_bf16 = args.bf16 is True - - if self.is_true("fp16.enabled"): - # DeepSpeed config explicitly enables fp16 - if not user_set_fp16 and not user_set_bf16: - # User didn't explicitly set either, so apply DeepSpeed config - args.fp16 = True - args.bf16 = False - elif user_set_bf16 and not user_set_fp16: - # User explicitly chose bf16, but DeepSpeed config wants fp16 - # This is a potential conflict - let user choice win but log a warning - pass # Keep user's bf16=True, fp16=False - elif self.is_true("bf16.enabled"): - # DeepSpeed config explicitly enables bf16 - if not user_set_fp16 and not user_set_bf16: - # User didn't explicitly set either, so apply DeepSpeed config - args.bf16 = True - args.fp16 = False - elif user_set_fp16 and not user_set_bf16: - # User explicitly chose fp16, but DeepSpeed config wants bf16 - # This is a potential conflict - let user choice win but log a warning - pass # Keep user's fp16=True, bf16=False - elif self.is_false("fp16.enabled") and self.is_false("bf16.enabled"): - # Both are explicitly disabled in DeepSpeed config - if not user_set_fp16 and not user_set_bf16: - # User didn't explicitly set either, so apply DeepSpeed config (fp32) - args.fp16 = False - args.bf16 = False - def trainer_config_process(self, args, auto_find_batch_size=False): """ Adjust the config with `TrainingArguments` values. This stage is run during `TrainingArguments` object creation. """ - # First, override TrainingArguments based on DeepSpeed config to ensure compatibility - self.override_training_args_from_deepspeed(args) - # DeepSpeed does: # train_batch_size = world_size * train_micro_batch_size_per_gpu * gradient_accumulation_steps train_batch_size = args.world_size * args.per_device_train_batch_size * args.gradient_accumulation_steps @@ -268,17 +221,20 @@ def trainer_config_finalize(self, args, model, num_training_steps): hidden_size_auto_keys = [x for x in hidden_size_based_keys if self.is_auto(x)] if len(hidden_size_auto_keys) > 0: - if hasattr(model.config, "hidden_size"): - hidden_size = model.config.hidden_size - elif hasattr(model.config, "hidden_sizes"): - # if there are many hidden sizes pick the largest one - hidden_size = max(model.config.hidden_sizes) - elif hasattr(model.config, "text_config") and hasattr(model.config.text_config, "hidden_size"): - hidden_size = model.config.text_config.hidden_size - elif hasattr(model.config, "text_config") and hasattr(model.config.text_config, "hidden_sizes"): - # if there are many hidden sizes pick the largest one - hidden_size = max(model.config.text_config.hidden_sizes) - else: + hidden_size = None + if hasattr(model, "config"): + if hasattr(model.config, "hidden_size"): + hidden_size = model.config.hidden_size + elif hasattr(model.config, "hidden_sizes"): + # if there are many hidden sizes pick the largest one + hidden_size = max(model.config.hidden_sizes) + elif hasattr(model.config, "text_config") and hasattr(model.config.text_config, "hidden_size"): + hidden_size = model.config.text_config.hidden_size + elif hasattr(model.config, "text_config") and hasattr(model.config.text_config, "hidden_sizes"): + # if there are many hidden sizes pick the largest one + hidden_size = max(model.config.text_config.hidden_sizes) + + if hidden_size is None: raise ValueError( "The model's config file has neither `hidden_size` nor `hidden_sizes` entry, " "therefore it's not possible to automatically fill out the following `auto` entries " @@ -416,7 +372,7 @@ def deepspeed_optim_sched(trainer, hf_deepspeed_config, args, num_training_steps optimizer = None if "optimizer" in config: - if args.adafactor: + if args.optim == "adafactor": raise ValueError( "--adafactor was passed, but also found `optimizer` configured in the DeepSpeed config. " "Only one optimizer can be configured." diff --git a/src/transformers/integrations/flash_paged.py b/src/transformers/integrations/flash_paged.py index 329fab4c9323..1d1db72a7605 100644 --- a/src/transformers/integrations/flash_paged.py +++ b/src/transformers/integrations/flash_paged.py @@ -6,11 +6,21 @@ from ..utils import is_flash_attn_2_available +# For some reason, if we dont assign the function to a variable here, it will be garbage collected try: if is_flash_attn_2_available(): from flash_attn import flash_attn_varlen_func # noqa: F401 -except Exception: - pass + + FLASH_ATTN_VARLEN_FUNC = flash_attn_varlen_func + else: + raise RuntimeError( + "Flash Attention 2 is not installed. Please refer to https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install it" + ) +except Exception as e: + msg = repr(e) + + def FLASH_ATTN_VARLEN_FUNC(*args, **kwargs): + raise Exception(f"flash_attn_varlen_func is not available: {msg}") def paged_attention_forward( @@ -58,14 +68,13 @@ def paged_attention_forward( # Retrieve the cumulative sequence lengths for the current layer if isinstance(cu_seq_lens_k, dict): - cu_seq_lens_k = cu_seq_lens_k[layer_type].clone() + cu_seq_lens_k = cu_seq_lens_k[layer_type] max_seqlen_k = max_seqlen_k[layer_type] - else: - cu_seq_lens_k = cu_seq_lens_k.clone() - max_seqlen_k = max_seqlen_k if implementation is not None and hasattr(implementation, "flash_attn_varlen_func"): flash_attn_varlen_func = implementation.flash_attn_varlen_func + else: + flash_attn_varlen_func = FLASH_ATTN_VARLEN_FUNC custom_kwargs = {"s_aux": kwargs.get("s_aux")} if "s_aux" in kwargs else {} diff --git a/src/transformers/integrations/flex_attention.py b/src/transformers/integrations/flex_attention.py index 85ddc433e67a..2701936685dd 100644 --- a/src/transformers/integrations/flex_attention.py +++ b/src/transformers/integrations/flex_attention.py @@ -36,7 +36,7 @@ if is_torch_flex_attn_available(): - from torch.nn.attention.flex_attention import _DEFAULT_SPARSE_BLOCK_SIZE as flex_default_block_size # noqa: N811 + from torch.nn.attention.flex_attention import _DEFAULT_SPARSE_BLOCK_SIZE as flex_default_block_size from torch.nn.attention.flex_attention import BlockMask, create_block_mask, flex_attention @@ -272,12 +272,9 @@ def score_mod(score, batch_idx, head_idx, q_idx, kv_idx): score = score + score_mask[batch_idx][0][q_idx][kv_idx] if head_mask is not None: score = score + head_mask[batch_idx][head_idx][0][0] - if s_aux is not None: - logits_max = torch.max(score, dim=-1, keepdim=True).values - sinks = torch.exp(s_aux - logits_max) - unnormalized_scores = torch.exp(score - logits_max) - normalizer = unnormalized_scores.sum(dim=-1, keepdim=True) + sinks - score = unnormalized_scores / normalizer + # Note: attention sinks cannot be correctly implemented in score_mod + # because it requires operating on the full attention matrix before softmax. + # ==> this is done after flex attention return score enable_gqa = True @@ -293,6 +290,11 @@ def score_mod(score, batch_idx, head_idx, q_idx, kv_idx): # On CPU we must skip returning LSE due to a runtime issue; elsewhere, follow PyTorch API and return it return_lse = query.device.type != "cpu" + if not return_lse and s_aux is not None: + raise ValueError( + "Attention sinks cannot be run on CPU with flex attention. Please switch to a different device, e.g. CUDA" + ) + flex_attention_output = compile_friendly_flex_attention( query, key, @@ -311,6 +313,21 @@ def score_mod(score, batch_idx, head_idx, q_idx, kv_idx): if return_lse: attention_output, lse = flex_attention_output # type: ignore[misc] lse = lse.to(value.dtype) + + if s_aux is not None: + # Apply attention sinks by renormalizing using LSE + batch_size, num_heads, seq_len_q, _ = attention_output.shape # batch, num_heads, seq_len, head_dim + sinks = s_aux.view(1, -1, 1, 1).expand(batch_size, num_heads, seq_len_q, 1) + + # We need to compute the normalization that includes the sinks + # since log(sum(exp(scores))) = lse, exp(log(sum(exp(scores)))) = exp(lse) + # NB: log(sum(exp(scores)) + exp(sink)) = log(exp(lse) + exp(sink)) + lse_expanded = lse.unsqueeze(-1) # [batch, num_heads, seq_len, 1] + combined_lse = torch.logsumexp(torch.cat([lse_expanded, sinks], dim=-1), dim=-1, keepdim=True) + + # Use new_norm / old_norm = exp(combined_lse - lse) to compute renorm and apply + renorm_factor = torch.exp(lse_expanded - combined_lse) + attention_output = attention_output * renorm_factor else: attention_output = flex_attention_output # type: ignore[assignment] lse = None diff --git a/src/transformers/integrations/fp_quant.py b/src/transformers/integrations/fp_quant.py index 89ebac7004ee..0ac441e36f93 100644 --- a/src/transformers/integrations/fp_quant.py +++ b/src/transformers/integrations/fp_quant.py @@ -28,6 +28,8 @@ def adapt_fp_quant_config(config: FPQuantConfig): if config.forward_dtype == "mxfp4": forward_dtype = FPQuantDtype.MXFP4 + elif config.forward_dtype == "nvfp4": + forward_dtype = FPQuantDtype.NVFP4 else: raise ValueError(f"Unsupported forward dtype: {config.forward_dtype}") @@ -43,5 +45,6 @@ def adapt_fp_quant_config(config: FPQuantConfig): store_master_weights=config.store_master_weights, hadamard_group_size=config.hadamard_group_size, pseudoquantization=config.pseudoquantization, + transform_init=config.transform_init, modules_to_not_convert=config.modules_to_not_convert, ) diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py index 703fd0156365..d5600050188f 100644 --- a/src/transformers/integrations/ggml.py +++ b/src/transformers/integrations/ggml.py @@ -90,6 +90,19 @@ "expert_count": "num_experts", "expert_used_count": "num_experts_per_tok", }, + "lfm2": { + "context_length": "max_position_embeddings", + "block_count": "num_hidden_layers", + "feed_forward_length": "intermediate_size", + "embedding_length": "hidden_size", + "rope.dimension_count": None, + "rope.freq_base": "rope_theta", + "attention.head_count": "num_attention_heads", + "attention.head_count_kv": "num_key_value_heads", + "attention.layer_norm_rms_epsilon": "rms_norm_eps", + "vocab_size": "vocab_size", + "shortconv.l_cache": "conv_L_cache", + }, "qwen3": { "context_length": "max_position_embeddings", "block_count": "num_hidden_layers", @@ -316,11 +329,11 @@ def _gguf_parse_value(_value, data_type): _value = int(_value[0]) elif data_type in [6, 12]: _value = float(_value[0]) - elif data_type in [7]: + elif data_type == 7: _value = bool(_value[0]) - elif data_type in [8]: + elif data_type == 8: _value = array("B", list(_value)).tobytes().decode() - elif data_type in [9]: + elif data_type == 9: _value = _gguf_parse_value(_value, array_data_type) return _value diff --git a/src/transformers/integrations/hub_kernels.py b/src/transformers/integrations/hub_kernels.py index 5be21e2f9a51..6bf8dbcc0219 100644 --- a/src/transformers/integrations/hub_kernels.py +++ b/src/transformers/integrations/hub_kernels.py @@ -111,6 +111,27 @@ ) } }, + "SiLU": { + "cuda": { + Mode.INFERENCE | Mode.TORCH_COMPILE: LayerRepository( + repo_id="kernels-community/activation", layer_name="Silu", version=">=0.1.0" + ) + } + }, + "GeLU": { + "cuda": { + Mode.INFERENCE | Mode.TORCH_COMPILE: LayerRepository( + repo_id="kernels-community/activation", layer_name="Gelu", version=">=0.1.0" + ) + } + }, + "GeluTanh": { + "cuda": { + Mode.INFERENCE | Mode.TORCH_COMPILE: LayerRepository( + repo_id="kernels-community/activation", layer_name="GeluTanh", version=">=0.1.0" + ) + } + }, } register_kernel_mapping(_KERNEL_MAPPING) @@ -152,7 +173,10 @@ def load_and_register_kernel(attn_implementation: str) -> None: if not is_kernel(attn_implementation): return if not _kernels_available: - raise ImportError("`kernels` is not installed. Please install it with `pip install kernels`.") + raise ImportError( + "`kernels` is either not installed or uses an incompatible version. " + "Please install the latest version with `pip install -U kernels`." + ) # Need to be imported here as otherwise we have a circular import in `modeling_utils` from ..masking_utils import ALL_MASK_ATTENTION_FUNCTIONS @@ -188,7 +212,7 @@ def load_and_register_kernel(attn_implementation: str) -> None: if attention_wrapper is None: attention_wrapper = flash_attention_forward kernel_function = partial(attention_wrapper, implementation=kernel) - lazy_import_flash_attention(kernel) + lazy_import_flash_attention(kernel, force_import=True) elif kernel_name is not None: kernel_function = getattr(kernel, kernel_name) # Register the kernel as a valid attention diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py index 5ef1123b8fce..b81d47831b6b 100755 --- a/src/transformers/integrations/integration_utils.py +++ b/src/transformers/integrations/integration_utils.py @@ -547,8 +547,6 @@ def run_hp_search_sigopt(trainer, n_trials: int, direction: str, **kwargs) -> Be def run_hp_search_wandb(trainer, n_trials: int, direction: str, **kwargs) -> BestRun: - from ..integrations import is_wandb_available - if not is_wandb_available(): raise ImportError("This function needs wandb installed: `pip install wandb`") import wandb @@ -686,7 +684,7 @@ def __init__(self, tb_writer=None): ) if has_tensorboard: try: - from torch.utils.tensorboard import SummaryWriter # noqa: F401 + from torch.utils.tensorboard import SummaryWriter self._SummaryWriter = SummaryWriter except ImportError: @@ -1092,19 +1090,28 @@ def setup(self, args, state, model, **kwargs): """ Setup the optional Trackio integration. - To customize the setup you can also override the following environment variables: - - Environment: - - **TRACKIO_PROJECT** (`str`, *optional*, defaults to `"huggingface"`): - The name of the project (can be an existing project to continue tracking or a new project to start tracking - from scratch). - - **TRACKIO_SPACE_ID** (`str`, *optional*, defaults to `None`): - If set, the project will be logged to a Hugging Face Space instead of a local directory. Should be a - complete Space name like `"username/reponame"` or `"orgname/reponame"`, or just `"reponame" in which case - the Space will be created in the currently-logged-in Hugging Face user's namespace. If the Space does not - exist, it will be created. If the Space already exists, the project will be logged to it. + To customize the setup you can also set the arguments `project`, `trackio_space_id` and `hub_private_repo` in + [`TrainingArguments`]. Please refer to the docstring of for more details. """ if state.is_world_process_zero: + if os.getenv("TRACKIO_PROJECT"): + logger.warning( + "The `TRACKIO_PROJECT` environment variable is deprecated and will be removed in a future " + "version. Use TrainingArguments.project instead." + ) + project = os.getenv("TRACKIO_PROJECT") + else: + project = args.project + + if os.getenv("TRACKIO_SPACE_ID"): + logger.warning( + "The `TRACKIO_SPACE_ID` environment variable is deprecated and will be removed in a future " + "version. Use TrainingArguments.trackio_space_id instead." + ) + space_id = os.getenv("TRACKIO_SPACE_ID") + else: + space_id = args.trackio_space_id + combined_dict = {**args.to_dict()} if hasattr(model, "config") and model.config is not None: @@ -1115,10 +1122,11 @@ def setup(self, args, state, model, **kwargs): combined_dict = {**{"peft_config": peft_config}, **combined_dict} self._trackio.init( - project=os.getenv("TRACKIO_PROJECT", "huggingface"), + project=project, name=args.run_name, - space_id=os.getenv("TRACKIO_SPACE_ID", None), + space_id=space_id, resume="allow", + private=args.hub_private_repo, ) # Add config parameters (run may have been created manually) diff --git a/src/transformers/integrations/mistral.py b/src/transformers/integrations/mistral.py index 78172329277e..cdf237645fc1 100644 --- a/src/transformers/integrations/mistral.py +++ b/src/transformers/integrations/mistral.py @@ -16,10 +16,8 @@ def __init__( pattern=r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""", add_prefix_space=False, additional_special_tokens=None, - *args, **kwargs, ): - super().__init__(*args) self.vocab = vocab self.pattern = pattern self.add_prefix_space = add_prefix_space diff --git a/src/transformers/integrations/mxfp4.py b/src/transformers/integrations/mxfp4.py index c40b202c54e8..6a6ce1db17e7 100644 --- a/src/transformers/integrations/mxfp4.py +++ b/src/transformers/integrations/mxfp4.py @@ -23,6 +23,7 @@ from accelerate import init_empty_weights import re +from contextlib import contextmanager logger = logging.get_logger(__name__) @@ -47,6 +48,28 @@ ] +@contextmanager +def on_device(dev): + if is_torch_available(): + import torch + + if isinstance(dev, torch.Tensor): + dev = dev.device + elif isinstance(dev, str): + dev = torch.device(dev) + dev_type = getattr(dev, "type", None) + if dev_type == "cuda": + with torch.cuda.device(dev): + yield + return + if dev_type == "xpu" and hasattr(torch, "xpu"): + with torch.xpu.device(dev): + yield + return + # other: CPU + yield + + # Copied from GPT_OSS repo and vllm def quantize_to_mxfp4(w, triton_kernels_hub): downcast_to_mxfp_torch = triton_kernels_hub.numerics_details.mxfp.downcast_to_mxfp_torch @@ -173,7 +196,7 @@ def forward(self, hidden_states: torch.Tensor, routing_data, gather_idx, scatter ) swiglu_fn = triton_kernels_hub.swiglu.swiglu_fn - with torch.cuda.device(hidden_states.device): + with on_device(hidden_states.device): act = FusedActivation(FnSpecs("swiglu", swiglu_fn, ("alpha", "limit")), (self.alpha, self.limit), 2) intermediate_cache1 = matmul_ogs( @@ -214,7 +237,7 @@ def routing_torch_dist( triton_kernels_hub.routing.compute_expt_data_torch, ) - with torch.cuda.device(logits.device): + with on_device(logits.device): world_size = torch.distributed.get_world_size() rank = int(os.environ.get("LOCAL_RANK", "0")) replace_value = -1 @@ -281,7 +304,7 @@ def mlp_forward(self, hidden_states): hidden_states = hidden_states.reshape(-1, self.router.hidden_dim) router_logits = nn.functional.linear(hidden_states, self.router.weight, self.router.bias) - with torch.cuda.device(router_logits.device): + with on_device(router_logits.device): routing_data, gather_idx, scatter_idx = routing(router_logits, self.router.top_k) routed_out = self.experts(hidden_states, routing_data, gather_idx, scatter_idx) @@ -320,7 +343,6 @@ def dequantize(module, param_name, param_value, target_device, dq_param_name, ** to_contiguous, rank, device_mesh, - set_param=False, ) blocks_attr = f"{proj}_blocks" scales_attr = f"{proj}_scales" @@ -376,7 +398,7 @@ def load_and_swizzle_mxfp4(module, param_name, param_value, target_device, trito target_device = "cuda" blocks = blocks.to(target_device).contiguous() scales = scales.to(target_device).contiguous() - with torch.cuda.device(target_device): + with on_device(target_device): triton_weight_tensor, weight_scale = swizzle_mxfp4( blocks.transpose(-2, -1), scales.transpose(-2, -1), triton_kernels_hub ) diff --git a/src/transformers/integrations/peft.py b/src/transformers/integrations/peft.py index 87dd6cffc2fa..22261eecad0b 100644 --- a/src/transformers/integrations/peft.py +++ b/src/transformers/integrations/peft.py @@ -15,7 +15,6 @@ import importlib import inspect import re -import warnings from typing import Any, Optional, Union from packaging import version @@ -70,14 +69,9 @@ class PeftAdapterMixin: more details about adapters and injecting them on a transformer-based model, check out the documentation of PEFT library: https://huggingface.co/docs/peft/index - Currently supported PEFT methods are all non-prefix tuning methods. Below is the list of supported PEFT methods - that anyone can load, train and run with this mixin class: - - Low Rank Adapters (LoRA): https://huggingface.co/docs/peft/conceptual_guides/lora - - IA3: https://huggingface.co/docs/peft/conceptual_guides/ia3 - - AdaLora: https://huggingface.co/papers/2303.10512 - - Other PEFT models such as prompt tuning, prompt learning are out of scope as these adapters are not "injectable" - into a torch module. For using these methods, please refer to the usage guide of PEFT library. + Currently supported PEFT methods are all non-prompt learning methods (LoRA, IA³, etc.). Other PEFT models such as + prompt tuning, prompt learning are out of scope as these adapters are not "injectable" into a torch module. For + using these methods, please refer to the usage guide of PEFT library. With this mixin, if the correct PEFT version is installed, it is possible to: @@ -96,7 +90,7 @@ def load_adapter( adapter_name: Optional[str] = None, revision: Optional[str] = None, token: Optional[str] = None, - device_map: Optional[str] = "auto", + device_map: str = "auto", max_memory: Optional[str] = None, offload_folder: Optional[str] = None, offload_index: Optional[int] = None, @@ -110,24 +104,21 @@ def load_adapter( Load adapter weights from file or remote Hub folder. If you are not familiar with adapters and PEFT methods, we invite you to read more about them on PEFT official documentation: https://huggingface.co/docs/peft - Requires peft as a backend to load the adapter weights. + Requires PEFT to be installed as a backend to load the adapter weights. Args: peft_model_id (`str`, *optional*): The identifier of the model to look for on the Hub, or a local path to the saved adapter config file and adapter weights. adapter_name (`str`, *optional*): - The adapter name to use. If not set, will use the default adapter. + The adapter name to use. If not set, will use the name "default". revision (`str`, *optional*, defaults to `"main"`): The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any identifier allowed by git. -projection_layer", - r"(?P\b(vision_model|model\.vision_model)\b.*\.)norm(?=\.|\s|$)": r"\glayer_norm", - r"(?P\b(vision_model|model\.vision_model)\b.*\.)norm1(?=\.|\s|$)": r"\glayer_norm1", - r"(?P\b(vision_model|model\.vision_model)\b.*\.)norm2(?=\.|\s|$)": r"\glayer_norm2", - r"\bvision_model\.vision_tower\.attn_pool\.[^\s$]*": None, - # VQ Model - r"gen_vision_model": "model.vqmodel", - r"(?P\b(gen_vision_model|model\.vqmodel)\b.*\.)decoder\.conv_blocks(?=\.|\s|$)": r"\gdecoder.up", - r"(?P\b(gen_vision_model|model\.vqmodel)\b.*\.)encoder\.conv_blocks(?=\.|\s|$)": r"\gencoder.down", - r"(?P\b(gen_vision_model|model\.vqmodel)\b.*\.)res(?=\.|\s|$)": r"\gblock", - r"(?P\b(gen_vision_model|model\.vqmodel)\b.*\.)mid\.0(?=\.|\s|$)": r"\gmid.block_1", - r"(?P\b(gen_vision_model|model\.vqmodel)\b.*\.)mid\.1(?=\.|\s|$)": r"\gmid.attn_1", - r"(?P\b(gen_vision_model|model\.vqmodel)\b.*\.)mid\.2(?=\.|\s|$)": r"\gmid.block_2", - # Aligner Modules - r"(gen_aligner)\.layers\.0": r"model.generation_aligner.fc1", - r"(gen_aligner)\.layers\.2": r"model.generation_aligner.hidden_layers.0", - r"(?']%}" - "{%set i=0%}" - "{%for message in messages%}" - "{%if message['role']|lower=='user'%}" - "<|User|>: " - "{%elif message['role']|lower=='assistant'%}" - "<|Assistant|>:{%if not (loop.last and not add_generation_prompt and message['content'][0]['type']=='text' and message['content'][0]['text']=='')%} {%endif%}" - "{%else%}" - "{{message['role'].capitalize()}}: " - "{%endif%}" - "{%for content in message['content']%}" - "{%if content['type']=='image'%}" - "{%if not loop.first%}{{'\n'}}{%endif%}" - "" - "{%if not loop.last%}{{'\n'}}{%endif%}" - "{%elif content['type']=='text'%}" - "{%set text=content['text']%}" - "{%if loop.first%}{%set text=text.lstrip()%}{%endif%}" - "{%if loop.last%}{%set text=text.rstrip()%}{%endif%}" - "{%if not loop.first and message['content'][loop.index0-1]['type']=='text'%}" - "{{' '+text}}" - "{%else%}" - "{{text}}" - "{%endif%}" - "{%endif%}" - "{%endfor%}" - "{%if not loop.last or add_generation_prompt%}" - "{%if message['role']|lower=='user'%}" - "{{seps[0]}}" - "{%else%}" - "{{seps[1]}}" - "{%endif%}" - "{%endif%}" - "{%endfor%}" - "{%if add_generation_prompt%}<|Assistant|>:{%endif%}" -) - - -def convert_old_keys_to_new_keys(state_dict): - keys_as_text = "\n".join(state_dict.keys()) - new_keys_as_text = keys_as_text - for old, repl in MAPPINGS.items(): - if repl is None: - new_keys_as_text = re.sub(old, "", new_keys_as_text) - else: - new_keys_as_text = re.sub(old, repl, new_keys_as_text) - output_dict = dict(zip(keys_as_text.split("\n"), new_keys_as_text.split("\n"))) - return output_dict - - -def split_tensor(tensor, key): - """Splits a merged tensor (qkv or kv) into separate tensors and creates keys for each part.""" - - if "qkv" in key: - prefix_to_replace = "qkv" - num_splits = 3 - new_keys = ["q_proj", "k_proj", "v_proj"] - elif "kv" in key: - prefix_to_replace = "kv" - num_splits = 2 - new_keys = ["k_proj", "v_proj"] - else: - raise ValueError(f"Unrecognized tensor type in key: {key}") - - split_size = tensor.shape[0] // num_splits - tensors = torch.split(tensor, split_size, dim=0) - return {key.replace(prefix_to_replace, new_keys[i]): tensors[i] for i in range(num_splits)} - - -def convert_state_dict_to_hf(state_dict): - """Convert state dict keys to HF format.""" - conversion_dict = convert_old_keys_to_new_keys(state_dict) - converted_state_dict = {} - - for old_key, new_key in conversion_dict.items(): - if new_key: - if "qkv" in new_key or "kv" in new_key: # Detect merged attention keys and split them. - qkv_split_dict = split_tensor(state_dict[old_key], new_key) - converted_state_dict.update(qkv_split_dict) - else: - converted_state_dict[new_key] = state_dict[old_key] - - # Embeddings will not have initial dimension - pos_embed_key = "model.vision_model.embeddings.position_embedding.weight" - converted_state_dict[pos_embed_key] = converted_state_dict[pos_embed_key].squeeze(0) - - return converted_state_dict - - -def ensure_model_downloaded( - repo_id: Optional[str] = None, revision: Optional[str] = None, local_dir: Optional[str] = None -) -> str: - """ - Ensures model files are downloaded locally, downloads them if not. - Returns path to local files. - - Args: - repo_id: The Hugging Face model repo ID (required if local_dir not provided) - revision: Optional git revision to use - local_dir: Optional local directory path where model files should be stored/found - """ - if local_dir is not None: - if os.path.exists(local_dir): - print(f"Using provided local directory: {local_dir}") - else: - # Create the local directory if it doesn't exist - os.makedirs(local_dir, exist_ok=True) - print(f"Created local directory: {local_dir}") - - if repo_id is None: - raise ValueError("Either repo_id or local_dir must be provided") - - print(f"Ensuring {repo_id} (revision: {revision or 'latest'}) is downloaded...") - - try: - # First try to find files locally - download_dir = snapshot_download(repo_id, revision=revision, local_files_only=True, local_dir=local_dir) - print(f"Found model files locally at {download_dir}") - return download_dir - except Exception: - # If files not found locally, download them - print(f"Downloading model files for {repo_id}...") - download_dir = snapshot_download(repo_id, revision=revision, local_files_only=False, local_dir=local_dir) - print(f"Downloaded model files to {download_dir}") - return download_dir - - -def load_model_state_dict(input_path: str) -> dict: - """ - Load model state dict, handling both single and sharded files. - """ - index_path = os.path.join(input_path, "pytorch_model.bin.index.json") - single_file_path = os.path.join(input_path, "pytorch_model.bin") - - # Check if we have a sharded model - if os.path.exists(index_path): - print("Loading sharded model...") - state_dict = {} - with open(index_path, "r") as f: - index = json.load(f) - - # Get unique shard files and load each one only once - unique_shard_files = sorted(set(index["weight_map"].values())) - for shard_file in unique_shard_files: - print(f"Loading shard {shard_file}...") - shard_path = os.path.join(input_path, shard_file) - shard_dict = torch.load(shard_path, map_location="cpu") - state_dict.update(shard_dict) - - return state_dict - - # Single file model - elif os.path.exists(single_file_path): - print("Loading single file model...") - return torch.load(single_file_path, map_location="cpu") - - else: - raise ValueError(f"No model files found in {input_path}") - - -def convert_model( - repo_id=None, - local_dir=None, - text_model_id=None, - output_dir=None, - output_hub_path=None, - safe_serialization=True, - revision=None, -): - """Convert and save the model weights, processor, and configuration.""" - if output_dir is None and output_hub_path is None: - raise ValueError("At least one of output_dir or output_hub_path must be specified") - - if repo_id is None and local_dir is None: - raise ValueError("Either repo_id or local_dir must be specified") - - # Create output directory if specified - if output_dir: - os.makedirs(output_dir, exist_ok=True) - print(f"Created/verified output directory: {output_dir}") - - torch.set_default_dtype(torch.float16) - - # Download or locate model files - input_path = ensure_model_downloaded(repo_id=repo_id, revision=revision, local_dir=local_dir) - - # Load configuration files - required_files = ["config.json", "preprocessor_config.json", "special_tokens_map.json", "tokenizer_config.json"] - - missing_files = [f for f in required_files if not os.path.exists(os.path.join(input_path, f))] - if missing_files: - raise ValueError( - f"The following required configuration files are missing from {input_path}: {', '.join(missing_files)}. " - "Please ensure you have downloaded all necessary model files." - ) - - with open(os.path.join(input_path, "config.json"), "r") as f: - config_data = json.load(f) - with open(os.path.join(input_path, "preprocessor_config.json"), "r") as f: - preprocessor_config = json.load(f) - with open(os.path.join(input_path, "special_tokens_map.json"), "r") as f: - special_tokens_map = json.load(f) - with open(os.path.join(input_path, "tokenizer_config.json"), "r") as f: - tokenizer_config = json.load(f) - - # Create tokenizer directly from tokenizer.json if it exists - tokenizer_json_path = os.path.join(input_path, "tokenizer.json") - special_image_tokens = { - "image_token": " ", - "boi_token": " ", - "eoi_token": " ", - } - - if os.path.exists(tokenizer_json_path) and not text_model_id: - tokenizer = AutoTokenizer.from_pretrained( - input_path, # This will load tokenizer.json directly - model_max_length=tokenizer_config["model_max_length"], - extra_special_tokens=special_image_tokens, - ) - else: - # Fallback to creating from text_model_id with special tokens - tokenizer = AutoTokenizer.from_pretrained( - text_model_id, - bos_token=special_tokens_map["bos_token"], - eos_token=special_tokens_map["eos_token"], - pad_token=special_tokens_map["pad_token"], - additional_special_tokens=special_tokens_map["additional_special_tokens"], - model_max_length=tokenizer_config["model_max_length"], - extra_special_tokens=special_image_tokens, - ) - - # Create image processor from config - image_processor_kwargs = {} - for key in ["do_normalize", "image_mean", "image_std", "min_size", "rescale_factor"]: - if key in preprocessor_config: - image_processor_kwargs[key] = preprocessor_config[key] - - if "image_size" in preprocessor_config: - image_processor_kwargs["size"] = { - "height": preprocessor_config["image_size"], - "width": preprocessor_config["image_size"], - } - - image_processor = JanusImageProcessor(**image_processor_kwargs) - - # Create processor with chat template - processor = JanusProcessor( - image_processor=image_processor, - tokenizer=tokenizer, - chat_template=CHAT_TEMPLATE, - use_default_system_prompt=True, - ) - - if output_dir: - print(f"Saving processor to {output_dir}...") - processor.save_pretrained(output_dir) - if output_hub_path: - print(f"Pushing processor to hub at {output_hub_path}...") - processor.push_to_hub(output_hub_path) - - # Create model configurations - text_config_kwargs = {} - for key in [ - "vocab_size", - "hidden_size", - "intermediate_size", - "num_hidden_layers", - "num_attention_heads", - "num_key_value_heads", - "hidden_act", - "max_position_embeddings", - "dtype", - ]: - if key in config_data["language_config"]: - text_config_kwargs[key] = config_data["language_config"][key] - - # Add token IDs from tokenizer - text_config_kwargs.update( - { - "pad_token_id": tokenizer.pad_token_id, - "bos_token_id": tokenizer.bos_token_id, - "eos_token_id": tokenizer.eos_token_id, - } - ) - - text_config = LlamaConfig(**text_config_kwargs) - - # Create vision config - vision_config_kwargs = {} - if "image_size" in config_data["vision_config"]["params"]: - vision_config_kwargs["image_size"] = config_data["vision_config"]["params"]["image_size"] - - # Add aligner params if present - if "aligner_config" in config_data and "params" in config_data["aligner_config"]: - if "n_embed" in config_data["aligner_config"]["params"]: - vision_config_kwargs["projection_dim"] = config_data["aligner_config"]["params"]["n_embed"] - if "depth" in config_data["aligner_config"]["params"]: - vision_config_kwargs["depth"] = config_data["aligner_config"]["params"]["depth"] - - vision_config = JanusVisionConfig(**vision_config_kwargs) - - vq_config = JanusVQVAEConfig( - embed_dim=config_data["gen_vision_config"]["params"]["n_embed"], - num_embeddings=config_data["gen_vision_config"]["params"]["image_token_size"], - projection_dim=config_data["gen_aligner_config"]["params"]["n_embed"], - depth=config_data["gen_aligner_config"]["params"]["depth"], - image_token_embed_dim=config_data["gen_head_config"]["params"]["image_token_embed"], - ) - - # Create the main config - config = JanusConfig( - text_config=text_config, - vision_config=vision_config, - vq_config=vq_config, - image_token_id=tokenizer.vocab.get(" "), - ) - - # Save the config - if output_dir: - config.save_pretrained(output_dir) - if output_hub_path: - config.push_to_hub(output_hub_path) - - # Initialize model with empty weights - print("Creating empty model...") - with init_empty_weights(): - model = JanusForConditionalGeneration(config) - - model.generation_config._from_model_config = False - model.generation_config.temperature = 1 - model.generation_config.guidance_scale = 5 - model.generation_config.pad_token_id = tokenizer.vocab.get("<\uff5c\u2581pad\u2581\uff5c>") - if not hasattr(model.generation_config, "generation_kwargs"): - model.generation_config.generation_kwargs = {} - model.generation_config.generation_kwargs["boi_token_id"] = tokenizer.vocab.get(" ") - - # Load and convert state dict - print("Loading state dict...") - state_dict = load_model_state_dict(input_path) - state_dict = convert_state_dict_to_hf(state_dict) - - # Load converted state dict - print("Loading converted weights into model...") - model.load_state_dict(state_dict, strict=True, assign=True) - - # Tie weights before any device mapping - print("Tying weights...") - model.tie_weights() - - # Save the model - if output_dir: - print(f"Saving model to {output_dir}...") - model.save_pretrained(output_dir, safe_serialization=safe_serialization) - if output_hub_path: - print(f"Pushing model to hub at {output_hub_path}...") - model.push_to_hub(output_hub_path, safe_serialization=safe_serialization) - - del state_dict, model - gc.collect() - - # Validate the saved model if saved locally - if output_dir: - print("Reloading the local model to check if it's saved correctly...") - # TODO: warning about weights not being tied is raised here regardless of model.tie_weights() above - JanusForConditionalGeneration.from_pretrained(output_dir, device_map="auto") - print("Local model reloaded successfully.") - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--repo_id", - help="HuggingFace Hub repo ID for the model", - default=None, - ) - parser.add_argument( - "--local_dir", - help="Local directory containing the model files", - default=None, - ) - parser.add_argument( - "--revision", - help="Specific revision to download from the Hub", - default=None, - ) - parser.add_argument( - "--output_dir", - help="Location to write HF model locally", - default=None, - ) - parser.add_argument( - "--output_hub_path", - help="Repository ID to push model to hub (e.g. 'username/model-name')", - default=None, - ) - parser.add_argument( - "--text_model_id", - help="Hub ID of the text model to get tokenizer from. Optional if tokenizer.json exists in the model directory.", - required=False, - ) - parser.add_argument( - "--safe_serialization", - action="store_true", - help="Whether to save using safetensors", - ) - args = parser.parse_args() - - if args.output_dir is None and args.output_hub_path is None: - raise ValueError("At least one of --output_dir or --output_hub_path must be specified") - - if args.repo_id is None and args.local_dir is None: - raise ValueError("Either --repo_id or --local_dir must be specified") - - convert_model( - repo_id=args.repo_id, - local_dir=args.local_dir, - text_model_id=args.text_model_id, - output_dir=args.output_dir, - output_hub_path=args.output_hub_path, - safe_serialization=args.safe_serialization, - revision=args.revision, - ) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/janus/image_processing_janus.py b/src/transformers/models/janus/image_processing_janus.py index 16659bd85354..21d36c651b39 100644 --- a/src/transformers/models/janus/image_processing_janus.py +++ b/src/transformers/models/janus/image_processing_janus.py @@ -355,7 +355,7 @@ def pad_to_square( background_color: Union[int, tuple[int, int, int]] = 0, data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, - ) -> np.array: + ) -> np.ndarray: """ Pads an image to a square based on the longest edge. @@ -480,7 +480,7 @@ def unnormalize( image_mean: Union[float, Iterable[float]], image_std: Union[float, Iterable[float]], input_data_format: Optional[Union[str, ChannelDimension]] = None, - ) -> np.array: + ) -> np.ndarray: """ Unnormalizes `image` using the mean and standard deviation specified by `mean` and `std`. image = (image * image_std) + image_mean diff --git a/src/transformers/models/janus/image_processing_janus_fast.py b/src/transformers/models/janus/image_processing_janus_fast.py index 9ed2732fb3d0..6cbca591626e 100644 --- a/src/transformers/models/janus/image_processing_janus_fast.py +++ b/src/transformers/models/janus/image_processing_janus_fast.py @@ -17,6 +17,7 @@ from typing import Optional, Union import torch +from torchvision.transforms.v2 import functional as F from ...image_processing_utils import BatchFeature from ...image_processing_utils_fast import ( @@ -36,16 +37,9 @@ from ...utils import ( TensorType, auto_docstring, - is_torchvision_v2_available, ) -if is_torchvision_v2_available(): - from torchvision.transforms.v2 import functional as F -else: - from torchvision.transforms import functional as F - - class JanusFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): r""" min_size (`int`, *optional*, defaults to 14): diff --git a/src/transformers/models/janus/modular_janus.py b/src/transformers/models/janus/modular_janus.py index dcd5c1e1e730..e5c000fdd6f0 100644 --- a/src/transformers/models/janus/modular_janus.py +++ b/src/transformers/models/janus/modular_janus.py @@ -1359,7 +1359,7 @@ def pad_to_square( background_color: Union[int, tuple[int, int, int]] = 0, data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, - ) -> np.array: + ) -> np.ndarray: """ Pads an image to a square based on the longest edge. @@ -1705,7 +1705,7 @@ def unnormalize( image_mean: Union[float, Iterable[float]], image_std: Union[float, Iterable[float]], input_data_format: Optional[Union[str, ChannelDimension]] = None, - ) -> np.array: + ) -> np.ndarray: """ Unnormalizes `image` using the mean and standard deviation specified by `mean` and `std`. image = (image * image_std) + image_mean diff --git a/src/transformers/models/kosmos2/convert_kosmos2_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/kosmos2/convert_kosmos2_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index 04c7712aa846..000000000000 --- a/src/transformers/models/kosmos2/convert_kosmos2_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,77 +0,0 @@ -import argparse - -from fairseq.checkpoint_utils import load_checkpoint_to_cpu - -from transformers import Kosmos2Config, Kosmos2ForConditionalGeneration - - -KEYS_TO_MODIFY_MAPPING = { - "gpt_model.decoder.output_projection": "text_model.lm_head", - "gpt_model.decoder": "text_model.model", - "img_connector": "image_to_text_projection", - "img_model.visual.class_embedding": "vision_model.model.embeddings.class_embedding", - "img_model.visual.positional_embedding": "vision_model.model.embeddings.position_embedding.weight", - "img_model.visual.conv1": "vision_model.model.embeddings.patch_embedding", - "img_model.visual": "vision_model.model", - "ln_pre": "pre_layrnorm", - "ln_post": "post_layernorm", - "transformer.resblocks": "encoder.layers", - "ts_attn": "self_attn", - "ln_1": "layer_norm1", - "ln_2": "layer_norm2", - "c_fc": "fc1", - "c_proj": "fc2", -} - - -KEYS_TO_IGNORE = [ - # this buffer in the original code is only used to send weights to the desired device - "gpt_model.decoder.embed_positions._float_tensor", - # this weight is never used in the forward in the original KOSMOS-2) - "gpt_model.decoder.self_attn_sope.scale", -] - - -def rename_key(key): - for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items(): - if key_to_modify in key: - key = key.replace(key_to_modify, new_key) - - return key - - -def convert_kosmos2_checkpoint_to_pytorch(checkpoint_path, pytorch_dump_folder_path): - state = load_checkpoint_to_cpu(checkpoint_path) - state_dict = state["model"] - state_dict_keys = list(state_dict.keys()) - - config = Kosmos2Config() - # This is necessary to match the results given by the original demo - config.text_config.no_repeat_ngram_size = 3 - model = Kosmos2ForConditionalGeneration(config) - - # convert (by renaming keys) - converted_state_dict = {} - for key in state_dict_keys: - if key in KEYS_TO_IGNORE: - continue - renamed_key = rename_key(key) - converted_state_dict[renamed_key] = state_dict[key] - - # check weight loading - model.load_state_dict(converted_state_dict, strict=True) - # save the result - model.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--kosmos2_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump." - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - args = parser.parse_args() - convert_kosmos2_checkpoint_to_pytorch(args.kosmos2_checkpoint_path, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/kosmos2_5/convert_kosmos2_5.py b/src/transformers/models/kosmos2_5/convert_kosmos2_5.py deleted file mode 100644 index d490c95a6a68..000000000000 --- a/src/transformers/models/kosmos2_5/convert_kosmos2_5.py +++ /dev/null @@ -1,101 +0,0 @@ -# coding=utf-8 -# Copyright 2024 Microsoft Research and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse - -from fairseq.checkpoint_utils import load_checkpoint_to_cpu - -from transformers import Kosmos2_5Config, Kosmos2_5ForConditionalGeneration - - -KEYS_TO_MODIFY_MAPPING = { - "gpt_model.decoder.output_projection": "text_model.lm_head", - "gpt_model.decoder": "text_model.model", - "img_connector": "image_to_text_projection", - "img_model.embeddings": "vision_model.embeddings", - "img_model.encoder": "vision_model.encoder", - "img_model.layernorm": "vision_model.layernorm", - "img_model": "vision_model", - "ln_pre": "pre_layrnorm", - "ln_post": "post_layernorm", - "transformer.resblocks": "encoder.layers", - "ts_attn": "self_attn", - "ln_1": "layer_norm1", - "ln_2": "layer_norm2", - "c_fc": "fc1", - "c_proj": "fc2", -} - - -KEYS_TO_IGNORE = [ - # this buffer in the original code is only used to send weights to the desired device - "gpt_model.decoder.embed_positions._float_tensor", - # this weight is never used in the forward in the original KOSMOS-2.5) - "gpt_model.decoder.self_attn_sope.scale", -] - - -def rename_key(key): - for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items(): - if key_to_modify in key: - key = key.replace(key_to_modify, new_key) - - return key - - -def convert_kosmos2_5_checkpoint_to_pytorch(checkpoint_path, pytorch_dump_folder_path): - state = load_checkpoint_to_cpu(checkpoint_path) - state_dict = state["model"] - state_dict_keys = list(state_dict.keys()) - - config = Kosmos2_5Config() - # This is necessary to match the results given by the original demo - config.text_config.no_repeat_ngram_size = 3 - model = Kosmos2_5ForConditionalGeneration(config) - - # convert (by renaming keys) - converted_state_dict = {} - for key in state_dict_keys: - if key in KEYS_TO_IGNORE: - continue - renamed_key = rename_key(key) - converted_state_dict[renamed_key] = state_dict[key] - - # set - # check weight loading - # check whether the state in converted_state_dict is the same as the state in the model - model.load_state_dict(converted_state_dict, strict=True) - # save the result - model.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--kosmos2_5_checkpoint_path", - default="ckpt.pt", - type=str, - required=False, - help="Path the official PyTorch dump.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default="ckpt", - type=str, - required=False, - help="Path to the output PyTorch model.", - ) - args = parser.parse_args() - convert_kosmos2_5_checkpoint_to_pytorch(args.kosmos2_5_checkpoint_path, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py b/src/transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py index c6d8b1b1edf5..8f6b0be8bfc4 100644 --- a/src/transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +++ b/src/transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py @@ -34,7 +34,7 @@ # Similar to transformers.models.pix2struct.image_processing_pix2struct.torch_extract_patches but dealing with a batch of images directly. def torch_extract_patches(image_tensor, patch_height, patch_width): """ - Utiliy function to extract patches from a given tensor representing a batch of images. Returns a tensor of shape + Utility function to extract patches from a given tensor representing a batch of images. Returns a tensor of shape (batch_size, `rows`, `columns`, `num_channels` x `patch_height` x `patch_width`). Args: @@ -45,7 +45,6 @@ def torch_extract_patches(image_tensor, patch_height, patch_width): patch_width (int): The width of the patches to extract. """ - image_tensor = image_tensor patches = torch.nn.functional.unfold(image_tensor, (patch_height, patch_width), stride=(patch_height, patch_width)) patches = patches.reshape(image_tensor.size(0), image_tensor.size(1), patch_height, patch_width, -1) patches = patches.permute(0, 4, 2, 3, 1).reshape( diff --git a/src/transformers/models/kosmos2_5/modeling_kosmos2_5.py b/src/transformers/models/kosmos2_5/modeling_kosmos2_5.py index 8f9fbd706b32..b31c5797ad3c 100644 --- a/src/transformers/models/kosmos2_5/modeling_kosmos2_5.py +++ b/src/transformers/models/kosmos2_5/modeling_kosmos2_5.py @@ -290,9 +290,7 @@ class Kosmos2_5ModelOutput(ModelOutput): vision_model_output: BaseModelOutputWithPooling = None def to_tuple(self) -> tuple[Any]: - return tuple( - (self[k] if k not in ["vision_model_output"] else getattr(self, k).to_tuple()) for k in self.keys() - ) + return tuple((self[k] if k != "vision_model_output" else getattr(self, k).to_tuple()) for k in self.keys()) @dataclass @@ -350,9 +348,7 @@ class Kosmos2_5ForConditionalGenerationModelOutput(ModelOutput): vision_model_output: BaseModelOutputWithPooling = None def to_tuple(self) -> tuple[Any]: - return tuple( - (self[k] if k not in ["vision_model_output"] else getattr(self, k).to_tuple()) for k in self.keys() - ) + return tuple((self[k] if k != "vision_model_output" else getattr(self, k).to_tuple()) for k in self.keys()) # Copied from transformers.models.pix2struct.modeling_pix2struct.Pix2StructLayerNorm with Pix2Struct->Kosmos2_5 diff --git a/src/transformers/models/kyutai_speech_to_text/convert_kyutai_speech_to_text_to_hf.py b/src/transformers/models/kyutai_speech_to_text/convert_kyutai_speech_to_text_to_hf.py deleted file mode 100644 index 6bd2cbe6c9d4..000000000000 --- a/src/transformers/models/kyutai_speech_to_text/convert_kyutai_speech_to_text_to_hf.py +++ /dev/null @@ -1,382 +0,0 @@ -# coding=utf-8 -# Copyright 2025 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import gc -import os -import re - -import safetensors.torch -import sentencepiece -import torch - -from transformers import ( - KyutaiSpeechToTextConfig, - KyutaiSpeechToTextFeatureExtractor, - KyutaiSpeechToTextForConditionalGeneration, - KyutaiSpeechToTextProcessor, - PreTrainedTokenizerFast, -) -from transformers.convert_slow_tokenizer import MoshiConverter -from transformers.utils.hub import cached_file - - -# fmt: off -MOSHI_ORIGINAL_TO_CONVERTED_KEY_MAPPING = { - r"out_norm": r"norm", - r"gating\.linear_in": r"mlp.fc1", - r"gating\.linear_out": r"mlp.fc2", - r"self_attn\.out_proj": r"self_attn.o_proj.linear", - r"norm1": r"input_layernorm", - r"norm2": r"post_attention_layernorm", - r"layer_scale_1": r"self_attn_layer_scale", - r"layer_scale_2": r"mlp_layer_scale", - r"alpha": r"weight", -} -# fmt: on - - -# fmt: off -MIMI_ORIGINAL_TO_CONVERTED_KEY_MAPPING = { - r"conv\.conv\.conv": "conv", - r"convtr\.convtr\.convtr": "conv", - r"conv\.conv": "conv", - r"convtr\.convtr": "conv", - r"quantizer\.rvq_first\.vq": "quantizer.semantic_residual_vector_quantizer", - r"quantizer\.rvq_first": "quantizer.semantic_residual_vector_quantizer", - r"quantizer\.rvq_rest\.vq": "quantizer.acoustic_residual_vector_quantizer", - r"quantizer\.rvq_rest": "quantizer.acoustic_residual_vector_quantizer", - r"_codebook": "codebook", - r"_initialized": "initialized", - r"embedding_sum": "embed_sum", - r"encoder\.model": "encoder.layers", - r"decoder\.model": "decoder.layers", - r"encoder_transformer\.transformer": "encoder_transformer", - r"decoder_transformer\.transformer": "decoder_transformer", - r"linear1": "mlp.fc1", - r"linear2": "mlp.fc2", - r"self_attn\.out_proj": "self_attn.o_proj", - r"norm1": "input_layernorm", - r"norm2": "post_attention_layernorm", - r"layer_scale_1": "self_attn_layer_scale", - r"layer_scale_2": "mlp_layer_scale", -} -# fmt: on - - -def permute_for_rope(input_tensor, n_heads, dim1, dim2): - """ - When you go from the complex ROPE formulation to sin and cos one, you need - to permute the query and key weights (to avoid doing it on the fly) - """ - return input_tensor.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2) - - -def convert_key(key, mapping): - for pattern, replacement in mapping.items(): - key = re.sub(pattern, replacement, key) - return key - - -def convert_kyutai_speech_to_text_state_dict(state_dict, config, unwanted_prefix="transformer."): - hidden_size = config.hidden_size - head_dim = config.head_dim - num_heads = int(config.hidden_size // config.head_dim) - num_key_value_heads = config.num_key_value_heads - key_value_head_dim = config.num_key_value_heads * head_dim - - # concat embeddings - embed_tokens_weight = [] - for i in range(32): - embed_tokens_weight.append(state_dict.pop(f"emb.{i}.weight")) - - embed_tokens_weight = torch.cat(embed_tokens_weight, dim=0) - embed_tokens_weight = torch.cat([state_dict.pop("text_emb.weight"), embed_tokens_weight]) - embed_tokens_weight = torch.cat([embed_tokens_weight, torch.zeros(1, config.hidden_size)], dim=0) - state_dict["embed_tokens.embed_tokens.weight"] = embed_tokens_weight - - for key, value in list(state_dict.items()): - if unwanted_prefix is not None and unwanted_prefix in key: - new_key = key[len(unwanted_prefix) :] - else: - new_key = key - - new_key = convert_key(new_key, MOSHI_ORIGINAL_TO_CONVERTED_KEY_MAPPING) - - # Post-process the current_parameter. - if "alpha" in key: - state_dict[key] = state_dict[key].squeeze() - - if "in_proj_weight" in new_key: - # split qkv into query key and value - mixed_qkv = state_dict.pop(key) - qkv_dim = mixed_qkv.size(0) // 3 - - query_layer = mixed_qkv[:qkv_dim] - key_layer = mixed_qkv[qkv_dim : qkv_dim * 2] - value_layer = mixed_qkv[qkv_dim * 2 :] - state_dict[new_key.replace("in_proj_weight", "q_proj.linear.weight")] = permute_for_rope( - query_layer, num_heads, hidden_size, hidden_size - ) - state_dict[new_key.replace("in_proj_weight", "k_proj.linear.weight")] = permute_for_rope( - key_layer, num_key_value_heads, key_value_head_dim, hidden_size - ) - - state_dict[new_key.replace("in_proj_weight", "v_proj.linear.weight")] = value_layer - else: - state_dict[new_key] = state_dict.pop(key) - - return state_dict - - -def convert_mimi_state_dict(state_dict, config, unwanted_prefix=None): - hidden_size = config.hidden_size - head_dim = config.head_dim - num_heads = int(config.hidden_size // config.head_dim) - num_key_value_heads = config.num_key_value_heads - key_value_head_dim = config.num_key_value_heads * head_dim - - for key, value in list(state_dict.items()): - if unwanted_prefix is not None and unwanted_prefix in key: - new_key = key[len(unwanted_prefix) :] - else: - new_key = key - - new_key = convert_key(new_key, MIMI_ORIGINAL_TO_CONVERTED_KEY_MAPPING) - - if "in_proj_weight" in new_key: - # split qkv into query key and value - mixed_qkv = state_dict.pop(key) - qkv_dim = mixed_qkv.size(0) // 3 - - query_layer = mixed_qkv[:qkv_dim] - key_layer = mixed_qkv[qkv_dim : qkv_dim * 2] - value_layer = mixed_qkv[qkv_dim * 2 :] - - state_dict[new_key.replace("in_proj_weight", "q_proj.weight")] = permute_for_rope( - query_layer, num_heads, hidden_size, hidden_size - ) - state_dict[new_key.replace("in_proj_weight", "k_proj.weight")] = permute_for_rope( - key_layer, num_key_value_heads, key_value_head_dim, hidden_size - ) - state_dict[new_key.replace("in_proj_weight", "v_proj.weight")] = value_layer - else: - state_dict[new_key] = state_dict.pop(key) - - return state_dict - - -def write_model( - input_path_or_repo, - model_name, - codec_model_path_or_repo, - codec_model_name, - output_dir, - safe_serialization=True, - unwanted_prefix="transformer.", -): - print("Converting the model.") - os.makedirs(output_dir, exist_ok=True) - - config = KyutaiSpeechToTextConfig( - vocab_size=8001, - max_position_embeddings=375, - num_hidden_layers=16, - num_attention_heads=16, - num_key_value_heads=16, - head_dim=128, - ) - config.use_cache = True - config.codec_config.sliding_window = 250 - - model_path = cached_file( - input_path_or_repo, - model_name, - ) - - codec_path = cached_file( - codec_model_path_or_repo, - codec_model_name, - ) - - print(f"Fetching all parameters from the checkpoint at {model_path}...") - state_dict = safetensors.torch.load_file(model_path) - - print(f"Fetching all parameters from the checkpoint at {codec_path}...") - codec_state_dict = safetensors.torch.load_file(codec_path) - - print("Converting model...") - # ----------------------- - # convert parameter names - # ----------------------- - state_dict = convert_kyutai_speech_to_text_state_dict(state_dict, config, unwanted_prefix=unwanted_prefix) - codec_state_dict = convert_mimi_state_dict(codec_state_dict, config.codec_config, unwanted_prefix=None) - - # ------------------------- - # load the weights and save - # ------------------------- - print("Loading the checkpoint in a Moshi ASR model.") - with torch.device("meta"): - model = KyutaiSpeechToTextForConditionalGeneration(config) - - linear_weight = state_dict.pop("text_linear.weight") - model.model.load_state_dict(state_dict, strict=True, assign=True) - - linear_weight = torch.cat([linear_weight, torch.zeros(1, config.hidden_size)]) - model.lm_head.load_state_dict({"weight": linear_weight}, strict=True, assign=True) - - model.codec_model.load_state_dict(codec_state_dict, strict=True, assign=True) - - print("Checkpoint loaded successfully.") - del model.config._name_or_path - del model.config.codec_config._name_or_path - - # default generation config - model.generation_config._from_model_config = False - model.generation_config.audio_window_size = 1 - model.generation_config.cache_implementation = "sliding_window" - - model.codec_model.generation_config._from_model_config = False - model.codec_model.generation_config.cache_implementation = "sliding_window" - model.codec_model.generation_config.use_cache = True - - print("Saving the model.") - model.save_pretrained(output_dir, safe_serialization=safe_serialization) - del state_dict, model - - # Safety check: reload the converted model - gc.collect() - print("Reloading the model to check if it's saved correctly.") - KyutaiSpeechToTextForConditionalGeneration.from_pretrained(output_dir, dtype=torch.bfloat16, device_map="auto") - print("Model reloaded successfully.") - - -def write_processor( - input_path_or_repo, - tokenizer_model_name, - codec_model_path_or_repo, - output_dir, - audio_delay_seconds, - audio_silence_prefix_seconds, -): - tokenizer_path = cached_file( - input_path_or_repo, - tokenizer_model_name, - ) - - tokenizer = MoshiConverter(tokenizer_path).converted() - original_tokenizer = sentencepiece.SentencePieceProcessor(tokenizer_path) - - tokenizer = PreTrainedTokenizerFast( - tokenizer_object=tokenizer, - chat_template=None, - unk_token=" ", - model_input_names=["input_ids", "attention_mask"], - clean_up_tokenization_spaces=False, - bos_token_id=original_tokenizer.bos_id(), - eos_token_id=original_tokenizer.eos_id(), - pad_token_id=original_tokenizer.pad_id(), - ) - - feature_extractor = KyutaiSpeechToTextFeatureExtractor( - audio_delay_seconds=audio_delay_seconds, - audio_silence_prefix_seconds=audio_silence_prefix_seconds, - ) - - processor = KyutaiSpeechToTextProcessor(feature_extractor, tokenizer) - processor.save_pretrained(output_dir) - print(f"Processor saved successfully to {output_dir}") - - -def main(): - parser = argparse.ArgumentParser(description="Convert Moshi ASR weights to HuggingFace format") - parser.add_argument( - "--input_path_or_repo", - type=str, - required=True, - help="Path or repo containing Moshi ASR weights", - ) - parser.add_argument( - "--model_name", - type=str, - required=True, - help="Name of the model in input_path_or_repo", - ) - parser.add_argument( - "--tokenizer_model_name", - type=str, - required=True, - help="Name of the tokenizer model in input_path_or_repo", - ) - parser.add_argument( - "--codec_model_path_or_repo", - type=str, - required=True, - help="Path or repo containing the Mimi weights", - ) - parser.add_argument( - "--mimi_name", - type=str, - required=True, - help="Name of the Mimi model in codec_model_path_or_repo", - ) - parser.add_argument( - "--preprocessor_model_path_or_repo", - type=str, - required=True, - help="Path or repo containing the preprocessor config", - ) - parser.add_argument( - "--output_dir", - help="Location to write HF model and tokenizer", - ) - parser.add_argument( - "--safe_serialization", action="store_true", default=True, help="Whether or not to save using `safetensors`." - ) - parser.add_argument( - "--audio_delay_seconds", - type=float, - required=True, - help="Audio delay in seconds to add to the right of the input", - ) - parser.add_argument( - "--audio_silence_prefix_seconds", - type=float, - required=True, - help="Audio silence prefix in seconds to add to the left of the input", - ) - args = parser.parse_args() - - write_model( - args.input_path_or_repo, - args.model_name, - args.codec_model_path_or_repo, - args.mimi_name, - args.output_dir, - safe_serialization=args.safe_serialization, - ) - - write_processor( - args.input_path_or_repo, - args.tokenizer_model_name, - args.preprocessor_model_path_or_repo, - args.output_dir, - args.audio_delay_seconds, - args.audio_silence_prefix_seconds, - ) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py index bde1736f9da8..d076ccb1de78 100644 --- a/src/transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py +++ b/src/transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py @@ -204,7 +204,7 @@ def __call__( if padding: padded_inputs["padding_mask"] = padded_inputs.pop("attention_mask") - # now let's padd left and right + # now let's pad left and right pad_left = int(self.audio_silence_prefix_seconds * self.sampling_rate) pad_right = int((self.audio_delay_seconds + 1.0) * self.sampling_rate) padded_inputs["input_values"] = np.pad( diff --git a/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py index 9eba7e163670..77c636570d58 100644 --- a/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +++ b/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py @@ -1078,7 +1078,7 @@ def __init__(self, config): self.codec_model = AutoModel.from_config(config.codec_config) # we are in an edge case where for the codec_model self.can_generate is False, setting self.codec_model.generation_config to None - # yet the codec_model needs a generation config to initalize it's cache for streaming inference + # yet the codec_model needs a generation config to initialize it's cache for streaming inference # we therefore initialize a generation config for the codec model self.codec_model.generation_config = GenerationConfig.from_model_config(config.codec_config) diff --git a/src/transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py index 8541a911e947..d3707d659e1e 100644 --- a/src/transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +++ b/src/transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py @@ -183,7 +183,7 @@ def __call__( if padding: padded_inputs["padding_mask"] = padded_inputs.pop("attention_mask") - # now let's padd left and right + # now let's pad left and right pad_left = int(self.audio_silence_prefix_seconds * self.sampling_rate) pad_right = int((self.audio_delay_seconds + 1.0) * self.sampling_rate) padded_inputs["input_values"] = np.pad( @@ -259,7 +259,7 @@ def __init__(self, config): self.codec_model = AutoModel.from_config(config.codec_config) # we are in an edge case where for the codec_model self.can_generate is False, setting self.codec_model.generation_config to None - # yet the codec_model needs a generation config to initalize it's cache for streaming inference + # yet the codec_model needs a generation config to initialize it's cache for streaming inference # we therefore initialize a generation config for the codec model self.codec_model.generation_config = GenerationConfig.from_model_config(config.codec_config) diff --git a/src/transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py b/src/transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py index 723687d58219..354bbe21c4db 100644 --- a/src/transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +++ b/src/transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py @@ -17,6 +17,7 @@ from typing import Optional, Union import torch +from torchvision.transforms.v2 import functional as F from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature, DefaultFastImageProcessorKwargs from ...image_transforms import ChannelDimension, group_images_by_shape, reorder_images @@ -25,18 +26,12 @@ from ...utils import ( TensorType, auto_docstring, - is_torchvision_v2_available, logging, requires_backends, ) from .image_processing_layoutlmv2 import apply_tesseract -if is_torchvision_v2_available(): - from torchvision.transforms.v2 import functional as F -else: - from torchvision.transforms import functional as F - logger = logging.get_logger(__name__) diff --git a/src/transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py b/src/transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py index 2ab8f8dd48cc..caefa9b89660 100644 --- a/src/transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +++ b/src/transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py @@ -17,6 +17,7 @@ from typing import Optional, Union import torch +from torchvision.transforms.v2 import functional as F from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature, DefaultFastImageProcessorKwargs from ...image_transforms import ChannelDimension, group_images_by_shape, reorder_images @@ -25,18 +26,12 @@ from ...utils import ( TensorType, auto_docstring, - is_torchvision_v2_available, logging, requires_backends, ) from .image_processing_layoutlmv3 import apply_tesseract -if is_torchvision_v2_available(): - from torchvision.transforms.v2 import functional as F -else: - from torchvision.transforms import functional as F - logger = logging.get_logger(__name__) diff --git a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py index b69fc57b1743..270437e97f44 100644 --- a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py +++ b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py @@ -524,7 +524,7 @@ def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs): if ( (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()) - and sum([text.startswith(no_split_token) for no_split_token in self.added_tokens_encoder]) == 0 + and sum(text.startswith(no_split_token) for no_split_token in self.added_tokens_encoder) == 0 ): text = " " + text return (text, kwargs) diff --git a/src/transformers/models/levit/convert_levit_timm_to_pytorch.py b/src/transformers/models/levit/convert_levit_timm_to_pytorch.py deleted file mode 100644 index 0d5731bf7bef..000000000000 --- a/src/transformers/models/levit/convert_levit_timm_to_pytorch.py +++ /dev/null @@ -1,181 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert LeViT checkpoints from timm.""" - -import argparse -import json -from collections import OrderedDict -from functools import partial -from pathlib import Path -from typing import Optional - -import timm -import torch -from huggingface_hub import hf_hub_download - -from transformers import LevitConfig, LevitForImageClassificationWithTeacher, LevitImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger() - - -def convert_weight_and_push( - hidden_sizes: int, name: str, config: LevitConfig, save_directory: Path, push_to_hub: bool = True -): - print(f"Converting {name}...") - - with torch.no_grad(): - if hidden_sizes == 128: - if name[-1] == "S": - from_model = timm.create_model("levit_128s", pretrained=True) - else: - from_model = timm.create_model("levit_128", pretrained=True) - if hidden_sizes == 192: - from_model = timm.create_model("levit_192", pretrained=True) - if hidden_sizes == 256: - from_model = timm.create_model("levit_256", pretrained=True) - if hidden_sizes == 384: - from_model = timm.create_model("levit_384", pretrained=True) - - from_model.eval() - our_model = LevitForImageClassificationWithTeacher(config).eval() - huggingface_weights = OrderedDict() - - weights = from_model.state_dict() - og_keys = list(from_model.state_dict().keys()) - new_keys = list(our_model.state_dict().keys()) - print(len(og_keys), len(new_keys)) - for i in range(len(og_keys)): - huggingface_weights[new_keys[i]] = weights[og_keys[i]] - our_model.load_state_dict(huggingface_weights) - - x = torch.randn((2, 3, 224, 224)) - out1 = from_model(x) - out2 = our_model(x).logits - - assert torch.allclose(out1, out2), "The model logits don't match the original one." - - checkpoint_name = name - print(checkpoint_name) - - if push_to_hub: - our_model.save_pretrained(save_directory / checkpoint_name) - image_processor = LevitImageProcessor() - image_processor.save_pretrained(save_directory / checkpoint_name) - - print(f"Pushed {checkpoint_name}") - - -def convert_weights_and_push(save_directory: Path, model_name: Optional[str] = None, push_to_hub: bool = True): - filename = "imagenet-1k-id2label.json" - num_labels = 1000 - expected_shape = (1, num_labels) - - repo_id = "huggingface/label-files" - num_labels = num_labels - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - - id2label = id2label - label2id = {v: k for k, v in id2label.items()} - - ImageNetPreTrainedConfig = partial(LevitConfig, num_labels=num_labels, id2label=id2label, label2id=label2id) - - names_to_hidden_sizes = { - "levit-128S": 128, - "levit-128": 128, - "levit-192": 192, - "levit-256": 256, - "levit-384": 384, - } - - names_to_config = { - "levit-128S": ImageNetPreTrainedConfig( - hidden_sizes=[128, 256, 384], - num_attention_heads=[4, 6, 8], - depths=[2, 3, 4], - key_dim=[16, 16, 16], - drop_path_rate=0, - ), - "levit-128": ImageNetPreTrainedConfig( - hidden_sizes=[128, 256, 384], - num_attention_heads=[4, 8, 12], - depths=[4, 4, 4], - key_dim=[16, 16, 16], - drop_path_rate=0, - ), - "levit-192": ImageNetPreTrainedConfig( - hidden_sizes=[192, 288, 384], - num_attention_heads=[3, 5, 6], - depths=[4, 4, 4], - key_dim=[32, 32, 32], - drop_path_rate=0, - ), - "levit-256": ImageNetPreTrainedConfig( - hidden_sizes=[256, 384, 512], - num_attention_heads=[4, 6, 8], - depths=[4, 4, 4], - key_dim=[32, 32, 32], - drop_path_rate=0, - ), - "levit-384": ImageNetPreTrainedConfig( - hidden_sizes=[384, 512, 768], - num_attention_heads=[6, 9, 12], - depths=[4, 4, 4], - key_dim=[32, 32, 32], - drop_path_rate=0.1, - ), - } - - if model_name: - convert_weight_and_push( - names_to_hidden_sizes[model_name], model_name, names_to_config[model_name], save_directory, push_to_hub - ) - else: - for model_name, config in names_to_config.items(): - convert_weight_and_push(names_to_hidden_sizes[model_name], model_name, config, save_directory, push_to_hub) - return config, expected_shape - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default=None, - type=str, - help="The name of the model you wish to convert, it must be one of the supported Levit* architecture,", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default="levit-dump-folder/", - type=Path, - required=False, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub") - parser.add_argument( - "--no-push_to_hub", - dest="push_to_hub", - action="store_false", - help="Do not push model and image processor to the hub", - ) - - args = parser.parse_args() - pytorch_dump_folder_path: Path = args.pytorch_dump_folder_path - pytorch_dump_folder_path.mkdir(exist_ok=True, parents=True) - convert_weights_and_push(pytorch_dump_folder_path, args.model_name, args.push_to_hub) diff --git a/src/transformers/models/levit/image_processing_levit_fast.py b/src/transformers/models/levit/image_processing_levit_fast.py index e452894d6e2e..ae30194288fa 100644 --- a/src/transformers/models/levit/image_processing_levit_fast.py +++ b/src/transformers/models/levit/image_processing_levit_fast.py @@ -17,6 +17,7 @@ from typing import Optional import torch +from torchvision.transforms.v2 import functional as F from ...image_processing_utils_fast import BaseImageProcessorFast, SizeDict from ...image_transforms import ( @@ -24,13 +25,7 @@ get_resize_output_image_size, ) from ...image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling -from ...utils import auto_docstring, is_torchvision_v2_available - - -if is_torchvision_v2_available(): - from torchvision.transforms.v2 import functional as F -else: - from torchvision.transforms import functional as F +from ...utils import auto_docstring @auto_docstring diff --git a/src/transformers/models/lfm2_vl/__init__.py b/src/transformers/models/lfm2_vl/__init__.py new file mode 100755 index 000000000000..7d0357ffbaa6 --- /dev/null +++ b/src/transformers/models/lfm2_vl/__init__.py @@ -0,0 +1,29 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_lfm2_vl import * + from .image_processing_lfm2_vl_fast import * + from .modeling_lfm2_vl import * + from .processing_lfm2_vl import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/lfm2_vl/configuration_lfm2_vl.py b/src/transformers/models/lfm2_vl/configuration_lfm2_vl.py new file mode 100755 index 000000000000..1378fbe6dc8c --- /dev/null +++ b/src/transformers/models/lfm2_vl/configuration_lfm2_vl.py @@ -0,0 +1,91 @@ +# coding=utf-8 +# Copyright 2025 the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch LFM2-VL model.""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging +from ..auto import CONFIG_MAPPING, AutoConfig + + +logger = logging.get_logger(__name__) + + +class Lfm2VlConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Lfm2VlForConditionalGeneration`]. It is used to instantiate an + Lfm2Vl model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the Lfm2-VL-1.6B. + + e.g. [LiquidAI/LFM2-VL-1.6B](https://huggingface.co/LiquidAI/LFM2-VL-1.6B) + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vision_config (`AutoConfig | dict`, *optional*, defaults to `Siglip2ImageConfig`): + The config object or dictionary of the vision backbone. + text_config (`AutoConfig | dict`, *optional*, defaults to `Lfm2Config`): + The config object or dictionary of the text backbone. + image_token_id (`int`, *optional*, defaults to 396): + The image token index to encode the image prompt. + projector_hidden_act (`str`, *optional*, defaults to `"gelu"`): + The activation function used by the multimodal projector. + projector_hidden_size (`int`, *optional*, defaults to 2560): + The hidden size of the multimodal projector. + projector_bias (`bool`, *optional*, defaults to `True`): + Whether to use bias in the multimodal projector. + downsample_factor (`int`, *optional*, defaults to 2): + The downsample_factor factor of the vision backbone. + """ + + model_type = "lfm2-vl" + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} + + def __init__( + self, + vision_config=None, + text_config=None, + image_token_id=396, + projector_hidden_act="gelu", + projector_hidden_size=2560, + projector_bias=True, + downsample_factor=2, + **kwargs, + ): + self.image_token_id = image_token_id + self.projector_hidden_act = projector_hidden_act + self.projector_hidden_size = projector_hidden_size + self.projector_bias = projector_bias + self.downsample_factor = downsample_factor + + if isinstance(vision_config, dict): + vision_config["model_type"] = vision_config.get("model_type", "siglip2_vision_model") + vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) + elif vision_config is None: + vision_config = CONFIG_MAPPING["siglip2_vision_model"]() + + if isinstance(text_config, dict): + text_config["model_type"] = text_config.get("model_type", "lfm2") + text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) + elif text_config is None: + text_config = CONFIG_MAPPING["lfm2"]() + + self.vision_config = vision_config + self.text_config = text_config + + super().__init__(**kwargs) + + +__all__ = ["Lfm2VlConfig"] diff --git a/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py b/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py new file mode 100755 index 000000000000..4081c86e108a --- /dev/null +++ b/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py @@ -0,0 +1,541 @@ +# coding=utf-8 +# Copyright 2025 the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math +from functools import lru_cache +from typing import Optional, Union + +import torch +from torchvision.transforms.v2 import functional as F + +from ...image_processing_utils import BatchFeature +from ...image_processing_utils_fast import ( + BaseImageProcessorFast, + DefaultFastImageProcessorKwargs, + group_images_by_shape, + reorder_images, +) +from ...image_utils import ( + IMAGENET_STANDARD_MEAN, + IMAGENET_STANDARD_STD, + ImageInput, + PILImageResampling, + SizeDict, +) +from ...processing_utils import ( + Unpack, +) +from ...utils import ( + TensorType, + auto_docstring, + logging, +) + + +logger = logging.get_logger(__name__) + + +def round_by_factor(number: float, factor: int) -> int: + """Returns the closest integer to 'number' that is divisible by 'factor'.""" + return round(number / factor) * factor + + +def find_closest_aspect_ratio( + aspect_ratio: float, + target_ratios: list[tuple[int, int]], + width: int, + height: int, + image_size: int, +) -> tuple[int, int]: + """Find the closest aspect ratio from target_ratios to match the input aspect ratio. + + Args: + aspect_ratio: The aspect ratio to match (width/height). + target_ratios: List of possible aspect ratios as tuples of (width, height) integers. + width: Original image width in pixels. + height: Original image height in pixels. + image_size: Base size for calculating target area. + + Returns: + tuple[int, int]: The best matching ratio as (width, height) integers. + """ + best_ratio_diff = float("inf") + best_ratio = (1, 1) + area = width * height + + for ratio in target_ratios: + target_aspect_ratio = ratio[0] / ratio[1] + ratio_diff = abs(aspect_ratio - target_aspect_ratio) + + # update best ratio if we found a closer match + if ratio_diff < best_ratio_diff: + best_ratio_diff = ratio_diff + best_ratio = ratio + # if equally close, prefer the ratio that better matches the original image area + elif ratio_diff == best_ratio_diff: + target_area = image_size * image_size * ratio[0] * ratio[1] + if area > 0.5 * target_area: + best_ratio = ratio + + return best_ratio + + +# copied from Siglip2ImageProcessor +@lru_cache(maxsize=256) +def get_image_size_for_max_num_patches( + image_height: int, image_width: int, patch_size: int, max_num_patches: int, eps: float = 1e-5 +) -> tuple[int, int]: + """ + Determine image size based on max number of patches, ensure dimensions are divisible by patch size and image is at least 1 patch. + + Args: + image_height (`int`): + Original image height. + image_width (`int`): + Original image width. + patch_size (`int`): + Patch size for processing. + max_num_patches (`int`): + Maximum number of patches. + eps (`float`): + Small threshold for binary search. + + Returns: + Tuple: (target_height, target_width) + """ + + def get_scaled_image_size(scale: float, size: int, patch_size: int) -> int: + scaled_size = size * scale + scaled_size = math.ceil(scaled_size / patch_size) * patch_size # make divisible by patch_size + scaled_size = max(patch_size, scaled_size) # ensure at least 1 patch + return int(scaled_size) + + # Binary search for optimal scale + scale_min, scale_max = eps / 10, 100.0 + while (scale_max - scale_min) >= eps: + scale = (scale_min + scale_max) / 2 + target_height = get_scaled_image_size(scale, image_height, patch_size) + target_width = get_scaled_image_size(scale, image_width, patch_size) + num_patches = (target_height / patch_size) * (target_width / patch_size) + + if num_patches <= max_num_patches: + scale_min = scale + else: + scale_max = scale + + scale = scale_min + target_height = get_scaled_image_size(scale, image_height, patch_size) + target_width = get_scaled_image_size(scale, image_width, patch_size) + return target_height, target_width + + +def convert_image_to_patches(images: "torch.Tensor", patch_size: int) -> "torch.Tensor": + """ + Convert 3D array image of shape (image_height, image_width, num_channels) into 2D array of patches of shape + (num_patches_height * num_patches_width, patch_size * patch_size * num_channels). + """ + batch_size, num_channels, image_height, image_width = images.shape + num_patches_height = image_height // patch_size + num_patches_width = image_width // patch_size + patched_image = images.reshape( + batch_size, num_channels, num_patches_height, patch_size, num_patches_width, patch_size + ) + patched_image = patched_image.permute(0, 2, 4, 3, 5, 1) + patched_image = patched_image.reshape(batch_size, num_patches_height * num_patches_width, -1) + return patched_image + + +def pad_along_first_dim( + images: "torch.Tensor", target_length: int, pad_value: int = 0 +) -> tuple["torch.Tensor", "torch.Tensor"]: + """ + Pad the array along the first dimension. + """ + current_length = images.shape[1] + padding_length = target_length - current_length + pixel_mask = torch.ones((target_length,), dtype=torch.int32) + if padding_length > 0: + paddings = (0, 0, 0, padding_length, 0, 0) + images = torch.nn.functional.pad(images, paddings, mode="constant", value=pad_value) + pixel_mask[-padding_length:] = 0 + return images, pixel_mask + + +class Lfm2VlFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): + """ + downsample_factor (`int`, *optional*, defaults to `2`): + The downsampling factor for images used when resizing the image. + """ + + downsample_factor: Optional[int] + do_image_splitting: Optional[bool] + min_tiles: Optional[int] + max_tiles: Optional[int] + use_thumbnail: Optional[bool] + min_image_tokens: Optional[int] + max_image_tokens: Optional[int] + encoder_patch_size: Optional[int] + tile_size: Optional[int] + max_pixels_tolerance: Optional[float] + do_pad: Optional[bool] + return_row_col_info: Optional[bool] + + +@auto_docstring +class Lfm2VlImageProcessorFast(BaseImageProcessorFast): + downsample_factor = 2 + do_image_splitting = True + min_tiles = 2 + max_tiles = 10 + use_thumbnail = True + min_image_tokens = 64 + max_image_tokens = 256 + encoder_patch_size = 16 + tile_size = 512 + max_pixels_tolerance = 2.0 + do_resize = True + size = {"height": 512, "width": 512} + resample = PILImageResampling.BILINEAR + do_rescale = True + rescale_factor = 1 / 255 + do_normalize = True + do_pad = True + return_row_col_info = False + image_mean = IMAGENET_STANDARD_STD + image_std = IMAGENET_STANDARD_MEAN + valid_kwargs = Lfm2VlFastImageProcessorKwargs + model_input_names = ["pixel_values", "pixel_attention_mask", "spatial_shapes"] + + def __init__(self, **kwargs: Unpack[Lfm2VlFastImageProcessorKwargs]): + super().__init__(**kwargs) + + max_thumbnail_image_patches = self.max_image_tokens * self.downsample_factor**2 + tile_size_patches = (self.tile_size // self.encoder_patch_size) ** 2 if self.do_image_splitting else 0 + self.max_num_patches = max( + max_thumbnail_image_patches, + tile_size_patches, + ) + + @lru_cache(maxsize=256) + def _target_ratios(self, min_tiles: int, max_tiles: int) -> list[tuple[int, int]]: + ratios = [ + (w, h) + for n in range(min_tiles, max_tiles + 1) + for w in range(1, n + 1) + for h in range(1, n + 1) + if min_tiles <= w * h <= max_tiles + ] + return sorted(set(ratios), key=lambda x: x[0] * x[1]) + + def _get_grid_layout( + self, + height: int, + width: int, + min_tiles: int, + max_tiles: int, + tile_size: int, + ) -> tuple[int, int]: + aspect_ratio = width / height + target_ratios = self._target_ratios(min_tiles, max_tiles) + + # find best matching grid configuration + grid_width, grid_height = find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, tile_size) + + target_width = tile_size * grid_width + target_height = tile_size * grid_height + total_patches = grid_width * grid_height + + return grid_width, grid_height, target_width, target_height, total_patches + + def crop_image_to_patches( + self, + image: "torch.Tensor", + min_tiles: int, + max_tiles: int, + tile_size: int, + use_thumbnail: bool, + thumbnail_size: tuple[int], + interpolation: "F.InterpolationMode" = None, + antialias: bool = True, + **kwargs, + ) -> "torch.Tensor": + """ + Processes a high resolution image into patches. + This method splits a high resolution image into a grid of smaller patches while trying to maintain + the original aspect ratio. It finds the optimal grid configuration within the specified tile constraints. + """ + batch_size, num_channels, height, width = image.shape + grid_width, grid_height, target_width, target_height, total_patches = self._get_grid_layout( + height, width, min_tiles=min_tiles, max_tiles=max_tiles, tile_size=tile_size + ) + resized_image = F.resize( + image, (target_height, target_width), interpolation=interpolation, antialias=antialias + ) + + # split the image into patches + processed_images = ( + resized_image.unfold(2, size=tile_size, step=tile_size) + .unfold(3, size=tile_size, step=tile_size) + .contiguous() + .view(batch_size, num_channels, -1, tile_size, tile_size) + .permute(2, 0, 1, 3, 4) + .reshape(batch_size, -1, num_channels, tile_size, tile_size) + ) + + # Re-order processed images to a nested image structure, so it can be reordered back correctly + # Note that the images can't be stacked because the thumbnail image is of bigger size than patches + # Each image in sublist will be of shape (1, C, H, W) + processed_images = list(processed_images) + + if use_thumbnail and grid_width * grid_height != 1: + total_patches += 1 + thumbnail_image = F.resize(image, thumbnail_size, interpolation=interpolation, antialias=antialias) + for i in range(batch_size): + processed_images[i] = list(processed_images[i]) + list(thumbnail_image[i][None, ...]) + + return processed_images, grid_width, grid_height + + # Adapted from Qwen-VL with minor differences + def smart_resize( + self, + height: int, + width: int, + downsample_factor: int, + min_image_tokens: int, + max_image_tokens: int, + encoder_patch_size: int, + ) -> tuple[int, int]: + """ + Rescales the image so that the following conditions are met: + 1. Both dimensions (height and width) are divisible by 'encoder_patch_size' * 'downsample_factor'. + This ensures no padding is needed in the downsampling step. + 2. The total number of pixels is within the range ['smart_resize_min_pixels', 'smart_resize_max_pixels']. + 3. The aspect ratio of the image is maintained as closely as possible. + """ + total_factor = encoder_patch_size * downsample_factor + smart_resize_min_pixels = min_image_tokens * encoder_patch_size**2 * downsample_factor**2 + smart_resize_max_pixels = max_image_tokens * encoder_patch_size**2 * downsample_factor**2 + + h_bar = max(total_factor, round_by_factor(height, total_factor)) + w_bar = max(total_factor, round_by_factor(width, total_factor)) + + if h_bar * w_bar > smart_resize_max_pixels: + beta = math.sqrt((height * width) / smart_resize_max_pixels) + math.floor(height / beta / total_factor) * total_factor + h_bar = max(total_factor, math.floor(height / beta / total_factor) * total_factor) + w_bar = max(total_factor, math.floor(width / beta / total_factor) * total_factor) + elif h_bar * w_bar < smart_resize_min_pixels: + beta = math.sqrt(smart_resize_min_pixels / (height * width)) + h_bar = math.ceil(height * beta / total_factor) * total_factor + w_bar = math.ceil(width * beta / total_factor) * total_factor + + return w_bar, h_bar + + def _is_image_too_large( + self, + height: int, + width: int, + max_image_tokens: int, + encoder_patch_size: int, + downsample_factor: int, + max_pixels_tolerance: float, + ) -> bool: + """Check if the image is too large to be processed as one tile.""" + total_factor = encoder_patch_size * downsample_factor + + h_bar = max(encoder_patch_size, round_by_factor(height, total_factor)) + w_bar = max(encoder_patch_size, round_by_factor(width, total_factor)) + return h_bar * w_bar > max_image_tokens * encoder_patch_size**2 * downsample_factor**2 * max_pixels_tolerance + + def resize_and_split( + self, + images: "torch.Tensor", + downsample_factor: int, + min_tiles: int, + max_tiles: int, + use_thumbnail: bool, + min_image_tokens: int, + max_image_tokens: int, + encoder_patch_size: int, + tile_size: int, + max_pixels_tolerance: float, + interpolation: "F.InterpolationMode", + ) -> "torch.Tensor": + batch_size, _, height, width = images.shape + do_image_splitting = not min_tiles == max_tiles == 1 + is_image_large = self._is_image_too_large( + height=height, + width=width, + max_image_tokens=max_image_tokens, + encoder_patch_size=encoder_patch_size, + downsample_factor=downsample_factor, + max_pixels_tolerance=max_pixels_tolerance, + ) + + new_width, new_height = self.smart_resize( + height=height, + width=width, + downsample_factor=downsample_factor, + min_image_tokens=min_image_tokens, + max_image_tokens=max_image_tokens, + encoder_patch_size=encoder_patch_size, + ) + + # Big image will be cropped into patches and small images are just resized + if is_image_large and do_image_splitting: + images, num_rows, num_cols = self.crop_image_to_patches( + images, + min_tiles=min_tiles, + max_tiles=max_tiles, + tile_size=tile_size, + thumbnail_size=(new_height, new_width), + use_thumbnail=use_thumbnail, + interpolation=interpolation, + ) + else: + num_rows = num_cols = 1 + images = F.resize(images, (new_height, new_width), interpolation=interpolation) + # Make a list and treat it as single crop per image so it can be re-grouped back correctly + images = [[image] for image in images] + + num_rows = [num_rows] * batch_size + num_cols = [num_cols] * batch_size + image_sizes = [[new_height, new_width]] * batch_size + return images, num_rows, num_cols, image_sizes + + def _preprocess( + self, + images: ImageInput, + size: SizeDict, + interpolation: "F.InterpolationMode", + do_resize: bool, + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + image_mean: Union[float, list[float]], + image_std: Union[float, list[float]], + downsample_factor: int, + do_image_splitting: bool, + min_tiles: int, + max_tiles: int, + use_thumbnail: bool, + min_image_tokens: int, + max_image_tokens: int, + encoder_patch_size: int, + tile_size: int, + max_pixels_tolerance: float, + return_tensors: Union[str, TensorType], + disable_grouping: bool, + do_pad: bool, + return_row_col_info: bool, + **kwargs, + ) -> BatchFeature: + if not do_image_splitting: + min_tiles = 1 + max_tiles = 1 + logger.debug( + "Image splitting is disabled, setting min_tiles and max_tiles to 1. Set do_image_splitting=True to enable splitting." + ) + + if do_image_splitting and min_tiles > max_tiles: + raise ValueError("min_tiles must be less than or equal to max_tiles") + + max_thumbnail_image_patches = max_image_tokens * downsample_factor**2 + tile_size_patches = (tile_size // encoder_patch_size) ** 2 if do_image_splitting else 0 + max_num_patches = max( + max_thumbnail_image_patches, + tile_size_patches, + ) + + grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) + resized_images_grouped = {} + resized_image_sizes = {} + rows_grouped, cols_grouped = {}, {} + for shape, stacked_images in grouped_images.items(): + num_rows = [1] * stacked_images.shape[0] + num_cols = [1] * stacked_images.shape[0] + height, width = stacked_images.shape[-2:] + image_sizes = [[height, width]] * stacked_images.shape[0] + do_resize = True + + if do_resize: + stacked_images, num_rows, num_cols, image_sizes = self.resize_and_split( + stacked_images, + downsample_factor=downsample_factor, + min_tiles=min_tiles, + max_tiles=max_tiles, + use_thumbnail=use_thumbnail, + min_image_tokens=min_image_tokens, + max_image_tokens=max_image_tokens, + encoder_patch_size=encoder_patch_size, + tile_size=tile_size, + max_pixels_tolerance=max_pixels_tolerance, + interpolation=interpolation, + ) + + rows_grouped[shape] = num_rows + cols_grouped[shape] = num_cols + resized_image_sizes[shape] = image_sizes + resized_images_grouped[shape] = stacked_images + resized_images = reorder_images(resized_images_grouped, grouped_images_index) + batch_rows = reorder_images(rows_grouped, grouped_images_index) + batch_cols = reorder_images(cols_grouped, grouped_images_index) + resized_image_sizes = reorder_images(resized_image_sizes, grouped_images_index) + + grouped_images, grouped_images_index = group_images_by_shape( + resized_images, disable_grouping=disable_grouping, is_nested=True + ) + + processed_images_grouped = {} + processed_masks, processed_spatial_shapes = {}, {} + for shape, stacked_images in grouped_images.items(): + # Fused rescale and normalize + stacked_images = self.rescale_and_normalize( + stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std + ) + batch_size, *_, height, width = stacked_images.shape + num_patches_height = height // encoder_patch_size + num_patches_width = width // encoder_patch_size + + stacked_images = convert_image_to_patches(stacked_images, encoder_patch_size) + processed_spatial_shapes[shape] = [[num_patches_height, num_patches_width]] * batch_size + + if do_pad: + stacked_images, pixel_mask = pad_along_first_dim(stacked_images, max_num_patches) + processed_masks[shape] = [pixel_mask] * batch_size + + processed_images_grouped[shape] = stacked_images + + processed_images = reorder_images(processed_images_grouped, grouped_images_index, is_nested=True) + data = {"pixel_values": torch.cat([torch.stack(images) for images in processed_images])} + + if do_pad: + processed_masks = reorder_images(processed_masks, grouped_images_index, is_nested=True) + processed_spatial_shapes = reorder_images(processed_spatial_shapes, grouped_images_index, is_nested=True) + processed_masks = torch.cat([torch.stack(masks) for masks in processed_masks]) + processed_spatial_shapes = torch.cat( + [torch.tensor(spatial_shape) for spatial_shape in processed_spatial_shapes] + ) + data.update({"pixel_attention_mask": processed_masks, "spatial_shapes": processed_spatial_shapes}) + + if return_row_col_info: + data["image_rows"] = batch_rows + data["image_cols"] = batch_cols + data["image_sizes"] = resized_image_sizes + + encoding = BatchFeature(data=data, tensor_type=return_tensors) + return encoding + + +__all__ = ["Lfm2VlImageProcessorFast"] diff --git a/src/transformers/models/lfm2_vl/modeling_lfm2_vl.py b/src/transformers/models/lfm2_vl/modeling_lfm2_vl.py new file mode 100755 index 000000000000..deee35394ee1 --- /dev/null +++ b/src/transformers/models/lfm2_vl/modeling_lfm2_vl.py @@ -0,0 +1,497 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/lfm2_vl/modular_lfm2_vl.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_lfm2_vl.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Optional, Union + +import torch +from torch import nn + +from ...activations import ACT2FN +from ...cache_utils import Cache +from ...generation import GenerationMixin +from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput +from ...modeling_utils import PreTrainedModel +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple +from ..auto import AutoModel +from .configuration_lfm2_vl import Lfm2VlConfig + + +class Lfm2VlMultiModalProjector(nn.Module): + def __init__(self, config: Lfm2VlConfig): + super().__init__() + in_channels = config.vision_config.hidden_size * (config.downsample_factor**2) + self.factor = config.downsample_factor + self.layer_norm = nn.LayerNorm(in_channels) + self.linear_1 = nn.Linear( + in_channels, + config.projector_hidden_size, + bias=config.projector_bias, + ) + self.act = ACT2FN[config.projector_hidden_act] + self.linear_2 = nn.Linear( + config.projector_hidden_size, + config.text_config.hidden_size, + bias=config.projector_bias, + ) + + def forward(self, image_features: torch.Tensor): + image_features = self.pixel_unshuffle(image_features) + image_features = self.layer_norm(image_features) + hidden_states = self.linear_1(image_features) + hidden_states = self.act(hidden_states) + hidden_states = self.linear_2(hidden_states) + return hidden_states + + def pixel_unshuffle(self, hidden_states: torch.Tensor): + batch_size, width, height, channels = hidden_states.size() + hidden_states = hidden_states.reshape(batch_size, width, height // self.factor, channels * self.factor) + hidden_states = hidden_states.permute(0, 2, 1, 3) + hidden_states = hidden_states.reshape( + batch_size, height // self.factor, width // self.factor, channels * self.factor**2 + ) + hidden_states = hidden_states.permute(0, 2, 1, 3) + return hidden_states + + +@auto_docstring +class Lfm2VlPreTrainedModel(PreTrainedModel): + config: Lfm2VlConfig + base_model_prefix = "" + supports_gradient_checkpointing = True + _skip_keys_device_placement = "past_key_values" + + _supports_flash_attn = True + _supports_sdpa = True + _can_compile_fullgraph = False + _supports_flex_attn = True + _supports_attention_backend = True + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for Lfm2Vl causal language model (or autoregressive) outputs. + """ +) +class Lfm2VlCausalLMOutputWithPast(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + past_key_values: Optional[Cache] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[torch.FloatTensor] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for Lfm2Vl outputs, with hidden states and attentions. + """ +) +class Lfm2VlModelOutputWithPast(BaseModelOutputWithPast): + r""" + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + """ + + image_hidden_states: Optional[torch.FloatTensor] = None + + +@auto_docstring( + custom_intro=""" + The Lfm2Vl model which consists of a vision backbone and a language model, without a language modeling head. + """ +) +class Lfm2VlModel(Lfm2VlPreTrainedModel): + _checkpoint_conversion_mapping = {} + + def __init__(self, config: Lfm2VlConfig): + super().__init__(config) + self.vision_tower = AutoModel.from_config(config.vision_config) + + self.multi_modal_projector = Lfm2VlMultiModalProjector(config) + self.language_model = AutoModel.from_config(config.text_config) + self.post_init() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def set_decoder(self, decoder): + self.language_model = decoder + + def get_decoder(self): + return self.language_model + + def get_image_features( + self, + pixel_values: torch.FloatTensor, + spatial_shapes: torch.Tensor, + pixel_attention_mask: torch.Tensor, + **kwargs, + ) -> list[torch.Tensor]: + """ + Obtains image last hidden states from the vision tower and apply multimodal projection. + + Args: + pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`): + The tensors corresponding to the input images. + spatial_shapes (`torch.Tensor` of shape `(batch_size, 2)`): + The spatial shapes of the input images. + pixel_attention_mask (`torch.Tensor` of shape `(batch_size, height, width)`): + The pixel attention mask of the input images. + Returns: + image_features (`list[torch.Tensor]`): Image feature tensor of shape `(num_images, image_length, embed_dim)`). + """ + image_outputs = self.vision_tower( + pixel_values=pixel_values, + spatial_shapes=spatial_shapes, + pixel_attention_mask=pixel_attention_mask, + ).last_hidden_state + + img_feature_lengths = pixel_attention_mask.sum(dim=1) + image_features = [] + + for img_idx in range(image_outputs.size(0)): + feature = image_outputs[img_idx] + # unpad the image representation + feature = feature[: img_feature_lengths[img_idx], :].unsqueeze(0) + + # reshape to original height and width + feature_org_h, feature_org_w = spatial_shapes[img_idx] + feature = feature.reshape(1, feature_org_h, feature_org_w, -1) + + # project the image representation + img_embedding = self.multi_modal_projector(feature) + + # flatten here to handle variable length in naflex + img_embedding = img_embedding.reshape(-1, img_embedding.size(-1)) + image_features.append(img_embedding) + + return image_features + + def get_placeholder_mask( + self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_image_mask = special_image_mask.all(-1) + else: + special_image_mask = input_ids == self.config.image_token_id + + n_image_tokens = special_image_mask.sum() + special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + n_image_features = image_features.shape[0] + if inputs_embeds[special_image_mask].numel() != image_features.numel(): + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) + return special_image_mask + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + spatial_shapes: Optional[torch.Tensor] = None, + pixel_attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, Lfm2VlModelOutputWithPast]: + r""" + spatial_shapes (`torch.Tensor` of shape `(batch_size, 2)`, *optional*): + The spatial shapes of the input images. + pixel_attention_mask (`torch.Tensor` of shape `(batch_size, height, width)`, *optional*): + The pixel attention mask of the input images. + """ + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + if pixel_values is not None: + image_features = self.get_image_features( + pixel_values=pixel_values, + spatial_shapes=spatial_shapes, + pixel_attention_mask=pixel_attention_mask, + ) + image_features = torch.cat(image_features, dim=0).to(inputs_embeds.device, inputs_embeds.dtype) + special_image_mask = self.get_placeholder_mask( + input_ids=input_ids, + inputs_embeds=inputs_embeds, + image_features=image_features, + ) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + + outputs = self.language_model( + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + return Lfm2VlModelOutputWithPast( + last_hidden_state=outputs.last_hidden_state, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=image_features if pixel_values is not None else None, + ) + + +@auto_docstring( + custom_intro=""" + The LFM2_VL model which consists of a vision backbone and a language model. + """ +) +class Lfm2VlForConditionalGeneration(Lfm2VlPreTrainedModel, GenerationMixin): + _checkpoint_conversion_mapping = {} + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config: Lfm2VlConfig): + super().__init__(config) + self.model = Lfm2VlModel(config) + self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False) + self.post_init() + + def get_input_embeddings(self): + return self.model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.model.set_input_embeddings(value) + + def get_output_embeddings(self) -> nn.Module: + return self.lm_head + + def set_decoder(self, decoder): + self.model.set_decoder(decoder) + + def get_decoder(self): + return self.model.get_decoder() + + def get_image_features( + self, + pixel_values: torch.FloatTensor, + spatial_shapes: torch.Tensor, + pixel_attention_mask: torch.Tensor, + **kwargs, + ): + return self.model.get_image_features( + pixel_values=pixel_values, + spatial_shapes=spatial_shapes, + pixel_attention_mask=pixel_attention_mask, + **kwargs, + ) + + # Make modules available through conditional class for BC + @property + def language_model(self): + return self.model.language_model + + @property + def vision_tower(self): + return self.model.vision_tower + + @property + def multi_modal_projector(self): + return self.model.multi_modal_projector + + @can_return_tuple + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + spatial_shapes: Optional[torch.Tensor] = None, + pixel_attention_mask: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, Lfm2VlCausalLMOutputWithPast]: + r""" + pixel_values (`torch.FloatTensor` of shape `(batch_size, channels, height, width)`, *optional*): + The input image tensors. + spatial_shapes (`torch.Tensor` of shape `(batch_size, 2)`, *optional*): + The spatial shapes of the input images. + pixel_attention_mask (`torch.Tensor` of shape `(batch_size, height, width)`, *optional*): + The pixel attention mask of the input images. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, AutoModelForImageTextToText + >>> from transformers.image_utils import load_image + + >>> model = AutoModelForImageTextToText.from_pretrained( + ... "LiquidAI/LFM2-VL-1.6B", + ... ) + >>> processor = AutoProcessor.from_pretrained( + ... "LiquidAI/LFM2-VL-1.6B", + ... ) + + >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg" + >>> image = load_image(url) + + >>> conversation = [ + ... { + ... "role": "user", + ... "content": [ + ... {"type": "image", "image": image}, + ... {"type": "text", "text": "What is in this image?"}, + ... ], + ... }, + ... ] + + >>> inputs = processor.apply_chat_template( + ... conversation, + ... add_generation_prompt=True, + ... tokenize=True, + ... return_dict=True, + ... return_tensors="pt" + ... ) + + >>> # Generate + >>> outputs = model.generate(**inputs, max_new_tokens=45) + >>> processor.batch_decode(outputs, skip_special_tokens=True)[0] + 'This image depicts a vibrant street scene in what appears to be a Chinatown or similar cultural area. The focal point is a large red stop sign with white lettering, mounted on a pole.' + ```""" + outputs = self.model( + input_ids=input_ids, + pixel_values=pixel_values, + spatial_shapes=spatial_shapes, + pixel_attention_mask=pixel_attention_mask, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs[0] + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function( + logits=logits, + labels=labels, + vocab_size=self.config.text_config.vocab_size, + **kwargs, + ) + + return Lfm2VlCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=outputs.image_hidden_states, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + inputs_embeds=None, + pixel_values=None, + attention_mask=None, + cache_position=None, + logits_to_keep=None, + **kwargs, + ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + + model_inputs = super().prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + **kwargs, + ) + + if cache_position[0] == 0: + # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore + # Otherwise we need pixel values to be passed to model + model_inputs["pixel_values"] = pixel_values + + return model_inputs + + +__all__ = ["Lfm2VlForConditionalGeneration", "Lfm2VlPreTrainedModel", "Lfm2VlModel"] diff --git a/src/transformers/models/lfm2_vl/modular_lfm2_vl.py b/src/transformers/models/lfm2_vl/modular_lfm2_vl.py new file mode 100644 index 000000000000..68367464c3cf --- /dev/null +++ b/src/transformers/models/lfm2_vl/modular_lfm2_vl.py @@ -0,0 +1,352 @@ +# coding=utf-8 +# Copyright 2025 the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Lfm2-VL model.""" + +from typing import Optional, Union + +import torch +from torch import nn + +from ...activations import ACT2FN +from ...cache_utils import Cache +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging +from ..llava.modeling_llava import ( + LlavaCausalLMOutputWithPast, + LlavaForConditionalGeneration, + LlavaModel, + LlavaModelOutputWithPast, + LlavaPreTrainedModel, +) +from .configuration_lfm2_vl import Lfm2VlConfig + + +logger = logging.get_logger(__name__) + + +class Lfm2VlMultiModalProjector(nn.Module): + def __init__(self, config: Lfm2VlConfig): + super().__init__() + in_channels = config.vision_config.hidden_size * (config.downsample_factor**2) + self.factor = config.downsample_factor + self.layer_norm = nn.LayerNorm(in_channels) + self.linear_1 = nn.Linear( + in_channels, + config.projector_hidden_size, + bias=config.projector_bias, + ) + self.act = ACT2FN[config.projector_hidden_act] + self.linear_2 = nn.Linear( + config.projector_hidden_size, + config.text_config.hidden_size, + bias=config.projector_bias, + ) + + def forward(self, image_features: torch.Tensor): + image_features = self.pixel_unshuffle(image_features) + image_features = self.layer_norm(image_features) + hidden_states = self.linear_1(image_features) + hidden_states = self.act(hidden_states) + hidden_states = self.linear_2(hidden_states) + return hidden_states + + def pixel_unshuffle(self, hidden_states: torch.Tensor): + batch_size, width, height, channels = hidden_states.size() + hidden_states = hidden_states.reshape(batch_size, width, height // self.factor, channels * self.factor) + hidden_states = hidden_states.permute(0, 2, 1, 3) + hidden_states = hidden_states.reshape( + batch_size, height // self.factor, width // self.factor, channels * self.factor**2 + ) + hidden_states = hidden_states.permute(0, 2, 1, 3) + return hidden_states + + +class Lfm2VlPreTrainedModel(LlavaPreTrainedModel): + _can_compile_fullgraph = False + + +class Lfm2VlCausalLMOutputWithPast(LlavaCausalLMOutputWithPast): + pass + + +class Lfm2VlModelOutputWithPast(LlavaModelOutputWithPast): + pass + + +class Lfm2VlModel(LlavaModel): + _checkpoint_conversion_mapping = {} + + def __init__(self, config: Lfm2VlConfig): + super().__init__(config) + + def get_image_features( + self, + pixel_values: torch.FloatTensor, + spatial_shapes: torch.Tensor, + pixel_attention_mask: torch.Tensor, + **kwargs, + ) -> list[torch.Tensor]: + """ + Obtains image last hidden states from the vision tower and apply multimodal projection. + + Args: + pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`): + The tensors corresponding to the input images. + spatial_shapes (`torch.Tensor` of shape `(batch_size, 2)`): + The spatial shapes of the input images. + pixel_attention_mask (`torch.Tensor` of shape `(batch_size, height, width)`): + The pixel attention mask of the input images. + Returns: + image_features (`list[torch.Tensor]`): Image feature tensor of shape `(num_images, image_length, embed_dim)`). + """ + image_outputs = self.vision_tower( + pixel_values=pixel_values, + spatial_shapes=spatial_shapes, + pixel_attention_mask=pixel_attention_mask, + ).last_hidden_state + + img_feature_lengths = pixel_attention_mask.sum(dim=1) + image_features = [] + + for img_idx in range(image_outputs.size(0)): + feature = image_outputs[img_idx] + # unpad the image representation + feature = feature[: img_feature_lengths[img_idx], :].unsqueeze(0) + + # reshape to original height and width + feature_org_h, feature_org_w = spatial_shapes[img_idx] + feature = feature.reshape(1, feature_org_h, feature_org_w, -1) + + # project the image representation + img_embedding = self.multi_modal_projector(feature) + + # flatten here to handle variable length in naflex + img_embedding = img_embedding.reshape(-1, img_embedding.size(-1)) + image_features.append(img_embedding) + + return image_features + + def get_placeholder_mask( + self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_image_mask = special_image_mask.all(-1) + else: + special_image_mask = input_ids == self.config.image_token_id + + n_image_tokens = special_image_mask.sum() + special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + n_image_features = image_features.shape[0] + if inputs_embeds[special_image_mask].numel() != image_features.numel(): + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) + return special_image_mask + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + spatial_shapes: Optional[torch.Tensor] = None, + pixel_attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, Lfm2VlModelOutputWithPast]: + r""" + spatial_shapes (`torch.Tensor` of shape `(batch_size, 2)`, *optional*): + The spatial shapes of the input images. + pixel_attention_mask (`torch.Tensor` of shape `(batch_size, height, width)`, *optional*): + The pixel attention mask of the input images. + """ + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + if pixel_values is not None: + image_features = self.get_image_features( + pixel_values=pixel_values, + spatial_shapes=spatial_shapes, + pixel_attention_mask=pixel_attention_mask, + ) + image_features = torch.cat(image_features, dim=0).to(inputs_embeds.device, inputs_embeds.dtype) + special_image_mask = self.get_placeholder_mask( + input_ids=input_ids, + inputs_embeds=inputs_embeds, + image_features=image_features, + ) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + + outputs = self.language_model( + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + return Lfm2VlModelOutputWithPast( + last_hidden_state=outputs.last_hidden_state, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=image_features if pixel_values is not None else None, + ) + + +class Lfm2VlForConditionalGeneration(LlavaForConditionalGeneration): + _checkpoint_conversion_mapping = {} + + def get_image_features( + self, + pixel_values: torch.FloatTensor, + spatial_shapes: torch.Tensor, + pixel_attention_mask: torch.Tensor, + **kwargs, + ): + return self.model.get_image_features( + pixel_values=pixel_values, + spatial_shapes=spatial_shapes, + pixel_attention_mask=pixel_attention_mask, + **kwargs, + ) + + @can_return_tuple + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + spatial_shapes: Optional[torch.Tensor] = None, + pixel_attention_mask: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, Lfm2VlCausalLMOutputWithPast]: + r""" + pixel_values (`torch.FloatTensor` of shape `(batch_size, channels, height, width)`, *optional*): + The input image tensors. + spatial_shapes (`torch.Tensor` of shape `(batch_size, 2)`, *optional*): + The spatial shapes of the input images. + pixel_attention_mask (`torch.Tensor` of shape `(batch_size, height, width)`, *optional*): + The pixel attention mask of the input images. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, AutoModelForImageTextToText + >>> from transformers.image_utils import load_image + + >>> model = AutoModelForImageTextToText.from_pretrained( + ... "LiquidAI/LFM2-VL-1.6B", + ... ) + >>> processor = AutoProcessor.from_pretrained( + ... "LiquidAI/LFM2-VL-1.6B", + ... ) + + >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg" + >>> image = load_image(url) + + >>> conversation = [ + ... { + ... "role": "user", + ... "content": [ + ... {"type": "image", "image": image}, + ... {"type": "text", "text": "What is in this image?"}, + ... ], + ... }, + ... ] + + >>> inputs = processor.apply_chat_template( + ... conversation, + ... add_generation_prompt=True, + ... tokenize=True, + ... return_dict=True, + ... return_tensors="pt" + ... ) + + >>> # Generate + >>> outputs = model.generate(**inputs, max_new_tokens=45) + >>> processor.batch_decode(outputs, skip_special_tokens=True)[0] + 'This image depicts a vibrant street scene in what appears to be a Chinatown or similar cultural area. The focal point is a large red stop sign with white lettering, mounted on a pole.' + ```""" + outputs = self.model( + input_ids=input_ids, + pixel_values=pixel_values, + spatial_shapes=spatial_shapes, + pixel_attention_mask=pixel_attention_mask, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs[0] + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function( + logits=logits, + labels=labels, + vocab_size=self.config.text_config.vocab_size, + **kwargs, + ) + + return Lfm2VlCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=outputs.image_hidden_states, + ) + + +__all__ = ["Lfm2VlForConditionalGeneration", "Lfm2VlPreTrainedModel", "Lfm2VlModel"] diff --git a/src/transformers/models/lfm2_vl/processing_lfm2_vl.py b/src/transformers/models/lfm2_vl/processing_lfm2_vl.py new file mode 100755 index 000000000000..12f289c266a1 --- /dev/null +++ b/src/transformers/models/lfm2_vl/processing_lfm2_vl.py @@ -0,0 +1,269 @@ +# coding=utf-8 +# Copyright 2025 the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math +from typing import Optional, Union + +from ...feature_extraction_utils import BatchFeature +from ...image_utils import ImageInput, make_nested_list_of_images +from ...processing_utils import ( + ImagesKwargs, + ProcessingKwargs, + ProcessorMixin, + Unpack, +) +from ...tokenization_utils_base import BatchEncoding, TextInput +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class Lfm2VlImagesKwargs(ImagesKwargs, total=False): + downsample_factor: Optional[int] + do_image_splitting: Optional[bool] + min_tiles: Optional[int] + max_tiles: Optional[int] + use_thumbnail: Optional[bool] + min_image_tokens: Optional[int] + max_image_tokens: Optional[int] + encoder_patch_size: Optional[int] + tile_size: Optional[int] + max_pixels_tolerance: Optional[float] + patch_size: Optional[int] + do_pad: Optional[bool] + return_row_col_info: Optional[bool] + + +class Lfm2VlProcessorKwargs(ProcessingKwargs, total=False): + images_kwargs: Lfm2VlImagesKwargs + + _defaults = { + "images_kwargs": { + "return_row_col_info": True, + }, + "text_kwargs": { + "use_image_special_tokens": True, + "add_special_tokens": False, + "padding": False, + "is_split_into_words": False, + }, + } + + +class Lfm2VlProcessor(ProcessorMixin): + r""" + Constructs a Lfm2Vl processor which wraps a Lfm2Tokenizer tokenizer and Lfm2VlImageProcessor into a single processor. + + [`Lfm2VlProcessor`] offers all the functionalities of [`Lfm2ImageProcessor`] and [`Lfm2Tokenizer`]. + + Args: + image_processor (`Lfm2VlImageProcessor`): + An instance of [`Lfm2VlImageProcessor`]. The image processor is a required input. + tokenizer (`PreTrainedTokenizerBase`): + An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input. + chat_template (`str`, *optional*): + A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string. + use_image_special_tokens (`bool`, *optional*, defaults to `True`): + Whether to use image special tokens or not when processing. + """ + + attributes = ["image_processor", "tokenizer"] + image_processor_class = "Lfm2VlImageProcessorFast" + tokenizer_class = "AutoTokenizer" + + def __init__( + self, + image_processor, + tokenizer, + chat_template: Optional[str] = None, + use_image_special_tokens: Optional[bool] = True, + **kwargs, + ): + self.image_token = tokenizer.image_token + self.image_token_id = tokenizer.image_token_id + self.use_image_special_tokens = use_image_special_tokens + self.image_start_token = tokenizer.image_start_token + self.image_end_token = tokenizer.image_end_token + self.image_thumbnail_token = tokenizer.image_thumbnail + super().__init__(image_processor, tokenizer, chat_template=chat_template, **kwargs) + + def __call__( + self, + images: Optional[Union[ImageInput, list[ImageInput], list[list[ImageInput]]]] = None, + text: Optional[Union[TextInput, list[TextInput]]] = None, + **kwargs: Unpack[Lfm2VlProcessorKwargs], + ) -> BatchEncoding: + """ + Processes the input prompts and returns a BatchFeature. + Args: + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`, *optional*): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. If is of type `list[ImageInput]`, it's assumed that this is for a single prompt i.e. of batch size 1. + text (`TextInput`, *optional*): + The sequence or batch of sequences to be encoded. + Wherever an image token, ` ` is encountered it is expanded to a proper sequence of image tokens. + return_tensors (`Optional[str, TensorType]`, *optional*): + If set, will return tensors of a particular framework. See [`PreTrainedTokenizerFast.__call__`] for more + information. + """ + if text is None and images is None: + raise ValueError("You must provide one of `text` or `images`.") + + if images is not None and text is None: + raise ValueError( + "You must provide `text` when `images` is provided. Minimal text consists of a single image token." + ) + + output_kwargs = self._merge_kwargs( + Lfm2VlProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + + if isinstance(text, str): + text = [text] + elif not isinstance(text, list) and not isinstance(text[0], str): + raise ValueError("Invalid input text. Please provide a string, or a list of strings") + + n_images_in_text = [sample.count(self.image_token) for sample in text] + if sum(n_images_in_text) > 0 and images is None: + raise ValueError(f"We detected {sum(n_images_in_text)} tokens in the text but no images were passed") + + inputs = {} + use_image_special_tokens = output_kwargs["text_kwargs"].pop("use_image_special_tokens") + + if images is not None: + images = self.image_processor.fetch_images(images) + batched_images = make_nested_list_of_images(images) + vision_inputs = self.image_processor(batched_images, **output_kwargs["images_kwargs"]) + + n_images_in_images = [len(sublist) for sublist in batched_images] + if n_images_in_images != n_images_in_text: + raise ValueError( + f"The number of images in the text {n_images_in_text} and images {n_images_in_images} should be the same." + ) + + text = self.expand_text_with_placeholders( + text, + batched_images, + image_rows=vision_inputs.pop("image_rows"), + image_cols=vision_inputs.pop("image_cols"), + image_sizes=vision_inputs.pop("image_sizes"), + use_image_special_tokens=use_image_special_tokens, + **output_kwargs["images_kwargs"], + ) + inputs.update(vision_inputs) + + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) + + text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"]) + inputs.update(text_inputs) + + return BatchFeature(inputs, tensor_type=return_tensors) + + def expand_text_with_placeholders( + self, + text: list[str], + images: list[list[ImageInput]], + image_rows: list[list[int]], + image_cols: list[list[int]], + image_sizes: list[list[int]], + use_image_special_tokens: bool, + **images_kwargs, + ): + prompt_strings = [] + + image_data = iter(zip(*[image_rows, image_cols, image_sizes])) + for sample_text, sample_images in zip(text, images): + split_sample = sample_text.split(self.image_token) + sample_text_with_image_tokens = "" + for i, image in enumerate(sample_images): + sample_text_with_image_tokens += split_sample[i] + if use_image_special_tokens: + sample_text_with_image_tokens += self.image_start_token + + rows, cols, image_size = next(image_data) + num_thumbnail_tokens, num_tokens_per_tile = self._get_image_num_tokens(image_size, **images_kwargs) + + if rows > 1 or cols > 1: + for row in range(rows): + for col in range(cols): + if use_image_special_tokens: + sample_text_with_image_tokens += f"<|img_row_{row + 1}_col_{col + 1}|>" + sample_text_with_image_tokens += self.image_token * num_tokens_per_tile + + if num_thumbnail_tokens > 0: + if use_image_special_tokens: + sample_text_with_image_tokens += self.image_thumbnail_token + sample_text_with_image_tokens += self.image_token * num_thumbnail_tokens + else: + sample_text_with_image_tokens += self.image_token * num_thumbnail_tokens + + if use_image_special_tokens: + sample_text_with_image_tokens += self.image_end_token + + sample_text_with_image_tokens += split_sample[i + 1] + prompt_strings.append(sample_text_with_image_tokens) + + return prompt_strings + + def _get_image_num_tokens(self, image_size: list[int], **images_kwargs) -> tuple[int, int]: + tile_size = images_kwargs.get("tile_size", self.image_processor.tile_size) + downsample_factor = images_kwargs.get("downsample_factor", self.image_processor.downsample_factor) + encoder_patch_size = images_kwargs.get("encoder_patch_size", self.image_processor.encoder_patch_size) + use_thumbnail = images_kwargs.get("use_thumbnail", self.image_processor.use_thumbnail) + + thumbnail_tokens = 0 + if use_thumbnail: + image_height, image_width = image_size + num_patches_height = image_height // encoder_patch_size + num_patches_width = image_width // encoder_patch_size + dwn_num_patches_height = math.ceil(num_patches_height / downsample_factor) + dwn_num_patches_width = math.ceil(num_patches_width / downsample_factor) + thumbnail_tokens = dwn_num_patches_height * dwn_num_patches_width + + num_patches_tile = tile_size // encoder_patch_size + dwn_num_patches_tile = math.ceil(num_patches_tile / downsample_factor) + tile_tokens = dwn_num_patches_tile * dwn_num_patches_tile + + return thumbnail_tokens, tile_tokens + + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to LFM2Tokeniser's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + batched_decode_output = self.tokenizer.batch_decode(*args, **kwargs) + return batched_decode_output + + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to LFM2Tokeniser's [`~PreTrainedTokenizer.decode`]. Please refer to + the docstring of this method for more information. + """ + decode_output = self.tokenizer.decode(*args, **kwargs) + return decode_output + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + + # LFM2-VL has no dedicated tokenizer class and uses the Base class with default model input names + tokenizer_input_names = [name for name in tokenizer_input_names if name != "token_type_ids"] + return list(tokenizer_input_names + image_processor_input_names) + + +__all__ = ["Lfm2VlProcessor"] diff --git a/src/transformers/models/lightglue/convert_lightglue_to_hf.py b/src/transformers/models/lightglue/convert_lightglue_to_hf.py deleted file mode 100644 index feb7c790113d..000000000000 --- a/src/transformers/models/lightglue/convert_lightglue_to_hf.py +++ /dev/null @@ -1,280 +0,0 @@ -# Copyright 2025 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import gc -import os -import re - -import torch -from datasets import load_dataset - -from transformers import ( - AutoModelForKeypointDetection, - LightGlueForKeypointMatching, - LightGlueImageProcessor, -) -from transformers.models.lightglue.configuration_lightglue import LightGlueConfig - - -DEFAULT_CHECKPOINT_URL = "https://github.com/cvg/LightGlue/releases/download/v0.1_arxiv/superpoint_lightglue.pth" - - -def prepare_imgs(): - dataset = load_dataset("hf-internal-testing/image-matching-test-dataset", split="train") - image0 = dataset[0]["image"] - image1 = dataset[1]["image"] - image2 = dataset[2]["image"] - # [image1, image1] on purpose to test the model early stopping - return [[image2, image0], [image1, image1]] - - -def verify_model_outputs(model, device): - images = prepare_imgs() - preprocessor = LightGlueImageProcessor() - inputs = preprocessor(images=images, return_tensors="pt").to(device) - model.to(device) - with torch.no_grad(): - outputs = model(**inputs, output_hidden_states=True, output_attentions=True) - - predicted_matches_values = outputs.matches[0, 0, 20:30] - predicted_matching_scores_values = outputs.matching_scores[0, 0, 20:30] - - predicted_number_of_matches = torch.sum(outputs.matches[0][0] != -1).item() - - expected_max_number_keypoints = 866 - expected_matches_shape = torch.Size((len(images), 2, expected_max_number_keypoints)) - expected_matching_scores_shape = torch.Size((len(images), 2, expected_max_number_keypoints)) - - expected_matches_values = torch.tensor([-1, -1, 5, -1, -1, 19, -1, 10, -1, 11], dtype=torch.int64).to(device) - expected_matching_scores_values = torch.tensor([0, 0, 0.2997, 0, 0, 0.6762, 0, 0.8826, 0, 0.5583]).to(device) - - expected_number_of_matches = 140 - - assert outputs.matches.shape == expected_matches_shape - assert outputs.matching_scores.shape == expected_matching_scores_shape - - assert torch.allclose(predicted_matches_values, expected_matches_values, atol=1e-2) - assert torch.allclose(predicted_matching_scores_values, expected_matching_scores_values, atol=1e-2) - - assert predicted_number_of_matches == expected_number_of_matches - - -ORIGINAL_TO_CONVERTED_KEY_MAPPING = { - r"posenc.Wr": r"positional_encoder.projector", - r"self_attn.(\d+).Wqkv": r"transformer_layers.\1.self_attention.Wqkv", - r"self_attn.(\d+).out_proj": r"transformer_layers.\1.self_attention.o_proj", - r"self_attn.(\d+).ffn.0": r"transformer_layers.\1.self_mlp.fc1", - r"self_attn.(\d+).ffn.1": r"transformer_layers.\1.self_mlp.layer_norm", - r"self_attn.(\d+).ffn.3": r"transformer_layers.\1.self_mlp.fc2", - r"cross_attn.(\d+).to_qk": r"transformer_layers.\1.cross_attention.to_qk", - r"cross_attn.(\d+).to_v": r"transformer_layers.\1.cross_attention.v_proj", - r"cross_attn.(\d+).to_out": r"transformer_layers.\1.cross_attention.o_proj", - r"cross_attn.(\d+).ffn.0": r"transformer_layers.\1.cross_mlp.fc1", - r"cross_attn.(\d+).ffn.1": r"transformer_layers.\1.cross_mlp.layer_norm", - r"cross_attn.(\d+).ffn.3": r"transformer_layers.\1.cross_mlp.fc2", - r"log_assignment.(\d+).matchability": r"match_assignment_layers.\1.matchability", - r"log_assignment.(\d+).final_proj": r"match_assignment_layers.\1.final_projection", - r"token_confidence.(\d+).token.0": r"token_confidence.\1.token", -} - - -def convert_old_keys_to_new_keys(state_dict_keys: list[str]): - """ - This function should be applied only once, on the concatenated keys to efficiently rename using - the key mappings. - """ - output_dict = {} - if state_dict_keys is not None: - old_text = "\n".join(state_dict_keys) - new_text = old_text - for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items(): - if replacement is None: - new_text = re.sub(pattern, "", new_text) # an empty line - continue - new_text = re.sub(pattern, replacement, new_text) - output_dict = dict(zip(old_text.split("\n"), new_text.split("\n"))) - return output_dict - - -def add_keypoint_detector_state_dict(lightglue_state_dict): - keypoint_detector = AutoModelForKeypointDetection.from_pretrained("magic-leap-community/superpoint") - keypoint_detector_state_dict = keypoint_detector.state_dict() - for k, v in keypoint_detector_state_dict.items(): - lightglue_state_dict[f"keypoint_detector.{k}"] = v - return lightglue_state_dict - - -def split_weights(state_dict): - for i in range(9): - # Remove unused r values - log_assignment_r_key = f"log_assignment.{i}.r" - if state_dict.get(log_assignment_r_key, None) is not None: - state_dict.pop(log_assignment_r_key) - - Wqkv_weight = state_dict.pop(f"transformer_layers.{i}.self_attention.Wqkv.weight") - Wqkv_bias = state_dict.pop(f"transformer_layers.{i}.self_attention.Wqkv.bias") - Wqkv_weight = Wqkv_weight.reshape(256, 3, 256) - Wqkv_bias = Wqkv_bias.reshape(256, 3) - query_weight, key_weight, value_weight = Wqkv_weight[:, 0], Wqkv_weight[:, 1], Wqkv_weight[:, 2] - query_bias, key_bias, value_bias = Wqkv_bias[:, 0], Wqkv_bias[:, 1], Wqkv_bias[:, 2] - state_dict[f"transformer_layers.{i}.self_attention.q_proj.weight"] = query_weight - state_dict[f"transformer_layers.{i}.self_attention.k_proj.weight"] = key_weight - state_dict[f"transformer_layers.{i}.self_attention.v_proj.weight"] = value_weight - state_dict[f"transformer_layers.{i}.self_attention.q_proj.bias"] = query_bias - state_dict[f"transformer_layers.{i}.self_attention.k_proj.bias"] = key_bias - state_dict[f"transformer_layers.{i}.self_attention.v_proj.bias"] = value_bias - - to_qk_weight = state_dict.pop(f"transformer_layers.{i}.cross_attention.to_qk.weight") - to_qk_bias = state_dict.pop(f"transformer_layers.{i}.cross_attention.to_qk.bias") - state_dict[f"transformer_layers.{i}.cross_attention.q_proj.weight"] = to_qk_weight - state_dict[f"transformer_layers.{i}.cross_attention.q_proj.bias"] = to_qk_bias - state_dict[f"transformer_layers.{i}.cross_attention.k_proj.weight"] = to_qk_weight - state_dict[f"transformer_layers.{i}.cross_attention.k_proj.bias"] = to_qk_bias - - return state_dict - - -@torch.no_grad() -def write_model( - model_path, - checkpoint_url, - organization, - safe_serialization=True, - push_to_hub=False, -): - os.makedirs(model_path, exist_ok=True) - - # ------------------------------------------------------------ - # LightGlue config - # ------------------------------------------------------------ - - config = LightGlueConfig( - descriptor_dim=256, - num_hidden_layers=9, - num_attention_heads=4, - ) - config.architectures = ["LightGlueForKeypointMatching"] - config.save_pretrained(model_path) - print("Model config saved successfully...") - - # ------------------------------------------------------------ - # Convert weights - # ------------------------------------------------------------ - - print(f"Fetching all parameters from the checkpoint at {checkpoint_url}...") - original_state_dict = torch.hub.load_state_dict_from_url(checkpoint_url) - - print("Converting model...") - all_keys = list(original_state_dict.keys()) - new_keys = convert_old_keys_to_new_keys(all_keys) - - state_dict = {} - for key in all_keys: - new_key = new_keys[key] - state_dict[new_key] = original_state_dict.pop(key).contiguous().clone() - - del original_state_dict - gc.collect() - state_dict = split_weights(state_dict) - state_dict = add_keypoint_detector_state_dict(state_dict) - - print("Loading the checkpoint in a LightGlue model...") - device = "cuda" - with torch.device(device): - model = LightGlueForKeypointMatching(config) - model.load_state_dict(state_dict) - print("Checkpoint loaded successfully...") - del model.config._name_or_path - - print("Saving the model...") - model.save_pretrained(model_path, safe_serialization=safe_serialization) - del state_dict, model - - # Safety check: reload the converted model - gc.collect() - print("Reloading the model to check if it's saved correctly.") - model = LightGlueForKeypointMatching.from_pretrained(model_path) - print("Model reloaded successfully.") - - model_name = "lightglue" - if "superpoint" in checkpoint_url: - model_name += "_superpoint" - if checkpoint_url == DEFAULT_CHECKPOINT_URL: - print("Checking the model outputs...") - verify_model_outputs(model, device) - print("Model outputs verified successfully.") - - if push_to_hub: - print("Pushing model to the hub...") - model.push_to_hub( - repo_id=f"{organization}/{model_name}", - commit_message="Add model", - ) - config.push_to_hub(repo_id=f"{organization}/{model_name}", commit_message="Add config") - - write_image_processor(model_path, model_name, organization, push_to_hub=push_to_hub) - - -def write_image_processor(save_dir, model_name, organization, push_to_hub=False): - if "superpoint" in model_name: - image_processor = LightGlueImageProcessor(do_grayscale=True) - else: - image_processor = LightGlueImageProcessor() - image_processor.save_pretrained(save_dir) - - if push_to_hub: - print("Pushing image processor to the hub...") - image_processor.push_to_hub( - repo_id=f"{organization}/{model_name}", - commit_message="Add image processor", - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--checkpoint_url", - default=DEFAULT_CHECKPOINT_URL, - type=str, - help="URL of the original LightGlue checkpoint you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - required=True, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument("--save_model", action="store_true", help="Save model to local") - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Push model and image preprocessor to the hub", - ) - parser.add_argument( - "--organization", - default="ETH-CVG", - type=str, - help="Hub organization in which you want the model to be uploaded.", - ) - - args = parser.parse_args() - write_model( - args.pytorch_dump_folder_path, - args.checkpoint_url, - args.organization, - safe_serialization=True, - push_to_hub=args.push_to_hub, - ) diff --git a/src/transformers/models/lightglue/modeling_lightglue.py b/src/transformers/models/lightglue/modeling_lightglue.py index fd460e54d393..8e9faa3e4e04 100644 --- a/src/transformers/models/lightglue/modeling_lightglue.py +++ b/src/transformers/models/lightglue/modeling_lightglue.py @@ -628,6 +628,10 @@ def _concat_early_stopped_outputs( matching_scores, ): early_stops_indices = torch.stack(early_stops_indices) + # Rearrange tensors to have the same order as the input batch + ids = torch.arange(early_stops_indices.shape[0]) + order_indices = early_stops_indices[ids] + early_stops_indices = early_stops_indices[order_indices] matches, final_pruned_keypoints_indices = ( pad_sequence(tensor, batch_first=True, padding_value=-1) for tensor in [matches, final_pruned_keypoints_indices] diff --git a/src/transformers/models/lightglue/modular_lightglue.py b/src/transformers/models/lightglue/modular_lightglue.py index 64c36f21fef9..29441344c9cd 100644 --- a/src/transformers/models/lightglue/modular_lightglue.py +++ b/src/transformers/models/lightglue/modular_lightglue.py @@ -786,6 +786,10 @@ def _concat_early_stopped_outputs( matching_scores, ): early_stops_indices = torch.stack(early_stops_indices) + # Rearrange tensors to have the same order as the input batch + ids = torch.arange(early_stops_indices.shape[0]) + order_indices = early_stops_indices[ids] + early_stops_indices = early_stops_indices[order_indices] matches, final_pruned_keypoints_indices = ( pad_sequence(tensor, batch_first=True, padding_value=-1) for tensor in [matches, final_pruned_keypoints_indices] diff --git a/src/transformers/models/llama/convert_llama_weights_to_hf.py b/src/transformers/models/llama/convert_llama_weights_to_hf.py deleted file mode 100644 index 5267bfe9ba49..000000000000 --- a/src/transformers/models/llama/convert_llama_weights_to_hf.py +++ /dev/null @@ -1,605 +0,0 @@ -# Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import gc -import json -import os -import tempfile -import warnings - -import torch -from tokenizers import AddedToken, processors - -from transformers import GenerationConfig, LlamaConfig, LlamaForCausalLM, LlamaTokenizer, PreTrainedTokenizerFast -from transformers.convert_slow_tokenizer import TikTokenConverter - - -try: - from transformers import LlamaTokenizerFast -except ImportError as e: - warnings.warn(e) - warnings.warn( - "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion" - ) - LlamaTokenizerFast = None - -""" -Sample usage: - -``` -python src/transformers/models/llama/convert_llama_weights_to_hf.py \ - --input_dir /path/to/downloaded/llama/weights --model_size 1B --llama_version 3.2 --output_dir /output/path -``` - -Thereafter, models can be loaded via: - -```py -from transformers import LlamaForCausalLM, LlamaTokenizer - -model = LlamaForCausalLM.from_pretrained("/output/path") -tokenizer = LlamaTokenizer.from_pretrained("/output/path") -``` - -Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions -come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). - -If you want your tokenizer to add a bos automatically you should update the tokenizer._tokenizers.post_processor: - -```py -from tokenizers import processors -bos = "<|begin_of_text|>" -tokenizer._tokenizers.post_processor = processors.Sequence( - [ - processors.ByteLevel(trim_offsets=False), - processors.TemplateProcessing( - single=f"{bos}:0 $A:0", - pair=f"{bos}:0 $A:0 {bos}:1 $B:1", - special_tokens=[ - (bos, tokenizer.encode(bos)), - ], - ), - ] -) -``` -""" - -NUM_SHARDS = { - "1B": 1, - "3B": 1, - "7B": 1, - "8B": 1, - "8Bf": 1, - "7Bf": 1, - "13B": 2, - "13Bf": 2, - "34B": 4, - "30B": 4, - "65B": 8, - "70B": 8, - "70Bf": 8, - "405B": 8, - "405B-MP16": 16, -} - -CONTEXT_LENGTH_FOR_VERSION = {"Guard-3": 131072, "3.2": 131072, "3.1": 131072, "3": 8192, "2": 4096, "1": 2048} - -BOS_ADDED_TOKEN = AddedToken( - "<|begin_of_text|>", single_word=False, lstrip=False, rstrip=False, normalized=False, special=True -) -EOS_ADDED_TOKEN = AddedToken( - "<|end_of_text|>", single_word=False, lstrip=False, rstrip=False, normalized=False, special=True -) -EOT_ADDED_TOKEN = AddedToken( - "<|eot_id|>", single_word=False, lstrip=False, rstrip=False, normalized=False, special=True -) - -DEFAULT_LLAMA_SPECIAL_TOKENS = { - "3": [ - "<|begin_of_text|>", - "<|end_of_text|>", - "<|reserved_special_token_0|>", - "<|reserved_special_token_1|>", - "<|reserved_special_token_2|>", - "<|reserved_special_token_3|>", - "<|start_header_id|>", - "<|end_header_id|>", - "<|reserved_special_token_4|>", - "<|eot_id|>", # end of turn - ] - + [f"<|reserved_special_token_{i}|>" for i in range(5, 256 - 5)], - "3.1": [ - "<|begin_of_text|>", - "<|end_of_text|>", - "<|reserved_special_token_0|>", - "<|reserved_special_token_1|>", - "<|finetune_right_pad_id|>", - "<|reserved_special_token_2|>", - "<|start_header_id|>", - "<|end_header_id|>", - "<|eom_id|>", # end of message - "<|eot_id|>", # end of turn - "<|python_tag|>", - ] - + [f"<|reserved_special_token_{i}|>" for i in range(3, 256 - 8)], - "3.2": [ - "<|begin_of_text|>", - "<|end_of_text|>", - "<|reserved_special_token_0|>", - "<|reserved_special_token_1|>", - "<|finetune_right_pad_id|>", - "<|reserved_special_token_2|>", - "<|start_header_id|>", - "<|end_header_id|>", - "<|eom_id|>", # end of message - "<|eot_id|>", # end of turn - "<|python_tag|>", - ] - + [f"<|reserved_special_token_{i}|>" for i in range(3, 256 - 8)], - "Guard-3": [ - "<|begin_of_text|>", - "<|end_of_text|>", - "<|reserved_special_token_0|>", - "<|reserved_special_token_1|>", - "<|finetune_right_pad_id|>", - "<|reserved_special_token_2|>", - "<|start_header_id|>", - "<|end_header_id|>", - "<|eom_id|>", # end of message - "<|eot_id|>", # end of turn - "<|python_tag|>", - ] - + [f"<|reserved_special_token_{i}|>" for i in range(3, 256 - 8)], -} - - -def is_llama_3(version): - return version in ["3", "3.1", "3.2", "Guard-3"] - - -def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256): - return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of) - - -def read_json(path): - with open(path, "r") as f: - return json.load(f) - - -def write_json(text, path): - with open(path, "w") as f: - json.dump(text, f) - - -def write_model( - model_path, - input_base_path, - model_size=None, - safe_serialization=True, - llama_version="1", - vocab_size=None, - num_shards=None, - instruct=False, - push_to_hub=False, -): - print("Converting the model.") - params = read_json(os.path.join(input_base_path, "params.json")) - num_shards = NUM_SHARDS[model_size] if num_shards is None else num_shards - params = params.get("model", params) - n_layers = params["n_layers"] - n_heads = params["n_heads"] - n_heads_per_shard = n_heads // num_shards - dim = params["dim"] - dims_per_head = dim // n_heads - base = params.get("rope_theta", 10000.0) - inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)) - if base > 10000.0 and not is_llama_3(llama_version): - max_position_embeddings = 16384 - else: - max_position_embeddings = CONTEXT_LENGTH_FOR_VERSION[llama_version] - - if params.get("n_kv_heads", None) is not None: - num_key_value_heads = params["n_kv_heads"] # for GQA / MQA - num_key_value_heads_per_shard = num_key_value_heads // num_shards - key_value_dim = dims_per_head * num_key_value_heads - else: # compatibility with other checkpoints - num_key_value_heads = n_heads - num_key_value_heads_per_shard = n_heads_per_shard - key_value_dim = dim - - # permute for sliced rotary - def permute(w, n_heads, dim1=dim, dim2=dim): - return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2) - - with tempfile.TemporaryDirectory() as tmp_model_path: - print(f"Fetching all parameters from the checkpoint at {input_base_path}.") - # Load weights - if num_shards == 1: - # Not sharded - # (The sharded implementation would also work, but this is simpler.) - loaded = torch.load( - os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu", weights_only=True - ) - else: - # Sharded - checkpoint_list = sorted([file for file in os.listdir(input_base_path) if file.endswith(".pth")]) - print("Loading in order:", checkpoint_list) - loaded = [ - torch.load(os.path.join(input_base_path, file), map_location="cpu", weights_only=True) - for file in checkpoint_list - ] - param_count = 0 - index_dict = {"weight_map": {}} - for layer_i in range(n_layers): - filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin" - if num_shards == 1: - # Unsharded - state_dict = { - f"model.layers.{layer_i}.self_attn.q_proj.weight": permute( - loaded[f"layers.{layer_i}.attention.wq.weight"], n_heads=n_heads - ), - f"model.layers.{layer_i}.self_attn.k_proj.weight": permute( - loaded[f"layers.{layer_i}.attention.wk.weight"], - n_heads=num_key_value_heads, - dim1=key_value_dim, - ), - f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[f"layers.{layer_i}.attention.wv.weight"], - f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"layers.{layer_i}.attention.wo.weight"], - f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w1.weight"], - f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w2.weight"], - f"model.layers.{layer_i}.mlp.up_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w3.weight"], - f"model.layers.{layer_i}.input_layernorm.weight": loaded[ - f"layers.{layer_i}.attention_norm.weight" - ], - f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[ - f"layers.{layer_i}.ffn_norm.weight" - ], - } - else: - # Sharded - # Note that attention.w{q,k,v,o}, feed_fordward.w[1,2,3], attention_norm.weight and ffn_norm.weight share - # the same storage object, saving attention_norm and ffn_norm will save other weights too, which is - # redundant as other weights will be stitched from multiple shards. To avoid that, they are cloned. - - state_dict = { - f"model.layers.{layer_i}.input_layernorm.weight": loaded[0][ - f"layers.{layer_i}.attention_norm.weight" - ].clone(), - f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[0][ - f"layers.{layer_i}.ffn_norm.weight" - ].clone(), - } - state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute( - torch.cat( - [ - loaded[i][f"layers.{layer_i}.attention.wq.weight"].view( - n_heads_per_shard, dims_per_head, dim - ) - for i in range(len(loaded)) - ], - dim=0, - ).reshape(dim, dim), - n_heads=n_heads, - ) - state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute( - torch.cat( - [ - loaded[i][f"layers.{layer_i}.attention.wk.weight"].view( - num_key_value_heads_per_shard, dims_per_head, dim - ) - for i in range(len(loaded)) - ], - dim=0, - ).reshape(key_value_dim, dim), - num_key_value_heads, - key_value_dim, - dim, - ) - state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat( - [ - loaded[i][f"layers.{layer_i}.attention.wv.weight"].view( - num_key_value_heads_per_shard, dims_per_head, dim - ) - for i in range(len(loaded)) - ], - dim=0, - ).reshape(key_value_dim, dim) - - state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat( - [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(len(loaded))], dim=1 - ) - state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat( - [loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(len(loaded))], dim=0 - ) - state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat( - [loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(len(loaded))], dim=1 - ) - state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat( - [loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(len(loaded))], dim=0 - ) - - state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq - for k, v in state_dict.items(): - index_dict["weight_map"][k] = filename - param_count += v.numel() - torch.save(state_dict, os.path.join(tmp_model_path, filename)) - - filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin" - if num_shards == 1: - # Unsharded - state_dict = { - "model.embed_tokens.weight": loaded["tok_embeddings.weight"], - "model.norm.weight": loaded["norm.weight"], - "lm_head.weight": loaded["output.weight"], - } - else: - concat_dim = 0 if is_llama_3(llama_version) else 1 - state_dict = { - "model.norm.weight": loaded[0]["norm.weight"], - "model.embed_tokens.weight": torch.cat( - [loaded[i]["tok_embeddings.weight"] for i in range(len(loaded))], dim=concat_dim - ), - "lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(len(loaded))], dim=0), - } - - for k, v in state_dict.items(): - index_dict["weight_map"][k] = filename - param_count += v.numel() - torch.save(state_dict, os.path.join(tmp_model_path, filename)) - - # Write configs - index_dict["metadata"] = {"total_size": param_count * 2} - write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json")) - ffn_dim_multiplier = params.get("ffn_dim_multiplier", 1) - multiple_of = params.get("multiple_of", 256) - - if is_llama_3(llama_version): - bos_token_id = 128000 - - if instruct: - eos_token_id = [128001, 128008, 128009] - else: - eos_token_id = 128001 - else: - bos_token_id = 1 - eos_token_id = 2 - - if llama_version in ["3.1", "3.2", "Guard-3"]: - rope_scaling = { - "factor": 32.0 if llama_version == "3.2" else 8.0, - "low_freq_factor": 1.0, - "high_freq_factor": 4.0, - "original_max_position_embeddings": 8192, - "rope_type": "llama3", - } - else: - rope_scaling = None - - config = LlamaConfig( - hidden_size=dim, - intermediate_size=compute_intermediate_size(dim, ffn_dim_multiplier, multiple_of), - num_attention_heads=params["n_heads"], - num_hidden_layers=params["n_layers"], - rms_norm_eps=params["norm_eps"], - num_key_value_heads=num_key_value_heads, - vocab_size=vocab_size, - rope_theta=base, - rope_scaling=rope_scaling, - max_position_embeddings=max_position_embeddings, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=llama_version in ["3.2"], - ) - - config.save_pretrained(tmp_model_path) - - generation_config = GenerationConfig( - do_sample=True, - temperature=0.6, - top_p=0.9, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - ) - generation_config.save_pretrained(tmp_model_path) - - # Make space so we can load the model properly now. - del state_dict - del loaded - gc.collect() - - print("Loading the checkpoint in a Llama model.") - model = LlamaForCausalLM.from_pretrained(tmp_model_path, dtype=torch.bfloat16) - - # Avoid saving this as part of the config. - del model.config._name_or_path - model.config.dtype = torch.float16 - - print("Saving in the Transformers format.") - if push_to_hub: - print("Pushing to the hub.") - model.push_to_hub(model_path, safe_serialization=safe_serialization, private=True, use_temp_dir=True) - else: - print("Saving to disk.") - model.save_pretrained(model_path, safe_serialization=safe_serialization) - - -class Llama3Converter(TikTokenConverter): - def __init__(self, vocab_file, special_tokens=None, instruct=False, llama_version="3.2", **kwargs): - super().__init__(vocab_file, additional_special_tokens=special_tokens, **kwargs) - tokenizer = self.converted() - - # References for chat templates in instruct models - templates_for_version = { - "2": ("meta-llama/Llama-2-7b-chat-hf", "f5db02db724555f92da89c216ac04704f23d4590"), - "3": ("meta-llama/Meta-Llama-3-8B-Instruct", "5f0b02c75b57c5855da9ae460ce51323ea669d8a"), - "3.1": ("meta-llama/Llama-3.1-8B-Instruct", "0e9e39f249a16976918f6564b8830bc894c89659"), - "3.2": ("meta-llama/Llama-3.2-1B-Instruct", "e9f8effbab1cbdc515c11ee6e098e3d5a9f51e14"), - "Guard-3": ("meta-llama/Llama-Guard-3-1B", "acf7aafa60f0410f8f42b1fa35e077d705892029"), - } - - # Add chat_template only if instruct is True. - # Prevents a null chat_template, which triggers - # a parsing warning in the Hub. - additional_kwargs = {} - if instruct or llama_version in ["Guard-3"]: - model_id, revision = templates_for_version.get(llama_version, (None, None)) - if model_id is not None: - from transformers import AutoTokenizer - - t = AutoTokenizer.from_pretrained(model_id, revision=revision) - additional_kwargs["chat_template"] = t.chat_template - - self.converted_tokenizer = PreTrainedTokenizerFast( - tokenizer_object=tokenizer, - bos_token="<|begin_of_text|>", - eos_token="<|end_of_text|>" if not instruct else "<|eot_id|>", - model_input_names=["input_ids", "attention_mask"], - model_max_length=CONTEXT_LENGTH_FOR_VERSION[llama_version], - clean_up_tokenization_spaces=True, - **additional_kwargs, - ) - self.update_post_processor(self.converted_tokenizer) - # finer special_tokens_map.json - self.converted_tokenizer._bos_token = BOS_ADDED_TOKEN - self.converted_tokenizer._eos_token = EOT_ADDED_TOKEN if instruct else EOS_ADDED_TOKEN - - # We can't do this while building the tokenizer because we have no easy access to the bos token id - def update_post_processor(self, tokenizer): - tokenizer._tokenizer.post_processor = processors.Sequence( - [ - processors.ByteLevel(trim_offsets=False), - processors.TemplateProcessing( - single="<|begin_of_text|> $A", - pair="<|begin_of_text|>:0 $A:0 <|begin_of_text|>:1 $B:1", - special_tokens=[ - ("<|begin_of_text|>", tokenizer.convert_tokens_to_ids("<|begin_of_text|>")), - ], - ), - ] - ) - - -def write_tokenizer( - tokenizer_path, input_tokenizer_path, llama_version="2", special_tokens=None, instruct=False, push_to_hub=False -): - print("Converting the tokenizer.") - tokenizer_class = LlamaTokenizer if LlamaTokenizerFast is None else LlamaTokenizerFast - if is_llama_3(llama_version): - tokenizer = Llama3Converter( - input_tokenizer_path, - special_tokens, - instruct, - llama_version, - ).converted_tokenizer - else: - try: - tokenizer = tokenizer_class(input_tokenizer_path) - except Exception: - raise ValueError( - "Failed to instantiate tokenizer. Please, make sure you have sentencepiece and protobuf installed." - ) - - if push_to_hub: - print(f"Pushing a {tokenizer_class.__name__} to the Hub repo - {tokenizer_path}.") - tokenizer.push_to_hub(tokenizer_path, private=True, use_temp_dir=True) - else: - print(f"Saving a {tokenizer_class.__name__} to {tokenizer_path}.") - tokenizer.save_pretrained(tokenizer_path) - return tokenizer - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--input_dir", - help="Location of Llama weights, which contains tokenizer.model and model folders", - ) - parser.add_argument( - "--model_size", - default=None, - help="'f' Deprecated in favor of `num_shards`: models correspond to the finetuned versions, and are specific to the Llama2 official release. For more details on Llama2, check out the original repo: https://huggingface.co/meta-llama", - ) - parser.add_argument( - "--output_dir", - help="Location to write HF model and tokenizer", - ) - parser.add_argument( - "--push_to_hub", - help="Whether or not to push the model to the hub at `output_dir` instead of saving it locally.", - action="store_true", - default=False, - ) - parser.add_argument( - "--safe_serialization", action="store_true", default=True, help="Whether or not to save using `safetensors`." - ) - # Different Llama versions used different default values for max_position_embeddings, hence the need to be able to specify which version is being used. - parser.add_argument( - "--llama_version", - choices=["1", "2", "3", "3.1", "3.2", "Guard-3"], - default="1", - type=str, - help="Version of the Llama model to convert. Currently supports Llama1 and Llama2. Controls the context size", - ) - parser.add_argument( - "--num_shards", - default=None, - type=int, - help="The number of individual shards used for the model. Does not have to be the same as the number of consolidated_xx.pth", - ) - parser.add_argument( - "--special_tokens", - default=None, - type=list[str], - help="The list of special tokens that should be added to the model.", - ) - parser.add_argument( - "--instruct", - action="store_true", - default=False, - help="Whether the model is an instruct model or not. Will affect special tokens and chat template.", - ) - args = parser.parse_args() - if args.model_size is None and args.num_shards is None: - raise ValueError("You have to set at least `num_shards` if you are not giving the `model_size`") - if args.special_tokens is None: - # no special tokens by default - args.special_tokens = DEFAULT_LLAMA_SPECIAL_TOKENS.get(str(args.llama_version), []) - - spm_path = os.path.join(args.input_dir, "tokenizer.model") - vocab_size = len( - write_tokenizer( - args.output_dir, - spm_path, - llama_version=args.llama_version, - special_tokens=args.special_tokens, - instruct=args.instruct, - push_to_hub=args.push_to_hub, - ) - ) - - if args.model_size != "tokenizer_only": - write_model( - model_path=args.output_dir, - input_base_path=args.input_dir, - model_size=args.model_size, - safe_serialization=args.safe_serialization, - llama_version=args.llama_version, - vocab_size=vocab_size, - num_shards=args.num_shards, - instruct=args.instruct, - push_to_hub=args.push_to_hub, - ) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/llama4/convert_llama4_weights_to_hf.py b/src/transformers/models/llama4/convert_llama4_weights_to_hf.py deleted file mode 100644 index 5af63ebc7350..000000000000 --- a/src/transformers/models/llama4/convert_llama4_weights_to_hf.py +++ /dev/null @@ -1,743 +0,0 @@ -import argparse -import gc -import io -import json -import os -import re -from typing import Optional - -import torch -from tokenizers import AddedToken, processors -from tqdm import tqdm - -from transformers import ( - GenerationConfig, - Llama4Config, - Llama4ForConditionalGeneration, - Llama4ImageProcessorFast, - Llama4Processor, - Llama4TextConfig, - Llama4VisionConfig, - PreTrainedTokenizerFast, -) -from transformers.integrations.tiktoken import TikTokenConverter - - -_OFFLINE_QUANT_COMPATIBLE = os.environ.get("OFFLINE_QUANT_COMPATIBLE", "0") == "1" - -torch.serialization.add_safe_globals([io.BytesIO]) -# fmt: off -# `None` means we drop the key - - -weight_postfix = ".weight" if _OFFLINE_QUANT_COMPATIBLE else "" -ORIGINAL_TO_CONVERTED_KEY_MAPPING = { - # CausalLM keys - r"output.weight": r"language_model.lm_head.weight", - r"\nnorm.weight": r"\nlanguage_model.model.norm.weight", - # Model keys - r"tok_embeddings.weight": r"language_model.model.embed_tokens.weight", - r"freq_cis": None, - r"rope.freqs": None, - r"layers.(\d+).attention_norm.weight": r"language_model.model.layers.\1.input_layernorm.weight", - r"layers.(\d+).attention.wqkv.layer_norm_weight": r"language_model.model.layers.\1.input_layernorm.weight", - r"layers.(\d+).feed_forward.norm.weight": r"language_model.model.layers.\1.post_attention_layernorm.weight", - r"layers.(\d+).attention.wo.weight": r"language_model.model.layers.\1.self_attn.o_proj.weight", - r"layers.(\d+).attention.wqkv.weight": r"language_model.model.layers.\1.self_attn.qkv_proj.weight", - - # MoE keys: no simple MLPmodel. - r"layers.(\d+).feed_forward.experts.moe_w_in_eD_F": r"language_model.model.layers.\1.feed_forward.experts.gate_proj" + weight_postfix, # will be fused with up - r"layers.(\d+).feed_forward.experts.moe_w_out_eF_D": r"language_model.model.layers.\1.feed_forward.experts.down_proj" + weight_postfix, # expert win - r"layers.(\d+).feed_forward.experts.moe_w_swiglu_eD_F": r"language_model.model.layers.\1.feed_forward.experts.up_proj" + weight_postfix, # fused with up - r"layers.(\d+).feed_forward.router_DE": r"language_model.model.layers.\1.feed_forward.router.weight", # used for top - r"layers.(\d+).feed_forward.w_in_shared_FD": r"language_model.model.layers.\1.feed_forward.shared_expert.gate_proj", # might need to be fused for efficiency? - r"layers.(\d+).feed_forward.w_out_shared_DF": r"language_model.model.layers.\1.feed_forward.shared_expert.down_proj", # might need to be fused for efficiency? - r"layers.(\d+).feed_forward.w_swiglu_FD": r"language_model.model.layers.\1.feed_forward.shared_expert.up_proj", # might need to be fused for efficiency? - r"layers.(\d+).feed_forward.global_gate_stats_3E": None, - # Unused keys in load hooks (explicitly removed) - r'layers.(\d+).attention.wqkv._extra_state': None, - r'layers.(\d+).attention.wo._extra_state': None, - # Key apparently unused in base models - r'layers.(\d+).feed_forward.expert_activation_DE': None, - - # MLP layer variant - r"layers.(\d+).feed_forward.w1.weight": r"language_model.model.layers.\1.feed_forward.gate_proj.weight", # might need to be fused for efficiency? - r"layers.(\d+).feed_forward.w3.weight": r"language_model.model.layers.\1.feed_forward.up_proj.weight", # might need to be fused for efficiency? - # r"layers.(\d+).feed_forward.mlp.fc1_weight": r"language_model.model.layers.\1.feed_forward.gate_up_proj.weight", - r"layers.(\d+).feed_forward.mlp.fc2_weight": r"language_model.model.layers.\1.feed_forward.down_proj.weight", - r"layers.(\d+).feed_forward.w2.weight": r"language_model.model.layers.\1.feed_forward.down_proj.weight", - r"layers.(\d+).feed_forward.mlp.layer_norm.weight": r"language_model.model.layers.\1.post_attention_layernorm.weight", - - # Vision encoder mapping - r"vision_embeddings.vision_encoder.conv1._linear": r"vision_model.patch_embedding.linear", - r'vision_embeddings.vision_adapter.mlp.c_fc': r"vision_model.vision_adapter.mlp.fc1", - r'vision_embeddings.vision_adapter.mlp.c_proj': r"vision_model.vision_adapter.mlp.fc2", - r"vision_embeddings.vision_encoder.transformer.resblocks.(\d+).attn.wq.(weight|bias)": r"vision_model.model.layers.\1.self_attn.q_proj.\2", - r"vision_embeddings.vision_encoder.transformer.resblocks.(\d+).attn.wk.(weight|bias)": r"vision_model.model.layers.\1.self_attn.k_proj.\2", - r"vision_embeddings.vision_encoder.transformer.resblocks.(\d+).attn.wv.(weight|bias)": r"vision_model.model.layers.\1.self_attn.v_proj.\2", - r"vision_embeddings.vision_encoder.transformer.resblocks.(\d+).attn.wo.(weight|bias)": r"vision_model.model.layers.\1.self_attn.o_proj.\2", - r"vision_embeddings.vision_encoder.transformer.resblocks.(\d+).mlp.c_fc": r"vision_model.model.layers.\1.mlp.fc1", - r"vision_embeddings.vision_encoder.transformer.resblocks.(\d+).mlp.c_proj": r"vision_model.model.layers.\1.mlp.fc2", - r"vision_embeddings.vision_encoder.transformer.resblocks.(\d+).ln_1.(weight|bias)": r"vision_model.model.layers.\1.input_layernorm.\2", - r"vision_embeddings.vision_encoder.transformer.resblocks.(\d+).ln_2.(weight|bias)": r"vision_model.model.layers.\1.post_attention_layernorm.\2", - # r'vision_embeddings.vision_encoder.ln_(1|2).(weight|bias)': r'vision_model.transformer.vision_encoder.layernorm_\1.\2', - r'vision_embeddings.vision_encoder.ln_post': r'vision_model.layernorm_post', - r'vision_embeddings.vision_encoder.ln_pre': r'vision_model.layernorm_pre', - r'vision_embeddings.vision_encoder.class_embedding': r'vision_model.class_embedding', - r"vision_embeddings.vision_encoder.positional_embedding_vlm": r"vision_model.positional_embedding_vlm", - r"vision_embeddings.vision_encoder.(?=\w)": r"vision_model.model.", - r"vision_projection.weight": r"multi_modal_projector.linear_1.weight", -} -# fmt: on - - -def convert_old_keys_to_new_keys(state_dict_keys: Optional[dict] = None): - """ - This function should be applied only once, on the concatenated keys to efficiently rename using - the key mappings. - """ - output_dict = {} - if state_dict_keys is not None: - old_text = "\n".join(state_dict_keys) - new_text = old_text - for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items(): - if replacement is None: - new_text = re.sub(pattern, "", new_text) # an empty line - continue - new_text = re.sub(pattern, replacement, new_text) - output_dict = dict(zip(old_text.split("\n"), new_text.split("\n"))) - return output_dict - - -def permute_for_rope(input_tensor, n_heads, dim1, dim2): - """ - When you go from the complex ROPE formulation to sin and cos one, you need - to permute the query and key weights (to avoid doing it on the fly) - """ - input_tensor = input_tensor.view(n_heads, dim1 // n_heads // 2, 2, dim2) - input_tensor = input_tensor.transpose(1, 2).reshape(dim1, dim2) - return input_tensor - - -def is_param_same_across_shards(key): - """ - Return `False` if the parameter is different across checkpoint shards - and needs to be concatenated. - """ - patterns = [ - r"language_model.layers.(\d+).(.*)layernorm.weight", - r"language_model.norm.weight", - r"router.weight", - r"feed_forward.global_gate_stats", - # not all vision weights are sharded, some are repeated - r"vision_model.class_embedding", - r"vision_model.positional_embedding_vlm", - r"vision_embeddings.vision_encoder.positional_embedding_vlm", - r"vision_model.model.layers.(\d+).self_attn.o_proj.bias", - r"vision_model.model.layers.(\d+).input_layernorm", - r"vision_model.model.layers.(\d+).post_attention_layernorm", - r"vision_model.layernorm_pre", - r"vision_model.layernorm_post", - r"vision_model.model.layers.(\d+).mlp.fc2.bias", - r"norm.weight", - ] # fmt: skip - return any(re.search(pattern, key) for pattern in patterns) - - -def get_concat_dim(key): - """ - Return the dimension to concatenate the weights on. - """ - concat_dim_1 = [ - # language dim 1 sharded weights - "feed_forward.router.weight", - "self_attn.o_proj", - "experts.gate_proj", - "experts.up_proj", - "expert.down_proj", - # "feed_forward.up_proj", - # "feed_forward.gate_proj", - "feed_forward.down_proj", - "global_gate_stats", - # vision dim1 sharded stuff - "mlp.fc2.weight", # covers all rowparallels across vis - ] # fmt: off - if any(re.search(pattern, key) for pattern in concat_dim_1): - return 1 - return 0 - - -def compute_intermediate_size(hidden_dim, ffn_exp=4, multiple_of=1024, ffn_dim_multiplier=1.2): - hidden_dim = ffn_exp * int(2 * hidden_dim / 3) - hidden_dim = int(ffn_dim_multiplier * hidden_dim) - hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) - return hidden_dim - - -# Ignore extra info - h/t Aritra -def safe_load(filename): - # Can use weights_only because io.BytesIO was registered, but we still need to skip those objects - shard = torch.load(filename, weights_only=True, map_location="cpu", mmap=True) - shard = {k: v for k, v in shard.items() if not isinstance(v, io.BytesIO)} - return shard - - -# Unpack mlp projections - possibly to be removed when they are fused -def preprocess_keys(state_dict): - new_state_dict = {} - for key, value in state_dict.items(): - if "mlp.fc1_weight" in key: - prefix = key.split("mlp.fc1_weight")[0] - w1, w3 = value.chunk(2, dim=0) - new_state_dict[prefix + "w1.weight"] = w1 - new_state_dict[prefix + "w3.weight"] = w3 - else: - new_state_dict[key] = value - return new_state_dict - - -def max_context_length(model_path, instruct=False): - """256K for base, 1M for 128E instruct, 10M for 16E instruct.""" - if not instruct: - return 256 * 1024 - - with open(os.path.join(model_path, "params.json"), "r") as f: - params = json.load(f) - params = params.get("model", params) - if params.get("moe_args") is None: - return 8192 - num_experts = params["moe_args"]["num_experts"] - return 10485760 if num_experts == 16 else 1048576 - - -def write_model( - model_path, - input_base_path, - num_shards, - convert_checkpoints, - safe_serialization=True, - instruct=False, -): - os.makedirs(model_path, exist_ok=True) - - with open(os.path.join(input_base_path, "params.json"), "r") as f: - params = json.load(f) - - params = params.get("model", params) - dtype = "bfloat16" - - # ------------------------------------------------------------ - # Text model params and config - # ------------------------------------------------------------ - - # params from config - vocab_size = 202048 # params["vocab_size"] # seems like the lm head is 25256 so padded instead of 202048 - num_layers = params["n_layers"] - dim = params["dim"] - num_heads = params["n_heads"] - rms_norm_eps = params["norm_eps"] - rope_theta = params["rope_theta"] - no_rope_layer_interval = params["nope_layer_interval"] - attention_chunk_size = params["attention_chunk_size"] - - config_kwargs = {} - if params["use_scaled_rope"]: - # some constants from original code - rope_scaling = { - "rope_type": "llama3", - "factor": params.get("rope_scaling_factor", 8.0), - "low_freq_factor": 1.0, - "high_freq_factor": params.get("rope_high_freq_factor", 4.0), - "original_max_position_embeddings": 8192, - } - config_kwargs.update({"rope_scaling": rope_scaling}) - - if attention_chunk_size is None: - config_kwargs.update({"cache_implementation": "static"}) - - # compute additional params for weight conversion - num_heads_per_shard = num_heads // num_shards - dim_per_head = dim // num_heads - intermediate_size_mlp = compute_intermediate_size( - dim, - ffn_exp=params["ffn_exp"], - multiple_of=params["multiple_of"], - ffn_dim_multiplier=params["ffn_dim_multiplier"], - ) - - num_key_value_heads = params["n_kv_heads"] # for GQA / MQA - - if params.get("moe_args", False): - num_experts = params["moe_args"]["num_experts"] - interleave_moe_layer_step = params["moe_args"].get("interleave_moe_layer_step", 1) - else: - # Dense model (possibly Llama Guard) - disable all moe layers - num_experts = 0 - interleave_moe_layer_step = 0 - config_kwargs.update({"moe_layers": []}) - - # Ensure all layers are rope if `nope_layer_interval` is None - no_rope_layer_interval = params["nope_layer_interval"] - no_rope_layer_interval = num_heads * 2 if no_rope_layer_interval is None else no_rope_layer_interval - - bos_token_id = 200000 - eos_token_id = [200001, 200007, 200008] if instruct else 200001 - pad_token_id = 200018 - - text_config = Llama4TextConfig( - num_attention_heads=num_heads, - vocab_size=vocab_size, - hidden_size=dim, - rms_norm_eps=rms_norm_eps, - rope_theta=rope_theta, - num_hidden_layers=num_layers, - intermediate_size=8192, - intermediate_size_mlp=intermediate_size_mlp, - max_position_embeddings=max_context_length(input_base_path, instruct), - num_local_experts=num_experts, - interleave_moe_layer_step=interleave_moe_layer_step, - use_qk_norm=params["use_qk_norm"], - no_rope_layer_interval=no_rope_layer_interval, - attention_chunk_size=attention_chunk_size, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - pad_token_id=pad_token_id, - tie_word_embeddings=False, # Constant set to False - dtype=dtype, - for_llm_compressor=_OFFLINE_QUANT_COMPATIBLE, - **config_kwargs, - ) - # default vision config from params - - vision_params = params["vision_args"] - vision_dim = vision_params["dim"] - vision_num_layers = vision_params["n_layers"] - image_size = vision_params["image_size"]["height"] # siglip config is outdated - vision_num_heads = vision_params["n_heads"] - - vision_output_dim = vision_params["output_dim"] - - vision_config = Llama4VisionConfig( - hidden_act="gelu", - num_hidden_layers=vision_num_layers, - image_size=image_size, - num_attention_heads=vision_num_heads, - hidden_size=vision_dim, - vision_output_dim=vision_output_dim, - ) - - config = Llama4Config(text_config=text_config, vision_config=vision_config) - config.save_pretrained(model_path) - - print("Model config saved successfully...") - - # ------------------------------------------------------------ - # Convert weights - # ------------------------------------------------------------ - - if convert_checkpoints: - print(f"Fetching all parameters from the checkpoint at {input_base_path}...") - if num_shards == 1: - if os.path.exists(os.path.join(input_base_path, "consolidated.00.pth")): - path = os.path.join(input_base_path, "consolidated.00.pth") - else: - path = os.path.join(input_base_path, "consolidated.pth") - loaded = [safe_load(path)] - else: - loaded = [ - safe_load(os.path.join(input_base_path, f"consolidated.{i:02d}.pth")) - for i in tqdm(range(num_shards), desc="Loading shards", unit="shard") - ] - loaded = [preprocess_keys(d) for d in loaded] - - all_keys_raw = list(loaded[0].keys()) - repeated_keys = [] - sharded_keys = [] - for _key in all_keys_raw: - try: - if num_shards == 1 or (loaded[0][_key] == loaded[1][_key]).all(): - repeated_keys.append(_key) - else: - sharded_keys.append(_key) - except Exception as e: - print(f"Encountered exception {e} for {_key}") - print("Initializing an empty model") - with torch.device("meta"): - model = Llama4ForConditionalGeneration(config) - - print("Converting model...") - all_keys = list(loaded[0].keys()) - new_keys = convert_old_keys_to_new_keys(all_keys) - state_dict = {} - replicated_params = [] # To keep track of replicated weights. - for key in tqdm(all_keys, desc="Renaming and processing all keys", unit="key"): - new_key = new_keys[key] - print(key, new_key) - if num_shards > 1 and not is_param_same_across_shards(new_key): - current_parameter = [chunk.pop(key) for chunk in loaded if not isinstance(chunk[key], io.BytesIO)] - else: - print(f"{key} (now {new_key}) is the same across all shards.") - replicated_params.append((key, new_key)) - current_parameter = [loaded[0].pop(key)] if not isinstance(loaded[0][key], io.BytesIO) else [] - - if "running_gate_stats_3E" in key: - new_keys.pop(new_key) - continue - - concat_dim = get_concat_dim(new_key) - - # Post-process the current_parameter. - if "qkv_proj" in new_key: - queries = [] - keys = [] - values = [] - for param in current_parameter: - query, key_, value = param.split( - [ - num_heads * dim_per_head // num_shards, - num_key_value_heads * dim_per_head // num_shards, - num_key_value_heads * dim_per_head // num_shards, - ] - ) - queries.append(query.reshape(num_heads_per_shard, -1, dim)) - keys.append(key_.reshape(num_key_value_heads // num_shards, -1, dim)) - values.append(value.reshape(num_key_value_heads // num_shards, -1, dim)) - - queries = torch.cat(queries, dim=0).reshape(dim, dim) - keys = torch.cat(keys, dim=0).reshape(num_key_value_heads * dim_per_head, dim) - values = torch.cat(values, dim=0).reshape(num_key_value_heads * dim_per_head, dim) - # queries = permute_for_rope(queries, num_heads, dim, dim) - # keys = permute_for_rope(keys, num_key_value_heads, num_key_value_heads*dim_per_head, dim) - - q = new_key.replace("qkv", "q") - tqdm.write(f"Processing: {key.ljust(50)} ->\t {q}, {queries.shape}") - state_dict[q] = queries - - k = new_key.replace("qkv", "k") - tqdm.write(f"Processing: {key.ljust(50)} ->\t {k}, {keys.shape}") - state_dict[k] = keys - - v = new_key.replace("qkv", "v") - tqdm.write(f"Processing: {key.ljust(50)} ->\t {v}, {values.shape}") - state_dict[v] = values - elif _OFFLINE_QUANT_COMPATIBLE and "feed_forward.experts." in new_key: - # for experts, we need to split expert for offline quantization purpose and don't need to fuse - expert_lists = [] - for k in current_parameter: - expert_lists.append( - list(k.reshape(num_experts, -1, k.shape[-1]).unbind(0)) - ) # [#expert * IN, OUT] -> #experts * [IN, OUT] - for i in range(num_experts): - expert = torch.cat([expert_list[i] for expert_list in expert_lists], dim=concat_dim) - expert_key = new_key.replace("experts.", f"experts.{i}.") - state_dict[expert_key] = expert.transpose(0, 1).contiguous() # [OUT, IN] - tqdm.write(f"Processing: {key.ljust(50)} ->\t {expert_key}, {state_dict[expert_key].shape}") - elif re.search(r"(gate|up)_proj", new_key): - path = new_key.split(".") - gate_key = re.sub(r"(gate|up)_proj", lambda m: "gate_proj", new_key) - up_key = re.sub(r"(gate|up)_proj", lambda m: "up_proj", new_key) - if gate_key == new_key: - state_dict[new_key] = torch.cat(current_parameter, dim=concat_dim) - elif new_key == up_key: - if "experts" not in new_key: - state_dict[new_key] = torch.cat(current_parameter, dim=concat_dim) - else: - gate_proj = state_dict.pop(gate_key) - gate_proj = [ - gate_proj.reshape(num_experts, -1, 8, 1024)[:, :, k, :].reshape(num_experts, -1, 1024) - for k in range(8) - ] - gate_proj = torch.cat(gate_proj, dim=-1) - - up_proj = [ - k.reshape(num_experts, -1, 8, 1024).reshape(num_experts, -1, 1024) - for k in current_parameter - ] - up_proj = torch.cat(up_proj, dim=-1) - - gate_up_proj = torch.cat((gate_proj, up_proj), dim=-1) - new_key = new_key.replace("up_proj", "gate_up_proj") - state_dict[new_key] = gate_up_proj.contiguous() - - tqdm.write(f"Processing: {key.ljust(50)} ->\t {new_key}, {state_dict[new_key].shape}") - elif "down_proj" in new_key: - current_parameter = torch.cat(current_parameter, dim=concat_dim) - if "experts" in new_key: - p = [] - for i in range(8): - p += [current_parameter.reshape(8, -1, 5120)[i, :, :].view(num_experts, -1, 5120)] - current_parameter = torch.cat(p, dim=1) - state_dict[new_key] = current_parameter.contiguous() - tqdm.write(f"Processing: {key.ljust(50)} ->\t {new_key}, {state_dict[new_key].shape}") - elif "router" in new_key: - current_parameter = torch.cat(current_parameter, dim=concat_dim) - state_dict[new_key] = current_parameter.transpose(0, 1) - elif "lm_head" in new_key: - current_parameter = torch.cat(current_parameter, dim=concat_dim).clone() - # TODO we need to do better than mean, works for now - # if (vocab_size - current_parameter.shape[0]) > 0: - # mean_embedding = torch.mean(current_parameter, dim=0)[:, None].repeat(vocab_size-current_parameter.shape[0],1) - # print(mean_embedding.shape) - # current_parameter = torch.cat((current_parameter, mean_embedding), dim=0) - state_dict[new_key] = current_parameter - tqdm.write( - f"Processing: {key.ljust(50)} ->\t {new_key}, {state_dict[new_key].shape}, concat dim = {concat_dim}" - ) - elif new_key == "vision_model.patch_embedding.linear.weight": - current_parameter = torch.cat(current_parameter, dim=concat_dim).clone() - # We don't reshape the patch embedding as we're using unfolded convolution as well - state_dict[new_key] = current_parameter # .reshape(-1, 3, vision_patch_size, vision_patch_size) - # generic concat for weights/select one for biases - elif isinstance(current_parameter, list) and len(current_parameter) > 0: - if not is_param_same_across_shards(new_key): - current_parameter = torch.cat(current_parameter, dim=concat_dim) - state_dict[new_key] = current_parameter - tqdm.write( - f"Processing: {key.ljust(50)} ->\t {new_key}, {state_dict[new_key].shape}, concat dim = {concat_dim}" - ) - elif is_param_same_across_shards(new_key): - state_dict[new_key] = current_parameter[0] - tqdm.write( - f"Processing: {key.ljust(50)} ->\t {new_key}, {state_dict[new_key].shape}, concat dim = {concat_dim}" - ) - - elif new_key == "": - # skip empty keys - continue - else: - # just load the parameter - state_dict[new_key] = current_parameter - tqdm.write( - f"Processing: {key.ljust(50)} ->\t {new_key}, {state_dict[new_key].shape}, concat dim = {concat_dim}" - ) - del loaded - gc.collect() - - print("Loading the checkpoint in a Llama4 model.") - state_dict.pop("") - model.load_state_dict(state_dict, strict=True, assign=True) - print("Model reloaded successfully.") - print("Saving the model.") - model.save_pretrained(model_path, safe_serialization=safe_serialization) - del state_dict, model - - # Safety check: reload the converted model - gc.collect() - print("Reloading the model to check if it's saved correctly.") - with torch.no_grad(): - # TODO test if we can do `tp_plan="auto"`` - model = Llama4ForConditionalGeneration.from_pretrained( - model_path, dtype=torch.bfloat16, device_map="auto", attn_implementation="eager" - ) - - model.generation_config.top_p = 0.9 - model.generation_config.temperature = 0.6 - print("Model reloaded successfully.") - - from transformers import AutoTokenizer - - tokenizer = AutoTokenizer.from_pretrained(model_path) - inputs = tokenizer(["Roses are red,"], return_tensors="pt").to(model.device) - out = model.generate(**inputs, max_new_tokens=4) - print(tokenizer.batch_decode(out)) - # generation config - if instruct: - print("Saving generation config...") - generation_config = GenerationConfig( - do_sample=True, - temperature=0.6, - top_p=0.9, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - pad_token_id=pad_token_id, - ) - generation_config.save_pretrained(model_path) - - -BOS_ADDED_TOKEN = AddedToken( - "<|begin_of_text|>", single_word=False, lstrip=False, rstrip=False, normalized=False, special=True -) -EOS_ADDED_TOKEN = AddedToken( - "<|end_of_text|>", single_word=False, lstrip=False, rstrip=False, normalized=False, special=True -) -EOT_ADDED_TOKEN = AddedToken("<|eot|>", single_word=False, lstrip=False, rstrip=False, normalized=False, special=True) - - -def get_reserved_special_tokens(name, count, start_index=0): - return [f"<|{name}_reserved_special_token_{i}|>" for i in range(start_index, start_index + count)] - - -# 200005, ..., 200079 -LLAMA4_TEXT_POST_TRAIN_SPECIAL_TOKENS = [ - "<|header_start|>", - "<|header_end|>", - "<|eom|>", - "<|eot|>", - "<|step|>", - "<|text_post_train_reserved_special_token_0|>", - "<|text_post_train_reserved_special_token_1|>", - "<|text_post_train_reserved_special_token_2|>", - "<|text_post_train_reserved_special_token_3|>", - "<|text_post_train_reserved_special_token_4|>", - "<|text_post_train_reserved_special_token_5|>", - "<|python_start|>", - "<|python_end|>", - "<|finetune_right_pad|>", -] + get_reserved_special_tokens( - "text_post_train", 61, 8 -) # <|text_post_train_reserved_special_token_8|>, ..., <|text_post_train_reserved_special_token_68|> - -# 200080, ..., 201133 -LLAMA4_VISION_SPECIAL_TOKENS = [ - "<|image_start|>", - "<|image_end|>", - "<|vision_reserved_special_token_0|>", - "<|vision_reserved_special_token_1|>", - "<|tile_x_separator|>", - "<|tile_y_separator|>", - "<|vision_reserved_special_token_2|>", - "<|vision_reserved_special_token_3|>", - "<|vision_reserved_special_token_4|>", - "<|vision_reserved_special_token_5|>", - "<|image|>", - "<|vision_reserved_special_token_6|>", - "<|patch|>", -] + get_reserved_special_tokens( - "vision", 1041, 7 -) # <|vision_reserved_special_token_7|>, ..., <|vision_reserved_special_token_1047|> - -LLAMA4_SPECIAL_TOKENS = LLAMA4_TEXT_POST_TRAIN_SPECIAL_TOKENS + LLAMA4_VISION_SPECIAL_TOKENS - -BASIC_SPECIAL_TOKENS = [ - "<|begin_of_text|>", - "<|end_of_text|>", - "<|fim_prefix|>", - "<|fim_middle|>", - "<|fim_suffix|>", -] - - -class Llama4Converter(TikTokenConverter): - def __init__( - self, - vocab_file, - special_tokens: list[str], - pattern: str, - model_max_length: int = 0, - chat_template: Optional[str] = None, - **kwargs, - ): - super().__init__(vocab_file, pattern=pattern) - self.additional_special_tokens = special_tokens - tokenizer = self.converted() - if chat_template is not None: - kwargs["chat_template"] = chat_template - - self.converted_tokenizer = PreTrainedTokenizerFast( - tokenizer_object=tokenizer, - model_input_names=["input_ids", "attention_mask"], - model_max_length=model_max_length, - **kwargs, - ) - - instruct = chat_template is not None - self.update_post_processor(self.converted_tokenizer) - # finer special_tokens_map.json - self.converted_tokenizer._bos_token = BOS_ADDED_TOKEN - self.converted_tokenizer._eos_token = EOT_ADDED_TOKEN if instruct else EOS_ADDED_TOKEN - - # We can't do this while building the tokenizer because we have no easy access to the bos token id - def update_post_processor(self, tokenizer): - tokenizer._tokenizer.post_processor = processors.Sequence( - [ - processors.ByteLevel(trim_offsets=False), - processors.TemplateProcessing( - single="<|begin_of_text|> $A", - pair="<|begin_of_text|>:0 $A:0 <|begin_of_text|>:1 $B:1", - special_tokens=[ - ("<|begin_of_text|>", tokenizer.convert_tokens_to_ids("<|begin_of_text|>")), - ], - ), - ] - ) - - -O200K_PATTERN = r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+""" # noqa: E501 - - -def write_tokenizer(args): - tokenizer_path = os.path.join(args.input_dir, "tokenizer.model") - chat_template = "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %} \n {%- if messages[0]['content'] is string %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- else %}\n {#- FIXME: The processor requires an array, always. #}\n {%- set system_message = messages[0]['content'][0]['text']|trim %}\n {%- endif %}\n {%- set messages = messages[1:] %}\n {%- set user_supplied_system_message = true %}\n{%- else %}\n {%- set system_message = \"\" %}\n {%- set user_supplied_system_message = false %}\n{%- endif %}\n\n{#- System message if the user supplied one #}\n{%- if user_supplied_system_message %}\n {{- \"<|header_start|>system<|header_end|>\n\n\" }}\n {%- if tools is not none %}\n {{- \"Environment: ipython\n\" }}\n {%- endif %}\n {%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n {%- endif %}\n {{- system_message }}\n {{- \"<|eot|>\" }}\n{%- endif %}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|header_start|>user<|header_end|>\n\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\n\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|header_start|>' + message['role'] + '<|header_end|>\n\n' }}\n {%- if message['content'] is string %}\n {{- message['content'] }}\n {%- else %}\n {%- for content in message['content'] %}\n {%- if content['type'] == 'image' %}\n {{- '<|image|>' }}\n {%- elif content['type'] == 'text' %}\n {{- content['text'] }}\n {%- endif %}\n {%- endfor %}\n {%- endif %}\n {{- \"<|eot|>\" }}\n {%- elif 'tool_calls' in message and message.tool_calls|length > 0 %}\n {{- '<|header_start|>assistant<|header_end|>\n\n' -}}\n {{- '<|python_start|>' }}\n {%- if message['content'] is string %}\n {{- message['content'] }}\n {%- else %}\n {%- for content in message['content'] %}\n {%- if content['type'] == 'image' %}\n {{- '<|image|>' }}\n {%- elif content['type'] == 'text' %}\n {{- content['text'] }}\n {%- endif %}\n {%- endfor %}\n {%- endif %}\n {{- '<|python_end|>' }}\n {%- for tool_call in message.tool_calls %}\n {{- '{\"name\": \"' + tool_call.function.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.function.arguments | tojson }}\n {{- \"}\" }}\n {%- endfor %}\n {{- \"<|eot|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|header_start|>ipython<|header_end|>\n\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|header_start|>assistant<|header_end|>\n\n' }}\n{%- endif %}\n" - - special_tokens = BASIC_SPECIAL_TOKENS + LLAMA4_SPECIAL_TOKENS - converter = Llama4Converter( - vocab_file=tokenizer_path, - pattern=O200K_PATTERN, - special_tokens=special_tokens, - chat_template=chat_template if args.instruct else None, - bos_token="<|begin_of_text|>", - eos_token="<|end_of_text|>" if not args.instruct else "<|eot|>", - pad_token="<|finetune_right_pad_id|>", - model_max_length=max_context_length(args.input_dir, args.instruct), - ) - tokenizer = converter.converted_tokenizer - - image_processor = Llama4ImageProcessorFast() - processor = Llama4Processor( - image_processor=image_processor, - tokenizer=tokenizer, - chat_template=tokenizer.chat_template, - ) - processor.save_pretrained(args.output_dir) - del processor - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--input_dir", - type=str, - help="Location of the local folder copied from the Hub.", - ) - parser.add_argument( - "--output_dir", - type=str, - help="Location to write HF model and tokenizer", - ) - parser.add_argument( - "--safe_serialization", default=True, type=bool, help="Whether or not to save using `safetensors`." - ) - parser.add_argument( - "--special_tokens", - default=None, - type=list[str], - help="The list of special tokens that should be added to the model.", - ) - parser.add_argument( - "--num_shards", - default=8, - type=int, - help="The number of individual shards used for the model. Does not have to be the same as the number of consolidated_xx.pth", - ) - parser.add_argument( - "--instruct", - action="store_true", - help="Whether the model is an instruct model", - ) - parser.add_argument( - "--convert_checkpoints", - action="store_true", - help="Whether to convert the original weights (or skip if previously converted)", - ) - - args = parser.parse_args() - write_tokenizer(args) - - write_model( - model_path=args.output_dir, - input_base_path=args.input_dir, - safe_serialization=args.safe_serialization, - num_shards=args.num_shards, - instruct=args.instruct, - convert_checkpoints=args.convert_checkpoints, - ) diff --git a/src/transformers/models/llama4/image_processing_llama4_fast.py b/src/transformers/models/llama4/image_processing_llama4_fast.py index 946fdde0a643..6506d5749d94 100644 --- a/src/transformers/models/llama4/image_processing_llama4_fast.py +++ b/src/transformers/models/llama4/image_processing_llama4_fast.py @@ -20,6 +20,7 @@ from typing import Optional, Union import torch +from torchvision.transforms.v2 import functional as F from ...image_processing_utils import BatchFeature from ...image_processing_utils_fast import ( @@ -33,16 +34,9 @@ from ...utils import ( TensorType, auto_docstring, - is_torchvision_v2_available, ) -if is_torchvision_v2_available(): - from torchvision.transforms.v2 import functional as F -else: - from torchvision.transforms import functional as F - - def get_factors(dividend: int) -> set[int]: """ Calculate all factors of a given number, i.e. a divisor that leaves diff --git a/src/transformers/models/llava/convert_llava_weights_to_hf.py b/src/transformers/models/llava/convert_llava_weights_to_hf.py deleted file mode 100644 index 936e113b0b9b..000000000000 --- a/src/transformers/models/llava/convert_llava_weights_to_hf.py +++ /dev/null @@ -1,202 +0,0 @@ -# Copyright 2023 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import glob - -import torch -from huggingface_hub import file_exists, hf_hub_download, snapshot_download -from safetensors import safe_open - -from transformers import ( - AddedToken, - AutoConfig, - AutoImageProcessor, - AutoTokenizer, - LlavaConfig, - LlavaForConditionalGeneration, - LlavaProcessor, - SiglipVisionConfig, -) - - -EPILOG_TXT = """Example: - python transformers/src/transformers/models/llava/convert_llava_weights_to_hf.py --text_model_id lmsys/vicuna-7b-v1.5 --vision_model_id openai/clip-vit-large-patch14-336 --output_hub_path org/llava-v1.5-7b-conv --old_state_dict_id liuhaotian/llava-v1.5-7b - -Example for creating the old state dict file with Python: - - import torch - from llava.model.language_model.llava_llama import LlavaLlamaForCausalLM - - # load model - kwargs = {"device_map": "auto", "dtype": torch.float16} - model = LlavaLlamaForCausalLM.from_pretrained("liuhaotian/llava-v1.5-7b", **kwargs) - - # load vision tower - model.get_vision_tower().load_model() - - # Save state dict - torch.save(model.state_dict(), "tmp/hf_models/llava-v1.5-7b/model_state_dict.bin") -""" - -KEYS_TO_MODIFY_MAPPING = { - "model.vision_tower.": "", - ".vision_resampler": "", # all lmms-lab models do avg pooling, so no vision_resampler - "model.mm_projector": "multi_modal_projector", - "model": "model.model", - "vision_model.model": "vision_model", - "lm_head": "language_model.lm_head", - "model.model": "language_model.model", - "multi_modal_projector.0": "multi_modal_projector.linear_1", - "multi_modal_projector.2": "multi_modal_projector.linear_2", -} - - -def load_original_state_dict(model_id): - directory_path = snapshot_download(repo_id=model_id, allow_patterns=["*.safetensors"]) - - original_state_dict = {} - for path in glob.glob(f"{directory_path}/*"): - if path.endswith(".safetensors"): - with safe_open(path, framework="pt", device="cpu") as f: - for key in f.keys(): - original_state_dict[key] = f.get_tensor(key) - - # tied weights so lm.head is not saved. Let's clone to load state dict - if "lm_head.weight" not in original_state_dict: - original_state_dict["lm_head.weight"] = original_state_dict["model.embed_tokens.weight"].clone() - - if "model.image_newline" in original_state_dict: - # not used in the original implementation because "merge_type=flat" - del original_state_dict["model.image_newline"] - return original_state_dict - - -# used only for llava-interlave -# for ex: Qwen/Qwen1.5-0.5B-Chat google/siglip-so400m-patch14-384 lmms-lab/llava-next-interleave-qwen-0.5b -def convert_state_dict_to_hf(state_dict): - new_state_dict = {} - for key, value in state_dict.items(): - if key.endswith(".inv_freq"): - continue - for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items(): - if key_to_modify in key: - key = key.replace(key_to_modify, new_key) - - new_state_dict[key] = value - return new_state_dict - - -def convert_llava_llama_to_hf(text_model_id, vision_model_id, output_hub_path, old_state_dict_id): - torch.set_default_dtype(torch.float16) - text_config = AutoConfig.from_pretrained(text_model_id) - - tokenizer = AutoTokenizer.from_pretrained(text_model_id) - tokenizer.add_tokens(AddedToken(" ", special=True, normalized=False), special_tokens=True) - if "Qwen" not in text_model_id: # qwen already has a pad token - tokenizer.add_special_tokens({"pad_token": " "}) - - image_processor = AutoImageProcessor.from_pretrained(vision_model_id) - processor = LlavaProcessor(tokenizer=tokenizer, image_processor=image_processor) - - if "siglip" in vision_model_id: - vision_config = SiglipVisionConfig( - hidden_size=1152, - image_size=384, - intermediate_size=4304, - num_attention_heads=16, - num_hidden_layers=26, - patch_size=14, - vision_use_head=False, - ).to_dict() - else: - vision_config = None - - config = LlavaConfig( - text_config=text_config, - vision_config=vision_config, - ) - - # llms-lab interleave models do not use any selection strategy except for last hidden state - if "Qwen" in text_model_id: - config.image_token_id = 151646 - if "siglip" in vision_model_id: - config.vision_feature_select_strategy = "full" - config.vision_feature_layer = -1 - else: - config.pad_token_id = 32001 - config.image_token_id = 32000 - - with torch.device("meta"): - model = LlavaForConditionalGeneration(config) - - # Some llava variants like microsoft/llava-med-v1.5-mistral-7b use safetensors to store weights - if file_exists(old_state_dict_id, "model_state_dict.bin"): - state_dict_path = hf_hub_download(old_state_dict_id, "model_state_dict.bin") - state_dict = torch.load(state_dict_path, map_location="cpu", weights_only=True) - else: - state_dict = load_original_state_dict(old_state_dict_id) - - state_dict = convert_state_dict_to_hf(state_dict) - model.load_state_dict(state_dict, strict=True, assign=True) - - pre_expansion_embeddings = model.language_model.model.embed_tokens.weight.data - mu = torch.mean(pre_expansion_embeddings, dim=0).float() - n = pre_expansion_embeddings.size()[0] - sigma = ((pre_expansion_embeddings - mu).T @ (pre_expansion_embeddings - mu)) / n - dist = torch.distributions.multivariate_normal.MultivariateNormal(mu, covariance_matrix=1e-5 * sigma) - - # We add an image token so we resize the model and pad to 64 for performance reasons - pad_shape = 64 - vocab_size = config.text_config.vocab_size - model.resize_token_embeddings(config.text_config.vocab_size + 2, pad_shape) - model.language_model.model.embed_tokens.weight.data[vocab_size:] = torch.stack( - tuple(dist.sample() for _ in range(model.language_model.model.embed_tokens.weight.data[vocab_size:].shape[0])), - dim=0, - ) - model.language_model.lm_head.weight.data[vocab_size:] = torch.stack( - tuple(dist.sample() for _ in range(model.language_model.lm_head.weight.data[vocab_size:].shape[0])), - dim=0, - ) - - model.push_to_hub(output_hub_path) - processor.push_to_hub(output_hub_path) - - -def main(): - parser = argparse.ArgumentParser( - epilog=EPILOG_TXT, - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - parser.add_argument( - "--text_model_id", - help="Hub location of the text model", - ) - parser.add_argument( - "--vision_model_id", - help="Hub location of the vision model", - ) - parser.add_argument( - "--output_hub_path", - help="Location on the hub of the converted model", - ) - parser.add_argument( - "--old_state_dict_id", - help="Location on the hub of the raw state dict of the original model. The filename needs to be `model_state_dict.bin`", - ) - args = parser.parse_args() - convert_llava_llama_to_hf(args.text_model_id, args.vision_model_id, args.output_hub_path, args.old_state_dict_id) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/llava/image_processing_llava.py b/src/transformers/models/llava/image_processing_llava.py index d3aa81303bb8..5420d6fe2918 100644 --- a/src/transformers/models/llava/image_processing_llava.py +++ b/src/transformers/models/llava/image_processing_llava.py @@ -154,7 +154,7 @@ def pad_to_square( background_color: Union[int, tuple[int, int, int]] = 0, data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, - ) -> np.array: + ) -> np.ndarray: """ Pads an image to a square based on the longest edge. diff --git a/src/transformers/models/llava/image_processing_llava_fast.py b/src/transformers/models/llava/image_processing_llava_fast.py index 41bb94f5b7e0..596070040549 100644 --- a/src/transformers/models/llava/image_processing_llava_fast.py +++ b/src/transformers/models/llava/image_processing_llava_fast.py @@ -17,6 +17,7 @@ from typing import Optional, Union import torch +from torchvision.transforms.v2 import functional as F from ...image_processing_utils import BatchFeature from ...image_processing_utils_fast import ( @@ -38,16 +39,9 @@ from ...utils import ( TensorType, auto_docstring, - is_torchvision_v2_available, ) -if is_torchvision_v2_available(): - from torchvision.transforms.v2 import functional as F -else: - from torchvision.transforms import functional as F - - class LlavaFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): ... diff --git a/src/transformers/models/llava_next/convert_llava_next_weights_to_hf.py b/src/transformers/models/llava_next/convert_llava_next_weights_to_hf.py deleted file mode 100644 index 41fc22678365..000000000000 --- a/src/transformers/models/llava_next/convert_llava_next_weights_to_hf.py +++ /dev/null @@ -1,394 +0,0 @@ -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Convert LLaVa-NeXT (LLaVa-1.6) checkpoints from the original repository. - -URL: https://github.com/haotian-liu/LLaVA/tree/main. - - -The command used to obtain original logits is the following: -python llava/eval/run_llava.py --model-path "liuhaotian/llava-v1.6-mistral-7b" --image-file "images/llava_v1_5_radar.jpg" --query "What is shown in this image?" --max_new_tokens 100 --temperature 0 - -Note: logits are tested with torch==2.1.2. -""" - -import argparse -import gc -import glob -import json -from pathlib import Path - -import requests -import torch -from accelerate import init_empty_weights -from huggingface_hub import hf_hub_download, snapshot_download -from PIL import Image -from safetensors import safe_open - -from transformers import ( - AddedToken, - AutoConfig, - AutoTokenizer, - LlavaNextConfig, - LlavaNextForConditionalGeneration, - LlavaNextImageProcessor, - LlavaNextProcessor, -) - - -KEYS_TO_MODIFY_MAPPING = { - "model.vision_tower.": "", - "model.mm_projector": "multi_modal_projector", - "model": "model.model", - "vision_model.model": "vision_model", - "lm_head": "language_model.lm_head", - "model.model": "language_model.model", - "multi_modal_projector.0": "multi_modal_projector.linear_1", - "multi_modal_projector.2": "multi_modal_projector.linear_2", - "language_model.model.image_newline": "image_newline", -} - - -def load_original_state_dict(model_id): - directory_path = snapshot_download(repo_id=model_id, allow_patterns=["*.safetensors"]) - - original_state_dict = {} - for path in glob.glob(f"{directory_path}/*"): - if path.endswith(".safetensors"): - with safe_open(path, framework="pt", device="cpu") as f: - for key in f.keys(): - original_state_dict[key] = f.get_tensor(key) - - return original_state_dict - - -def convert_state_dict_to_hf(state_dict): - new_state_dict = {} - for key, value in state_dict.items(): - if key.endswith(".inv_freq"): - continue - for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items(): - if key_to_modify in key: - key = key.replace(key_to_modify, new_key) - - new_state_dict[key] = value.to(torch.float16) - return new_state_dict - - -def load_image(): - url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true" - image = Image.open(requests.get(url, stream=True).raw) - return image - - -def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False): - # load original config - filepath = hf_hub_download(repo_id=model_id, filename="config.json", repo_type="model") - # read json - with open(filepath) as f: - data = json.load(f) - print(data) - - if model_id == "liuhaotian/llava-v1.6-mistral-7b": - text_model_id = "mistralai/Mistral-7B-Instruct-v0.2" - image_token_id = 32000 - elif model_id == "liuhaotian/llava-v1.6-vicuna-7b": - text_model_id = "lmsys/vicuna-7b-v1.5" - image_token_id = 32000 - elif model_id == "liuhaotian/llava-v1.6-vicuna-13b": - text_model_id = "lmsys/vicuna-13b-v1.5" - image_token_id = 32000 - elif model_id == "liuhaotian/llava-v1.6-34b": - text_model_id = "NousResearch/Nous-Hermes-2-Yi-34B" - image_token_id = 64000 - elif model_id == "lmms-lab/llama3-llava-next-8b": - text_model_id = "meta-llama/Meta-Llama-3-8B-Instruct" - image_token_id = 128256 - elif model_id == "lmms-lab/llava-next-72b": - text_model_id = "Qwen/Qwen1.5-72B-Chat" - image_token_id = 151646 - elif model_id == "lmms-lab/llava-next-110b": - text_model_id = "Qwen/Qwen1.5-110B-Chat" - image_token_id = 151646 - - vision_model_id = data["mm_vision_tower"] - - torch.set_default_dtype(torch.float16) - text_config = AutoConfig.from_pretrained(text_model_id) - - use_fast = model_id != "liuhaotian/llava-v1.6-34b" - tokenizer = AutoTokenizer.from_pretrained(text_model_id, use_fast=use_fast) - tokenizer.add_tokens(AddedToken(" ", special=True, normalized=False), special_tokens=True) - - if model_id in ("liuhaotian/llava-v1.6-mistral-7b", "lmms-lab/llama3-llava-next-8b"): - # Mistral-7B doesn't have a padding token set yet - tokenizer.add_special_tokens({"pad_token": " "}) - - image_processor = LlavaNextImageProcessor.from_pretrained(vision_model_id) - processor = LlavaNextProcessor(tokenizer=tokenizer, image_processor=image_processor) - - config = LlavaNextConfig( - text_config=text_config.to_dict(), - image_grid_pinpoints=image_processor.image_grid_pinpoints, - use_image_newline_parameter=True, - image_token_id=image_token_id, - ) - - with init_empty_weights(): - model = LlavaNextForConditionalGeneration(config) - - # load original state dict - state_dict = load_original_state_dict(model_id) - state_dict = convert_state_dict_to_hf(state_dict) - model.load_state_dict(state_dict, assign=True) - model.eval() - - pre_expansion_embeddings = model.language_model.model.embed_tokens.weight.data - mu = torch.mean(pre_expansion_embeddings, dim=0).float() - n = pre_expansion_embeddings.size()[0] - sigma = ((pre_expansion_embeddings - mu).T @ (pre_expansion_embeddings - mu)) / n - dist = torch.distributions.multivariate_normal.MultivariateNormal(mu, covariance_matrix=1e-5 * sigma) - - # We add an image token so we resize the model - # Pad to 64 for performance reasons - # Qwen-based models have extra unused space in the vocab size already, so no need to resize - if model_id not in ["lmms-lab/llava-next-72b", "lmms-lab/llava-next-110b"]: - pad_shape = 64 - vocab_size = config.text_config.vocab_size - if model_id == "liuhaotian/llava-v1.6-34b": - # this one has 3 additional tokens, namely <|startoftext|>, <|endoftext|> and - num_tokens = vocab_size + 3 - else: - # this one has 2 additional tokens, namely and - num_tokens = vocab_size + 2 - model.resize_token_embeddings(num_tokens, pad_to_multiple_of=pad_shape) - model.language_model.model.embed_tokens.weight.data[vocab_size:] = torch.stack( - tuple( - dist.sample() for _ in range(model.language_model.model.embed_tokens.weight.data[vocab_size:].shape[0]) - ), - dim=0, - ) - model.language_model.lm_head.weight.data[vocab_size:] = torch.stack( - tuple(dist.sample() for _ in range(model.language_model.lm_head.weight.data[vocab_size:].shape[0])), - dim=0, - ) - - print(f"Saving model and processor for {model_id} to {pytorch_dump_folder_path}") - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - # Make space so we can load the model properly now. - del state_dict - gc.collect() - - # Load everything back for inference tests in float32 because prev script was written as that - # Though it's mostly loaded in fp16 as original weights are in fp16 - model = LlavaNextForConditionalGeneration.from_pretrained(pytorch_dump_folder_path, device_map="auto") - processor = LlavaNextProcessor.from_pretrained(pytorch_dump_folder_path) - device = model.device - - # prepare inputs - image = load_image() - if model_id == "liuhaotian/llava-v1.6-mistral-7b": - prompt = "[INST] \nWhat is shown in this image? [/INST]" - elif model_id in ["liuhaotian/llava-v1.6-vicuna-7b", "liuhaotian/llava-v1.6-vicuna-13b"]: - prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: \nWhat is shown in this image? ASSISTANT:" - elif model_id == "liuhaotian/llava-v1.6-34b": - prompt = "<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n \nWhat is shown in this image?<|im_end|><|im_start|>assistant\n" - elif model_id == "lmms-lab/llama3-llava-next-8b": - prompt = "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.<|eot_id|><|start_header_id|><|start_header_id|>user<|end_header_id|>\n\n \nWhat is shown in this image?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" - elif model_id in ["lmms-lab/llava-next-72b", "lmms-lab/llava-next-110b"]: - prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n \nWhat is shown in this image?<|im_end|>\n<|im_start|>assistant\n" - - inputs = processor(images=image, text=prompt, return_tensors="pt") - - # verify inputs - filepath = hf_hub_download(repo_id="nielsr/test-image", filename="llava_1_6_pixel_values.pt", repo_type="dataset") - original_pixel_values = torch.load(filepath, map_location="cpu", weights_only=True) - assert torch.allclose(original_pixel_values, inputs.pixel_values.half()) - - if model_id == "liuhaotian/llava-v1.6-mistral-7b": - filepath = hf_hub_download(repo_id="nielsr/test-image", filename="llava_1_6_input_ids.pt", repo_type="dataset") - original_input_ids = torch.load(filepath, map_location="cpu", weights_only=True) - # replace -200 by image_token_id (since we use token ID = 32000 for the image token) - original_input_ids[original_input_ids == -200] = image_token_id - assert original_input_ids[0].tolist() == inputs.input_ids[0].tolist() - - elif model_id == "liuhaotian/llava-v1.6-34b": - filepath = hf_hub_download( - repo_id="nielsr/test-image", filename="llava_1_6_34b_input_ids.pt", repo_type="dataset" - ) - original_input_ids = torch.load(filepath, map_location="cpu", weights_only=True) - # replace -200 by image_token_id - original_input_ids[original_input_ids == -200] = image_token_id - - assert original_input_ids[0].tolist() == inputs.input_ids[0].tolist() - - image_sizes = torch.tensor([[899, 1024]]) - assert image_sizes[0].tolist() == inputs.image_sizes[0].tolist() - - # verify single forward pass - print("Single forward pass") - with torch.inference_mode(): - inputs = inputs.to(device) - outputs = model(**inputs) - print("Shape of logits:", outputs.logits.shape) - print("First values of logits:", outputs.logits[0, :3, :3]) - - if model_id == "liuhaotian/llava-v1.6-mistral-7b": - expected_slice = torch.tensor( - [[-4.8555, -4.6992, -0.1996], [-10.5703, -10.7344, -2.7246], [-7.0391, -7.3672, -0.2634]], - dtype=torch.float32, - device=device, - ) - elif model_id == "liuhaotian/llava-v1.6-vicuna-7b": - expected_slice = torch.tensor( - [[1.4883, 0.9976, -0.6992], [-9.7031, -5.7031, -1.5557], [-5.1328, -5.5586, 8.8281]], - dtype=torch.float32, - device=device, - ) - elif model_id == "liuhaotian/llava-v1.6-vicuna-13b": - expected_slice = torch.tensor( - [[-0.9614, 7.3125, 0.2106], [-7.2695, -8.5469, 3.6211], [-6.3750, -8.1875, 5.4688]], - dtype=torch.float32, - device=device, - ) - elif model_id == "liuhaotian/llava-v1.6-34b": - expected_slice = torch.tensor( - [[-9.0859, -9.1406, 5.9453], [-5.9570, -5.9766, 2.2754], [-5.7305, -5.7539, 4.0000]], - dtype=torch.float32, - device=device, - ) - elif model_id == "lmms-lab/llama3-llava-next-8b": - expected_slice = torch.tensor( - [[-3.9648, 1.1396, 3.3145], [-5.3594, -1.5654, -1.9619], [-12.3750, -10.6797, -9.3125]], - dtype=torch.float32, - device=device, - ) - elif model_id == "lmms-lab/llava-next-72b": - # Not yet checked against reference - expected_slice = torch.tensor( - [[3.7148, 3.9277, 3.4395], [-0.4341, 1.1387, 6.5117], [3.2324, 3.4688, 4.1133]], - dtype=torch.float32, - device=device, - ) - elif model_id == "lmms-lab/llava-next-110b": - # Not yet checked against reference - expected_slice = torch.tensor( - [[-2.5449, -1.6738, -2.0371], [1.0811, 3.4961, 5.0312], [1.7803, 2.5137, 2.4277]], - dtype=torch.float32, - device=device, - ) - else: - raise ValueError(f"Model {model_id} not supported") - - assert torch.allclose(outputs.logits[0, :3, :3], expected_slice, atol=1e-4) - print("Logits are ok!") - - # verify generation - output_ids = model.generate( - **inputs, - max_new_tokens=100, - use_cache=True, - ) - - generated_text = processor.batch_decode(output_ids, skip_special_tokens=True)[0].strip() - - print("Generated text:", repr(generated_text)) - - if model_id == "liuhaotian/llava-v1.6-mistral-7b": - expected_text = '[INST] \nWhat is shown in this image? [/INST] The image appears to be a radar chart, which is a type of multi-dimensional plot that displays data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point.\n\nIn this particular radar chart, there are several axes labeled with different metrics or benchmarks, such as "MMM-Vet," "MMM-Bench," "LLaVA-Bench," "SLED-Bench," "' - elif model_id == "liuhaotian/llava-v1.6-vicuna-7b": - expected_text = """A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human\'s questions. USER: \nWhat is shown in this image? ASSISTANT: The image appears to be a graphical representation of a benchmarking study comparing the performance of various models or systems. It\'s a scatter plot with a circular layout, where each point represents a different model or system, and the axes represent different metrics or dimensions of comparison.\n\nThe metrics are likely related to machine learning or artificial intelligence performance, as indicated by the terms like "BLIP-2," "Instruct BLIP," "POE," "QWA," "V""" - elif model_id == "liuhaotian/llava-v1.6-vicuna-13b": - expected_text = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: \nWhat is shown in this image? ASSISTANT: The image appears to be a radar chart, also known as a spider chart or star chart, which is a graphical method of displaying multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point.\n\nIn this particular radar chart, there are several variables represented:\n\n- MM-Vet\n- LLa-Va-Bench\n- SEED-Bench\n- MM" - elif model_id == "liuhaotian/llava-v1.6-34b": - expected_text = "<|im_start|> system\nAnswer the questions. <|im_start|> user\n\nWhat is shown in this image? <|im_start|> assistant\nThe image appears to be a radar chart, also known as a spider chart, which is a graphical method of displaying multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point.\n\nIn this particular chart, there are several datasets represented by different colors and labeled with various acronyms such as MM-Vet, LLaVA-Bench, SEED-Bench, MM-Bench-CN, MM-" - elif model_id == "lmms-lab/llama3-llava-next-8b": - expected_text = 'system\n\nYou are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.user\n\n\nWhat is shown in this image?assistant\n\n\nThe image shows a radar chart, also known as a spider chart or a web chart, which is a type of graph used to display multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point. Each axis represents a different variable, and the values are plotted along each axis and connected to form a polygon.\n\nIn this particular radar chart, there are several axes labeled with different variables, such as "MM-Vet," "LL' - elif model_id == "lmms-lab/llava-next-72b": - expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThe image displays a radar chart, also known as a spider chart or a star chart, which is a graphical method of displaying multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point. Each axis represents a different variable, and the value of each variable is represented by the distance from the center of the chart to the point where the axis intersects with the line representing that variable's value.\n\nIn this particular chart, there are several axes" - elif model_id == "lmms-lab/llava-next-110b": - expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThe image shows a radar chart comparing the performance of different models on various visual question answering (VQA) benchmarks. Each colored line represents a different model, and the distance from the center of the chart indicates the score or performance level of the model on a particular benchmark. The benchmarks are labeled around the edges of the chart, and include VQA v2, GQA, VizWiz, TextVQA, MMBench-CN, MME, and others. The chart allows for a" - else: - raise ValueError(f"Model {model_id} not supported") - - assert generated_text == expected_text - print("Generated text is ok!") - - # verify batched generation - print("Batched generation...") - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - cats_image = Image.open(requests.get(url, stream=True).raw) - - inputs = processor( - images=[image, cats_image], - text=[prompt, prompt], - padding=True, - return_tensors="pt", - ).to(device) - - for k, v in inputs.items(): - print(k, v.shape) - - print("Image sizes:", inputs.image_sizes) - - # make sure image_sizes are the same - # as otherwise batched generation doesn't work - inputs.image_sizes[1] = inputs.image_sizes[0] - - print("Batched generation...") - output_ids = model.generate( - **inputs, - max_new_tokens=20, - use_cache=True, - ) - - outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True) - print(outputs) - - if push_to_hub: - checkpoint_name = model_id.split("/")[-1] - print(f"Pushing to repo llava-hf/{checkpoint_name}-hf") - model.push_to_hub(f"llava-hf/{checkpoint_name}-hf") - processor.push_to_hub(f"llava-hf/{checkpoint_name}-hf") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--model_id", - help="Hub location of the model to convert", - default="liuhaotian/llava-v1.6-mistral-7b", - choices=[ - "liuhaotian/llava-v1.6-mistral-7b", - "liuhaotian/llava-v1.6-vicuna-7b", - "liuhaotian/llava-v1.6-vicuna-13b", - "liuhaotian/llava-v1.6-34b", - "lmms-lab/llama3-llava-next-8b", - "lmms-lab/llava-next-72b", - "lmms-lab/llava-next-110b", - ], - required=False, - ) - parser.add_argument( - "--pytorch_dump_folder_path", type=str, required=True, help="Path to the output PyTorch model directory." - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub." - ) - args = parser.parse_args() - - convert_llava_to_hf(args.model_id, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/llava_next/image_processing_llava_next.py b/src/transformers/models/llava_next/image_processing_llava_next.py index 3887c9c7ad4b..350ce9db7dc6 100644 --- a/src/transformers/models/llava_next/image_processing_llava_next.py +++ b/src/transformers/models/llava_next/image_processing_llava_next.py @@ -58,12 +58,12 @@ from PIL import Image -def divide_to_patches(image: np.ndarray, patch_size: int, input_data_format) -> list[np.array]: +def divide_to_patches(image: np.ndarray, patch_size: int, input_data_format) -> list[np.ndarray]: """ Divides an image into patches of a specified size. Args: - image (`np.array`): + image (`np.ndarray`): The input image. patch_size (`int`): The size of each patch. @@ -71,7 +71,7 @@ def divide_to_patches(image: np.ndarray, patch_size: int, input_data_format) -> The channel dimension format of the input image. Returns: - list: A list of np.array representing the patches. + list: A list of np.ndarray representing the patches. """ patches = [] height, width = get_image_size(image, channel_dim=input_data_format) @@ -86,7 +86,7 @@ def divide_to_patches(image: np.ndarray, patch_size: int, input_data_format) -> return patches -def expand_to_square(image: np.ndarray, background_color, input_data_format) -> np.array: +def expand_to_square(image: np.ndarray, background_color, input_data_format) -> np.ndarray: """ Expands an image to a square by adding a background color. """ @@ -400,12 +400,12 @@ def _preprocess( def _resize_for_patching( self, image: np.ndarray, target_resolution: tuple, resample, input_data_format: ChannelDimension - ) -> np.array: + ) -> np.ndarray: """ Resizes an image to a target resolution while maintaining aspect ratio. Args: - image (np.array): + image (np.ndarray): The input image. target_resolution (tuple): The target resolution (height, width) of the image. @@ -415,7 +415,7 @@ def _resize_for_patching( The channel dimension format of the input image. Returns: - np.array: The resized and padded image. + np.ndarray: The resized and padded image. """ new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format) @@ -433,7 +433,7 @@ def _get_padding_size(self, original_resolution: tuple, target_resolution: tuple def _pad_for_patching( self, image: np.ndarray, target_resolution: tuple, input_data_format: ChannelDimension - ) -> np.array: + ) -> np.ndarray: """ Pad an image to a target resolution while maintaining aspect ratio. """ @@ -453,12 +453,12 @@ def get_image_patches( resample: PILImageResampling, data_format: ChannelDimension, input_data_format: ChannelDimension, - ) -> list[np.array]: + ) -> list[np.ndarray]: """ Process an image with variable resolutions by dividing it into patches. Args: - image (np.array): + image (np.ndarray): The input image to be processed. grid_pinpoints (List): A string representation of a list of possible resolutions. @@ -474,7 +474,7 @@ def get_image_patches( The channel dimension format of the input image. Returns: - list[np.array]: A list of NumPy arrays containing the processed image patches. + list[np.ndarray]: A list of NumPy arrays containing the processed image patches. """ if not isinstance(grid_pinpoints, list): raise TypeError("grid_pinpoints must be a list of possible resolutions.") diff --git a/src/transformers/models/llava_next/image_processing_llava_next_fast.py b/src/transformers/models/llava_next/image_processing_llava_next_fast.py index b502d98d6ac3..df20e2b90e83 100644 --- a/src/transformers/models/llava_next/image_processing_llava_next_fast.py +++ b/src/transformers/models/llava_next/image_processing_llava_next_fast.py @@ -17,6 +17,7 @@ from typing import Optional, Union import torch +from torchvision.transforms.v2 import functional as F from ...image_processing_utils import BatchFeature, get_patch_output_size, select_best_resolution from ...image_processing_utils_fast import ( @@ -39,16 +40,9 @@ from ...utils import ( TensorType, auto_docstring, - is_torchvision_v2_available, ) -if is_torchvision_v2_available(): - from torchvision.transforms.v2 import functional as F -else: - from torchvision.transforms import functional as F - - class LlavaNextFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): """ image_grid_pinpoints (`list[list[int]]`, *optional*): diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py index 8cca63f4a66c..a75b4b798107 100644 --- a/src/transformers/models/llava_next/modeling_llava_next.py +++ b/src/transformers/models/llava_next/modeling_llava_next.py @@ -409,8 +409,6 @@ def get_image_features( if vision_feature_select_strategy == "default": selected_image_feature = selected_image_feature[:, 1:] - elif vision_feature_select_strategy == "full": - selected_image_feature = selected_image_feature image_features = self.multi_modal_projector(selected_image_feature) image_features = torch.split(image_features, image_num_patches, dim=0) diff --git a/src/transformers/models/llava_next_video/convert_llava_next_video_weights_to_hf.py b/src/transformers/models/llava_next_video/convert_llava_next_video_weights_to_hf.py deleted file mode 100644 index 265e543cb557..000000000000 --- a/src/transformers/models/llava_next_video/convert_llava_next_video_weights_to_hf.py +++ /dev/null @@ -1,274 +0,0 @@ -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Convert LLaVa-NeXT-Video checkpoints from the original repository. - -URL: https://github.com/LLaVA-VL/LLaVA-NeXT/tree/inference -""" - -import argparse -import glob -import json -from pathlib import Path - -import torch -from accelerate import init_empty_weights -from huggingface_hub import hf_hub_download, snapshot_download -from safetensors import safe_open - -from transformers import ( - AddedToken, - AutoConfig, - AutoTokenizer, - LlavaNextImageProcessor, - LlavaNextVideoConfig, - LlavaNextVideoForConditionalGeneration, - LlavaNextVideoImageProcessor, - LlavaNextVideoProcessor, -) - - -KEYS_TO_MODIFY_MAPPING = { - "model.vision_tower.": "", - ".vision_resampler": "", # all lmms-lab models do avg pooling, so no vision_resampler - "model.mm_projector": "multi_modal_projector", - "model": "model.model", - "vision_model.model": "vision_model", - "lm_head": "language_model.lm_head", - "model.model": "language_model.model", - "multi_modal_projector.0": "multi_modal_projector.linear_1", - "multi_modal_projector.2": "multi_modal_projector.linear_2", - "language_model.model.image_newline": "image_newline", -} - -# {{SYSTEM_PROMPT}} USER: \n{{PROMPT}} ASSISTANT:" assistant end with " " -chat_vicuna = ( - "{% for message in messages %}" - "{% if message['role'] == 'system' %}" - "{{ message['content'][0]['text'] }}" - "{% else %}" - "{{ message['role'].upper() + ': '}}" - "{% endif %}" - "{# Render all images first #}" - "{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}" - "{{ ' \n' }}" - "{% endfor %}" - "{# Render all text next #}" - "{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}" - "{{ content['text'] + ' '}}" - "{% endfor %}" - "{% endfor %}" - "{% if add_generation_prompt %}" - "{{ 'ASSISTANT:' }}" - "{% endif %}" -) - -# "[INST] \nWhat is shown in this image? [/INST]" assistant end with " " -chat_mistral = ( - "{% for message in messages %}" - "{% if message['role'] == 'user' %}" - "{{ '[INST] ' }}" - "{# Render all images first #}" - "{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}" - "{{ ' \n' }}" - "{% endfor %}" - "{# Render all text next #}" - "{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}" - "{{ content['text'] }}" - "{% endfor %}" - "{{' [/INST]' }}" - "{% elif message['role'] == 'assistant' %}" - r"{{ ' ' + message['content'][0]['text'] + '<\s> '}}" - "{% else %}" - "{{ raise_exception('Only user and assistant roles are supported!') }}" - "{% endif %}" - "{% endfor %}" -) - -# "<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n \nWhat is shown in this image?<|im_end|><|im_start|>assistant\n" -chat_yi = ( - "{% for message in messages %}" - "{{'<|im_start|>' + message['role'] + '\n'}}" - "{# Render all images first #}" - "{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}" - "{{ ' \n' }}" - "{% endfor %}" - "{# Render all text next #}" - "{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}" - "{{ content['text'] }}" - "{% endfor %}" - "{{'<|im_end|>' + '\n'}}" - "{% endfor %}" - "{% if add_generation_prompt %}" - "{{ '<|im_start|>assistant\n' }}" - "{% endif %}" -) - -model2template = { - "lmms-lab/LLaVA-NeXT-Video-7B-32K": chat_mistral, - "lmms-lab/LLaVA-NeXT-Video-7B": chat_vicuna, - "lmms-lab/LLaVA-NeXT-Video-7B-DPO": chat_vicuna, - "lmms-lab/LLaVA-NeXT-Video-34B": chat_yi, - "lmms-lab/LLaVA-NeXT-Video-34B-DPO": chat_yi, -} - - -def load_original_state_dict(model_id): - directory_path = snapshot_download(repo_id=model_id, allow_patterns=["*.safetensors"]) - - original_state_dict = {} - for path in glob.glob(f"{directory_path}/*"): - if path.endswith(".safetensors"): - with safe_open(path, framework="pt", device="cpu") as f: - for key in f.keys(): - original_state_dict[key] = f.get_tensor(key) - - return original_state_dict - - -def convert_state_dict_to_hf(state_dict): - new_state_dict = {} - for key, value in state_dict.items(): - if key.endswith(".inv_freq"): - continue - for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items(): - if key_to_modify in key: - key = key.replace(key_to_modify, new_key) - - new_state_dict[key] = value.to(torch.bfloat16) - return new_state_dict - - -def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False): - # load original config - filepath = hf_hub_download(repo_id=model_id, filename="config.json", repo_type="model") - with open(filepath) as f: - data = json.load(f) - print(data) - - if model_id == "lmms-lab/LLaVA-NeXT-Video-7B-32K": - text_model_id = "mistralai/Mistral-7B-Instruct-v0.2" - video_token_id = 32000 - image_token_id = 32001 - overwrite_text_config = {} - elif model_id in ["lmms-lab/LLaVA-NeXT-Video-7B", "lmms-lab/LLaVA-NeXT-Video-7B-DPO"]: - text_model_id = "lmsys/vicuna-7b-v1.5" - video_token_id = 32000 - image_token_id = 32001 - overwrite_text_config = {"factor": 2.0, "type": "linear"} - elif model_id in ["lmms-lab/LLaVA-NeXT-Video-34B", "lmms-lab/LLaVA-NeXT-Video-34B-DPO"]: - text_model_id = "NousResearch/Nous-Hermes-2-Yi-34B" - video_token_id = 64000 - image_token_id = 64001 - overwrite_text_config = {} - else: - raise ValueError("Incorrect checkpoint referenced. Text model-id not identified!") - - vision_model_id = data["mm_vision_tower"] - - torch.set_default_dtype(torch.bfloat16) - text_config = AutoConfig.from_pretrained(text_model_id) - text_config = text_config.to_dict() - text_config.update(overwrite_text_config) - - tokenizer = AutoTokenizer.from_pretrained(text_model_id, use_fast=True, padding_side="left") - tokenizer.add_tokens(AddedToken(" ", special=True, normalized=False), special_tokens=True) - tokenizer.add_tokens(AddedToken(" ", special=True, normalized=False), special_tokens=True) - - image_processor = LlavaNextImageProcessor.from_pretrained(vision_model_id) - video_processor = LlavaNextVideoImageProcessor.from_pretrained(vision_model_id) - processor = LlavaNextVideoProcessor( - tokenizer=tokenizer, - video_processor=video_processor, - image_processor=image_processor, - chat_template=model2template[model_id], - ) - - config = LlavaNextVideoConfig( - text_config=text_config, - image_grid_pinpoints=image_processor.image_grid_pinpoints, - use_image_newline_parameter=True, - video_token_id=video_token_id, - image_token_id=image_token_id, - ) - - with init_empty_weights(): - model = LlavaNextVideoForConditionalGeneration(config) - - # load original state dict - state_dict = load_original_state_dict(model_id) - state_dict = convert_state_dict_to_hf(state_dict) - model.load_state_dict(state_dict, assign=True, strict=True) - - # See https://nlp.stanford.edu/~johnhew/vocab-expansion.html for why we get mean/stdev this way to expand embeddings - pre_expansion_embeddings = model.language_model.model.embed_tokens.weight.data - mu = torch.mean(pre_expansion_embeddings, dim=0).float() - n = pre_expansion_embeddings.size()[0] - sigma = ((pre_expansion_embeddings - mu).T @ (pre_expansion_embeddings - mu)) / n - dist = torch.distributions.multivariate_normal.MultivariateNormal(mu, covariance_matrix=1e-5 * sigma) - - # We add an image token so we resize the model - # Pad to 64 for performance reasons - pad_shape = 64 - vocab_size = config.text_config.vocab_size - - # this one has 2 additional tokens, namely , and - num_tokens = vocab_size + 3 - model.resize_token_embeddings(num_tokens, pad_to_multiple_of=pad_shape) - model.language_model.model.embed_tokens.weight.data[vocab_size:] = torch.stack( - tuple(dist.sample() for _ in range(model.language_model.model.embed_tokens.weight.data[vocab_size:].shape[0])), - dim=0, - ) - model.language_model.lm_head.weight.data[vocab_size:] = torch.stack( - tuple(dist.sample() for _ in range(model.language_model.lm_head.weight.data[vocab_size:].shape[0])), - dim=0, - ) - - if pytorch_dump_folder_path is not None: - print(f"Saving model and processor for {model_id} to {pytorch_dump_folder_path}") - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - repo_id = model_id.split("/")[-1] - print(f"Pushing model to hub repo: {repo_id}") - model.push_to_hub(f"llava-hf/{repo_id}-hf") - processor.push_to_hub(f"llava-hf/{repo_id}-hf") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--model_id", - help="Hub location of the model to convert", - default="lmms-lab/LLaVA-NeXT-Video-7B", - choices=[ - "lmms-lab/LLaVA-NeXT-Video-7B", - "lmms-lab/LLaVA-NeXT-Video-7B-DPO", - "lmms-lab/LLaVA-NeXT-Video-7B-32K", - "lmms-lab/LLaVA-NeXT-Video-34B", - "lmms-lab/LLaVA-NeXT-Video-34B-DPO", - ], - required=False, - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub." - ) - args = parser.parse_args() - - convert_llava_to_hf(args.model_id, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py index 3ef172962c2c..9e3b15cea548 100644 --- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py @@ -461,8 +461,6 @@ def get_image_features( if vision_feature_select_strategy == "default": selected_image_feature = selected_image_feature[:, 1:] - elif vision_feature_select_strategy == "full": - selected_image_feature = selected_image_feature image_features = self.multi_modal_projector(selected_image_feature) image_features = torch.split(image_features, image_num_patches, dim=0) @@ -659,8 +657,6 @@ def get_video_features( if vision_feature_select_strategy == "default": selected_video_features = selected_video_features[:, 1:] - elif vision_feature_select_strategy == "full": - selected_video_features = selected_video_features # Same as image features except that video has pooling layer video_features = self.vision_resampler(selected_video_features) diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py index 73745f435b7d..7eda08ffa0bd 100644 --- a/src/transformers/models/llava_next_video/modular_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py @@ -327,8 +327,6 @@ def get_image_features( if vision_feature_select_strategy == "default": selected_image_feature = selected_image_feature[:, 1:] - elif vision_feature_select_strategy == "full": - selected_image_feature = selected_image_feature image_features = self.multi_modal_projector(selected_image_feature) image_features = torch.split(image_features, image_num_patches, dim=0) @@ -386,8 +384,6 @@ def get_video_features( if vision_feature_select_strategy == "default": selected_video_features = selected_video_features[:, 1:] - elif vision_feature_select_strategy == "full": - selected_video_features = selected_video_features # Same as image features except that video has pooling layer video_features = self.vision_resampler(selected_video_features) diff --git a/src/transformers/models/llava_onevision/convert_llava_onevision_weights_to_hf.py b/src/transformers/models/llava_onevision/convert_llava_onevision_weights_to_hf.py deleted file mode 100644 index dea84924d9b7..000000000000 --- a/src/transformers/models/llava_onevision/convert_llava_onevision_weights_to_hf.py +++ /dev/null @@ -1,386 +0,0 @@ -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Convert LLaVa-Onevision checkpoints from the original repository. - -URL: https://github.com/LLaVA-VL/LLaVA-NeXT/tree/main - -""" - -import argparse -import gc -import glob -import json -from pathlib import Path - -import requests -import torch -from accelerate import init_empty_weights -from huggingface_hub import hf_hub_download, snapshot_download -from PIL import Image -from safetensors import safe_open - -from transformers import ( - AddedToken, - AutoConfig, - AutoTokenizer, - LlavaOnevisionConfig, - LlavaOnevisionForConditionalGeneration, - LlavaOnevisionImageProcessor, - LlavaOnevisionProcessor, - LlavaOnevisionVideoProcessor, - SiglipVisionConfig, -) - - -KEYS_TO_MODIFY_MAPPING = { - "model.vision_tower.": "", - "model.mm_projector": "multi_modal_projector", - "model": "model.model", - "vision_model.model": "vision_model", - "lm_head": "language_model.lm_head", - "model.model": "language_model.model", - "multi_modal_projector.0": "multi_modal_projector.linear_1", - "multi_modal_projector.2": "multi_modal_projector.linear_2", - "language_model.model.image_newline": "image_newline", -} - -chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n'}}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ ' \n' }}{% endfor %}{# Render all video then #}{% for content in message['content'] | selectattr('type', 'equalto', 'video') %}{{ ' \n' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] }}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] }}{% endgeneration %}{% endfor %}{% endif %}{{'<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" - - -def load_original_state_dict(model_id): - directory_path = snapshot_download(repo_id=model_id, allow_patterns=["*.safetensors"]) - - original_state_dict = {} - for path in glob.glob(f"{directory_path}/*"): - if path.endswith(".safetensors"): - with safe_open(path, framework="pt", device="cpu") as f: - for key in f.keys(): - original_state_dict[key] = f.get_tensor(key) - - # tied weights so lm.head is not saved. Let's clone to load state dict - if "lm_head.weight" not in original_state_dict: - original_state_dict["lm_head.weight"] = original_state_dict["model.embed_tokens.weight"].clone() - - return original_state_dict - - -def convert_state_dict_to_hf(state_dict): - new_state_dict = {} - for key, value in state_dict.items(): - if key.endswith(".inv_freq"): - continue - for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items(): - if key_to_modify in key: - key = key.replace(key_to_modify, new_key) - - new_state_dict[key] = value.to(torch.float16) - return new_state_dict - - -def load_image(): - url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true" - image = Image.open(requests.get(url, stream=True).raw) - return image - - -def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False): - # load original config - filepath = hf_hub_download(repo_id=model_id, filename="config.json", repo_type="model") - # read json - with open(filepath) as f: - data = json.load(f) - print(data) - - if model_id in ["lmms-lab/llava-onevision-qwen2-0.5b-ov", "lmms-lab/llava-onevision-qwen2-0.5b-si"]: - text_model_id = "Qwen/Qwen2-0.5B-Instruct" - elif model_id in [ - "lmms-lab/llava-onevision-qwen2-7b-ov", - "lmms-lab/llava-onevision-qwen2-7b-si", - "lmms-lab/llava-onevision-qwen2-7b-ov-chat", - ]: - text_model_id = "Qwen/Qwen2-7B-Instruct" - elif model_id in [ - "lmms-lab/llava-onevision-qwen2-72b-ov", - "lmms-lab/llava-onevision-qwen2-72b-si", - "lmms-lab/llava-onevision-qwen2-72b-ov-chat", - ]: - text_model_id = "Qwen/Qwen2-72B-Instruct" - - vision_model_id = data["mm_vision_tower"] - torch.set_default_dtype(torch.float16) - text_config = AutoConfig.from_pretrained(text_model_id) - - tokenizer = AutoTokenizer.from_pretrained(text_model_id, use_fast=True) - tokenizer.add_tokens(AddedToken(" ", special=True, normalized=False), special_tokens=True) - tokenizer.add_tokens(AddedToken(" ", special=True, normalized=False), special_tokens=True) - - image_processor = LlavaOnevisionImageProcessor.from_pretrained(vision_model_id) - video_processor = LlavaOnevisionVideoProcessor.from_pretrained(vision_model_id) - processor = LlavaOnevisionProcessor( - tokenizer=tokenizer, - video_processor=video_processor, - image_processor=image_processor, - num_image_tokens=729, - vision_feature_select_strategy="full", - chat_template=chat_template, - ) - - vision_config = SiglipVisionConfig( - hidden_size=1152, - image_size=384, - intermediate_size=4304, - num_attention_heads=16, - num_hidden_layers=26, # drop the last layer - patch_size=14, - vision_use_head=False, # no head - ).to_dict() - - config = LlavaOnevisionConfig( - text_config=text_config.to_dict(), - vision_config=vision_config, - use_image_newline_parameter=True, - ) - - with init_empty_weights(): - model = LlavaOnevisionForConditionalGeneration(config) - - # load original state dict - state_dict = load_original_state_dict(model_id) - state_dict = convert_state_dict_to_hf(state_dict) - model.load_state_dict(state_dict, assign=True) - model.eval() - - pre_expansion_embeddings = model.language_model.model.embed_tokens.weight.data - mu = torch.mean(pre_expansion_embeddings, dim=0).float() - n = pre_expansion_embeddings.size()[0] - sigma = ((pre_expansion_embeddings - mu).T @ (pre_expansion_embeddings - mu)) / n - dist = torch.distributions.multivariate_normal.MultivariateNormal(mu, covariance_matrix=1e-5 * sigma) - - # We add an image token so we resize the model - # Pad to 64 for performance reasons - # Qwen-based models have extra unused space in the vocab size already, so no need to resize - pad_shape = 64 - vocab_size = config.text_config.vocab_size - num_tokens = vocab_size + 2 - model.resize_token_embeddings(num_tokens, pad_to_multiple_of=pad_shape) - model.language_model.model.embed_tokens.weight.data[vocab_size:] = torch.stack( - tuple(dist.sample() for _ in range(model.language_model.model.embed_tokens.weight.data[vocab_size:].shape[0])), - dim=0, - ) - model.language_model.lm_head.weight.data[vocab_size:] = torch.stack( - tuple(dist.sample() for _ in range(model.language_model.lm_head.weight.data[vocab_size:].shape[0])), - dim=0, - ) - - print(f"Saving model and processor for {model_id} to {pytorch_dump_folder_path}") - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - # Make space so we can load the model properly now. - del state_dict - gc.collect() - - # Load everything back for inference tests in float32 because prev script was written as that - # Though it's mostly loaded in fp16 as original weights are in fp16 - model = LlavaOnevisionForConditionalGeneration.from_pretrained( - pytorch_dump_folder_path, dtype="float16", device_map="auto" - ) - processor = LlavaOnevisionProcessor.from_pretrained(pytorch_dump_folder_path) - device = model.device - - # prepare inputs - image = load_image() - prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n \nWhat is shown in this image?<|im_end|>\n<|im_start|>assistant\n" - inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch.float16) - - # verify inputs - filepath = hf_hub_download( - repo_id="RaushanTurganbay/test-image", filename="llava_onevision_pixel_values.pt", repo_type="dataset" - ) - original_pixel_values = torch.load(filepath, map_location="cpu", weights_only=True) - assert torch.allclose(original_pixel_values, inputs.pixel_values.half()) - - image_sizes = torch.tensor([[899, 1024]]) - assert image_sizes[0].tolist() == inputs.image_sizes[0].tolist() - - # verify single forward pass - print("Single forward pass") - with torch.inference_mode(): - inputs = inputs.to(device) - outputs = model(**inputs) - print("Shape of logits:", outputs.logits.shape) - print("First values of logits:", outputs.logits[0, :3, :3]) - - if model_id == "lmms-lab/llava-onevision-qwen2-0.5b-si": - # Not yet checked against reference - expected_slice = torch.tensor( - [[-12.1953, -14.6797, -12.7891], [0.5840, -0.8467, 1.3799], [3.6055, 4.5430, 9.9062]], - dtype=torch.float32, - device=device, - ) - elif model_id == "lmms-lab/llava-onevision-qwen2-0.5b-ov": - # Not yet checked against reference - expected_slice = torch.tensor( - [[-12.0234, -14.3828, -12.7500], [2.3594, 1.0000, 3.9336], [3.6582, 4.7148, 9.1172]], - dtype=torch.float32, - device=device, - ) - elif model_id == "lmms-lab/llava-onevision-qwen2-7b-si": - # Not yet checked against reference - expected_slice = torch.tensor( - [[1.7656, 3.3418, 1.4033], [0.0757, 0.7427, 3.5098], [6.7109, 5.6797, 9.3828]], - dtype=torch.float32, - device=device, - ) - elif model_id == "lmms-lab/llava-onevision-qwen2-7b-ov": - # Not yet checked against reference - expected_slice = torch.tensor( - [[1.8496, 3.4219, 1.3135], [3.0996, 3.0117, 3.1484], [4.2422, 4.7109, 9.9688]], - dtype=torch.float32, - device=device, - ) - elif model_id == "lmms-lab/llava-onevision-qwen2-72b-si": - # Not yet checked against reference - expected_slice = torch.tensor( - [[4.1875, 4.4883, 2.7910], [1.2949, 5.1328, 3.1582], [0.9390, 6.4531, 8.4375]], - dtype=torch.float32, - device=device, - ) - elif model_id == "lmms-lab/llava-onevision-qwen2-72b-ov": - # Not yet checked against reference - expected_slice = torch.tensor( - [[4.2930, 4.7305, 2.7363], [1.7529, 5.0742, 3.9590], [1.3936, 6.3438, 9.3984]], - dtype=torch.float32, - device=device, - ) - elif model_id == "lmms-lab/llava-onevision-qwen2-7b-ov-chat": - # Not yet checked against reference - expected_slice = torch.tensor( - [[1.8662, 3.4316, 1.3174], [2.7109, 2.5488, 3.0117], [4.4648, 4.9648, 10.3359]], - dtype=torch.float32, - device=device, - ) - elif model_id == "lmms-lab/llava-onevision-qwen2-72b-ov-chat": - # Not yet checked against reference - expected_slice = torch.tensor( - [[4.3086, 4.7344, 2.6953], [1.7090, 5.1719, 4.0234], [1.3057, 6.3438, 9.5469]], - dtype=torch.float32, - device=device, - ) - else: - raise ValueError(f"Model {model_id} not supported") - - assert torch.allclose(outputs.logits[0, :3, :3], expected_slice, atol=1e-4) - print("Logits are ok!") - - # verify generation - output_ids = model.generate( - **inputs, - max_new_tokens=100, - use_cache=True, - ) - - generated_text = processor.batch_decode(output_ids, skip_special_tokens=True)[0].strip() - - print("Generated text:", repr(generated_text)) - - if model_id == "lmms-lab/llava-onevision-qwen2-0.5b-si": - expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThe image is a radar chart that shows the performance of different algorithms or models in a specific domain, such as image classification or natural language processing. The chart is color-coded to represent different algorithms, with each color corresponding to a specific algorithm. The algorithms are labeled as BLIP-2, InstructBLIP, Owen-VL-Chat, and LLaVA-1.5. The chart also includes a legend at the bottom that explains the color coding and the algorithms represented." - elif model_id == "lmms-lab/llava-onevision-qwen2-0.5b-ov": - expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThe image is a radar chart that compares the performance of different models in a specific task, likely related to natural language processing or machine learning. The chart is divided into different categories, each represented by a different color and labeled with the name of the model or technique used. The models are evaluated based on their performance metrics, such as BLEU-2, InstructBLIP, Qwen-VL-Chat, and LLaVA-1.5. The radar chart helps to visualize the relative" - elif model_id == "lmms-lab/llava-onevision-qwen2-7b-si": - expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThis image is a radar chart that compares the performance of different models on various metrics. The models being compared are BLIP-2, InstructBLIP, and Qwen-VL-Chat. The metrics being compared are VQA, QA, GQA, VQA-av2, and VQA-av2. The chart shows that BLIP-2 performs the best on all metrics, followed by InstructBLIP and Qwen-VL-Chat." - elif model_id == "lmms-lab/llava-onevision-qwen2-7b-ov": - expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThe image shows a radar chart, also known as a spider chart or a star chart, which is used to compare multiple quantitative variables. Each axis represents a different variable, and the chart is filled with data points that represent the performance or values of different entities across these variables.\n\nIn this particular radar chart, the variables are represented on the axes, and the performance of different models or systems is shown by the lines connecting the data points. The models or systems are labeled along the bottom of the chart," - elif model_id == "lmms-lab/llava-onevision-qwen2-72b-si": - expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThe image shows a radar chart, which is a graphical method of displaying multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point. The chart is used to compare the performance of different models or systems across various benchmarks or metrics.\n\nIn this specific radar chart, there are multiple axes, each representing a different benchmark or metric, such as VQA2, GQA, TextVQA, and others. The chart includes several colored lines" - elif model_id == "lmms-lab/llava-onevision-qwen2-72b-ov": - expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThe image is a radar chart comparing the performance of different models on various multimodal benchmarks. The models compared are BLIP-2, InstructBLIP, POPE, QWen-VL-Chat, and LLava-1.5. The benchmarks include VQAv2, GQA, TextVQA, SQA-IMG, VizWiz, MM-IMDb, MM-VQA, MM-IMDb-CN, MM-IMDb-EN, MM-" - elif model_id == "lmms-lab/llava-onevision-qwen2-7b-ov-chat": - expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThe image shows a radar chart, also known as a spider chart or a star chart, which is used to display multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point. Each axis represents a different variable, and the values are plotted along these axes.\n\nIn this particular radar chart, there are multiple lines representing different models or systems, each distinguished by a different color and labeled with a name such as BLIP-2, In" - elif model_id == "lmms-lab/llava-onevision-qwen2-72b-ov-chat": - expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThe image is a radar chart comparing the performance of different models on various multimodal benchmarks. The models compared are BLIP-2, InstructBLIP, POPE, QWen-VL-Chat, and LLava-1.5. The benchmarks include VQAv2, GQA, TextVQA, SQA-IMG, VizWiz, MM-IMDb, MM-VQA, MM-IMDb-CN, MM-IMDb-EN, MM-" - else: - raise ValueError(f"Model {model_id} not supported") - - assert generated_text == expected_text - print("Generated text is ok!") - - # verify batched generation - print("Batched generation...") - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - cats_image = Image.open(requests.get(url, stream=True).raw) - - inputs = processor( - images=[image, cats_image], - text=[prompt, prompt], - padding=True, - return_tensors="pt", - ).to(device, torch.float16) - - for k, v in inputs.items(): - print(k, v.shape) - - print("Image sizes:", inputs.image_sizes) - - # make sure image_sizes are the same - # as otherwise batched generation doesn't work - inputs.image_sizes[1] = inputs.image_sizes[0] - - print("Batched generation...") - output_ids = model.generate( - **inputs, - max_new_tokens=20, - use_cache=True, - ) - - outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True) - print(outputs) - - if push_to_hub: - checkpoint_name = model_id.split("/")[-1] - print(f"Pushing to repo llava-hf/{checkpoint_name}-hf") - model.push_to_hub(f"llava-hf/{checkpoint_name}-hf") - processor.push_to_hub(f"llava-hf/{checkpoint_name}-hf") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--model_id", - help="Hub location of the model to convert", - default="lmms-lab/llava-onevision-qwen2-0.5b-ov", - choices=[ - "lmms-lab/llava-onevision-qwen2-0.5b-ov", - "lmms-lab/llava-onevision-qwen2-0.5b-si", - "lmms-lab/llava-onevision-qwen2-7b-si", - "lmms-lab/llava-onevision-qwen2-7b-ov", - "lmms-lab/llava-onevision-qwen2-72b-si", - "lmms-lab/llava-onevision-qwen2-72b-ov", - "lmms-lab/llava-onevision-qwen2-7b-ov-chat", - "lmms-lab/llava-onevision-qwen2-72b-ov-chat", - ], - required=False, - ) - parser.add_argument( - "--pytorch_dump_folder_path", type=str, required=True, help="Path to the output PyTorch model directory." - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub." - ) - args = parser.parse_args() - - convert_llava_to_hf(args.model_id, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/llava_onevision/image_processing_llava_onevision.py b/src/transformers/models/llava_onevision/image_processing_llava_onevision.py index 837eda460802..836a1984a522 100644 --- a/src/transformers/models/llava_onevision/image_processing_llava_onevision.py +++ b/src/transformers/models/llava_onevision/image_processing_llava_onevision.py @@ -58,12 +58,12 @@ # Copied from transformers.models.llava_next.image_processing_llava_next.divide_to_patches -def divide_to_patches(image: np.ndarray, patch_size: int, input_data_format) -> list[np.array]: +def divide_to_patches(image: np.ndarray, patch_size: int, input_data_format) -> list[np.ndarray]: """ Divides an image into patches of a specified size. Args: - image (`np.array`): + image (`np.ndarray`): The input image. patch_size (`int`): The size of each patch. @@ -71,7 +71,7 @@ def divide_to_patches(image: np.ndarray, patch_size: int, input_data_format) -> The channel dimension format of the input image. Returns: - list: A list of np.array representing the patches. + list: A list of np.ndarray representing the patches. """ patches = [] height, width = get_image_size(image, channel_dim=input_data_format) @@ -87,7 +87,7 @@ def divide_to_patches(image: np.ndarray, patch_size: int, input_data_format) -> # Copied from transformers.models.llava_next.image_processing_llava_next.expand_to_square -def expand_to_square(image: np.ndarray, background_color, input_data_format) -> np.array: +def expand_to_square(image: np.ndarray, background_color, input_data_format) -> np.ndarray: """ Expands an image to a square by adding a background color. """ @@ -292,12 +292,12 @@ def pad( # Copied from transformers.models.llava_next.image_processing_llava_next.LlavaNextImageProcessor._resize_for_patching def _resize_for_patching( self, image: np.ndarray, target_resolution: tuple, resample, input_data_format: ChannelDimension - ) -> np.array: + ) -> np.ndarray: """ Resizes an image to a target resolution while maintaining aspect ratio. Args: - image (np.array): + image (np.ndarray): The input image. target_resolution (tuple): The target resolution (height, width) of the image. @@ -307,7 +307,7 @@ def _resize_for_patching( The channel dimension format of the input image. Returns: - np.array: The resized and padded image. + np.ndarray: The resized and padded image. """ new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format) @@ -327,7 +327,7 @@ def _get_padding_size(self, original_resolution: tuple, target_resolution: tuple # Copied from transformers.models.llava_next.image_processing_llava_next.LlavaNextImageProcessor._pad_for_patching def _pad_for_patching( self, image: np.ndarray, target_resolution: tuple, input_data_format: ChannelDimension - ) -> np.array: + ) -> np.ndarray: """ Pad an image to a target resolution while maintaining aspect ratio. """ @@ -348,12 +348,12 @@ def get_image_patches( resample: PILImageResampling, data_format: ChannelDimension, input_data_format: ChannelDimension, - ) -> list[np.array]: + ) -> list[np.ndarray]: """ Process an image with variable resolutions by dividing it into patches. Args: - image (np.array): + image (np.ndarray): The input image to be processed. grid_pinpoints (List): A string representation of a list of possible resolutions. @@ -369,7 +369,7 @@ def get_image_patches( The channel dimension format of the input image. Returns: - list[np.array]: A list of NumPy arrays containing the processed image patches. + list[np.ndarray]: A list of NumPy arrays containing the processed image patches. """ if not isinstance(grid_pinpoints, list): raise TypeError("grid_pinpoints must be a list of possible resolutions.") @@ -450,7 +450,7 @@ def pad_to_square( background_color: Union[int, tuple[int, int, int]] = 0, data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, - ) -> np.array: + ) -> np.ndarray: """ Pads an image to a square based on the longest edge. diff --git a/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py b/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py index 4392d64e9ebf..11872cb67bf3 100644 --- a/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +++ b/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py @@ -22,6 +22,7 @@ from typing import Optional, Union import torch +from torchvision.transforms.v2 import functional as F from ...image_processing_utils import BatchFeature, get_patch_output_size, select_best_resolution from ...image_processing_utils_fast import ( @@ -41,13 +42,7 @@ get_image_size, ) from ...processing_utils import Unpack -from ...utils import TensorType, auto_docstring, is_torchvision_v2_available - - -if is_torchvision_v2_available(): - from torchvision.transforms.v2 import functional as F -else: - from torchvision.transforms import functional as F +from ...utils import TensorType, auto_docstring class LlavaOnevisionFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py index eae6e3046f94..727655374574 100644 --- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py +++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py @@ -432,8 +432,6 @@ def get_image_features( if vision_feature_select_strategy == "default": selected_image_feature = selected_image_feature[:, 1:] - elif vision_feature_select_strategy == "full": - selected_image_feature = selected_image_feature image_features = self.multi_modal_projector(selected_image_feature) image_features = torch.split(image_features, image_num_patches, dim=0) @@ -633,8 +631,6 @@ def get_video_features( if vision_feature_select_strategy == "default": selected_video_feature = selected_video_feature[:, 1:] - elif vision_feature_select_strategy == "full": - selected_video_feature = selected_video_feature video_features = self.multi_modal_projector(selected_video_feature) video_features = self.apply_pooling(video_features) diff --git a/src/transformers/models/llava_onevision/modular_llava_onevision.py b/src/transformers/models/llava_onevision/modular_llava_onevision.py index 21688e7763bf..b4f64dee8e04 100644 --- a/src/transformers/models/llava_onevision/modular_llava_onevision.py +++ b/src/transformers/models/llava_onevision/modular_llava_onevision.py @@ -18,6 +18,7 @@ import torch from torch import nn +from torchvision.transforms.v2 import functional as F from transformers.models.llava_next.image_processing_llava_next_fast import LlavaNextImageProcessorFast from transformers.models.llava_next_video.modeling_llava_next_video import ( @@ -50,16 +51,10 @@ TensorType, auto_docstring, can_return_tuple, - is_torchvision_v2_available, logging, ) -if is_torchvision_v2_available(): - from torchvision.transforms.v2 import functional as F -else: - from torchvision.transforms import functional as F - logger = logging.get_logger(__name__) @@ -409,8 +404,6 @@ def get_image_features( if vision_feature_select_strategy == "default": selected_image_feature = selected_image_feature[:, 1:] - elif vision_feature_select_strategy == "full": - selected_image_feature = selected_image_feature image_features = self.multi_modal_projector(selected_image_feature) image_features = torch.split(image_features, image_num_patches, dim=0) @@ -459,8 +452,6 @@ def get_video_features( if vision_feature_select_strategy == "default": selected_video_feature = selected_video_feature[:, 1:] - elif vision_feature_select_strategy == "full": - selected_video_feature = selected_video_feature video_features = self.multi_modal_projector(selected_video_feature) video_features = self.apply_pooling(video_features) diff --git a/src/transformers/models/longcat_flash/modeling_longcat_flash.py b/src/transformers/models/longcat_flash/modeling_longcat_flash.py index 87e812852b37..4681cfb60e53 100644 --- a/src/transformers/models/longcat_flash/modeling_longcat_flash.py +++ b/src/transformers/models/longcat_flash/modeling_longcat_flash.py @@ -534,7 +534,7 @@ def __init__(self, config): self.rotary_emb = LongcatFlashRotaryEmbedding(config=config) self.gradient_checkpointing = False # Each layer above has 2 sublayers, config hack to have a correct cache (to avoid a checkpoint change) - self.head_dim = config.head_dim # For CI happiness (we didn't convert so head_dim is not directly used) # noqa + self.head_dim = config.head_dim # For CI happiness (we didn't convert so head_dim is not directly used) self.config.num_hidden_layers = 2 * config.num_layers diff --git a/src/transformers/models/longcat_flash/modular_longcat_flash.py b/src/transformers/models/longcat_flash/modular_longcat_flash.py index f58ca870aefc..60c93239d2c4 100644 --- a/src/transformers/models/longcat_flash/modular_longcat_flash.py +++ b/src/transformers/models/longcat_flash/modular_longcat_flash.py @@ -300,7 +300,7 @@ def __init__(self, config): [LongcatFlashDecoderLayer(config, layer_idx) for layer_idx in range(config.num_layers)] ) # Each layer above has 2 sublayers, config hack to have a correct cache (to avoid a checkpoint change) - self.head_dim = config.head_dim # For CI happiness (we didn't convert so head_dim is not directly used) # noqa + self.head_dim = config.head_dim # For CI happiness (we didn't convert so head_dim is not directly used) self.config.num_hidden_layers = 2 * config.num_layers self.norm = LongcatFlashRMSNorm(config.hidden_size, eps=config.rms_norm_eps) diff --git a/src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py b/src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py deleted file mode 100644 index cbd7600e9639..000000000000 --- a/src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py +++ /dev/null @@ -1,85 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert RoBERTa checkpoint.""" - -import argparse - -import pytorch_lightning as pl -import torch -from torch import nn - -from transformers import LongformerForQuestionAnswering, LongformerModel - - -class LightningModel(pl.LightningModule): - def __init__(self, model): - super().__init__() - self.model = model - self.num_labels = 2 - self.qa_outputs = nn.Linear(self.model.config.hidden_size, self.num_labels) - - # implement only because lightning requires to do so - def forward(self): - pass - - -def convert_longformer_qa_checkpoint_to_pytorch( - longformer_model: str, longformer_question_answering_ckpt_path: str, pytorch_dump_folder_path: str -): - # load longformer model from model identifier - longformer = LongformerModel.from_pretrained(longformer_model) - lightning_model = LightningModel(longformer) - - ckpt = torch.load(longformer_question_answering_ckpt_path, map_location=torch.device("cpu"), weights_only=True) - lightning_model.load_state_dict(ckpt["state_dict"]) - - # init longformer question answering model - longformer_for_qa = LongformerForQuestionAnswering.from_pretrained(longformer_model) - - # transfer weights - longformer_for_qa.longformer.load_state_dict(lightning_model.model.state_dict()) - longformer_for_qa.qa_outputs.load_state_dict(lightning_model.qa_outputs.state_dict()) - longformer_for_qa.eval() - - # save model - longformer_for_qa.save_pretrained(pytorch_dump_folder_path) - - print(f"Conversion successful. Model saved under {pytorch_dump_folder_path}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--longformer_model", - default=None, - type=str, - required=True, - help="model identifier of longformer. Should be either `longformer-base-4096` or `longformer-large-4096`.", - ) - parser.add_argument( - "--longformer_question_answering_ckpt_path", - default=None, - type=str, - required=True, - help="Path the official PyTorch Lightning Checkpoint.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - args = parser.parse_args() - convert_longformer_qa_checkpoint_to_pytorch( - args.longformer_model, args.longformer_question_answering_ckpt_path, args.pytorch_dump_folder_path - ) diff --git a/src/transformers/models/longt5/convert_longt5x_checkpoint_to_flax.py b/src/transformers/models/longt5/convert_longt5x_checkpoint_to_flax.py deleted file mode 100644 index d99797107363..000000000000 --- a/src/transformers/models/longt5/convert_longt5x_checkpoint_to_flax.py +++ /dev/null @@ -1,215 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Convert T5/LongT5X checkpoints from the original repository to JAX/FLAX model. This script is an extension of -'src/transformers/models/t5/convert_t5x_checkpoint_to_flax. -""" - -import argparse - -from t5x import checkpoints - -from transformers import AutoConfig, FlaxAutoModelForSeq2SeqLM - - -def convert_t5x_checkpoint_to_flax(t5x_checkpoint_path, config_name, flax_dump_folder_path): - config = AutoConfig.from_pretrained(config_name) - flax_model = FlaxAutoModelForSeq2SeqLM.from_config(config=config) - t5x_model = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path) - - split_mlp_wi = "wi_0" in t5x_model["target"]["encoder"]["layers_0"]["mlp"] - - if config.model_type == "t5": - encoder_attn_name = "SelfAttention" - if config.model_type == "longt5" and config.encoder_attention_type == "local": - encoder_attn_name = "LocalSelfAttention" - elif config.model_type == "longt5" and config.encoder_attention_type == "transient-global": - encoder_attn_name = "TransientGlobalSelfAttention" - else: - raise ValueError( - "Given config is expected to have `model_type='t5'`, or `model_type='longt5` with `encoder_attention_type`" - " attribute with a value from ['local', 'transient-global]." - ) - - # Encoder - for layer_index in range(config.num_layers): - layer_name = f"layers_{str(layer_index)}" - - # Self-Attention - t5x_attention_key = t5x_model["target"]["encoder"][layer_name]["attention"]["key"]["kernel"] - t5x_attention_out = t5x_model["target"]["encoder"][layer_name]["attention"]["out"]["kernel"] - t5x_attention_query = t5x_model["target"]["encoder"][layer_name]["attention"]["query"]["kernel"] - t5x_attention_value = t5x_model["target"]["encoder"][layer_name]["attention"]["value"]["kernel"] - - # Global input layer norm - if config.model_type == "longt5" and config.encoder_attention_type == "transient-global": - t5x_global_layer_norm = t5x_model["target"]["encoder"][layer_name]["attention"]["T5LayerNorm_0"]["scale"] - - # Layer Normalization - t5x_attention_layer_norm = t5x_model["target"]["encoder"][layer_name]["pre_attention_layer_norm"]["scale"] - - if split_mlp_wi: - t5x_mlp_wi_0 = t5x_model["target"]["encoder"][layer_name]["mlp"]["wi_0"]["kernel"] - t5x_mlp_wi_1 = t5x_model["target"]["encoder"][layer_name]["mlp"]["wi_1"]["kernel"] - else: - t5x_mlp_wi = t5x_model["target"]["encoder"][layer_name]["mlp"]["wi"]["kernel"] - - t5x_mlp_wo = t5x_model["target"]["encoder"][layer_name]["mlp"]["wo"]["kernel"] - - # Layer Normalization - t5x_mlp_layer_norm = t5x_model["target"]["encoder"][layer_name]["pre_mlp_layer_norm"]["scale"] - - # Assigning - flax_model_encoder_layer_block = flax_model.params["encoder"]["block"][str(layer_index)]["layer"] - flax_model_encoder_layer_block["0"][encoder_attn_name]["k"]["kernel"] = t5x_attention_key - flax_model_encoder_layer_block["0"][encoder_attn_name]["o"]["kernel"] = t5x_attention_out - flax_model_encoder_layer_block["0"][encoder_attn_name]["q"]["kernel"] = t5x_attention_query - flax_model_encoder_layer_block["0"][encoder_attn_name]["v"]["kernel"] = t5x_attention_value - - flax_model_encoder_layer_block["0"]["layer_norm"]["weight"] = t5x_attention_layer_norm - - # Global input layer norm - if config.model_type == "longt5" and config.encoder_attention_type == "transient-global": - flax_model_encoder_layer_block["0"][encoder_attn_name]["global_input_layer_norm"]["weight"] = ( - t5x_global_layer_norm - ) - - if split_mlp_wi: - flax_model_encoder_layer_block["1"]["DenseReluDense"]["wi_0"]["kernel"] = t5x_mlp_wi_0 - flax_model_encoder_layer_block["1"]["DenseReluDense"]["wi_1"]["kernel"] = t5x_mlp_wi_1 - else: - flax_model_encoder_layer_block["1"]["DenseReluDense"]["wi"]["kernel"] = t5x_mlp_wi - - flax_model_encoder_layer_block["1"]["DenseReluDense"]["wo"]["kernel"] = t5x_mlp_wo - flax_model_encoder_layer_block["1"]["layer_norm"]["weight"] = t5x_mlp_layer_norm - - flax_model.params["encoder"]["block"][str(layer_index)]["layer"] = flax_model_encoder_layer_block - - # Only for layer 0: - t5x_encoder_rel_embedding = t5x_model["target"]["encoder"]["relpos_bias"]["rel_embedding"].T - flax_model.params["encoder"]["block"]["0"]["layer"]["0"][encoder_attn_name]["relative_attention_bias"][ - "embedding" - ] = t5x_encoder_rel_embedding - - # Side/global relative position_bias + layer norm - if config.model_type == "longt5" and config.encoder_attention_type == "transient-global": - t5x_encoder_global_rel_embedding = t5x_model["target"]["encoder"]["side_relpos_bias"]["rel_embedding"].T - flax_model.params["encoder"]["block"]["0"]["layer"]["0"][encoder_attn_name]["global_relative_attention_bias"][ - "embedding" - ] = t5x_encoder_global_rel_embedding - - # Assigning - t5x_encoder_norm = t5x_model["target"]["encoder"]["encoder_norm"]["scale"] - flax_model.params["encoder"]["final_layer_norm"]["weight"] = t5x_encoder_norm - - # Decoder - for layer_index in range(config.num_layers): - layer_name = f"layers_{str(layer_index)}" - - # Self-Attention - t5x_attention_key = t5x_model["target"]["decoder"][layer_name]["self_attention"]["key"]["kernel"] - t5x_attention_out = t5x_model["target"]["decoder"][layer_name]["self_attention"]["out"]["kernel"] - t5x_attention_query = t5x_model["target"]["decoder"][layer_name]["self_attention"]["query"]["kernel"] - t5x_attention_value = t5x_model["target"]["decoder"][layer_name]["self_attention"]["value"]["kernel"] - - # Layer Normalization - t5x_pre_attention_layer_norm = t5x_model["target"]["decoder"][layer_name]["pre_self_attention_layer_norm"][ - "scale" - ] - - # Encoder-Decoder-Attention - t5x_enc_dec_attention_module = t5x_model["target"]["decoder"][layer_name]["encoder_decoder_attention"] - t5x_enc_dec_attention_key = t5x_enc_dec_attention_module["key"]["kernel"] - t5x_enc_dec_attention_out = t5x_enc_dec_attention_module["out"]["kernel"] - t5x_enc_dec_attention_query = t5x_enc_dec_attention_module["query"]["kernel"] - t5x_enc_dec_attention_value = t5x_enc_dec_attention_module["value"]["kernel"] - - # Layer Normalization - t5x_cross_layer_norm = t5x_model["target"]["decoder"][layer_name]["pre_cross_attention_layer_norm"]["scale"] - - # MLP - if split_mlp_wi: - t5x_mlp_wi_0 = t5x_model["target"]["decoder"][layer_name]["mlp"]["wi_0"]["kernel"] - t5x_mlp_wi_1 = t5x_model["target"]["decoder"][layer_name]["mlp"]["wi_1"]["kernel"] - else: - t5x_mlp_wi = t5x_model["target"]["decoder"][layer_name]["mlp"]["wi"]["kernel"] - - t5x_mlp_wo = t5x_model["target"]["decoder"][layer_name]["mlp"]["wo"]["kernel"] - - # Layer Normalization - tx5_mlp_layer_norm = t5x_model["target"]["decoder"][layer_name]["pre_mlp_layer_norm"]["scale"] - - # Assigning - flax_model_decoder_layer_block = flax_model.params["decoder"]["block"][str(layer_index)]["layer"] - flax_model_decoder_layer_block["0"]["SelfAttention"]["k"]["kernel"] = t5x_attention_key - flax_model_decoder_layer_block["0"]["SelfAttention"]["o"]["kernel"] = t5x_attention_out - flax_model_decoder_layer_block["0"]["SelfAttention"]["q"]["kernel"] = t5x_attention_query - flax_model_decoder_layer_block["0"]["SelfAttention"]["v"]["kernel"] = t5x_attention_value - - flax_model_decoder_layer_block["0"]["layer_norm"]["weight"] = t5x_pre_attention_layer_norm - - flax_model_decoder_layer_block["1"]["EncDecAttention"]["k"]["kernel"] = t5x_enc_dec_attention_key - flax_model_decoder_layer_block["1"]["EncDecAttention"]["o"]["kernel"] = t5x_enc_dec_attention_out - flax_model_decoder_layer_block["1"]["EncDecAttention"]["q"]["kernel"] = t5x_enc_dec_attention_query - flax_model_decoder_layer_block["1"]["EncDecAttention"]["v"]["kernel"] = t5x_enc_dec_attention_value - - flax_model_decoder_layer_block["1"]["layer_norm"]["weight"] = t5x_cross_layer_norm - - if split_mlp_wi: - flax_model_decoder_layer_block["2"]["DenseReluDense"]["wi_0"]["kernel"] = t5x_mlp_wi_0 - flax_model_decoder_layer_block["2"]["DenseReluDense"]["wi_1"]["kernel"] = t5x_mlp_wi_1 - else: - flax_model_decoder_layer_block["2"]["DenseReluDense"]["wi"]["kernel"] = t5x_mlp_wi - - flax_model_decoder_layer_block["2"]["DenseReluDense"]["wo"]["kernel"] = t5x_mlp_wo - - flax_model_decoder_layer_block["2"]["layer_norm"]["weight"] = tx5_mlp_layer_norm - - flax_model.params["decoder"]["block"][str(layer_index)]["layer"] = flax_model_decoder_layer_block - - # Decoder Normalization - tx5_decoder_norm = t5x_model["target"]["decoder"]["decoder_norm"]["scale"] - flax_model.params["decoder"]["final_layer_norm"]["weight"] = tx5_decoder_norm - - # Only for layer 0: - t5x_decoder_rel_embedding = t5x_model["target"]["decoder"]["relpos_bias"]["rel_embedding"].T - flax_model.params["decoder"]["block"]["0"]["layer"]["0"]["SelfAttention"]["relative_attention_bias"][ - "embedding" - ] = t5x_decoder_rel_embedding - - # Token Embeddings - tx5_token_embeddings = t5x_model["target"]["token_embedder"]["embedding"] - flax_model.params["shared"]["embedding"] = tx5_token_embeddings - - # LM Head (only in v1.1 and LongT5 checkpoints) - if "logits_dense" in t5x_model["target"]["decoder"]: - flax_model.params["lm_head"]["kernel"] = t5x_model["target"]["decoder"]["logits_dense"]["kernel"] - - flax_model.save_pretrained(flax_dump_folder_path) - print("T5X Model was successfully converted!") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--t5x_checkpoint_path", default=None, type=str, required=True, help="Path the T5X checkpoint." - ) - parser.add_argument("--config_name", default=None, type=str, required=True, help="Config name of LongT5/T5 model.") - parser.add_argument( - "--flax_dump_folder_path", default=None, type=str, required=True, help="Path to the output FLAX model." - ) - args = parser.parse_args() - convert_t5x_checkpoint_to_flax(args.t5x_checkpoint_path, args.config_name, args.flax_dump_folder_path) diff --git a/src/transformers/models/longt5/modeling_longt5.py b/src/transformers/models/longt5/modeling_longt5.py index 4e84a1550349..ea6ab0cfff35 100644 --- a/src/transformers/models/longt5/modeling_longt5.py +++ b/src/transformers/models/longt5/modeling_longt5.py @@ -250,7 +250,7 @@ def forward(self, hidden_states): try: from apex.normalization import FusedRMSNorm - LongT5LayerNorm = FusedRMSNorm # noqa + LongT5LayerNorm = FusedRMSNorm logger.info("Discovered apex.normalization.FusedRMSNorm - will use it instead of LongT5LayerNorm") except ImportError: @@ -1270,6 +1270,35 @@ def dummy_inputs(self): } return dummy_inputs + def _try_load_missing_tied_module(self, key): + module = self + key = key.removesuffix(".weight") + for sub_key in key.split("."): + if not hasattr(module, sub_key): + return + module = getattr(module, sub_key) + + self._tie_or_clone_weights(module, self.shared) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requested_loading_info = kwargs.get("output_loading_info", False) + kwargs["output_loading_info"] = True + model, loading_info = super().from_pretrained(*args, **kwargs) + missing_keys = loading_info.get("missing_keys", []) + + if hasattr(model, "shared") and hasattr(model, "_tied_weights_keys"): + for missing_key in missing_keys: + logger.warning( + f"Recovering a missing tied weight {missing_key} from a legacy LongT5 checkpoint. " + f"Consider saving {missing_key} in your checkpoint or updating the config (tie_word_embeddings=true)." + ) + model._try_load_missing_tied_module(missing_key) + + if requested_loading_info: + return model, loading_info + return model + def _init_weights(self, module): """Initialize the weights""" factor = self.config.initializer_factor # Used for testing weights initialization diff --git a/src/transformers/models/luke/convert_luke_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/luke/convert_luke_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index 5e0e461862a8..000000000000 --- a/src/transformers/models/luke/convert_luke_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,170 +0,0 @@ -# coding=utf-8 -# Copyright 2020 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert LUKE checkpoint.""" - -import argparse -import json -import os - -import torch - -from transformers import LukeConfig, LukeModel, LukeTokenizer, RobertaTokenizer -from transformers.tokenization_utils_base import AddedToken - - -@torch.no_grad() -def convert_luke_checkpoint(checkpoint_path, metadata_path, entity_vocab_path, pytorch_dump_folder_path, model_size): - # Load configuration defined in the metadata file - with open(metadata_path) as metadata_file: - metadata = json.load(metadata_file) - config = LukeConfig(use_entity_aware_attention=True, **metadata["model_config"]) - - # Load in the weights from the checkpoint_path - state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True) - - # Load the entity vocab file - entity_vocab = load_entity_vocab(entity_vocab_path) - - tokenizer = RobertaTokenizer.from_pretrained(metadata["model_config"]["bert_model_name"]) - - # Add special tokens to the token vocabulary for downstream tasks - entity_token_1 = AddedToken(" ", lstrip=False, rstrip=False) - entity_token_2 = AddedToken(" ", lstrip=False, rstrip=False) - tokenizer.add_special_tokens({"additional_special_tokens": [entity_token_1, entity_token_2]}) - config.vocab_size += 2 - - print(f"Saving tokenizer to {pytorch_dump_folder_path}") - tokenizer.save_pretrained(pytorch_dump_folder_path) - with open(os.path.join(pytorch_dump_folder_path, LukeTokenizer.vocab_files_names["entity_vocab_file"]), "w") as f: - json.dump(entity_vocab, f) - - tokenizer = LukeTokenizer.from_pretrained(pytorch_dump_folder_path) - - # Initialize the embeddings of the special tokens - word_emb = state_dict["embeddings.word_embeddings.weight"] - ent_emb = word_emb[tokenizer.convert_tokens_to_ids(["@"])[0]].unsqueeze(0) - ent2_emb = word_emb[tokenizer.convert_tokens_to_ids(["#"])[0]].unsqueeze(0) - state_dict["embeddings.word_embeddings.weight"] = torch.cat([word_emb, ent_emb, ent2_emb]) - - # Initialize the query layers of the entity-aware self-attention mechanism - for layer_index in range(config.num_hidden_layers): - for matrix_name in ["query.weight", "query.bias"]: - prefix = f"encoder.layer.{layer_index}.attention.self." - state_dict[prefix + "w2e_" + matrix_name] = state_dict[prefix + matrix_name] - state_dict[prefix + "e2w_" + matrix_name] = state_dict[prefix + matrix_name] - state_dict[prefix + "e2e_" + matrix_name] = state_dict[prefix + matrix_name] - - # Initialize the embedding of the [MASK2] entity using that of the [MASK] entity for downstream tasks - entity_emb = state_dict["entity_embeddings.entity_embeddings.weight"] - entity_emb[entity_vocab["[MASK2]"]] = entity_emb[entity_vocab["[MASK]"]] - - model = LukeModel(config=config).eval() - - missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False) - if not (len(missing_keys) == 1 and missing_keys[0] == "embeddings.position_ids"): - raise ValueError(f"Missing keys {', '.join(missing_keys)}. Expected only missing embeddings.position_ids") - if not (all(key.startswith("entity_predictions") or key.startswith("lm_head") for key in unexpected_keys)): - raise ValueError( - "Unexpected keys" - f" {', '.join([key for key in unexpected_keys if not (key.startswith('entity_predictions') or key.startswith('lm_head'))])}" - ) - - # Check outputs - tokenizer = LukeTokenizer.from_pretrained(pytorch_dump_folder_path, task="entity_classification") - - text = ( - "Top seed Ana Ivanovic said on Thursday she could hardly believe her luck as a fortuitous netcord helped the" - " new world number one avoid a humiliating second- round exit at Wimbledon ." - ) - span = (39, 42) - encoding = tokenizer(text, entity_spans=[span], add_prefix_space=True, return_tensors="pt") - - outputs = model(**encoding) - - # Verify word hidden states - if model_size == "large": - expected_shape = torch.Size((1, 42, 1024)) - expected_slice = torch.tensor( - [[0.0133, 0.0865, 0.0095], [0.3093, -0.2576, -0.7418], [-0.1720, -0.2117, -0.2869]] - ) - else: # base - expected_shape = torch.Size((1, 42, 768)) - expected_slice = torch.tensor([[0.0037, 0.1368, -0.0091], [0.1099, 0.3329, -0.1095], [0.0765, 0.5335, 0.1179]]) - - if not (outputs.last_hidden_state.shape == expected_shape): - raise ValueError( - f"Outputs.last_hidden_state.shape is {outputs.last_hidden_state.shape}, Expected shape is {expected_shape}" - ) - if not torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4): - raise ValueError - - # Verify entity hidden states - if model_size == "large": - expected_shape = torch.Size((1, 1, 1024)) - expected_slice = torch.tensor([[0.0466, -0.0106, -0.0179]]) - else: # base - expected_shape = torch.Size((1, 1, 768)) - expected_slice = torch.tensor([[0.1457, 0.1044, 0.0174]]) - - if not (outputs.entity_last_hidden_state.shape != expected_shape): - raise ValueError( - f"Outputs.entity_last_hidden_state.shape is {outputs.entity_last_hidden_state.shape}, Expected shape is" - f" {expected_shape}" - ) - if not torch.allclose(outputs.entity_last_hidden_state[0, :3, :3], expected_slice, atol=1e-4): - raise ValueError - - # Finally, save our PyTorch model and tokenizer - print(f"Saving PyTorch model to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - - -def load_entity_vocab(entity_vocab_path): - entity_vocab = {} - with open(entity_vocab_path, "r", encoding="utf-8") as f: - for index, line in enumerate(f): - title, _ = line.rstrip().split("\t") - entity_vocab[title] = index - - return entity_vocab - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument("--checkpoint_path", type=str, help="Path to a pytorch_model.bin file.") - parser.add_argument( - "--metadata_path", default=None, type=str, help="Path to a metadata.json file, defining the configuration." - ) - parser.add_argument( - "--entity_vocab_path", - default=None, - type=str, - help="Path to an entity_vocab.tsv file, containing the entity vocabulary.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to where to dump the output PyTorch model." - ) - parser.add_argument( - "--model_size", default="base", type=str, choices=["base", "large"], help="Size of the model to be converted." - ) - args = parser.parse_args() - convert_luke_checkpoint( - args.checkpoint_path, - args.metadata_path, - args.entity_vocab_path, - args.pytorch_dump_folder_path, - args.model_size, - ) diff --git a/src/transformers/models/lxmert/convert_lxmert_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/lxmert/convert_lxmert_original_tf_checkpoint_to_pytorch.py deleted file mode 100755 index 1dd77bc36f80..000000000000 --- a/src/transformers/models/lxmert/convert_lxmert_original_tf_checkpoint_to_pytorch.py +++ /dev/null @@ -1,59 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert LXMERT checkpoint.""" - -import argparse - -import torch - -from transformers import LxmertConfig, LxmertForPreTraining, load_tf_weights_in_lxmert -from transformers.utils import logging - - -logging.set_verbosity_info() - - -def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path): - # Initialise PyTorch model - config = LxmertConfig.from_json_file(config_file) - print(f"Building PyTorch model from configuration: {config}") - model = LxmertForPreTraining(config) - - # Load weights from tf checkpoint - load_tf_weights_in_lxmert(model, config, tf_checkpoint_path) - - # Save pytorch-model - print(f"Save PyTorch model to {pytorch_dump_path}") - torch.save(model.state_dict(), pytorch_dump_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." - ) - parser.add_argument( - "--config_file", - default=None, - type=str, - required=True, - help="The config json file corresponding to the pre-trained model. \nThis specifies the model architecture.", - ) - parser.add_argument( - "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - args = parser.parse_args() - convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path) diff --git a/src/transformers/models/m2m_100/convert_m2m100_original_checkpoint_to_pytorch.py b/src/transformers/models/m2m_100/convert_m2m100_original_checkpoint_to_pytorch.py deleted file mode 100644 index 02e7ef23a085..000000000000 --- a/src/transformers/models/m2m_100/convert_m2m100_original_checkpoint_to_pytorch.py +++ /dev/null @@ -1,85 +0,0 @@ -# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse - -import torch -from torch import nn - -from transformers import M2M100Config, M2M100ForConditionalGeneration - - -def remove_ignore_keys_(state_dict): - ignore_keys = [ - "encoder.version", - "decoder.version", - "model.encoder.version", - "model.decoder.version", - "decoder.output_projection.weight", - "_float_tensor", - "encoder.embed_positions._float_tensor", - "decoder.embed_positions._float_tensor", - ] - for k in ignore_keys: - state_dict.pop(k, None) - - -def make_linear_from_emb(emb): - vocab_size, emb_size = emb.weight.shape - lin_layer = nn.Linear(vocab_size, emb_size, bias=False) - lin_layer.weight.data = emb.weight.data - return lin_layer - - -def convert_fairseq_m2m100_checkpoint_from_disk(checkpoint_path): - m2m_100 = torch.load(checkpoint_path, map_location="cpu", weights_only=True) - args = m2m_100["args"] or m2m_100["cfg"]["model"] - state_dict = m2m_100["model"] - remove_ignore_keys_(state_dict) - vocab_size = state_dict["encoder.embed_tokens.weight"].shape[0] - - config = M2M100Config( - vocab_size=vocab_size, - max_position_embeddings=1024, - encoder_layers=args.encoder_layers, - decoder_layers=args.decoder_layers, - encoder_attention_heads=args.encoder_attention_heads, - decoder_attention_heads=args.decoder_attention_heads, - encoder_ffn_dim=args.encoder_ffn_embed_dim, - decoder_ffn_dim=args.decoder_ffn_embed_dim, - d_model=args.encoder_embed_dim, - encoder_layerdrop=args.encoder_layerdrop, - decoder_layerdrop=args.decoder_layerdrop, - dropout=args.dropout, - attention_dropout=args.attention_dropout, - activation_dropout=args.activation_dropout, - activation_function="relu", - ) - - state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"] - model = M2M100ForConditionalGeneration(config) - model.model.load_state_dict(state_dict, strict=False) - model.lm_head = make_linear_from_emb(model.model.shared) - - return model - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument("fairseq_path", type=str, help="path to a model.pt on local filesystem.") - parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - args = parser.parse_args() - model = convert_fairseq_m2m100_checkpoint_from_disk(args.fairseq_pathß) - model.save_pretrained(args.pytorch_dump_folder_path) diff --git a/src/transformers/models/mamba/convert_mamba_ssm_checkpoint_to_pytorch.py b/src/transformers/models/mamba/convert_mamba_ssm_checkpoint_to_pytorch.py deleted file mode 100644 index eaedafa13fe1..000000000000 --- a/src/transformers/models/mamba/convert_mamba_ssm_checkpoint_to_pytorch.py +++ /dev/null @@ -1,152 +0,0 @@ -# coding=utf-8 -# Copyright 2024 state-spaces/mamba org and HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""This script can be used to convert checkpoints provided in the `mamba_ssm` library into the format provided in HuggingFace `transformers`. It depends on the `mamba_ssm` package to be installed.""" - -import argparse -import json -import math - -import torch - -from transformers import AutoTokenizer, MambaConfig, MambaForCausalLM -from transformers.utils import logging -from transformers.utils.import_utils import is_mamba_ssm_available - - -if is_mamba_ssm_available(): - from mamba_ssm.models.config_mamba import MambaConfig as MambaConfigSSM - from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel - - def convert_ssm_config_to_hf_config(config_ssm: MambaConfigSSM) -> MambaConfig: - """Convert a MambaConfig from mamba_ssm to a MambaConfig from transformers.""" - hf_config = MambaConfig() - # Set config hidden size, num hidden layers, and vocab size directly from the original config - hf_config.hidden_size = config_ssm.d_model - hf_config.intermediate_size = config_ssm.d_model * 2 - hf_config.time_step_rank = math.ceil(config_ssm.d_model / 16) - - hf_config.num_hidden_layers = config_ssm.n_layer - vocab_size = config_ssm.vocab_size - pad_vocab_size_multiple = config_ssm.pad_vocab_size_multiple - if (vocab_size % pad_vocab_size_multiple) != 0: - vocab_size += pad_vocab_size_multiple - (vocab_size % pad_vocab_size_multiple) - hf_config.vocab_size = vocab_size - return hf_config - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def convert_mamba_ssm_checkpoint_to_huggingface_model( - original_state_dict: dict, original_ssm_config_dict: dict -) -> tuple[MambaForCausalLM, AutoTokenizer]: - if not is_mamba_ssm_available(): - raise ImportError( - "Calling convert_mamba_ssm_checkpoint_to_huggingface_model requires the mamba_ssm library to be installed. Please install it with `pip install mamba_ssm`." - ) - original_ssm_config = MambaConfigSSM(**original_ssm_config_dict) - - # Convert mamba_ssm config to huggingface MambaConfig - hf_config = convert_ssm_config_to_hf_config(original_ssm_config) - - # No weights need to be renamed between the two models. - converted_state_dict = original_state_dict - - # Load reshaped state dict into a huggingface model. - hf_model = MambaForCausalLM(hf_config) - tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b") - hf_model.load_state_dict(converted_state_dict) - return (hf_model, tokenizer) - - -def validate_converted_model( - original_state_dict: dict, original_ssm_config_dict: dict, hf_model: MambaForCausalLM, tokenizer: AutoTokenizer -) -> None: - """Validate the converted model returns the same output as the original model.""" - torch_device = "cuda" - - original_config = MambaConfigSSM(**original_ssm_config_dict) - original_model = MambaLMHeadModel(original_config).to(torch_device) - original_model.load_state_dict(original_state_dict) - - hf_model = hf_model.to(torch_device) - input_ids = tokenizer("Hey how are you doing?", return_tensors="pt")["input_ids"].to(torch_device) - # Assert model logits are close - with torch.no_grad(): - original_model_logits = original_model(input_ids).logits - hf_model_logits = hf_model(input_ids).logits - if not torch.allclose(original_model_logits, hf_model_logits, atol=1e-3): - raise ValueError("The converted model did not return the same logits as the original model.") - - logger.info("Model conversion validated successfully.") - - -def convert_mamba_checkpoint_file_to_huggingface_model_file( - mamba_checkpoint_path: str, config_json_file: str, output_dir: str -) -> None: - if not is_mamba_ssm_available(): - raise ImportError( - "Calling convert_mamba_checkpoint_file_to_huggingface_model_file requires the mamba_ssm library to be installed. Please install it with `pip install mamba_ssm`." - ) - if not torch.cuda.is_available(): - raise ValueError( - "This script is to be run with a CUDA device, as the original mamba_ssm model does not support cpu." - ) - logger.info(f"Loading model from {mamba_checkpoint_path} based on config from {config_json_file}") - # Load weights and config from paths - original_state_dict = torch.load(mamba_checkpoint_path, map_location="cpu", weights_only=True) - with open(config_json_file, "r", encoding="utf-8") as json_file: - original_ssm_config_dict = json.load(json_file) - - # Convert the model - hf_model, tokenizer = convert_mamba_ssm_checkpoint_to_huggingface_model( - original_state_dict, original_ssm_config_dict - ) - - # Validate the conversion - validate_converted_model(original_state_dict, original_ssm_config_dict, hf_model, tokenizer) - - logger.info(f"Model converted successfully. Saving model to {output_dir}") - - # Save new model to pytorch_dump_path - hf_model.save_pretrained(output_dir) - tokenizer.save_pretrained(output_dir) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "-i", - "--mamba_checkpoint_file", - type=str, - required=True, - help="Path to a `pytorch_model.bin` mamba_ssm checkpoint file to be converted.", - ) - parser.add_argument( - "-c", - "--config_json_file", - type=str, - required=True, - help="Path to a `config.json` file corresponding to a MambaConfig of the original mamba_ssm model.", - ) - parser.add_argument( - "-o", "--output_dir", type=str, required=True, help="Path to directory to save the converted output model to." - ) - args = parser.parse_args() - - convert_mamba_checkpoint_file_to_huggingface_model_file( - args.mamba_checkpoint_file, args.config_json_file, args.output_dir - ) diff --git a/src/transformers/models/mamba2/convert_mamba2_ssm_checkpoint_to_pytorch.py b/src/transformers/models/mamba2/convert_mamba2_ssm_checkpoint_to_pytorch.py deleted file mode 100644 index 482dd539b82d..000000000000 --- a/src/transformers/models/mamba2/convert_mamba2_ssm_checkpoint_to_pytorch.py +++ /dev/null @@ -1,193 +0,0 @@ -# coding=utf-8 -# Copyright 2024 state-spaces/mamba2 org and HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""This script can be used to convert checkpoints provided in the `mamba2_ssm` library into the format provided in HuggingFace `transformers`. It depends on the `mamba2_ssm` package to be installed.""" - -import argparse -import json -from functools import partial -from os import path -from typing import Optional - -import torch -from safetensors import safe_open -from safetensors.torch import save_model - -from transformers import GPTNeoXTokenizerFast, LlamaTokenizerFast, Mamba2Config, Mamba2ForCausalLM - - -def load_state_dict_from_safetensors(mamba2_checkpoint_path: str, ckpt_name: str) -> dict[str, torch.Tensor]: - # Load weights and config from paths - original_state_dict = {} - with safe_open(path.join(mamba2_checkpoint_path, ckpt_name), framework="pt") as f: - for k in f.keys(): - newk = k.removeprefix("model.") - original_state_dict[newk] = f.get_tensor(k).clone() - return original_state_dict - - -def load_state_dict_from_torch(mamba2_checkpoint_path: str, ckpt_name: str) -> dict[str, torch.Tensor]: - return torch.load(path.join(mamba2_checkpoint_path, ckpt_name), map_location="cpu", weights_only=True) - - -def convert_ssm_config_to_hf_config(config_ssm: dict, mamba2_model_dict: dict) -> Mamba2Config: - """Convert a Mamba2Config from mamba_ssm to a Mamba2Config from here.""" - hf_config = Mamba2Config() - - # Switch to a different dict depending on model type - config_dict = mamba2_model_dict - - # Set important values from config and recalculate other resulting entries - hf_config.hidden_size = config_ssm[config_dict["hidden_size"]] - hf_config.num_heads = (hf_config.hidden_size * hf_config.expand) // hf_config.head_dim - hf_config.num_hidden_layers = config_ssm[config_dict["num_hidden_layers"]] - hf_config.n_groups = config_ssm.get(config_dict["n_groups"], 1) - hf_config.tie_word_embeddings = config_ssm["tie_embeddings"] - hf_config.bos_token_id = config_dict["bos_token_id"] - hf_config.pad_token_id = config_dict["pad_token_id"] - hf_config.eos_token_id = config_dict["eos_token_id"] - - # Padded vocab size, mostly of 16 but 32 is also very common in different models - vocab_size = config_ssm["vocab_size"] - pad_vocab_size_multiple = config_ssm["pad_vocab_size_multiple"] - if (vocab_size % pad_vocab_size_multiple) != 0: - vocab_size += pad_vocab_size_multiple - (vocab_size % pad_vocab_size_multiple) - hf_config.vocab_size = vocab_size - - return hf_config - - -def load_and_save_tokenizer( - mamba2_model_type: str, - output_dir: str, - tokenizer_model_path: Optional[str] = None, -) -> None: - tokenizer = None - - # Load tokenizer - if tokenizer_model_path is not None and mamba2_model_type == "codestral": - tokenizer_class = LlamaTokenizerFast - tokenizer = tokenizer_class(tokenizer_model_path, legacy=False, from_slow=True) - elif mamba2_model_type == "mamba_ssm": - tokenizer = GPTNeoXTokenizerFast.from_pretrained("state-spaces/mamba-130m-hf", padding_side="left") - - # Save tokenizer - if tokenizer is not None: - tokenizer.save_pretrained(output_dir) - - -_MAMBA2_MODELS_DICT = { - "codestral": { - "hidden_size": "dim", - "num_hidden_layers": "n_layers", - "n_groups": "n_groups", - "bos_token_id": 0, - "pad_token_id": 1, - "eos_token_id": 2, - "config_name": "params.json", - "load_state_dict": partial(load_state_dict_from_safetensors, ckpt_name="consolidated.safetensors"), - "load_and_save_tokenizer": partial(load_and_save_tokenizer, "codestral"), - }, - "mamba_ssm": { - "hidden_size": "d_model", - "num_hidden_layers": "n_layer", - "n_groups": "ngroups", - "bos_token_id": 0, - "pad_token_id": 0, - "eos_token_id": 0, - "config_name": "config.json", - "load_state_dict": partial(load_state_dict_from_torch, ckpt_name="pytorch_model.bin"), - "load_and_save_tokenizer": partial(load_and_save_tokenizer, "mamba_ssm"), - }, -} - - -def convert_mamba2_checkpoint_file_to_huggingface_model_file( - mamba2_checkpoint_path: str, - mamba2_model_type: str, - precision: str, - output_dir: str, - tokenizer_model_path: Optional[str] = None, -) -> None: - mamba2_model_dict = _MAMBA2_MODELS_DICT[mamba2_model_type] - - # Load and save config based on name - config_path = path.join(mamba2_checkpoint_path, mamba2_model_dict["config_name"]) - with open(config_path, "r", encoding="utf-8") as json_file: - config = json.load(json_file) - hf_config = convert_ssm_config_to_hf_config(config_ssm=config, mamba2_model_dict=mamba2_model_dict) - hf_config.save_pretrained(output_dir) - - # Load state dict of the original model and transfer to hf model - original_state_dict = mamba2_model_dict["load_state_dict"](mamba2_checkpoint_path=mamba2_checkpoint_path) - hf_model = Mamba2ForCausalLM(hf_config) - hf_model.load_state_dict(original_state_dict) - - # Save new model to pytorch_dump_path - dtype = torch.float32 if precision == "fp32" else (torch.bfloat16 if precision == "bf16" else torch.float16) - save_model(hf_model.to(dtype), path.join(output_dir, "model.safetensors"), metadata={"format": "pt"}) - - # Load and save tokenizer - mamba2_model_dict["load_and_save_tokenizer"](output_dir=output_dir, tokenizer_model_path=tokenizer_model_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "-i", - "--mamba2_checkpoint_directory", - type=str, - required=True, - help="Path to a directory containing the `pytorch_model.bin` or `.safetensors` mamba2_ssm checkpoint file to be converted.", - ) - parser.add_argument( - "-m", - "--mamba2_model_type", - type=str, - default="mamba_ssm", - const="mamba_ssm", - required=True, - choices=("codestral", "mamba_ssm"), - help="The model type the conversion will be performed on. Can choose from either `codestral` or `mamba_ssm`.", - ) - parser.add_argument( - "-p", - "--precision", - type=str, - default="fp16", - const="fp16", - required=True, - choices=("fp32", "fp16", "bf16"), - help="The precision the model will be saved in. Select from fp32, fp16 or bf16.", - ) - parser.add_argument( - "-o", "--output_dir", type=str, required=True, help="Path to directory to save the converted output model to." - ) - parser.add_argument( - "-t", - "--tokenizer_model_path", - type=str, - default=None, - required=False, - help="Path to a `codestral` tokenizer file.", - ) - args = parser.parse_args() - - convert_mamba2_checkpoint_file_to_huggingface_model_file( - args.mamba2_checkpoint_directory, - args.mamba2_model_type, - args.precision, - args.output_dir, - args.tokenizer_model_path, - ) diff --git a/src/transformers/models/mamba2/modeling_mamba2.py b/src/transformers/models/mamba2/modeling_mamba2.py index a423c5b42fbd..bb24e2422d32 100644 --- a/src/transformers/models/mamba2/modeling_mamba2.py +++ b/src/transformers/models/mamba2/modeling_mamba2.py @@ -286,7 +286,7 @@ def __init__(self, config: Mamba2Config, layer_idx: int): if not is_fast_path_available: logger.warning_once( - "The fast path is not available because on of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)`" + "The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)`" " is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and" " https://github.com/Dao-AILab/causal-conv1d" ) diff --git a/src/transformers/models/marian/convert_marian_tatoeba_to_pytorch.py b/src/transformers/models/marian/convert_marian_tatoeba_to_pytorch.py deleted file mode 100644 index abd1c4768d16..000000000000 --- a/src/transformers/models/marian/convert_marian_tatoeba_to_pytorch.py +++ /dev/null @@ -1,1326 +0,0 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import datetime -import json -import os -import re -from pathlib import Path - -import yaml -from tqdm import tqdm - -from transformers.models.marian.convert_marian_to_pytorch import ( - FRONT_MATTER_TEMPLATE, - convert, - convert_opus_name_to_hf_name, - download_and_unzip, - get_system_metadata, -) - - -DEFAULT_REPO = "Tatoeba-Challenge" -DEFAULT_MODEL_DIR = os.path.join(DEFAULT_REPO, "models") -ISO_URL = "https://cdn-datasets.huggingface.co/language_codes/iso-639-3.csv" -ISO_PATH = "lang_code_data/iso-639-3.csv" -LANG_CODE_PATH = "lang_code_data/language-codes-3b2.csv" -TATOEBA_MODELS_URL = "https://object.pouta.csc.fi/Tatoeba-MT-models" - - -class TatoebaConverter: - """ - Convert Tatoeba-Challenge models to huggingface format. - - Steps: - - 1. Convert numpy state dict to hf format (same code as OPUS-MT-Train conversion). - 2. Rename opus model to huggingface format. This means replace each alpha3 code with an alpha2 code if a unique - one exists. e.g. aav-eng -> aav-en, heb-eng -> he-en - 3. Select the best model for a particular pair, parse the yml for it and write a model card. By default the - best model is the one listed first in released-model-results, but it's also possible to specify the most - recent one. - """ - - def __init__(self, save_dir="marian_converted"): - assert Path(DEFAULT_REPO).exists(), "need git clone git@github.com:Helsinki-NLP/Tatoeba-Challenge.git" - self.download_lang_info() - self.model_results = json.load(open("Tatoeba-Challenge/models/released-model-results.json")) - self.alpha3_to_alpha2 = {} - for line in open(ISO_PATH): - parts = line.split("\t") - if len(parts[0]) == 3 and len(parts[3]) == 2: - self.alpha3_to_alpha2[parts[0]] = parts[3] - for line in LANG_CODE_PATH: - parts = line.split(",") - if len(parts[0]) == 3 and len(parts[1]) == 2: - self.alpha3_to_alpha2[parts[0]] = parts[1] - self.model_card_dir = Path(save_dir) - self.tag2name = {} - for key, value in GROUP_MEMBERS.items(): - self.tag2name[key] = value[0] - - def convert_models(self, tatoeba_ids, dry_run=False): - models_to_convert = [self.parse_metadata(x) for x in tatoeba_ids] - save_dir = Path("marian_ckpt") - dest_dir = Path(self.model_card_dir) - dest_dir.mkdir(exist_ok=True) - for model in tqdm(models_to_convert): # k, prepro, download, test_set_url in tqdm(model_list): - if "SentencePiece" not in model["pre-processing"]: - print(f"Skipping {model['release']} because it doesn't appear to use SentencePiece") - continue - if not os.path.exists(save_dir / model["_name"]): - download_and_unzip(f"{TATOEBA_MODELS_URL}/{model['release']}", save_dir / model["_name"]) - # from convert_marian_to_pytorch - opus_language_groups_to_hf = convert_opus_name_to_hf_name - pair_name = opus_language_groups_to_hf(model["_name"]) - convert(save_dir / model["_name"], dest_dir / f"opus-mt-{pair_name}") - self.write_model_card(model, dry_run=dry_run) - - def expand_group_to_two_letter_codes(self, grp_name): - return [self.alpha3_to_alpha2.get(x, x) for x in GROUP_MEMBERS[grp_name][1]] - - def is_group(self, code, name): - return "languages" in name or len(GROUP_MEMBERS.get(code, [])) > 1 - - def get_tags(self, code, name): - if len(code) == 2: - assert "languages" not in name, f"{code}: {name}" - return [code] - elif self.is_group(code, name): - group = self.expand_group_to_two_letter_codes(code) - group.append(code) - return group - else: # zho-> zh - print(f"Three letter monolingual code: {code}") - return [code] - - def resolve_lang_code(self, src, tgt) -> tuple[str, str]: - src_tags = self.get_tags(src, self.tag2name[src]) - tgt_tags = self.get_tags(tgt, self.tag2name[tgt]) - return src_tags, tgt_tags - - @staticmethod - def model_type_info_from_model_name(name): - info = {"_has_backtranslated_data": False} - if "1m" in name: - info["_data_per_pair"] = str(1e6) - if "2m" in name: - info["_data_per_pair"] = str(2e6) - if "4m" in name: - info["_data_per_pair"] = str(4e6) - if "+bt" in name: - info["_has_backtranslated_data"] = True - if "tuned4" in name: - info["_tuned"] = re.search(r"tuned4[^-]+", name).group() - return info - - def write_model_card(self, model_dict, dry_run=False) -> str: - """ - Construct card from data parsed from YAML and the model's name. upload command: aws s3 sync model_card_dir - s3://models.huggingface.co/bert/Helsinki-NLP/ --dryrun - """ - model_dir_url = f"{TATOEBA_MODELS_URL}/{model_dict['release']}" - long_pair = model_dict["_name"].split("-") - assert len(long_pair) == 2, f"got a translation pair {model_dict['_name']} that doesn't appear to be a pair" - short_src = self.alpha3_to_alpha2.get(long_pair[0], long_pair[0]) - short_tgt = self.alpha3_to_alpha2.get(long_pair[1], long_pair[1]) - model_dict["_hf_model_id"] = f"opus-mt-{short_src}-{short_tgt}" - - a3_src, a3_tgt = model_dict["_name"].split("-") - # opus_src_tags, opus_tgt_tags = a3_src.split("+"), a3_tgt.split("+") - - # This messy part tries to deal with language tags in multilingual models, possibly - # not all having three-letter codes - resolved_src_tags, resolved_tgt_tags = self.resolve_lang_code(a3_src, a3_tgt) - a2_src_tags, a2_tgt_tags = [], [] - for tag in resolved_src_tags: - if tag not in self.alpha3_to_alpha2: - a2_src_tags.append(tag) - for tag in resolved_tgt_tags: - if tag not in self.alpha3_to_alpha2: - a2_tgt_tags.append(tag) - - lang_tags = dedup(a2_src_tags + a2_tgt_tags) - src_multilingual, tgt_multilingual = (len(a2_src_tags) > 1), (len(a2_tgt_tags) > 1) - s, t = ",".join(a2_src_tags), ",".join(a2_tgt_tags) - - metadata = { - "hf_name": model_dict["_name"], - "source_languages": s, - "target_languages": t, - "opus_readme_url": f"{model_dir_url}/README.md", - "original_repo": "Tatoeba-Challenge", - "tags": ["translation"], - "languages": lang_tags, - } - lang_tags = l2front_matter(lang_tags) - - metadata["src_constituents"] = list(GROUP_MEMBERS[a3_src][1]) - metadata["tgt_constituents"] = list(GROUP_MEMBERS[a3_tgt][1]) - metadata["src_multilingual"] = src_multilingual - metadata["tgt_multilingual"] = tgt_multilingual - - backtranslated_data = "" - if model_dict["_has_backtranslated_data"]: - backtranslated_data = " with backtranslations" - - multilingual_data = "" - if "_data_per_pair" in model_dict: - multilingual_data = f"* data per pair in multilingual model: {model_dict['_data_per_pair']}\n" - - tuned = "" - if "_tuned" in model_dict: - tuned = f"* multilingual model tuned for: {model_dict['_tuned']}\n" - - model_base_filename = model_dict["release"].split("/")[-1] - download = f"* download original weights: [{model_base_filename}]({model_dir_url}/{model_dict['release']})\n" - - langtoken = "" - if tgt_multilingual: - langtoken = ( - "* a sentence-initial language token is required in the form of >>id<<" - "(id = valid, usually three-letter target language ID)\n" - ) - - metadata.update(get_system_metadata(DEFAULT_REPO)) - - scorestable = "" - for k, v in model_dict.items(): - if "scores" in k: - this_score_table = f"* {k}\n|Test set|score|\n|---|---|\n" - pairs = sorted(v.items(), key=lambda x: x[1], reverse=True) - for pair in pairs: - this_score_table += f"|{pair[0]}|{pair[1]}|\n" - scorestable += this_score_table - - datainfo = "" - if "training-data" in model_dict: - datainfo += "* Training data: \n" - for k, v in model_dict["training-data"].items(): - datainfo += f" * {str(k)}: {str(v)}\n" - if "validation-data" in model_dict: - datainfo += "* Validation data: \n" - for k, v in model_dict["validation-data"].items(): - datainfo += f" * {str(k)}: {str(v)}\n" - if "test-data" in model_dict: - datainfo += "* Test data: \n" - for k, v in model_dict["test-data"].items(): - datainfo += f" * {str(k)}: {str(v)}\n" - - testsetfilename = model_dict["release"].replace(".zip", ".test.txt") - testscoresfilename = model_dict["release"].replace(".zip", ".eval.txt") - testset = f"* test set translations file: [test.txt]({model_dir_url}/{testsetfilename})\n" - testscores = f"* test set scores file: [eval.txt]({model_dir_url}/{testscoresfilename})\n" - - # combine with Tatoeba markdown - readme_url = f"{TATOEBA_MODELS_URL}/{model_dict['_name']}/README.md" - extra_markdown = f""" -### {model_dict["_name"]} - -* source language name: {self.tag2name[a3_src]} -* target language name: {self.tag2name[a3_tgt]} -* OPUS readme: [README.md]({readme_url}) -""" - - content = ( - f""" -* model: {model_dict["modeltype"]} -* source language code{src_multilingual * "s"}: {", ".join(a2_src_tags)} -* target language code{tgt_multilingual * "s"}: {", ".join(a2_tgt_tags)} -* dataset: opus {backtranslated_data} -* release date: {model_dict["release-date"]} -* pre-processing: {model_dict["pre-processing"]} -""" - + multilingual_data - + tuned - + download - + langtoken - + datainfo - + testset - + testscores - + scorestable - ) - - content = FRONT_MATTER_TEMPLATE.format(lang_tags) + extra_markdown + content - - items = "\n".join([f"* {k}: {v}" for k, v in metadata.items()]) - sec3 = "\n### System Info: \n" + items - content += sec3 - if dry_run: - print("CONTENT:") - print(content) - print("METADATA:") - print(metadata) - return - sub_dir = self.model_card_dir / model_dict["_hf_model_id"] - sub_dir.mkdir(exist_ok=True) - dest = sub_dir / "README.md" - dest.open("w").write(content) - for k, v in metadata.items(): - if isinstance(v, datetime.date): - metadata[k] = datetime.datetime.strftime(v, "%Y-%m-%d") - with open(sub_dir / "metadata.json", "w", encoding="utf-8") as writeobj: - json.dump(metadata, writeobj) - - def download_lang_info(self): - global LANG_CODE_PATH - Path(LANG_CODE_PATH).parent.mkdir(exist_ok=True) - import wget - from huggingface_hub import hf_hub_download - - if not os.path.exists(ISO_PATH): - wget.download(ISO_URL, ISO_PATH) - if not os.path.exists(LANG_CODE_PATH): - LANG_CODE_PATH = hf_hub_download( - repo_id="huggingface/language_codes_marianMT", filename="language-codes-3b2.csv", repo_type="dataset" - ) - - def parse_metadata(self, model_name, repo_path=DEFAULT_MODEL_DIR, method="best"): - p = Path(repo_path) / model_name - - def url_to_name(url): - return url.split("/")[-1].split(".")[0] - - if model_name not in self.model_results: - # This is not a language pair, so model results are ambiguous, go by newest - method = "newest" - - if method == "best": - # Sort by how early they appear in released-models-results - results = [url_to_name(model["download"]) for model in self.model_results[model_name]] - ymls = [f for f in os.listdir(p) if f.endswith(".yml") and f[:-4] in results] - ymls.sort(key=lambda x: results.index(x[:-4])) - metadata = yaml.safe_load(open(p / ymls[0])) - metadata.update(self.model_type_info_from_model_name(ymls[0][:-4])) - elif method == "newest": - ymls = [f for f in os.listdir(p) if f.endswith(".yml")] - # Sort by date - ymls.sort( - key=lambda x: datetime.datetime.strptime(re.search(r"\d\d\d\d-\d\d?-\d\d?", x).group(), "%Y-%m-%d") - ) - metadata = yaml.safe_load(open(p / ymls[-1])) - metadata.update(self.model_type_info_from_model_name(ymls[-1][:-4])) - else: - raise NotImplementedError(f"Don't know argument method='{method}' to parse_metadata()") - metadata["_name"] = model_name - return metadata - - -GROUP_MEMBERS = { - # three letter code -> (group/language name, {constituents...} - # if this language is on the target side the constituents can be used as target language codes. - # if the language is on the source side they are supported natively without special codes. - "aav": ("Austro-Asiatic languages", {"hoc", "hoc_Latn", "kha", "khm", "khm_Latn", "mnw", "vie", "vie_Hani"}), - "afa": ( - "Afro-Asiatic languages", - { - "acm", - "afb", - "amh", - "apc", - "ara", - "arq", - "ary", - "arz", - "hau_Latn", - "heb", - "kab", - "mlt", - "rif_Latn", - "shy_Latn", - "som", - "thv", - "tir", - }, - ), - "afr": ("Afrikaans", {"afr"}), - "alv": ( - "Atlantic-Congo languages", - { - "ewe", - "fuc", - "fuv", - "ibo", - "kin", - "lin", - "lug", - "nya", - "run", - "sag", - "sna", - "swh", - "toi_Latn", - "tso", - "umb", - "wol", - "xho", - "yor", - "zul", - }, - ), - "ara": ("Arabic", {"afb", "apc", "apc_Latn", "ara", "ara_Latn", "arq", "arq_Latn", "arz"}), - "art": ( - "Artificial languages", - { - "afh_Latn", - "avk_Latn", - "dws_Latn", - "epo", - "ido", - "ido_Latn", - "ile_Latn", - "ina_Latn", - "jbo", - "jbo_Cyrl", - "jbo_Latn", - "ldn_Latn", - "lfn_Cyrl", - "lfn_Latn", - "nov_Latn", - "qya", - "qya_Latn", - "sjn_Latn", - "tlh_Latn", - "tzl", - "tzl_Latn", - "vol_Latn", - }, - ), - "aze": ("Azerbaijani", {"aze_Latn"}), - "bat": ("Baltic languages", {"lit", "lav", "prg_Latn", "ltg", "sgs"}), - "bel": ("Belarusian", {"bel", "bel_Latn"}), - "ben": ("Bengali", {"ben"}), - "bnt": ( - "Bantu languages", - {"kin", "lin", "lug", "nya", "run", "sna", "swh", "toi_Latn", "tso", "umb", "xho", "zul"}, - ), - "bul": ("Bulgarian", {"bul", "bul_Latn"}), - "cat": ("Catalan", {"cat"}), - "cau": ("Caucasian languages", {"abk", "kat", "che", "ady"}), - "ccs": ("South Caucasian languages", {"kat"}), - "ceb": ("Cebuano", {"ceb"}), - "cel": ("Celtic languages", {"gla", "gle", "bre", "cor", "glv", "cym"}), - "ces": ("Czech", {"ces"}), - "cpf": ("Creoles and pidgins, French‑based", {"gcf_Latn", "hat", "mfe"}), - "cpp": ( - "Creoles and pidgins, Portuguese-based", - {"zsm_Latn", "ind", "pap", "min", "tmw_Latn", "max_Latn", "zlm_Latn"}, - ), - "cus": ("Cushitic languages", {"som"}), - "dan": ("Danish", {"dan"}), - "deu": ("German", {"deu"}), - "dra": ("Dravidian languages", {"tam", "kan", "mal", "tel"}), - "ell": ("Modern Greek (1453-)", {"ell"}), - "eng": ("English", {"eng"}), - "epo": ("Esperanto", {"epo"}), - "est": ("Estonian", {"est"}), - "euq": ("Basque (family)", {"eus"}), - "eus": ("Basque", {"eus"}), - "fin": ("Finnish", {"fin"}), - "fiu": ( - "Finno-Ugrian languages", - { - "est", - "fin", - "fkv_Latn", - "hun", - "izh", - "kpv", - "krl", - "liv_Latn", - "mdf", - "mhr", - "myv", - "sma", - "sme", - "udm", - "vep", - "vro", - }, - ), - "fra": ("French", {"fra"}), - "gem": ( - "Germanic languages", - { - "afr", - "ang_Latn", - "dan", - "deu", - "eng", - "enm_Latn", - "fao", - "frr", - "fry", - "gos", - "got_Goth", - "gsw", - "isl", - "ksh", - "ltz", - "nds", - "nld", - "nno", - "nob", - "nob_Hebr", - "non_Latn", - "pdc", - "sco", - "stq", - "swe", - "swg", - "yid", - }, - ), - "gle": ("Irish", {"gle"}), - "glg": ("Galician", {"glg"}), - "gmq": ("North Germanic languages", {"dan", "nob", "nob_Hebr", "swe", "isl", "nno", "non_Latn", "fao"}), - "gmw": ( - "West Germanic languages", - { - "afr", - "ang_Latn", - "deu", - "eng", - "enm_Latn", - "frr", - "fry", - "gos", - "gsw", - "ksh", - "ltz", - "nds", - "nld", - "pdc", - "sco", - "stq", - "swg", - "yid", - }, - ), - "grk": ("Greek languages", {"grc_Grek", "ell"}), - "hbs": ("Serbo-Croatian", {"hrv", "srp_Cyrl", "bos_Latn", "srp_Latn"}), - "heb": ("Hebrew", {"heb"}), - "hin": ("Hindi", {"hin"}), - "hun": ("Hungarian", {"hun"}), - "hye": ("Armenian", {"hye", "hye_Latn"}), - "iir": ( - "Indo-Iranian languages", - { - "asm", - "awa", - "ben", - "bho", - "gom", - "guj", - "hif_Latn", - "hin", - "jdt_Cyrl", - "kur_Arab", - "kur_Latn", - "mai", - "mar", - "npi", - "ori", - "oss", - "pan_Guru", - "pes", - "pes_Latn", - "pes_Thaa", - "pnb", - "pus", - "rom", - "san_Deva", - "sin", - "snd_Arab", - "tgk_Cyrl", - "tly_Latn", - "urd", - "zza", - }, - ), - "ilo": ("Iloko", {"ilo"}), - "inc": ( - "Indic languages", - { - "asm", - "awa", - "ben", - "bho", - "gom", - "guj", - "hif_Latn", - "hin", - "mai", - "mar", - "npi", - "ori", - "pan_Guru", - "pnb", - "rom", - "san_Deva", - "sin", - "snd_Arab", - "urd", - }, - ), - "ine": ( - "Indo-European languages", - { - "afr", - "afr_Arab", - "aln", - "ang_Latn", - "arg", - "asm", - "ast", - "awa", - "bel", - "bel_Latn", - "ben", - "bho", - "bjn", - "bos_Latn", - "bre", - "bul", - "bul_Latn", - "cat", - "ces", - "cor", - "cos", - "csb_Latn", - "cym", - "dan", - "deu", - "dsb", - "egl", - "ell", - "eng", - "enm_Latn", - "ext", - "fao", - "fra", - "frm_Latn", - "frr", - "fry", - "gcf_Latn", - "gla", - "gle", - "glg", - "glv", - "gom", - "gos", - "got_Goth", - "grc_Grek", - "gsw", - "guj", - "hat", - "hif_Latn", - "hin", - "hrv", - "hsb", - "hye", - "hye_Latn", - "ind", - "isl", - "ita", - "jdt_Cyrl", - "ksh", - "kur_Arab", - "kur_Latn", - "lad", - "lad_Latn", - "lat_Grek", - "lat_Latn", - "lav", - "lij", - "lit", - "lld_Latn", - "lmo", - "ltg", - "ltz", - "mai", - "mar", - "max_Latn", - "mfe", - "min", - "mkd", - "mwl", - "nds", - "nld", - "nno", - "nob", - "nob_Hebr", - "non_Latn", - "npi", - "oci", - "ori", - "orv_Cyrl", - "oss", - "pan_Guru", - "pap", - "pcd", - "pdc", - "pes", - "pes_Latn", - "pes_Thaa", - "pms", - "pnb", - "pol", - "por", - "prg_Latn", - "pus", - "roh", - "rom", - "ron", - "rue", - "rus", - "rus_Latn", - "san_Deva", - "scn", - "sco", - "sgs", - "sin", - "slv", - "snd_Arab", - "spa", - "sqi", - "srd", - "srp_Cyrl", - "srp_Latn", - "stq", - "swe", - "swg", - "tgk_Cyrl", - "tly_Latn", - "tmw_Latn", - "ukr", - "urd", - "vec", - "wln", - "yid", - "zlm_Latn", - "zsm_Latn", - "zza", - }, - ), - "isl": ("Icelandic", {"isl"}), - "ita": ("Italian", {"ita"}), - "itc": ( - "Italic languages", - { - "arg", - "ast", - "bjn", - "cat", - "cos", - "egl", - "ext", - "fra", - "frm_Latn", - "gcf_Latn", - "glg", - "hat", - "ind", - "ita", - "lad", - "lad_Latn", - "lat_Grek", - "lat_Latn", - "lij", - "lld_Latn", - "lmo", - "max_Latn", - "mfe", - "min", - "mwl", - "oci", - "pap", - "pcd", - "pms", - "por", - "roh", - "ron", - "scn", - "spa", - "srd", - "tmw_Latn", - "vec", - "wln", - "zlm_Latn", - "zsm_Latn", - }, - ), - "jpn": ("Japanese", {"jpn", "jpn_Bopo", "jpn_Hang", "jpn_Hani", "jpn_Hira", "jpn_Kana", "jpn_Latn", "jpn_Yiii"}), - "jpx": ("Japanese (family)", {"jpn"}), - "kat": ("Georgian", {"kat"}), - "kor": ("Korean", {"kor_Hani", "kor_Hang", "kor_Latn", "kor"}), - "lav": ("Latvian", {"lav"}), - "lit": ("Lithuanian", {"lit"}), - "mkd": ("Macedonian", {"mkd"}), - "mkh": ("Mon-Khmer languages", {"vie_Hani", "mnw", "vie", "kha", "khm_Latn", "khm"}), - "msa": ("Malay (macrolanguage)", {"zsm_Latn", "ind", "max_Latn", "zlm_Latn", "min"}), - "mul": ( - "Multiple languages", - { - "abk", - "acm", - "ady", - "afb", - "afh_Latn", - "afr", - "akl_Latn", - "aln", - "amh", - "ang_Latn", - "apc", - "ara", - "arg", - "arq", - "ary", - "arz", - "asm", - "ast", - "avk_Latn", - "awa", - "aze_Latn", - "bak", - "bam_Latn", - "bel", - "bel_Latn", - "ben", - "bho", - "bod", - "bos_Latn", - "bre", - "brx", - "brx_Latn", - "bul", - "bul_Latn", - "cat", - "ceb", - "ces", - "cha", - "che", - "chr", - "chv", - "cjy_Hans", - "cjy_Hant", - "cmn", - "cmn_Hans", - "cmn_Hant", - "cor", - "cos", - "crh", - "crh_Latn", - "csb_Latn", - "cym", - "dan", - "deu", - "dsb", - "dtp", - "dws_Latn", - "egl", - "ell", - "enm_Latn", - "epo", - "est", - "eus", - "ewe", - "ext", - "fao", - "fij", - "fin", - "fkv_Latn", - "fra", - "frm_Latn", - "frr", - "fry", - "fuc", - "fuv", - "gan", - "gcf_Latn", - "gil", - "gla", - "gle", - "glg", - "glv", - "gom", - "gos", - "got_Goth", - "grc_Grek", - "grn", - "gsw", - "guj", - "hat", - "hau_Latn", - "haw", - "heb", - "hif_Latn", - "hil", - "hin", - "hnj_Latn", - "hoc", - "hoc_Latn", - "hrv", - "hsb", - "hun", - "hye", - "iba", - "ibo", - "ido", - "ido_Latn", - "ike_Latn", - "ile_Latn", - "ilo", - "ina_Latn", - "ind", - "isl", - "ita", - "izh", - "jav", - "jav_Java", - "jbo", - "jbo_Cyrl", - "jbo_Latn", - "jdt_Cyrl", - "jpn", - "kab", - "kal", - "kan", - "kat", - "kaz_Cyrl", - "kaz_Latn", - "kek_Latn", - "kha", - "khm", - "khm_Latn", - "kin", - "kir_Cyrl", - "kjh", - "kpv", - "krl", - "ksh", - "kum", - "kur_Arab", - "kur_Latn", - "lad", - "lad_Latn", - "lao", - "lat_Latn", - "lav", - "ldn_Latn", - "lfn_Cyrl", - "lfn_Latn", - "lij", - "lin", - "lit", - "liv_Latn", - "lkt", - "lld_Latn", - "lmo", - "ltg", - "ltz", - "lug", - "lzh", - "lzh_Hans", - "mad", - "mah", - "mai", - "mal", - "mar", - "max_Latn", - "mdf", - "mfe", - "mhr", - "mic", - "min", - "mkd", - "mlg", - "mlt", - "mnw", - "moh", - "mon", - "mri", - "mwl", - "mww", - "mya", - "myv", - "nan", - "nau", - "nav", - "nds", - "niu", - "nld", - "nno", - "nob", - "nob_Hebr", - "nog", - "non_Latn", - "nov_Latn", - "npi", - "nya", - "oci", - "ori", - "orv_Cyrl", - "oss", - "ota_Arab", - "ota_Latn", - "pag", - "pan_Guru", - "pap", - "pau", - "pdc", - "pes", - "pes_Latn", - "pes_Thaa", - "pms", - "pnb", - "pol", - "por", - "ppl_Latn", - "prg_Latn", - "pus", - "quc", - "qya", - "qya_Latn", - "rap", - "rif_Latn", - "roh", - "rom", - "ron", - "rue", - "run", - "rus", - "sag", - "sah", - "san_Deva", - "scn", - "sco", - "sgs", - "shs_Latn", - "shy_Latn", - "sin", - "sjn_Latn", - "slv", - "sma", - "sme", - "smo", - "sna", - "snd_Arab", - "som", - "spa", - "sqi", - "srp_Cyrl", - "srp_Latn", - "stq", - "sun", - "swe", - "swg", - "swh", - "tah", - "tam", - "tat", - "tat_Arab", - "tat_Latn", - "tel", - "tet", - "tgk_Cyrl", - "tha", - "tir", - "tlh_Latn", - "tly_Latn", - "tmw_Latn", - "toi_Latn", - "ton", - "tpw_Latn", - "tso", - "tuk", - "tuk_Latn", - "tur", - "tvl", - "tyv", - "tzl", - "tzl_Latn", - "udm", - "uig_Arab", - "uig_Cyrl", - "ukr", - "umb", - "urd", - "uzb_Cyrl", - "uzb_Latn", - "vec", - "vie", - "vie_Hani", - "vol_Latn", - "vro", - "war", - "wln", - "wol", - "wuu", - "xal", - "xho", - "yid", - "yor", - "yue", - "yue_Hans", - "yue_Hant", - "zho", - "zho_Hans", - "zho_Hant", - "zlm_Latn", - "zsm_Latn", - "zul", - "zza", - }, - ), - "nic": ( - "Niger-Kordofanian languages", - { - "bam_Latn", - "ewe", - "fuc", - "fuv", - "ibo", - "kin", - "lin", - "lug", - "nya", - "run", - "sag", - "sna", - "swh", - "toi_Latn", - "tso", - "umb", - "wol", - "xho", - "yor", - "zul", - }, - ), - "nld": ("Dutch", {"nld"}), - "nor": ("Norwegian", {"nob", "nno"}), - "phi": ("Philippine languages", {"ilo", "akl_Latn", "war", "hil", "pag", "ceb"}), - "pol": ("Polish", {"pol"}), - "por": ("Portuguese", {"por"}), - "pqe": ( - "Eastern Malayo-Polynesian languages", - {"fij", "gil", "haw", "mah", "mri", "nau", "niu", "rap", "smo", "tah", "ton", "tvl"}, - ), - "roa": ( - "Romance languages", - { - "arg", - "ast", - "cat", - "cos", - "egl", - "ext", - "fra", - "frm_Latn", - "gcf_Latn", - "glg", - "hat", - "ind", - "ita", - "lad", - "lad_Latn", - "lij", - "lld_Latn", - "lmo", - "max_Latn", - "mfe", - "min", - "mwl", - "oci", - "pap", - "pms", - "por", - "roh", - "ron", - "scn", - "spa", - "tmw_Latn", - "vec", - "wln", - "zlm_Latn", - "zsm_Latn", - }, - ), - "ron": ("Romanian", {"ron"}), - "run": ("Rundi", {"run"}), - "rus": ("Russian", {"rus"}), - "sal": ("Salishan languages", {"shs_Latn"}), - "sem": ("Semitic languages", {"acm", "afb", "amh", "apc", "ara", "arq", "ary", "arz", "heb", "mlt", "tir"}), - "sla": ( - "Slavic languages", - { - "bel", - "bel_Latn", - "bos_Latn", - "bul", - "bul_Latn", - "ces", - "csb_Latn", - "dsb", - "hrv", - "hsb", - "mkd", - "orv_Cyrl", - "pol", - "rue", - "rus", - "slv", - "srp_Cyrl", - "srp_Latn", - "ukr", - }, - ), - "slv": ("Slovenian", {"slv"}), - "spa": ("Spanish", {"spa"}), - "swe": ("Swedish", {"swe"}), - "taw": ("Tai", {"lao", "tha"}), - "tgl": ("Tagalog", {"tgl_Latn"}), - "tha": ("Thai", {"tha"}), - "trk": ( - "Turkic languages", - { - "aze_Latn", - "bak", - "chv", - "crh", - "crh_Latn", - "kaz_Cyrl", - "kaz_Latn", - "kir_Cyrl", - "kjh", - "kum", - "ota_Arab", - "ota_Latn", - "sah", - "tat", - "tat_Arab", - "tat_Latn", - "tuk", - "tuk_Latn", - "tur", - "tyv", - "uig_Arab", - "uig_Cyrl", - "uzb_Cyrl", - "uzb_Latn", - }, - ), - "tur": ("Turkish", {"tur"}), - "ukr": ("Ukrainian", {"ukr"}), - "urd": ("Urdu", {"urd"}), - "urj": ( - "Uralic languages", - { - "est", - "fin", - "fkv_Latn", - "hun", - "izh", - "kpv", - "krl", - "liv_Latn", - "mdf", - "mhr", - "myv", - "sma", - "sme", - "udm", - "vep", - "vro", - }, - ), - "vie": ("Vietnamese", {"vie", "vie_Hani"}), - "war": ("Waray (Philippines)", {"war"}), - "zho": ( - "Chinese", - { - "cjy_Hans", - "cjy_Hant", - "cmn", - "cmn_Bopo", - "cmn_Hang", - "cmn_Hani", - "cmn_Hans", - "cmn_Hant", - "cmn_Hira", - "cmn_Kana", - "cmn_Latn", - "cmn_Yiii", - "gan", - "hak_Hani", - "lzh", - "lzh_Bopo", - "lzh_Hang", - "lzh_Hani", - "lzh_Hans", - "lzh_Hira", - "lzh_Kana", - "lzh_Yiii", - "nan", - "nan_Hani", - "wuu", - "wuu_Bopo", - "wuu_Hani", - "wuu_Latn", - "yue", - "yue_Bopo", - "yue_Hang", - "yue_Hani", - "yue_Hans", - "yue_Hant", - "yue_Hira", - "yue_Kana", - "zho", - "zho_Hans", - "zho_Hant", - }, - ), - "zle": ("East Slavic languages", {"bel", "orv_Cyrl", "bel_Latn", "rus", "ukr", "rue"}), - "zls": ("South Slavic languages", {"bos_Latn", "bul", "bul_Latn", "hrv", "mkd", "slv", "srp_Cyrl", "srp_Latn"}), - "zlw": ("West Slavic languages", {"csb_Latn", "dsb", "hsb", "pol", "ces"}), -} - - -def l2front_matter(langs): - return "".join(f"- {l}\n" for l in langs) - - -def dedup(lst): - """Preservers order""" - new_lst = [] - for item in lst: - if not item or item in new_lst: - continue - else: - new_lst.append(item) - return new_lst - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "-m", "--models", action="append", help=" Set flag", required=True, nargs="+", dest="models" - ) - parser.add_argument("-save_dir", "--save_dir", default="marian_converted", help="where to save converted models") - args = parser.parse_args() - resolver = TatoebaConverter(save_dir=args.save_dir) - resolver.convert_models(args.models[0]) diff --git a/src/transformers/models/marian/convert_marian_to_pytorch.py b/src/transformers/models/marian/convert_marian_to_pytorch.py deleted file mode 100644 index 6c432ebcdf6f..000000000000 --- a/src/transformers/models/marian/convert_marian_to_pytorch.py +++ /dev/null @@ -1,717 +0,0 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import json -import os -import socket -import time -import warnings -from pathlib import Path -from typing import Union -from zipfile import ZipFile - -import numpy as np -import torch -from huggingface_hub.hf_api import list_models -from torch import nn -from tqdm import tqdm - -from transformers import MarianConfig, MarianMTModel, MarianTokenizer - - -def remove_suffix(text: str, suffix: str): - if text.endswith(suffix): - return text[: -len(suffix)] - return text # or whatever - - -def remove_prefix(text: str, prefix: str): - if text.startswith(prefix): - return text[len(prefix) :] - return text # or whatever - - -def convert_encoder_layer(opus_dict, layer_prefix: str, converter: dict): - sd = {} - for k in opus_dict: - if not k.startswith(layer_prefix): - continue - stripped = remove_prefix(k, layer_prefix) - v = opus_dict[k].T # besides embeddings, everything must be transposed. - sd[converter[stripped]] = torch.tensor(v).squeeze() - return sd - - -def load_layers_(layer_lst: nn.ModuleList, opus_state: dict, converter, is_decoder=False): - for i, layer in enumerate(layer_lst): - layer_tag = f"decoder_l{i + 1}_" if is_decoder else f"encoder_l{i + 1}_" - sd = convert_encoder_layer(opus_state, layer_tag, converter) - layer.load_state_dict(sd, strict=False) - - -def find_pretrained_model(src_lang: str, tgt_lang: str) -> list[str]: - """Find models that can accept src_lang as input and return tgt_lang as output.""" - prefix = "Helsinki-NLP/opus-mt-" - model_list = list_models() - model_ids = [x.id for x in model_list if x.id.startswith("Helsinki-NLP")] - src_and_targ = [ - remove_prefix(m, prefix).lower().split("-") for m in model_ids if "+" not in m - ] # + can't be loaded. - matching = [f"{prefix}{a}-{b}" for (a, b) in src_and_targ if src_lang in a and tgt_lang in b] - return matching - - -def add_emb_entries(wemb, final_bias, n_special_tokens=1): - vsize, d_model = wemb.shape - embs_to_add = np.zeros((n_special_tokens, d_model)) - new_embs = np.concatenate([wemb, embs_to_add]) - bias_to_add = np.zeros((n_special_tokens, 1)) - new_bias = np.concatenate((final_bias, bias_to_add), axis=1) - return new_embs, new_bias - - -def _cast_yaml_str(v): - bool_dct = {"true": True, "false": False} - if not isinstance(v, str): - return v - elif v in bool_dct: - return bool_dct[v] - try: - return int(v) - except (TypeError, ValueError): - return v - - -def cast_marian_config(raw_cfg: dict[str, str]) -> dict: - return {k: _cast_yaml_str(v) for k, v in raw_cfg.items()} - - -CONFIG_KEY = "special:model.yml" - - -def load_config_from_state_dict(opus_dict): - import yaml - - cfg_str = "".join([chr(x) for x in opus_dict[CONFIG_KEY]]) - yaml_cfg = yaml.load(cfg_str[:-1], Loader=yaml.BaseLoader) - return cast_marian_config(yaml_cfg) - - -def find_model_file(dest_dir): # this one better - model_files = list(Path(dest_dir).glob("*.npz")) - if len(model_files) != 1: - raise ValueError(f"Found more than one model file: {model_files}") - model_file = model_files[0] - return model_file - - -# Group Names Logic: change long opus model names to something shorter, like opus-mt-en-ROMANCE -ROM_GROUP = ( - "fr+fr_BE+fr_CA+fr_FR+wa+frp+oc+ca+rm+lld+fur+lij+lmo+es+es_AR+es_CL+es_CO+es_CR+es_DO+es_EC+es_ES+es_GT" - "+es_HN+es_MX+es_NI+es_PA+es_PE+es_PR+es_SV+es_UY+es_VE+pt+pt_br+pt_BR+pt_PT+gl+lad+an+mwl+it+it_IT+co" - "+nap+scn+vec+sc+ro+la" -) -GROUPS = [ - ("cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh", "ZH"), - (ROM_GROUP, "ROMANCE"), - ("de+nl+fy+af+da+fo+is+no+nb+nn+sv", "NORTH_EU"), - ("da+fo+is+no+nb+nn+sv", "SCANDINAVIA"), - ("se+sma+smj+smn+sms", "SAMI"), - ("nb_NO+nb+nn_NO+nn+nog+no_nb+no", "NORWAY"), - ("ga+cy+br+gd+kw+gv", "CELTIC"), # https://en.wikipedia.org/wiki/Insular_Celtic_languages -] -GROUP_TO_OPUS_NAME = { - "opus-mt-ZH-de": "cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh-de", - "opus-mt-ZH-fi": "cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh-fi", - "opus-mt-ZH-sv": "cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh-sv", - "opus-mt-SCANDINAVIA-SCANDINAVIA": "da+fo+is+no+nb+nn+sv-da+fo+is+no+nb+nn+sv", - "opus-mt-NORTH_EU-NORTH_EU": "de+nl+fy+af+da+fo+is+no+nb+nn+sv-de+nl+fy+af+da+fo+is+no+nb+nn+sv", - "opus-mt-de-ZH": "de-cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh", - "opus-mt-en_el_es_fi-en_el_es_fi": "en+el+es+fi-en+el+es+fi", - "opus-mt-en-ROMANCE": ( - "en-fr+fr_BE+fr_CA+fr_FR+wa+frp+oc+ca+rm+lld+fur+lij+lmo+es+es_AR+es_CL+es_CO+es_CR+es_DO" - "+es_EC+es_ES+es_GT+es_HN+es_MX+es_NI+es_PA+es_PE+es_PR+es_SV+es_UY+es_VE+pt+pt_br+pt_BR" - "+pt_PT+gl+lad+an+mwl+it+it_IT+co+nap+scn+vec+sc+ro+la" - ), - "opus-mt-en-CELTIC": "en-ga+cy+br+gd+kw+gv", - "opus-mt-es-NORWAY": "es-nb_NO+nb+nn_NO+nn+nog+no_nb+no", - "opus-mt-fi_nb_no_nn_ru_sv_en-SAMI": "fi+nb+no+nn+ru+sv+en-se+sma+smj+smn+sms", - "opus-mt-fi-ZH": "fi-cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh", - "opus-mt-fi-NORWAY": "fi-nb_NO+nb+nn_NO+nn+nog+no_nb+no", - "opus-mt-ROMANCE-en": ( - "fr+fr_BE+fr_CA+fr_FR+wa+frp+oc+ca+rm+lld+fur+lij+lmo+es+es_AR+es_CL+es_CO+es_CR+es_DO" - "+es_EC+es_ES+es_GT+es_HN+es_MX+es_NI+es_PA+es_PE+es_PR+es_SV+es_UY+es_VE+pt+pt_br+pt_BR" - "+pt_PT+gl+lad+an+mwl+it+it_IT+co+nap+scn+vec+sc+ro+la-en" - ), - "opus-mt-CELTIC-en": "ga+cy+br+gd+kw+gv-en", - "opus-mt-sv-ZH": "sv-cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh", - "opus-mt-sv-NORWAY": "sv-nb_NO+nb+nn_NO+nn+nog+no_nb+no", -} -OPUS_GITHUB_URL = "https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/models/" -ORG_NAME = "Helsinki-NLP/" - - -def convert_opus_name_to_hf_name(x): - """For OPUS-MT-Train/ DEPRECATED""" - for substr, grp_name in GROUPS: - x = x.replace(substr, grp_name) - return x.replace("+", "_") - - -def convert_hf_name_to_opus_name(hf_model_name): - """ - Relies on the assumption that there are no language codes like pt_br in models that are not in GROUP_TO_OPUS_NAME. - """ - hf_model_name = remove_prefix(hf_model_name, ORG_NAME) - if hf_model_name in GROUP_TO_OPUS_NAME: - opus_w_prefix = GROUP_TO_OPUS_NAME[hf_model_name] - else: - opus_w_prefix = hf_model_name.replace("_", "+") - return remove_prefix(opus_w_prefix, "opus-mt-") - - -def get_system_metadata(repo_root): - import git - - return { - "helsinki_git_sha": git.Repo(path=repo_root, search_parent_directories=True).head.object.hexsha, - "transformers_git_sha": git.Repo(path=".", search_parent_directories=True).head.object.hexsha, - "port_machine": socket.gethostname(), - "port_time": time.strftime("%Y-%m-%d-%H:%M"), - } - - -# docstyle-ignore -FRONT_MATTER_TEMPLATE = """--- -language: -{} -tags: -- translation - -license: apache-2.0 ---- -""" -DEFAULT_REPO = "Tatoeba-Challenge" -DEFAULT_MODEL_DIR = os.path.join(DEFAULT_REPO, "models") - - -def write_model_card( - hf_model_name: str, - repo_root=DEFAULT_REPO, - save_dir=Path("marian_converted"), - dry_run=False, - extra_metadata={}, -) -> str: - """ - Copy the most recent model's readme section from opus, and add metadata. upload command: aws s3 sync model_card_dir - s3://models.huggingface.co/bert/Helsinki-NLP/ --dryrun - """ - import pandas as pd - - hf_model_name = remove_prefix(hf_model_name, ORG_NAME) - opus_name: str = convert_hf_name_to_opus_name(hf_model_name) - if repo_root not in ("OPUS-MT-train", "Tatoeba-Challenge"): - raise ValueError(f"Repos root is {repo_root}. Expected either OPUS-MT-train or Tatoeba-Challenge") - opus_readme_path = Path(repo_root).joinpath("models", opus_name, "README.md") - if not (opus_readme_path.exists()): - raise ValueError(f"Readme file {opus_readme_path} not found") - - opus_src, opus_tgt = [x.split("+") for x in opus_name.split("-")] - - readme_url = f"https://github.com/Helsinki-NLP/{repo_root}/tree/master/models/{opus_name}/README.md" - - s, t = ",".join(opus_src), ",".join(opus_tgt) - metadata = { - "hf_name": hf_model_name, - "source_languages": s, - "target_languages": t, - "opus_readme_url": readme_url, - "original_repo": repo_root, - "tags": ["translation"], - } - metadata.update(extra_metadata) - metadata.update(get_system_metadata(repo_root)) - - # combine with opus markdown - - extra_markdown = ( - f"### {hf_model_name}\n\n* source group: {metadata['src_name']} \n* target group: " - f"{metadata['tgt_name']} \n* OPUS readme: [{opus_name}]({readme_url})\n" - ) - - content = opus_readme_path.open().read() - content = content.split("\n# ")[-1] # Get the lowest level 1 header in the README -- the most recent model. - splat = content.split("*")[2:] - print(splat[3]) - content = "*".join(splat) - content = ( - FRONT_MATTER_TEMPLATE.format(metadata["src_alpha2"]) - + extra_markdown - + "\n* " - + content.replace("download", "download original weights") - ) - - items = "\n\n".join([f"- {k}: {v}" for k, v in metadata.items()]) - sec3 = "\n### System Info: \n" + items - content += sec3 - if dry_run: - return content, metadata - sub_dir = save_dir / f"opus-mt-{hf_model_name}" - sub_dir.mkdir(exist_ok=True) - dest = sub_dir / "README.md" - dest.open("w").write(content) - pd.Series(metadata).to_json(sub_dir / "metadata.json") - - # if dry_run: - return content, metadata - - -def make_registry(repo_path="Opus-MT-train/models"): - if not (Path(repo_path) / "fr-en" / "README.md").exists(): - raise ValueError( - f"repo_path:{repo_path} does not exist: " - "You must run: git clone git@github.com:Helsinki-NLP/Opus-MT-train.git before calling." - ) - results = {} - for p in Path(repo_path).iterdir(): - n_dash = p.name.count("-") - if n_dash == 0: - continue - else: - lns = list(open(p / "README.md").readlines()) - results[p.name] = _parse_readme(lns) - return [(k, v["pre-processing"], v["download"], v["download"][:-4] + ".test.txt") for k, v in results.items()] - - -def convert_all_sentencepiece_models(model_list=None, repo_path=None, dest_dir=Path("marian_converted")): - """Requires 300GB""" - save_dir = Path("marian_ckpt") - dest_dir = Path(dest_dir) - dest_dir.mkdir(exist_ok=True) - save_paths = [] - if model_list is None: - model_list: list = make_registry(repo_path=repo_path) - for k, prepro, download, test_set_url in tqdm(model_list): - if "SentencePiece" not in prepro: # dont convert BPE models. - continue - if not os.path.exists(save_dir / k): - download_and_unzip(download, save_dir / k) - pair_name = convert_opus_name_to_hf_name(k) - convert(save_dir / k, dest_dir / f"opus-mt-{pair_name}") - - save_paths.append(dest_dir / f"opus-mt-{pair_name}") - return save_paths - - -def lmap(f, x) -> list: - return list(map(f, x)) - - -def fetch_test_set(test_set_url): - import wget - - fname = wget.download(test_set_url, "opus_test.txt") - lns = Path(fname).open().readlines() - src = lmap(str.strip, lns[::4]) - gold = lmap(str.strip, lns[1::4]) - mar_model = lmap(str.strip, lns[2::4]) - if not (len(gold) == len(mar_model) == len(src)): - raise ValueError(f"Gold, marian and source lengths {len(gold)}, {len(mar_model)}, {len(src)} mismatched") - os.remove(fname) - return src, mar_model, gold - - -def convert_whole_dir(path=Path("marian_ckpt/")): - for subdir in tqdm(list(path.ls())): - dest_dir = f"marian_converted/{subdir.name}" - if (dest_dir / "pytorch_model.bin").exists(): - continue - convert(source_dir, dest_dir) - - -def _parse_readme(lns): - """Get link and metadata from opus model card equivalent.""" - subres = {} - for ln in [x.strip() for x in lns]: - if not ln.startswith("*"): - continue - ln = ln[1:].strip() - - for k in ["download", "dataset", "models", "model", "pre-processing"]: - if ln.startswith(k): - break - else: - continue - if k in ["dataset", "model", "pre-processing"]: - splat = ln.split(":") - _, v = splat - subres[k] = v - elif k == "download": - v = ln.split("(")[-1][:-1] - subres[k] = v - return subres - - -def save_tokenizer_config(dest_dir: Path, separate_vocabs=False): - dname = dest_dir.name.split("-") - dct = {"target_lang": dname[-1], "source_lang": "-".join(dname[:-1]), "separate_vocabs": separate_vocabs} - save_json(dct, dest_dir / "tokenizer_config.json") - - -def add_to_vocab_(vocab: dict[str, int], special_tokens: list[str]): - start = max(vocab.values()) + 1 - added = 0 - for tok in special_tokens: - if tok in vocab: - continue - vocab[tok] = start + added - added += 1 - return added - - -def find_vocab_file(model_dir): - return list(model_dir.glob("*vocab.yml"))[0] - - -def find_src_vocab_file(model_dir): - return list(model_dir.glob("*src.vocab.yml"))[0] - - -def find_tgt_vocab_file(model_dir): - return list(model_dir.glob("*trg.vocab.yml"))[0] - - -def add_special_tokens_to_vocab(model_dir: Path, separate_vocab=False) -> None: - if separate_vocab: - vocab = load_yaml(find_src_vocab_file(model_dir)) - vocab = {k: int(v) for k, v in vocab.items()} - num_added = add_to_vocab_(vocab, [" "]) - save_json(vocab, model_dir / "vocab.json") - - vocab = load_yaml(find_tgt_vocab_file(model_dir)) - vocab = {k: int(v) for k, v in vocab.items()} - num_added = add_to_vocab_(vocab, [" "]) - save_json(vocab, model_dir / "target_vocab.json") - save_tokenizer_config(model_dir, separate_vocabs=separate_vocab) - else: - vocab = load_yaml(find_vocab_file(model_dir)) - vocab = {k: int(v) for k, v in vocab.items()} - num_added = add_to_vocab_(vocab, [" "]) - print(f"added {num_added} tokens to vocab") - save_json(vocab, model_dir / "vocab.json") - save_tokenizer_config(model_dir) - - -def check_equal(marian_cfg, k1, k2): - v1, v2 = marian_cfg[k1], marian_cfg[k2] - if v1 != v2: - raise ValueError(f"hparams {k1},{k2} differ: {v1} != {v2}") - - -def check_marian_cfg_assumptions(marian_cfg): - assumed_settings = { - "layer-normalization": False, - "right-left": False, - "transformer-ffn-depth": 2, - "transformer-aan-depth": 2, - "transformer-no-projection": False, - "transformer-postprocess-emb": "d", - "transformer-postprocess": "dan", # Dropout, add, normalize - "transformer-preprocess": "", - "type": "transformer", - "ulr-dim-emb": 0, - "dec-cell-base-depth": 2, - "dec-cell-high-depth": 1, - "transformer-aan-nogate": False, - } - for k, v in assumed_settings.items(): - actual = marian_cfg[k] - if actual != v: - raise ValueError(f"Unexpected config value for {k} expected {v} got {actual}") - - -BIAS_KEY = "decoder_ff_logit_out_b" -BART_CONVERTER = { # for each encoder and decoder layer - "self_Wq": "self_attn.q_proj.weight", - "self_Wk": "self_attn.k_proj.weight", - "self_Wv": "self_attn.v_proj.weight", - "self_Wo": "self_attn.out_proj.weight", - "self_bq": "self_attn.q_proj.bias", - "self_bk": "self_attn.k_proj.bias", - "self_bv": "self_attn.v_proj.bias", - "self_bo": "self_attn.out_proj.bias", - "self_Wo_ln_scale": "self_attn_layer_norm.weight", - "self_Wo_ln_bias": "self_attn_layer_norm.bias", - "ffn_W1": "fc1.weight", - "ffn_b1": "fc1.bias", - "ffn_W2": "fc2.weight", - "ffn_b2": "fc2.bias", - "ffn_ffn_ln_scale": "final_layer_norm.weight", - "ffn_ffn_ln_bias": "final_layer_norm.bias", - # Decoder Cross Attention - "context_Wk": "encoder_attn.k_proj.weight", - "context_Wo": "encoder_attn.out_proj.weight", - "context_Wq": "encoder_attn.q_proj.weight", - "context_Wv": "encoder_attn.v_proj.weight", - "context_bk": "encoder_attn.k_proj.bias", - "context_bo": "encoder_attn.out_proj.bias", - "context_bq": "encoder_attn.q_proj.bias", - "context_bv": "encoder_attn.v_proj.bias", - "context_Wo_ln_scale": "encoder_attn_layer_norm.weight", - "context_Wo_ln_bias": "encoder_attn_layer_norm.bias", -} - - -class OpusState: - def __init__(self, source_dir, eos_token_id=0): - npz_path = find_model_file(source_dir) - self.state_dict = np.load(npz_path) - cfg = load_config_from_state_dict(self.state_dict) - if cfg["dim-vocabs"][0] != cfg["dim-vocabs"][1]: - raise ValueError - if "Wpos" in self.state_dict: - raise ValueError("Wpos key in state dictionary") - self.state_dict = dict(self.state_dict) - if cfg["tied-embeddings-all"]: - cfg["tied-embeddings-src"] = True - cfg["tied-embeddings"] = True - self.share_encoder_decoder_embeddings = cfg["tied-embeddings-src"] - - # create the tokenizer here because we need to know the eos_token_id - self.source_dir = source_dir - self.tokenizer = self.load_tokenizer() - # retrieve EOS token and set correctly - tokenizer_has_eos_token_id = ( - hasattr(self.tokenizer, "eos_token_id") and self.tokenizer.eos_token_id is not None - ) - eos_token_id = self.tokenizer.eos_token_id if tokenizer_has_eos_token_id else 0 - - if cfg["tied-embeddings-src"]: - self.wemb, self.final_bias = add_emb_entries(self.state_dict["Wemb"], self.state_dict[BIAS_KEY], 1) - self.pad_token_id = self.wemb.shape[0] - 1 - cfg["vocab_size"] = self.pad_token_id + 1 - else: - self.wemb, _ = add_emb_entries(self.state_dict["encoder_Wemb"], self.state_dict[BIAS_KEY], 1) - self.dec_wemb, self.final_bias = add_emb_entries( - self.state_dict["decoder_Wemb"], self.state_dict[BIAS_KEY], 1 - ) - # still assuming that vocab size is same for encoder and decoder - self.pad_token_id = self.wemb.shape[0] - 1 - cfg["vocab_size"] = self.pad_token_id + 1 - cfg["decoder_vocab_size"] = self.pad_token_id + 1 - - if cfg["vocab_size"] != self.tokenizer.vocab_size: - raise ValueError( - f"Original vocab size {cfg['vocab_size']} and new vocab size {len(self.tokenizer.encoder)} mismatched." - ) - - # self.state_dict['Wemb'].sha - self.state_keys = list(self.state_dict.keys()) - if "Wtype" in self.state_dict: - raise ValueError("Wtype key in state dictionary") - self._check_layer_entries() - self.cfg = cfg - hidden_size, intermediate_shape = self.state_dict["encoder_l1_ffn_W1"].shape - if hidden_size != cfg["dim-emb"]: - raise ValueError(f"Hidden size {hidden_size} and configured size {cfg['dim_emb']} mismatched") - - # Process decoder.yml - decoder_yml = cast_marian_config(load_yaml(source_dir / "decoder.yml")) - check_marian_cfg_assumptions(cfg) - self.hf_config = MarianConfig( - vocab_size=cfg["vocab_size"], - decoder_vocab_size=cfg.get("decoder_vocab_size", cfg["vocab_size"]), - share_encoder_decoder_embeddings=cfg["tied-embeddings-src"], - decoder_layers=cfg["dec-depth"], - encoder_layers=cfg["enc-depth"], - decoder_attention_heads=cfg["transformer-heads"], - encoder_attention_heads=cfg["transformer-heads"], - decoder_ffn_dim=cfg["transformer-dim-ffn"], - encoder_ffn_dim=cfg["transformer-dim-ffn"], - d_model=cfg["dim-emb"], - activation_function=cfg["transformer-ffn-activation"], - pad_token_id=self.pad_token_id, - eos_token_id=eos_token_id, - forced_eos_token_id=eos_token_id, - bos_token_id=0, - max_position_embeddings=cfg["dim-emb"], - scale_embedding=True, - normalize_embedding="n" in cfg["transformer-preprocess"], - static_position_embeddings=not cfg["transformer-train-position-embeddings"], - tie_word_embeddings=cfg["tied-embeddings"], - dropout=0.1, # see opus-mt-train repo/transformer-dropout param. - # default: add_final_layer_norm=False, - num_beams=decoder_yml["beam-size"], - decoder_start_token_id=self.pad_token_id, - bad_words_ids=[[self.pad_token_id]], - max_length=512, - ) - - def _check_layer_entries(self): - self.encoder_l1 = self.sub_keys("encoder_l1") - self.decoder_l1 = self.sub_keys("decoder_l1") - self.decoder_l2 = self.sub_keys("decoder_l2") - if len(self.encoder_l1) != 16: - warnings.warn(f"Expected 16 keys for each encoder layer, got {len(self.encoder_l1)}") - if len(self.decoder_l1) != 26: - warnings.warn(f"Expected 26 keys for each decoder layer, got {len(self.decoder_l1)}") - if len(self.decoder_l2) != 26: - warnings.warn(f"Expected 26 keys for each decoder layer, got {len(self.decoder_l1)}") - - @property - def extra_keys(self): - extra = [] - for k in self.state_keys: - if ( - k.startswith("encoder_l") - or k.startswith("decoder_l") - or k in [CONFIG_KEY, "Wemb", "encoder_Wemb", "decoder_Wemb", "Wpos", "decoder_ff_logit_out_b"] - ): - continue - else: - extra.append(k) - return extra - - def sub_keys(self, layer_prefix): - return [remove_prefix(k, layer_prefix) for k in self.state_dict if k.startswith(layer_prefix)] - - def load_tokenizer(self): - # save tokenizer - add_special_tokens_to_vocab(self.source_dir, not self.share_encoder_decoder_embeddings) - return MarianTokenizer.from_pretrained(str(self.source_dir)) - - def load_marian_model(self) -> MarianMTModel: - state_dict, cfg = self.state_dict, self.hf_config - - if not cfg.static_position_embeddings: - raise ValueError("config.static_position_embeddings should be True") - model = MarianMTModel(cfg) - - if "hidden_size" in cfg.to_dict(): - raise ValueError("hidden_size is in config") - load_layers_( - model.model.encoder.layers, - state_dict, - BART_CONVERTER, - ) - load_layers_(model.model.decoder.layers, state_dict, BART_CONVERTER, is_decoder=True) - - # handle tensors not associated with layers - if self.cfg["tied-embeddings-src"]: - wemb_tensor = nn.Parameter(torch.FloatTensor(self.wemb)) - bias_tensor = nn.Parameter(torch.FloatTensor(self.final_bias)) - model.model.shared.weight = wemb_tensor - model.model.encoder.embed_tokens = model.model.decoder.embed_tokens = model.model.shared - else: - wemb_tensor = nn.Parameter(torch.FloatTensor(self.wemb)) - model.model.encoder.embed_tokens.weight = wemb_tensor - - decoder_wemb_tensor = nn.Parameter(torch.FloatTensor(self.dec_wemb)) - bias_tensor = nn.Parameter(torch.FloatTensor(self.final_bias)) - model.model.decoder.embed_tokens.weight = decoder_wemb_tensor - - # handle tied embeddings, otherwise "from_pretrained" loads them incorrectly - if self.cfg["tied-embeddings"]: - model.lm_head.weight.data = model.model.decoder.embed_tokens.weight.data.clone() - - model.final_logits_bias = bias_tensor - - if "Wpos" in state_dict: - print("Unexpected: got Wpos") - wpos_tensor = torch.tensor(state_dict["Wpos"]) - model.model.encoder.embed_positions.weight = wpos_tensor - model.model.decoder.embed_positions.weight = wpos_tensor - - if cfg.normalize_embedding: - if "encoder_emb_ln_scale_pre" not in state_dict: - raise ValueError("encoder_emb_ln_scale_pre is not in state dictionary") - raise NotImplementedError("Need to convert layernorm_embedding") - - if self.extra_keys: - raise ValueError(f"Failed to convert {self.extra_keys}") - - if model.get_input_embeddings().padding_idx != self.pad_token_id: - raise ValueError( - f"Padding tokens {model.get_input_embeddings().padding_idx} and {self.pad_token_id} mismatched" - ) - return model - - -def download_and_unzip(url, dest_dir): - try: - import wget - except ImportError: - raise ImportError("you must pip install wget") - - filename = wget.download(url) - unzip(filename, dest_dir) - os.remove(filename) - - -def convert(source_dir: Path, dest_dir): - dest_dir = Path(dest_dir) - dest_dir.mkdir(exist_ok=True) - - opus_state = OpusState(source_dir) - - # save tokenizer - opus_state.tokenizer.save_pretrained(dest_dir) - - # save_json(opus_state.cfg, dest_dir / "marian_original_config.json") - # ^^ Uncomment to save human readable marian config for debugging - - model = opus_state.load_marian_model() - model = model.half() - model.save_pretrained(dest_dir) - model.from_pretrained(dest_dir) # sanity check - - -def load_yaml(path): - import yaml - - with open(path, encoding="utf-8") as f: - return yaml.load(f, Loader=yaml.BaseLoader) - - -def save_json(content: Union[dict, list], path: str) -> None: - with open(path, "w") as f: - json.dump(content, f) - - -def unzip(zip_path: str, dest_dir: str) -> None: - with ZipFile(zip_path, "r") as zipObj: - zipObj.extractall(dest_dir) - - -if __name__ == "__main__": - """ - Tatoeba conversion instructions in scripts/tatoeba/README.md - """ - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--src", - type=str, - help="path to marian model sub dir. yaml.load will be used to load the configuration file, please be wary of which file you're loading.", - default="en-de", - ) - parser.add_argument("--dest", type=str, default=None, help="Path to the output PyTorch model.") - args = parser.parse_args() - - source_dir = Path(args.src) - if not source_dir.exists(): - raise ValueError(f"Source directory {source_dir} not found") - dest_dir = f"converted-{source_dir.name}" if args.dest is None else args.dest - convert(source_dir, dest_dir) diff --git a/src/transformers/models/mask2former/convert_mask2former_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/mask2former/convert_mask2former_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index 33cba259eed4..000000000000 --- a/src/transformers/models/mask2former/convert_mask2former_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,1020 +0,0 @@ -# coding=utf-8 -# Copyright 2022 Meta Platforms, Inc. and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import json -import sys -from argparse import ArgumentParser -from collections.abc import Iterator -from dataclasses import dataclass -from pathlib import Path -from pprint import pformat -from typing import Any - -import requests -import torch -import torchvision.transforms as T -from detectron2.checkpoint import DetectionCheckpointer -from detectron2.config import get_cfg -from detectron2.projects.deeplab import add_deeplab_config -from huggingface_hub import hf_hub_download -from PIL import Image -from torch import Tensor, nn - -from transformers import ( - Mask2FormerConfig, - Mask2FormerForUniversalSegmentation, - Mask2FormerImageProcessor, - Mask2FormerModel, - SwinConfig, -) -from transformers.models.mask2former.modeling_mask2former import ( - Mask2FormerForUniversalSegmentationOutput, - Mask2FormerModelOutput, -) -from transformers.utils import logging - - -StateDict = dict[str, Tensor] - -logging.set_verbosity_info() -logger = logging.get_logger() - -torch.manual_seed(0) - - -class TrackedStateDict: - def __init__(self, to_track: dict): - """This class "tracks" a python dictionary by keeping track of which item is accessed. - - Args: - to_track (Dict): The dictionary we wish to track - """ - self.to_track = to_track - self._seen: set[str] = set() - - def __getitem__(self, key: str) -> Any: - return self.to_track[key] - - def __setitem__(self, key: str, item: Any): - self._seen.add(key) - self.to_track[key] = item - - def diff(self) -> list[str]: - """This method returns a set difference between the keys in the tracked state dict and the one we have access so far. - This is an effective method to check if we have update all the keys - - Returns: - list[str]: List of keys not yet updated - """ - return set(self.to_track.keys()) - self._seen - - def copy(self) -> dict: - # proxy the call to the internal dictionary - return self.to_track.copy() - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - img_data = requests.get(url, stream=True).raw - im = Image.open(img_data) - return im - - -@dataclass -class Args: - """Fake command line arguments needed by mask2former/detectron implementation""" - - config_file: str - - -def setup_cfg(args: Args): - # load config from file and command-line arguments - cfg = get_cfg() - add_deeplab_config(cfg) - add_maskformer2_config(cfg) - cfg.merge_from_file(args.config_file) - cfg.freeze() - return cfg - - -class OriginalMask2FormerConfigToOursConverter: - def __call__(self, original_config: object) -> Mask2FormerConfig: - model = original_config.MODEL - - repo_id = "huggingface/label-files" - if model.SEM_SEG_HEAD.NUM_CLASSES == 847: - filename = "mask2former-ade20k-full-id2label.json" - elif model.SEM_SEG_HEAD.NUM_CLASSES == 150: - filename = "ade20k-id2label.json" - elif model.SEM_SEG_HEAD.NUM_CLASSES == 80: - filename = "coco-detection-mmdet-id2label.json" - elif model.SEM_SEG_HEAD.NUM_CLASSES == 171: - filename = "mask2former-coco-stuff-id2label.json" - elif model.SEM_SEG_HEAD.NUM_CLASSES == 133: - filename = "coco-panoptic-id2label.json" - elif model.SEM_SEG_HEAD.NUM_CLASSES == 19: - filename = "cityscapes-id2label.json" - elif model.SEM_SEG_HEAD.NUM_CLASSES == 8: - filename = "cityscapes-instance-id2label.json" - elif model.SEM_SEG_HEAD.NUM_CLASSES == 65: - filename = "mapillary-vistas-id2label.json" - - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - label2id = {label: idx for idx, label in id2label.items()} - - if model.SWIN.EMBED_DIM == 96: - backbone_config = SwinConfig.from_pretrained( - "microsoft/swin-tiny-patch4-window7-224", out_features=["stage1", "stage2", "stage3", "stage4"] - ) - elif model.SWIN.EMBED_DIM == 128: - backbone_config = SwinConfig( - embed_dim=128, - window_size=12, - depths=(2, 2, 18, 2), - num_heads=(4, 8, 16, 32), - out_features=["stage1", "stage2", "stage3", "stage4"], - ) - - elif model.SWIN.EMBED_DIM == 192: - backbone_config = SwinConfig.from_pretrained( - "microsoft/swin-large-patch4-window12-384", out_features=["stage1", "stage2", "stage3", "stage4"] - ) - else: - raise ValueError(f"embed dim {model.SWIN.EMBED_DIM} not supported for Swin!") - - backbone_config.drop_path_rate = model.SWIN.DROP_PATH_RATE - backbone_config.attention_probs_dropout_prob = model.SWIN.ATTN_DROP_RATE - backbone_config.depths = model.SWIN.DEPTHS - - config: Mask2FormerConfig = Mask2FormerConfig( - ignore_value=model.SEM_SEG_HEAD.IGNORE_VALUE, - num_labels=model.SEM_SEG_HEAD.NUM_CLASSES, - num_queries=model.MASK_FORMER.NUM_OBJECT_QUERIES, - no_object_weight=model.MASK_FORMER.NO_OBJECT_WEIGHT, - class_weight=model.MASK_FORMER.CLASS_WEIGHT, - mask_weight=model.MASK_FORMER.MASK_WEIGHT, - dice_weight=model.MASK_FORMER.DICE_WEIGHT, - train_num_points=model.MASK_FORMER.TRAIN_NUM_POINTS, - oversample_ratio=model.MASK_FORMER.OVERSAMPLE_RATIO, - importance_sample_ratio=model.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO, - init_std=0.02, - init_xavier_std=1.0, - use_auxiliary_loss=model.MASK_FORMER.DEEP_SUPERVISION, - feature_strides=[4, 8, 16, 32], - backbone_config=backbone_config, - id2label=id2label, - label2id=label2id, - feature_size=model.SEM_SEG_HEAD.CONVS_DIM, - mask_feature_size=model.SEM_SEG_HEAD.MASK_DIM, - hidden_dim=model.MASK_FORMER.HIDDEN_DIM, - encoder_layers=model.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS, - encoder_feedforward_dim=1024, - decoder_layers=model.MASK_FORMER.DEC_LAYERS, - num_attention_heads=model.MASK_FORMER.NHEADS, - dropout=model.MASK_FORMER.DROPOUT, - dim_feedforward=model.MASK_FORMER.DIM_FEEDFORWARD, - pre_norm=model.MASK_FORMER.PRE_NORM, - enforce_input_proj=model.MASK_FORMER.ENFORCE_INPUT_PROJ, - common_stride=model.SEM_SEG_HEAD.COMMON_STRIDE, - ) - return config - - -class OriginalMask2FormerConfigToImageProcessorConverter: - def __call__(self, original_config: object) -> Mask2FormerImageProcessor: - model = original_config.MODEL - model_input = original_config.INPUT - - return Mask2FormerImageProcessor( - image_mean=(torch.tensor(model.PIXEL_MEAN) / 255).tolist(), - image_std=(torch.tensor(model.PIXEL_STD) / 255).tolist(), - size=model_input.MIN_SIZE_TEST, - max_size=model_input.MAX_SIZE_TEST, - num_labels=model.SEM_SEG_HEAD.NUM_CLASSES, - ignore_index=model.SEM_SEG_HEAD.IGNORE_VALUE, - size_divisibility=32, - ) - - -class OriginalMask2FormerCheckpointToOursConverter: - def __init__(self, original_model: nn.Module, config: Mask2FormerConfig): - self.original_model = original_model - self.config = config - - def pop_all(self, renamed_keys: list[tuple[str, str]], dst_state_dict: StateDict, src_state_dict: StateDict): - for src_key, dst_key in renamed_keys: - dst_state_dict[dst_key] = src_state_dict.pop(src_key) - - def replace_maskformer_swin_backbone( - self, dst_state_dict: StateDict, src_state_dict: StateDict, config: Mask2FormerConfig - ): - dst_prefix: str = "pixel_level_module.encoder" - src_prefix: str = "backbone" - - renamed_keys = [ - ( - f"{src_prefix}.patch_embed.proj.weight", - f"{dst_prefix}.model.embeddings.patch_embeddings.projection.weight", - ), - (f"{src_prefix}.patch_embed.proj.bias", f"{dst_prefix}.model.embeddings.patch_embeddings.projection.bias"), - (f"{src_prefix}.patch_embed.norm.weight", f"{dst_prefix}.model.embeddings.norm.weight"), - (f"{src_prefix}.patch_embed.norm.bias", f"{dst_prefix}.model.embeddings.norm.bias"), - ] - num_layers = len(config.backbone_config.depths) - for layer_idx in range(num_layers): - for block_idx in range(config.backbone_config.depths[layer_idx]): - renamed_keys.extend( - [ # src, dst - ( - f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm1.weight", - f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_before.weight", - ), - ( - f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm1.bias", - f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_before.bias", - ), - ( - f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.relative_position_bias_table", - f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.relative_position_bias_table", - ), - ] - ) - # now we need to handle the attentions - # read in weights + bias of input projection layer of cross-attention - - src_att_weight = src_state_dict[f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.weight"] - src_att_bias = src_state_dict[f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.bias"] - - size = src_att_weight.shape[0] - offset = size // 3 - dst_state_dict[ - f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.query.weight" - ] = src_att_weight[:offset, :] - dst_state_dict[ - f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.query.bias" - ] = src_att_bias[:offset] - - dst_state_dict[ - f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.key.weight" - ] = src_att_weight[offset : offset * 2, :] - dst_state_dict[ - f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.key.bias" - ] = src_att_bias[offset : offset * 2] - - dst_state_dict[ - f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.value.weight" - ] = src_att_weight[-offset:, :] - dst_state_dict[ - f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.value.bias" - ] = src_att_bias[-offset:] - - # let's pop them - src_state_dict.pop(f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.weight") - src_state_dict.pop(f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.bias") - # proj - renamed_keys.extend( - [ - ( - f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.proj.weight", - f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.output.dense.weight", - ), - ( - f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.proj.bias", - f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.output.dense.bias", - ), - ] - ) - - # second norm - renamed_keys.extend( - [ - ( - f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm2.weight", - f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_after.weight", - ), - ( - f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm2.bias", - f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_after.bias", - ), - ] - ) - - # mlp - renamed_keys.extend( - [ - ( - f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc1.weight", - f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.intermediate.dense.weight", - ), - ( - f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc1.bias", - f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.intermediate.dense.bias", - ), - ( - f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc2.weight", - f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.output.dense.weight", - ), - ( - f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc2.bias", - f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.output.dense.bias", - ), - ] - ) - - renamed_keys.extend( - [ - ( - f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.relative_position_index", - f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.relative_position_index", - ) - ] - ) - - if layer_idx < num_layers - 1: - # patch merging - renamed_keys.extend( - [ - ( - f"{src_prefix}.layers.{layer_idx}.downsample.reduction.weight", - f"{dst_prefix}.model.encoder.layers.{layer_idx}.downsample.reduction.weight", - ), - ( - f"{src_prefix}.layers.{layer_idx}.downsample.norm.weight", - f"{dst_prefix}.model.encoder.layers.{layer_idx}.downsample.norm.weight", - ), - ( - f"{src_prefix}.layers.{layer_idx}.downsample.norm.bias", - f"{dst_prefix}.model.encoder.layers.{layer_idx}.downsample.norm.bias", - ), - ] - ) - - # hidden states norms - renamed_keys.extend( - [ - ( - f"{src_prefix}.norm{layer_idx}.weight", - f"{dst_prefix}.hidden_states_norms.{layer_idx}.weight", - ), - ( - f"{src_prefix}.norm{layer_idx}.bias", - f"{dst_prefix}.hidden_states_norms.{layer_idx}.bias", - ), - ] - ) - self.pop_all(renamed_keys, dst_state_dict, src_state_dict) - - def replace_swin_backbone(self, dst_state_dict: StateDict, src_state_dict: StateDict, config: Mask2FormerConfig): - dst_prefix: str = "pixel_level_module.encoder" - src_prefix: str = "backbone" - - renamed_keys = [ - ( - f"{src_prefix}.patch_embed.proj.weight", - f"{dst_prefix}.embeddings.patch_embeddings.projection.weight", - ), - (f"{src_prefix}.patch_embed.proj.bias", f"{dst_prefix}.embeddings.patch_embeddings.projection.bias"), - (f"{src_prefix}.patch_embed.norm.weight", f"{dst_prefix}.embeddings.norm.weight"), - (f"{src_prefix}.patch_embed.norm.bias", f"{dst_prefix}.embeddings.norm.bias"), - ] - - for layer_idx in range(len(config.backbone_config.depths)): - for block_idx in range(config.backbone_config.depths[layer_idx]): - renamed_keys.extend( - [ # src, dst - ( - f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm1.weight", - f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_before.weight", - ), - ( - f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm1.bias", - f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_before.bias", - ), - ( - f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.relative_position_bias_table", - f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.relative_position_bias_table", - ), - ] - ) - # now we need to handle the attentions - # read in weights + bias of input projection layer of cross-attention - - src_att_weight = src_state_dict[f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.weight"] - src_att_bias = src_state_dict[f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.bias"] - - size = src_att_weight.shape[0] - offset = size // 3 - dst_state_dict[ - f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.query.weight" - ] = src_att_weight[:offset, :] - dst_state_dict[ - f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.query.bias" - ] = src_att_bias[:offset] - - dst_state_dict[ - f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.key.weight" - ] = src_att_weight[offset : offset * 2, :] - dst_state_dict[ - f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.key.bias" - ] = src_att_bias[offset : offset * 2] - - dst_state_dict[ - f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.value.weight" - ] = src_att_weight[-offset:, :] - dst_state_dict[ - f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.value.bias" - ] = src_att_bias[-offset:] - - # let's pop them - src_state_dict.pop(f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.weight") - src_state_dict.pop(f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.bias") - # proj - renamed_keys.extend( - [ - ( - f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.proj.weight", - f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.output.dense.weight", - ), - ( - f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.proj.bias", - f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.output.dense.bias", - ), - ] - ) - - # second norm - renamed_keys.extend( - [ - ( - f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm2.weight", - f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_after.weight", - ), - ( - f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm2.bias", - f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_after.bias", - ), - ] - ) - - # mlp - renamed_keys.extend( - [ - ( - f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc1.weight", - f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.intermediate.dense.weight", - ), - ( - f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc1.bias", - f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.intermediate.dense.bias", - ), - ( - f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc2.weight", - f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.output.dense.weight", - ), - ( - f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc2.bias", - f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.output.dense.bias", - ), - ] - ) - - renamed_keys.extend( - [ - ( - f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.relative_position_index", - f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.relative_position_index", - ) - ] - ) - - if layer_idx < 3: - # patch merging - renamed_keys.extend( - [ - ( - f"{src_prefix}.layers.{layer_idx}.downsample.reduction.weight", - f"{dst_prefix}.encoder.layers.{layer_idx}.downsample.reduction.weight", - ), - ( - f"{src_prefix}.layers.{layer_idx}.downsample.norm.weight", - f"{dst_prefix}.encoder.layers.{layer_idx}.downsample.norm.weight", - ), - ( - f"{src_prefix}.layers.{layer_idx}.downsample.norm.bias", - f"{dst_prefix}.encoder.layers.{layer_idx}.downsample.norm.bias", - ), - ] - ) - - # hidden states norms - renamed_keys.extend( - [ - ( - f"{src_prefix}.norm{layer_idx}.weight", - f"{dst_prefix}.hidden_states_norms.stage{layer_idx + 1}.weight", - ), - ( - f"{src_prefix}.norm{layer_idx}.bias", - f"{dst_prefix}.hidden_states_norms.stage{layer_idx + 1}.bias", - ), - ] - ) - self.pop_all(renamed_keys, dst_state_dict, src_state_dict) - - # Backbone + Pixel Decoder - def replace_pixel_module(self, dst_state_dict: StateDict, src_state_dict: StateDict): - dst_prefix: str = "pixel_level_module.decoder" - src_prefix: str = "sem_seg_head.pixel_decoder" - - self.replace_swin_backbone(dst_state_dict, src_state_dict, self.config) - - def rename_keys_for_weight_bias(src_prefix: str, dst_prefix: str): - return [ - (f"{src_prefix}.weight", f"{dst_prefix}.weight"), - (f"{src_prefix}.bias", f"{dst_prefix}.bias"), - ] - - def rename_keys_for_self_attn(src_prefix: str, dst_prefix: str): - self_attn_keys = [] - self_attn_keys.extend( - rename_keys_for_weight_bias(f"{src_prefix}.attention_weights", f"{dst_prefix}.attention_weights") - ) - self_attn_keys.extend( - rename_keys_for_weight_bias(f"{src_prefix}.output_proj", f"{dst_prefix}.output_proj") - ) - self_attn_keys.extend( - rename_keys_for_weight_bias(f"{src_prefix}.sampling_offsets", f"{dst_prefix}.sampling_offsets") - ) - self_attn_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.value_proj", f"{dst_prefix}.value_proj")) - - return self_attn_keys - - def rename_keys_for_encoder_layer(src_prefix: str, dst_prefix: str): - encoder_keys = [] - encoder_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.linear1", f"{dst_prefix}.fc1")) - encoder_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.linear2", f"{dst_prefix}.fc2")) - encoder_keys.extend( - rename_keys_for_weight_bias(f"{src_prefix}.norm1", f"{dst_prefix}.self_attn_layer_norm") - ) - encoder_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.norm2", f"{dst_prefix}.final_layer_norm")) - encoder_keys.extend(rename_keys_for_self_attn(f"{src_prefix}.self_attn", f"{dst_prefix}.self_attn")) - - return encoder_keys - - # convolution layer for final features - renamed_keys = [ - (f"{src_prefix}.adapter_1.weight", f"{dst_prefix}.adapter_1.0.weight"), - (f"{src_prefix}.adapter_1.norm.weight", f"{dst_prefix}.adapter_1.1.weight"), - (f"{src_prefix}.adapter_1.norm.bias", f"{dst_prefix}.adapter_1.1.bias"), - ] - - renamed_keys.extend( - [ - (f"{src_prefix}.layer_1.weight", f"{dst_prefix}.layer_1.0.weight"), - (f"{src_prefix}.layer_1.norm.weight", f"{dst_prefix}.layer_1.1.weight"), - (f"{src_prefix}.layer_1.norm.bias", f"{dst_prefix}.layer_1.1.bias"), - ] - ) - - # proj layers - for i in range(3): - for j in range(2): - renamed_keys.extend( - [ - (f"{src_prefix}.input_proj.{i}.{j}.weight", f"{dst_prefix}.input_projections.{i}.{j}.weight"), - (f"{src_prefix}.input_proj.{i}.{j}.bias", f"{dst_prefix}.input_projections.{i}.{j}.bias"), - ] - ) - - renamed_keys.extend([(f"{src_prefix}.transformer.level_embed", f"{dst_prefix}.level_embed")]) - - # layers - for layer_idx in range(self.config.encoder_layers): - renamed_keys.extend( - rename_keys_for_encoder_layer( - f"{src_prefix}.transformer.encoder.layers.{layer_idx}", f"{dst_prefix}.encoder.layers.{layer_idx}" - ) - ) - - # proj - renamed_keys.extend( - [ - (f"{src_prefix}.mask_features.weight", f"{dst_prefix}.mask_projection.weight"), - (f"{src_prefix}.mask_features.bias", f"{dst_prefix}.mask_projection.bias"), - ] - ) - self.pop_all(renamed_keys, dst_state_dict, src_state_dict) - - # Transformer Decoder - def rename_keys_in_masked_attention_decoder(self, dst_state_dict: StateDict, src_state_dict: StateDict): - dst_prefix: str = "transformer_module.decoder" - src_prefix: str = "sem_seg_head.predictor" - - rename_keys = [] - for i in range(self.config.decoder_layers - 1): - rename_keys.append( - ( - f"{src_prefix}.transformer_self_attention_layers.{i}.self_attn.out_proj.weight", - f"{dst_prefix}.layers.{i}.self_attn.out_proj.weight", - ) - ) - rename_keys.append( - ( - f"{src_prefix}.transformer_self_attention_layers.{i}.self_attn.out_proj.bias", - f"{dst_prefix}.layers.{i}.self_attn.out_proj.bias", - ) - ) - - rename_keys.append( - ( - f"{src_prefix}.transformer_self_attention_layers.{i}.norm.weight", - f"{dst_prefix}.layers.{i}.self_attn_layer_norm.weight", - ) - ) - rename_keys.append( - ( - f"{src_prefix}.transformer_self_attention_layers.{i}.norm.bias", - f"{dst_prefix}.layers.{i}.self_attn_layer_norm.bias", - ) - ) - - rename_keys.append( - ( - f"{src_prefix}.transformer_cross_attention_layers.{i}.multihead_attn.in_proj_weight", - f"{dst_prefix}.layers.{i}.cross_attn.in_proj_weight", - ) - ) - rename_keys.append( - ( - f"{src_prefix}.transformer_cross_attention_layers.{i}.multihead_attn.in_proj_bias", - f"{dst_prefix}.layers.{i}.cross_attn.in_proj_bias", - ) - ) - rename_keys.append( - ( - f"{src_prefix}.transformer_cross_attention_layers.{i}.multihead_attn.out_proj.weight", - f"{dst_prefix}.layers.{i}.cross_attn.out_proj.weight", - ) - ) - rename_keys.append( - ( - f"{src_prefix}.transformer_cross_attention_layers.{i}.multihead_attn.out_proj.bias", - f"{dst_prefix}.layers.{i}.cross_attn.out_proj.bias", - ) - ) - - rename_keys.append( - ( - f"{src_prefix}.transformer_cross_attention_layers.{i}.norm.weight", - f"{dst_prefix}.layers.{i}.cross_attn_layer_norm.weight", - ) - ) - rename_keys.append( - ( - f"{src_prefix}.transformer_cross_attention_layers.{i}.norm.bias", - f"{dst_prefix}.layers.{i}.cross_attn_layer_norm.bias", - ) - ) - - rename_keys.append( - (f"{src_prefix}.transformer_ffn_layers.{i}.linear1.weight", f"{dst_prefix}.layers.{i}.fc1.weight") - ) - rename_keys.append( - (f"{src_prefix}.transformer_ffn_layers.{i}.linear1.bias", f"{dst_prefix}.layers.{i}.fc1.bias") - ) - rename_keys.append( - (f"{src_prefix}.transformer_ffn_layers.{i}.linear2.weight", f"{dst_prefix}.layers.{i}.fc2.weight") - ) - rename_keys.append( - (f"{src_prefix}.transformer_ffn_layers.{i}.linear2.bias", f"{dst_prefix}.layers.{i}.fc2.bias") - ) - rename_keys.append( - ( - f"{src_prefix}.transformer_ffn_layers.{i}.norm.weight", - f"{dst_prefix}.layers.{i}.final_layer_norm.weight", - ) - ) - rename_keys.append( - ( - f"{src_prefix}.transformer_ffn_layers.{i}.norm.bias", - f"{dst_prefix}.layers.{i}.final_layer_norm.bias", - ) - ) - - return rename_keys - - def replace_masked_attention_decoder(self, dst_state_dict: StateDict, src_state_dict: StateDict): - dst_prefix: str = "transformer_module.decoder" - src_prefix: str = "sem_seg_head.predictor" - - renamed_keys = self.rename_keys_in_masked_attention_decoder(dst_state_dict, src_state_dict) - - # add more - renamed_keys.extend( - [ - (f"{src_prefix}.decoder_norm.weight", f"{dst_prefix}.layernorm.weight"), - (f"{src_prefix}.decoder_norm.bias", f"{dst_prefix}.layernorm.bias"), - ] - ) - - mlp_len = 3 - for i in range(mlp_len): - renamed_keys.extend( - [ - ( - f"{src_prefix}.mask_embed.layers.{i}.weight", - f"{dst_prefix}.mask_predictor.mask_embedder.{i}.0.weight", - ), - ( - f"{src_prefix}.mask_embed.layers.{i}.bias", - f"{dst_prefix}.mask_predictor.mask_embedder.{i}.0.bias", - ), - ] - ) - - self.pop_all(renamed_keys, dst_state_dict, src_state_dict) - - def replace_keys_qkv_transformer_decoder(self, dst_state_dict: StateDict, src_state_dict: StateDict): - dst_prefix: str = "transformer_module.decoder.layers" - src_prefix: str = "sem_seg_head.predictor" - for i in range(self.config.decoder_layers - 1): - # read in weights + bias of input projection layer of self-attention - in_proj_weight = src_state_dict.pop( - f"{src_prefix}.transformer_self_attention_layers.{i}.self_attn.in_proj_weight" - ) - in_proj_bias = src_state_dict.pop( - f"{src_prefix}.transformer_self_attention_layers.{i}.self_attn.in_proj_bias" - ) - # next, add query, keys and values (in that order) to the state dict - dst_state_dict[f"{dst_prefix}.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :] - dst_state_dict[f"{dst_prefix}.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256] - dst_state_dict[f"{dst_prefix}.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :] - dst_state_dict[f"{dst_prefix}.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512] - dst_state_dict[f"{dst_prefix}.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :] - dst_state_dict[f"{dst_prefix}.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:] - - def replace_transformer_module(self, dst_state_dict: StateDict, src_state_dict: StateDict): - dst_prefix: str = "transformer_module" - src_prefix: str = "sem_seg_head.predictor" - - self.replace_masked_attention_decoder(dst_state_dict, src_state_dict) - - renamed_keys = [ - (f"{src_prefix}.query_embed.weight", f"{dst_prefix}.queries_embedder.weight"), - (f"{src_prefix}.query_feat.weight", f"{dst_prefix}.queries_features.weight"), - (f"{src_prefix}.level_embed.weight", f"{dst_prefix}.level_embed.weight"), - ] - - self.pop_all(renamed_keys, dst_state_dict, src_state_dict) - self.replace_keys_qkv_transformer_decoder(dst_state_dict, src_state_dict) - - def replace_universal_segmentation_module(self, dst_state_dict: StateDict, src_state_dict: StateDict): - dst_prefix: str = "" - src_prefix: str = "sem_seg_head.predictor" - - renamed_keys = [ - (f"{src_prefix}.class_embed.weight", f"{dst_prefix}class_predictor.weight"), - (f"{src_prefix}.class_embed.bias", f"{dst_prefix}class_predictor.bias"), - ] - - logger.info(f"Replacing keys {pformat(renamed_keys)}") - self.pop_all(renamed_keys, dst_state_dict, src_state_dict) - - def convert(self, mask2former: Mask2FormerModel) -> Mask2FormerModel: - dst_state_dict = TrackedStateDict(mask2former.state_dict()) - src_state_dict = self.original_model.state_dict() - - self.replace_pixel_module(dst_state_dict, src_state_dict) - self.replace_transformer_module(dst_state_dict, src_state_dict) - - logger.info(f"Missed keys are {pformat(dst_state_dict.diff())}") - logger.info(f"Not copied keys are {pformat(src_state_dict.keys())}") - logger.info("🙌 Done") - - state_dict = {key: dst_state_dict[key] for key in dst_state_dict.to_track} - mask2former.load_state_dict(state_dict) - return mask2former - - def convert_universal_segmentation( - self, mask2former: Mask2FormerForUniversalSegmentation - ) -> Mask2FormerForUniversalSegmentation: - dst_state_dict = TrackedStateDict(mask2former.state_dict()) - src_state_dict = self.original_model.state_dict() - - self.replace_universal_segmentation_module(dst_state_dict, src_state_dict) - - state_dict = {key: dst_state_dict[key] for key in dst_state_dict.to_track} - mask2former.load_state_dict(state_dict) - - return mask2former - - @staticmethod - def using_dirs(checkpoints_dir: Path, config_dir: Path) -> Iterator[tuple[object, Path, Path]]: - checkpoints: list[Path] = checkpoints_dir.glob("**/*.pkl") - - for checkpoint in checkpoints: - logger.info(f"💪 Converting {checkpoint.stem}") - # find associated config file - - # dataset_name e.g 'coco' - dataset_name = checkpoint.parents[2].stem - if dataset_name == "ade": - dataset_name = dataset_name.replace("ade", "ade20k") - - # task type e.g 'instance-segmentation' - segmentation_task = checkpoint.parents[1].stem - - # config file corresponding to checkpoint - config_file_name = f"{checkpoint.parents[0].stem}.yaml" - - config: Path = config_dir / dataset_name / segmentation_task / "swin" / config_file_name - yield config, checkpoint - - -def test( - original_model, - our_model: Mask2FormerForUniversalSegmentation, - image_processor: Mask2FormerImageProcessor, - tolerance: float, -): - with torch.no_grad(): - original_model = original_model.eval() - our_model = our_model.eval() - - im = prepare_img() - x = image_processor(images=im, return_tensors="pt")["pixel_values"] - - original_model_backbone_features = original_model.backbone(x.clone()) - our_model_output: Mask2FormerModelOutput = our_model.model(x.clone(), output_hidden_states=True) - - # Test backbone - for original_model_feature, our_model_feature in zip( - original_model_backbone_features.values(), our_model_output.encoder_hidden_states - ): - assert torch.allclose(original_model_feature, our_model_feature, atol=tolerance), ( - "The backbone features are not the same." - ) - - # Test pixel decoder - mask_features, _, multi_scale_features = original_model.sem_seg_head.pixel_decoder.forward_features( - original_model_backbone_features - ) - - for original_model_feature, our_model_feature in zip( - multi_scale_features, our_model_output.pixel_decoder_hidden_states - ): - assert torch.allclose(original_model_feature, our_model_feature, atol=tolerance), ( - "The pixel decoder feature are not the same" - ) - - # Let's test the full model - tr_complete = T.Compose( - [T.Resize((384, 384)), T.ToTensor()], - ) - y = (tr_complete(im) * 255.0).to(torch.int).float() - - # modify original Mask2Former code to return mask and class logits - original_class_logits, original_mask_logits = original_model([{"image": y.clone().squeeze(0)}]) - - our_model_out: Mask2FormerForUniversalSegmentationOutput = our_model(x.clone()) - our_mask_logits = our_model_out.masks_queries_logits - our_class_logits = our_model_out.class_queries_logits - - assert original_mask_logits.shape == our_mask_logits.shape, "Output masks shapes are not matching." - assert original_class_logits.shape == our_class_logits.shape, "Output class logits shapes are not matching." - assert torch.allclose(original_class_logits, our_class_logits, atol=tolerance), ( - "The class logits are not the same." - ) - assert torch.allclose(original_mask_logits, our_mask_logits, atol=tolerance), ( - "The predicted masks are not the same." - ) - - logger.info("✅ Test passed!") - - -def get_model_name(checkpoint_file: Path): - # model_name_raw is something like maskformer2_swin_small_bs16_50ep - model_name_raw: str = checkpoint_file.parents[0].stem - - # `segmentation_task_type` must be one of the following: `instance-segmentation`, `panoptic-segmentation`, `semantic-segmentation` - segmentation_task_name: str = checkpoint_file.parents[1].stem - if segmentation_task_name not in ["instance-segmentation", "panoptic-segmentation", "semantic-segmentation"]: - raise ValueError( - f"{segmentation_task_name} must be wrong since acceptable values are: instance-segmentation," - " panoptic-segmentation, semantic-segmentation." - ) - - # dataset name must be one of the following: `coco`, `ade`, `cityscapes`, `mapillary-vistas` - dataset_name: str = checkpoint_file.parents[2].stem - if dataset_name not in ["coco", "ade", "cityscapes", "mapillary-vistas"]: - raise ValueError( - f"{dataset_name} must be wrong since we didn't find 'coco' or 'ade' or 'cityscapes' or 'mapillary-vistas'" - " in it " - ) - - backbone = "swin" - backbone_types = ["tiny", "small", "base_IN21k", "base", "large"] - backbone_type = list(filter(lambda x: x in model_name_raw, backbone_types))[0].replace("_", "-") - - model_name = f"mask2former-{backbone}-{backbone_type}-{dataset_name}-{segmentation_task_name.split('-')[0]}" - - return model_name - - -if __name__ == "__main__": - parser = ArgumentParser( - description="Command line to convert the original mask2formers (with swin backbone) to our implementations." - ) - - parser.add_argument( - "--checkpoints_dir", - type=Path, - help=( - "A directory containing the model's checkpoints. The directory has to have the following structure:" - " / / / .pkl" - ), - ) - parser.add_argument( - "--configs_dir", - type=Path, - help=( - "A directory containing the model's configs, see detectron2 doc. The directory has to have the following" - " structure: / / / .yaml" - ), - ) - parser.add_argument( - "--mask2former_dir", - required=True, - type=Path, - help=( - "A path to Mask2Former's original implementation directory. You can download from here:" - " https://github.com/facebookresearch/Mask2Former" - ), - ) - - args = parser.parse_args() - - checkpoints_dir: Path = args.checkpoints_dir - config_dir: Path = args.configs_dir - mask2former_dir: Path = args.mask2former_dir - # append the path to the parents to mask2former dir - sys.path.append(str(mask2former_dir.parent)) - # import original Mask2Former config and model from original source code repo - from Mask2Former.mask2former.config import add_maskformer2_config - from Mask2Former.mask2former.maskformer_model import MaskFormer as OriginalMask2Former - - for config_file, checkpoint_file in OriginalMask2FormerCheckpointToOursConverter.using_dirs( - checkpoints_dir, config_dir - ): - model_name = get_model_name(checkpoint_file) - image_processor = OriginalMask2FormerConfigToImageProcessorConverter()( - setup_cfg(Args(config_file=config_file)) - ) - image_processor.size = {"height": 384, "width": 384} - - original_config = setup_cfg(Args(config_file=config_file)) - mask2former_kwargs = OriginalMask2Former.from_config(original_config) - original_model = OriginalMask2Former(**mask2former_kwargs).eval() - - DetectionCheckpointer(original_model).load(str(checkpoint_file)) - - config: Mask2FormerConfig = OriginalMask2FormerConfigToOursConverter()(original_config) - mask2former = Mask2FormerModel(config=config).eval() - - converter = OriginalMask2FormerCheckpointToOursConverter(original_model, config) - mask2former = converter.convert(mask2former) - - mask2former_for_segmentation = Mask2FormerForUniversalSegmentation(config=config).eval() - mask2former_for_segmentation.model = mask2former - - mask2former_for_segmentation = converter.convert_universal_segmentation(mask2former_for_segmentation) - - tolerance = 3e-1 - high_tolerance_models = [ - "mask2former-swin-base-IN21k-coco-instance", - "mask2former-swin-base-coco-instance", - "mask2former-swin-small-cityscapes-semantic", - ] - - if model_name in high_tolerance_models: - tolerance = 3e-1 - - logger.info(f"🪄 Testing {model_name}...") - test(original_model, mask2former_for_segmentation, image_processor, tolerance) - logger.info(f"🪄 Pushing {model_name} to hub...") - - image_processor.push_to_hub(model_name) - mask2former_for_segmentation.push_to_hub(model_name) diff --git a/src/transformers/models/mask2former/image_processing_mask2former.py b/src/transformers/models/mask2former/image_processing_mask2former.py index a0c369722b54..06fe78e82e9e 100644 --- a/src/transformers/models/mask2former/image_processing_mask2former.py +++ b/src/transformers/models/mask2former/image_processing_mask2former.py @@ -303,7 +303,7 @@ def compute_segments( # TODO: (Amy) Move to image_transforms # Copied from transformers.models.maskformer.image_processing_maskformer.convert_segmentation_map_to_binary_masks def convert_segmentation_map_to_binary_masks( - segmentation_map: "np.ndarray", + segmentation_map: np.ndarray, instance_id_to_semantic_id: Optional[dict[int, int]] = None, ignore_index: Optional[int] = None, do_reduce_labels: bool = False, @@ -582,7 +582,7 @@ def rescale( # Copied from transformers.models.maskformer.image_processing_maskformer.MaskFormerImageProcessor.convert_segmentation_map_to_binary_masks def convert_segmentation_map_to_binary_masks( self, - segmentation_map: "np.ndarray", + segmentation_map: np.ndarray, instance_id_to_semantic_id: Optional[dict[int, int]] = None, ignore_index: Optional[int] = None, do_reduce_labels: bool = False, diff --git a/src/transformers/models/mask2former/image_processing_mask2former_fast.py b/src/transformers/models/mask2former/image_processing_mask2former_fast.py index a5d662288119..58dbb09d6319 100644 --- a/src/transformers/models/mask2former/image_processing_mask2former_fast.py +++ b/src/transformers/models/mask2former/image_processing_mask2former_fast.py @@ -23,6 +23,7 @@ import torch from torch import nn +from torchvision.transforms.v2 import functional as F from ...image_processing_utils import BatchFeature, get_size_dict from ...image_processing_utils_fast import ( @@ -42,7 +43,7 @@ PILImageResampling, ) from ...processing_utils import Unpack -from ...utils import TensorType, auto_docstring, is_torchvision_v2_available, logging +from ...utils import TensorType, auto_docstring, logging from .image_processing_mask2former import ( compute_segments, convert_segmentation_to_rle, @@ -51,11 +52,6 @@ ) -if is_torchvision_v2_available(): - from torchvision.transforms.v2 import functional as F -else: - from torchvision.transforms import functional as F - logger = logging.get_logger(__name__) @@ -348,9 +344,7 @@ def _preprocess( image=grouped_segmentation_maps[shape], size=size, size_divisor=size_divisor, - interpolation=F.InterpolationMode.NEAREST_EXACT - if is_torchvision_v2_available() - else F.InterpolationMode.NEAREST, + interpolation=F.InterpolationMode.NEAREST_EXACT, ) resized_images_grouped[shape] = stacked_images if segmentation_maps is not None: diff --git a/src/transformers/models/mask2former/modeling_mask2former.py b/src/transformers/models/mask2former/modeling_mask2former.py index e8c3d2344b8d..553700465f3c 100644 --- a/src/transformers/models/mask2former/modeling_mask2former.py +++ b/src/transformers/models/mask2former/modeling_mask2former.py @@ -783,7 +783,7 @@ def get_num_masks(self, class_labels: torch.Tensor, device: torch.device) -> tor """ Computes the average number of target masks across the batch, for normalization purposes. """ - num_masks = sum([len(classes) for classes in class_labels]) + num_masks = sum(len(classes) for classes in class_labels) num_masks = torch.as_tensor(num_masks, dtype=torch.float, device=device) world_size = 1 if is_accelerate_available(): diff --git a/src/transformers/models/maskformer/convert_maskformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/maskformer/convert_maskformer_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index fac17d022033..000000000000 --- a/src/transformers/models/maskformer/convert_maskformer_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,732 +0,0 @@ -# coding=utf-8 -# Copyright 2022 Meta Platforms, Inc. and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import sys -from argparse import ArgumentParser -from collections.abc import Iterator -from dataclasses import dataclass -from pathlib import Path -from pprint import pformat -from typing import Any - -import requests -import torch -import torchvision.transforms as T -from detectron2.checkpoint import DetectionCheckpointer -from detectron2.config import get_cfg -from detectron2.data import MetadataCatalog -from detectron2.projects.deeplab import add_deeplab_config -from PIL import Image -from torch import Tensor, nn - -from transformers.models.maskformer.feature_extraction_maskformer import MaskFormerImageProcessor -from transformers.models.maskformer.modeling_maskformer import ( - MaskFormerConfig, - MaskFormerForInstanceSegmentation, - MaskFormerForInstanceSegmentationOutput, - MaskFormerModel, - MaskFormerModelOutput, -) -from transformers.utils import logging - - -StateDict = dict[str, Tensor] - -logging.set_verbosity_info() -logger = logging.get_logger() - -torch.manual_seed(0) - - -class TrackedStateDict: - def __init__(self, to_track: dict): - """This class "tracks" a python dictionary by keeping track of which item is accessed. - - Args: - to_track (Dict): The dictionary we wish to track - """ - self.to_track = to_track - self._seen: set[str] = set() - - def __getitem__(self, key: str) -> Any: - return self.to_track[key] - - def __setitem__(self, key: str, item: Any): - self._seen.add(key) - self.to_track[key] = item - - def diff(self) -> list[str]: - """This method returns a set difference between the keys in the tracked state dict and the one we have access so far. - This is an effective method to check if we have update all the keys - - Returns: - list[str]: List of keys not yet updated - """ - return set(self.to_track.keys()) - self._seen - - def copy(self) -> dict: - # proxy the call to the internal dictionary - return self.to_track.copy() - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - img_data = requests.get(url, stream=True).raw - im = Image.open(img_data) - return im - - -@dataclass -class Args: - """Fake command line arguments needed by maskformer/detectron implementation""" - - config_file: str - - -def setup_cfg(args: Args): - # load config from file and command-line arguments - cfg = get_cfg() - add_deeplab_config(cfg) - add_mask_former_config(cfg) - cfg.merge_from_file(args.config_file) - cfg.freeze() - return cfg - - -class OriginalMaskFormerConfigToOursConverter: - def __call__(self, original_config: object) -> MaskFormerConfig: - model = original_config.MODEL - mask_former = model.MASK_FORMER - swin = model.SWIN - - dataset_catalog = MetadataCatalog.get(original_config.DATASETS.TEST[0]) - id2label = dict(enumerate(dataset_catalog.stuff_classes)) - label2id = {label: idx for idx, label in id2label.items()} - - config: MaskFormerConfig = MaskFormerConfig( - fpn_feature_size=model.SEM_SEG_HEAD.CONVS_DIM, - mask_feature_size=model.SEM_SEG_HEAD.MASK_DIM, - num_labels=model.SEM_SEG_HEAD.NUM_CLASSES, - no_object_weight=mask_former.NO_OBJECT_WEIGHT, - num_queries=mask_former.NUM_OBJECT_QUERIES, - backbone_config={ - "pretrain_img_size": swin.PRETRAIN_IMG_SIZE, - "image_size": swin.PRETRAIN_IMG_SIZE, - "in_channels": 3, - "patch_size": swin.PATCH_SIZE, - "embed_dim": swin.EMBED_DIM, - "depths": swin.DEPTHS, - "num_heads": swin.NUM_HEADS, - "window_size": swin.WINDOW_SIZE, - "drop_path_rate": swin.DROP_PATH_RATE, - "model_type": "swin", - }, - dice_weight=mask_former.DICE_WEIGHT, - ce_weight=1.0, - mask_weight=mask_former.MASK_WEIGHT, - decoder_config={ - "model_type": "detr", - "max_position_embeddings": 1024, - "encoder_layers": 6, - "encoder_ffn_dim": 2048, - "encoder_attention_heads": 8, - "decoder_layers": mask_former.DEC_LAYERS, - "decoder_ffn_dim": mask_former.DIM_FEEDFORWARD, - "decoder_attention_heads": mask_former.NHEADS, - "encoder_layerdrop": 0.0, - "decoder_layerdrop": 0.0, - "d_model": mask_former.HIDDEN_DIM, - "dropout": mask_former.DROPOUT, - "attention_dropout": 0.0, - "activation_dropout": 0.0, - "init_std": 0.02, - "init_xavier_std": 1.0, - "scale_embedding": False, - "auxiliary_loss": False, - "dilation": False, - # default pretrained config values - }, - id2label=id2label, - label2id=label2id, - ) - - return config - - -class OriginalMaskFormerConfigToImageProcessorConverter: - def __call__(self, original_config: object) -> MaskFormerImageProcessor: - model = original_config.MODEL - model_input = original_config.INPUT - dataset_catalog = MetadataCatalog.get(original_config.DATASETS.TEST[0]) - - return MaskFormerImageProcessor( - image_mean=(torch.tensor(model.PIXEL_MEAN) / 255).tolist(), - image_std=(torch.tensor(model.PIXEL_STD) / 255).tolist(), - size=model_input.MIN_SIZE_TEST, - max_size=model_input.MAX_SIZE_TEST, - num_labels=model.SEM_SEG_HEAD.NUM_CLASSES, - ignore_index=dataset_catalog.ignore_label, - size_divisibility=32, # 32 is required by swin - ) - - -class OriginalMaskFormerCheckpointToOursConverter: - def __init__(self, original_model: nn.Module, config: MaskFormerConfig): - self.original_model = original_model - self.config = config - - def pop_all(self, renamed_keys: list[tuple[str, str]], dst_state_dict: StateDict, src_state_dict: StateDict): - for src_key, dst_key in renamed_keys: - dst_state_dict[dst_key] = src_state_dict.pop(src_key) - - def replace_backbone(self, dst_state_dict: StateDict, src_state_dict: StateDict, config: MaskFormerConfig): - dst_prefix: str = "pixel_level_module.encoder" - src_prefix: str = "backbone" - - renamed_keys = [ - ( - f"{src_prefix}.patch_embed.proj.weight", - f"{dst_prefix}.model.embeddings.patch_embeddings.projection.weight", - ), - (f"{src_prefix}.patch_embed.proj.bias", f"{dst_prefix}.model.embeddings.patch_embeddings.projection.bias"), - (f"{src_prefix}.patch_embed.norm.weight", f"{dst_prefix}.model.embeddings.norm.weight"), - (f"{src_prefix}.patch_embed.norm.bias", f"{dst_prefix}.model.embeddings.norm.bias"), - ] - num_layers = len(config.backbone_config.depths) - for layer_idx in range(num_layers): - for block_idx in range(config.backbone_config.depths[layer_idx]): - renamed_keys.extend( - [ # src, dst - ( - f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm1.weight", - f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_before.weight", - ), - ( - f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm1.bias", - f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_before.bias", - ), - ( - f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.relative_position_bias_table", - f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.relative_position_bias_table", - ), - ] - ) - # now we need to handle the attentions - # read in weights + bias of input projection layer of cross-attention - - src_att_weight = src_state_dict[f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.weight"] - src_att_bias = src_state_dict[f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.bias"] - - size = src_att_weight.shape[0] - offset = size // 3 - dst_state_dict[ - f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.query.weight" - ] = src_att_weight[:offset, :] - dst_state_dict[ - f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.query.bias" - ] = src_att_bias[:offset] - - dst_state_dict[ - f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.key.weight" - ] = src_att_weight[offset : offset * 2, :] - dst_state_dict[ - f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.key.bias" - ] = src_att_bias[offset : offset * 2] - - dst_state_dict[ - f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.value.weight" - ] = src_att_weight[-offset:, :] - dst_state_dict[ - f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.value.bias" - ] = src_att_bias[-offset:] - - # let's pop them - src_state_dict.pop(f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.weight") - src_state_dict.pop(f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.bias") - # proj - renamed_keys.extend( - [ - ( - f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.proj.weight", - f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.output.dense.weight", - ), - ( - f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.proj.bias", - f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.output.dense.bias", - ), - ] - ) - - # second norm - renamed_keys.extend( - [ - ( - f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm2.weight", - f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_after.weight", - ), - ( - f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm2.bias", - f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_after.bias", - ), - ] - ) - - # mlp - renamed_keys.extend( - [ - ( - f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc1.weight", - f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.intermediate.dense.weight", - ), - ( - f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc1.bias", - f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.intermediate.dense.bias", - ), - ( - f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc2.weight", - f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.output.dense.weight", - ), - ( - f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc2.bias", - f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.output.dense.bias", - ), - ] - ) - - renamed_keys.extend( - [ - ( - f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.relative_position_index", - f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.relative_position_index", - ) - ] - ) - - if layer_idx < num_layers - 1: - # patch merging - renamed_keys.extend( - [ - ( - f"{src_prefix}.layers.{layer_idx}.downsample.reduction.weight", - f"{dst_prefix}.model.encoder.layers.{layer_idx}.downsample.reduction.weight", - ), - ( - f"{src_prefix}.layers.{layer_idx}.downsample.norm.weight", - f"{dst_prefix}.model.encoder.layers.{layer_idx}.downsample.norm.weight", - ), - ( - f"{src_prefix}.layers.{layer_idx}.downsample.norm.bias", - f"{dst_prefix}.model.encoder.layers.{layer_idx}.downsample.norm.bias", - ), - ] - ) - - # hidden states norms - renamed_keys.extend( - [ - ( - f"{src_prefix}.norm{layer_idx}.weight", - f"{dst_prefix}.hidden_states_norms.{layer_idx}.weight", - ), - ( - f"{src_prefix}.norm{layer_idx}.bias", - f"{dst_prefix}.hidden_states_norms.{layer_idx}.bias", - ), - ] - ) - self.pop_all(renamed_keys, dst_state_dict, src_state_dict) - - def replace_pixel_module(self, dst_state_dict: StateDict, src_state_dict: StateDict): - dst_prefix: str = "pixel_level_module.decoder" - src_prefix: str = "sem_seg_head.pixel_decoder" - - self.replace_backbone(dst_state_dict, src_state_dict, self.config) - - def rename_keys_for_conv(detectron_conv: str, mine_conv: str): - return [ - (f"{detectron_conv}.weight", f"{mine_conv}.0.weight"), - # 2 cuz the have act in the middle -> rename it - (f"{detectron_conv}.norm.weight", f"{mine_conv}.1.weight"), - (f"{detectron_conv}.norm.bias", f"{mine_conv}.1.bias"), - ] - - renamed_keys = [ - (f"{src_prefix}.mask_features.weight", f"{dst_prefix}.mask_projection.weight"), - (f"{src_prefix}.mask_features.bias", f"{dst_prefix}.mask_projection.bias"), - # the layers in the original one are in reverse order, stem is the last one! - ] - - renamed_keys.extend(rename_keys_for_conv(f"{src_prefix}.layer_4", f"{dst_prefix}.fpn.stem")) - - # add all the fpn layers (here we need some config parameters to know the size in advance) - for src_i, dst_i in zip(range(3, 0, -1), range(0, 3)): - renamed_keys.extend( - rename_keys_for_conv(f"{src_prefix}.adapter_{src_i}", f"{dst_prefix}.fpn.layers.{dst_i}.proj") - ) - renamed_keys.extend( - rename_keys_for_conv(f"{src_prefix}.layer_{src_i}", f"{dst_prefix}.fpn.layers.{dst_i}.block") - ) - - self.pop_all(renamed_keys, dst_state_dict, src_state_dict) - - def rename_keys_in_detr_decoder(self, dst_state_dict: StateDict, src_state_dict: StateDict): - dst_prefix: str = "transformer_module.decoder" - src_prefix: str = "sem_seg_head.predictor.transformer.decoder" - # not sure why we are not popping direcetly here! - # here we list all keys to be renamed (original name on the left, our name on the right) - rename_keys = [] - for i in range(self.config.decoder_config.decoder_layers): - # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms - rename_keys.append( - ( - f"{src_prefix}.layers.{i}.self_attn.out_proj.weight", - f"{dst_prefix}.layers.{i}.self_attn.out_proj.weight", - ) - ) - rename_keys.append( - ( - f"{src_prefix}.layers.{i}.self_attn.out_proj.bias", - f"{dst_prefix}.layers.{i}.self_attn.out_proj.bias", - ) - ) - rename_keys.append( - ( - f"{src_prefix}.layers.{i}.multihead_attn.out_proj.weight", - f"{dst_prefix}.layers.{i}.encoder_attn.out_proj.weight", - ) - ) - rename_keys.append( - ( - f"{src_prefix}.layers.{i}.multihead_attn.out_proj.bias", - f"{dst_prefix}.layers.{i}.encoder_attn.out_proj.bias", - ) - ) - rename_keys.append((f"{src_prefix}.layers.{i}.linear1.weight", f"{dst_prefix}.layers.{i}.fc1.weight")) - rename_keys.append((f"{src_prefix}.layers.{i}.linear1.bias", f"{dst_prefix}.layers.{i}.fc1.bias")) - rename_keys.append((f"{src_prefix}.layers.{i}.linear2.weight", f"{dst_prefix}.layers.{i}.fc2.weight")) - rename_keys.append((f"{src_prefix}.layers.{i}.linear2.bias", f"{dst_prefix}.layers.{i}.fc2.bias")) - rename_keys.append( - (f"{src_prefix}.layers.{i}.norm1.weight", f"{dst_prefix}.layers.{i}.self_attn_layer_norm.weight") - ) - rename_keys.append( - (f"{src_prefix}.layers.{i}.norm1.bias", f"{dst_prefix}.layers.{i}.self_attn_layer_norm.bias") - ) - rename_keys.append( - (f"{src_prefix}.layers.{i}.norm2.weight", f"{dst_prefix}.layers.{i}.encoder_attn_layer_norm.weight") - ) - rename_keys.append( - (f"{src_prefix}.layers.{i}.norm2.bias", f"{dst_prefix}.layers.{i}.encoder_attn_layer_norm.bias") - ) - rename_keys.append( - (f"{src_prefix}.layers.{i}.norm3.weight", f"{dst_prefix}.layers.{i}.final_layer_norm.weight") - ) - rename_keys.append( - (f"{src_prefix}.layers.{i}.norm3.bias", f"{dst_prefix}.layers.{i}.final_layer_norm.bias") - ) - - return rename_keys - - def replace_q_k_v_in_detr_decoder(self, dst_state_dict: StateDict, src_state_dict: StateDict): - dst_prefix: str = "transformer_module.decoder" - src_prefix: str = "sem_seg_head.predictor.transformer.decoder" - for i in range(self.config.decoder_config.decoder_layers): - # read in weights + bias of input projection layer of self-attention - in_proj_weight = src_state_dict.pop(f"{src_prefix}.layers.{i}.self_attn.in_proj_weight") - in_proj_bias = src_state_dict.pop(f"{src_prefix}.layers.{i}.self_attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - dst_state_dict[f"{dst_prefix}.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :] - dst_state_dict[f"{dst_prefix}.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256] - dst_state_dict[f"{dst_prefix}.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :] - dst_state_dict[f"{dst_prefix}.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512] - dst_state_dict[f"{dst_prefix}.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :] - dst_state_dict[f"{dst_prefix}.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:] - # read in weights + bias of input projection layer of cross-attention - in_proj_weight_cross_attn = src_state_dict.pop(f"{src_prefix}.layers.{i}.multihead_attn.in_proj_weight") - in_proj_bias_cross_attn = src_state_dict.pop(f"{src_prefix}.layers.{i}.multihead_attn.in_proj_bias") - # next, add query, keys and values (in that order) of cross-attention to the state dict - dst_state_dict[f"{dst_prefix}.layers.{i}.encoder_attn.q_proj.weight"] = in_proj_weight_cross_attn[:256, :] - dst_state_dict[f"{dst_prefix}.layers.{i}.encoder_attn.q_proj.bias"] = in_proj_bias_cross_attn[:256] - dst_state_dict[f"{dst_prefix}.layers.{i}.encoder_attn.k_proj.weight"] = in_proj_weight_cross_attn[ - 256:512, : - ] - dst_state_dict[f"{dst_prefix}.layers.{i}.encoder_attn.k_proj.bias"] = in_proj_bias_cross_attn[256:512] - dst_state_dict[f"{dst_prefix}.layers.{i}.encoder_attn.v_proj.weight"] = in_proj_weight_cross_attn[-256:, :] - dst_state_dict[f"{dst_prefix}.layers.{i}.encoder_attn.v_proj.bias"] = in_proj_bias_cross_attn[-256:] - - def replace_detr_decoder(self, dst_state_dict: StateDict, src_state_dict: StateDict): - dst_prefix: str = "transformer_module.decoder" - src_prefix: str = "sem_seg_head.predictor.transformer.decoder" - renamed_keys = self.rename_keys_in_detr_decoder(dst_state_dict, src_state_dict) - # add more - renamed_keys.extend( - [ - (f"{src_prefix}.norm.weight", f"{dst_prefix}.layernorm.weight"), - (f"{src_prefix}.norm.bias", f"{dst_prefix}.layernorm.bias"), - ] - ) - - self.pop_all(renamed_keys, dst_state_dict, src_state_dict) - - self.replace_q_k_v_in_detr_decoder(dst_state_dict, src_state_dict) - - def replace_transformer_module(self, dst_state_dict: StateDict, src_state_dict: StateDict): - dst_prefix: str = "transformer_module" - src_prefix: str = "sem_seg_head.predictor" - - self.replace_detr_decoder(dst_state_dict, src_state_dict) - - renamed_keys = [ - (f"{src_prefix}.query_embed.weight", f"{dst_prefix}.queries_embedder.weight"), - (f"{src_prefix}.input_proj.weight", f"{dst_prefix}.input_projection.weight"), - (f"{src_prefix}.input_proj.bias", f"{dst_prefix}.input_projection.bias"), - ] - - self.pop_all(renamed_keys, dst_state_dict, src_state_dict) - - def replace_instance_segmentation_module(self, dst_state_dict: StateDict, src_state_dict: StateDict): - # NOTE in our case we don't have a prefix, thus we removed the "." from the keys later on! - dst_prefix: str = "" - src_prefix: str = "sem_seg_head.predictor" - - renamed_keys = [ - (f"{src_prefix}.class_embed.weight", f"{dst_prefix}class_predictor.weight"), - (f"{src_prefix}.class_embed.bias", f"{dst_prefix}class_predictor.bias"), - ] - - mlp_len = 3 - for i in range(mlp_len): - renamed_keys.extend( - [ - (f"{src_prefix}.mask_embed.layers.{i}.weight", f"{dst_prefix}mask_embedder.{i}.0.weight"), - (f"{src_prefix}.mask_embed.layers.{i}.bias", f"{dst_prefix}mask_embedder.{i}.0.bias"), - ] - ) - logger.info(f"Replacing keys {pformat(renamed_keys)}") - self.pop_all(renamed_keys, dst_state_dict, src_state_dict) - - def convert(self, mask_former: MaskFormerModel) -> MaskFormerModel: - dst_state_dict = TrackedStateDict(mask_former.state_dict()) - src_state_dict = self.original_model.state_dict() - - self.replace_pixel_module(dst_state_dict, src_state_dict) - self.replace_transformer_module(dst_state_dict, src_state_dict) - - logger.info(f"Missed keys are {pformat(dst_state_dict.diff())}") - logger.info(f"Not copied keys are {pformat(src_state_dict.keys())}") - logger.info("🙌 Done") - - mask_former.load_state_dict(dst_state_dict) - - return mask_former - - def convert_instance_segmentation( - self, mask_former: MaskFormerForInstanceSegmentation - ) -> MaskFormerForInstanceSegmentation: - dst_state_dict = TrackedStateDict(mask_former.state_dict()) - src_state_dict = self.original_model.state_dict() - - self.replace_instance_segmentation_module(dst_state_dict, src_state_dict) - - mask_former.load_state_dict(dst_state_dict) - - return mask_former - - @staticmethod - def using_dirs(checkpoints_dir: Path, config_dir: Path) -> Iterator[tuple[object, Path, Path]]: - checkpoints: list[Path] = checkpoints_dir.glob("**/*.pkl") - - for checkpoint in checkpoints: - logger.info(f"💪 Converting {checkpoint.stem}") - # find associated config file - config: Path = config_dir / checkpoint.parents[0].stem / "swin" / f"{checkpoint.stem}.yaml" - - yield config, checkpoint - - -def test(original_model, our_model: MaskFormerForInstanceSegmentation, image_processor: MaskFormerImageProcessor): - with torch.no_grad(): - original_model = original_model.eval() - our_model = our_model.eval() - - im = prepare_img() - - tr = T.Compose( - [ - T.Resize((384, 384)), - T.ToTensor(), - T.Normalize( - mean=torch.tensor([123.675, 116.280, 103.530]) / 255.0, - std=torch.tensor([58.395, 57.120, 57.375]) / 255.0, - ), - ], - ) - - x = tr(im).unsqueeze(0) - - original_model_backbone_features = original_model.backbone(x.clone()) - - our_model_output: MaskFormerModelOutput = our_model.model(x.clone(), output_hidden_states=True) - - for original_model_feature, our_model_feature in zip( - original_model_backbone_features.values(), our_model_output.encoder_hidden_states - ): - assert torch.allclose(original_model_feature, our_model_feature, atol=1e-3), ( - "The backbone features are not the same." - ) - - original_model_pixel_out = original_model.sem_seg_head.pixel_decoder.forward_features( - original_model_backbone_features - ) - - assert torch.allclose( - original_model_pixel_out[0], our_model_output.pixel_decoder_last_hidden_state, atol=1e-4 - ), "The pixel decoder feature are not the same" - - # let's test the full model - original_model_out = original_model([{"image": x.squeeze(0)}]) - - original_segmentation = original_model_out[0]["sem_seg"] - - our_model_out: MaskFormerForInstanceSegmentationOutput = our_model(x) - - our_segmentation = image_processor.post_process_segmentation(our_model_out, target_size=(384, 384)) - - assert torch.allclose(original_segmentation, our_segmentation, atol=1e-3), ( - "The segmentation image is not the same." - ) - - logger.info("✅ Test passed!") - - -def get_name(checkpoint_file: Path): - model_name_raw: str = checkpoint_file.stem - # model_name_raw is something like maskformer_panoptic_swin_base_IN21k_384_bs64_554k - parent_name: str = checkpoint_file.parents[0].stem - backbone = "swin" - dataset = "" - if "coco" in parent_name: - dataset = "coco" - elif "ade" in parent_name: - dataset = "ade" - else: - raise ValueError(f"{parent_name} must be wrong since we didn't find 'coco' or 'ade' in it ") - - backbone_types = ["tiny", "small", "base", "large"] - - backbone_type = list(filter(lambda x: x in model_name_raw, backbone_types))[0] - - model_name = f"maskformer-{backbone}-{backbone_type}-{dataset}" - - return model_name - - -if __name__ == "__main__": - parser = ArgumentParser( - description="Command line to convert the original maskformers (with swin backbone) to our implementations." - ) - - parser.add_argument( - "--checkpoints_dir", - type=Path, - help=( - "A directory containing the model's checkpoints. The directory has to have the following structure:" - " / / .pkl\n" - "Given the files are in the pickle format, please be wary of passing it files you trust." - ), - ) - parser.add_argument( - "--configs_dir", - type=Path, - help=( - "A directory containing the model's configs, see detectron2 doc. The directory has to have the following" - " structure: / / .yaml" - ), - ) - parser.add_argument( - "--pytorch_dump_folder_path", - required=True, - type=Path, - help="Path to the folder to output PyTorch models.", - ) - parser.add_argument( - "--maskformer_dir", - required=True, - type=Path, - help=( - "A path to MaskFormer's original implementation directory. You can download from here:" - " https://github.com/facebookresearch/MaskFormer" - ), - ) - - args = parser.parse_args() - - checkpoints_dir: Path = args.checkpoints_dir - config_dir: Path = args.configs_dir - save_directory: Path = args.pytorch_dump_folder_path - maskformer_dir: Path = args.maskformer_dir - # append the path to the parents to maskformer dir - sys.path.append(str(maskformer_dir.parent)) - # and import what's needed - from MaskFormer.mask_former import add_mask_former_config - from MaskFormer.mask_former.mask_former_model import MaskFormer as OriginalMaskFormer - - if not save_directory.exists(): - save_directory.mkdir(parents=True) - - for config_file, checkpoint_file in OriginalMaskFormerCheckpointToOursConverter.using_dirs( - checkpoints_dir, config_dir - ): - image_processor = OriginalMaskFormerConfigToImageProcessorConverter()(setup_cfg(Args(config_file=config_file))) - - original_config = setup_cfg(Args(config_file=config_file)) - mask_former_kwargs = OriginalMaskFormer.from_config(original_config) - - original_model = OriginalMaskFormer(**mask_former_kwargs).eval() - - DetectionCheckpointer(original_model).load(str(checkpoint_file)) - - config: MaskFormerConfig = OriginalMaskFormerConfigToOursConverter()(original_config) - - mask_former = MaskFormerModel(config=config).eval() - - converter = OriginalMaskFormerCheckpointToOursConverter(original_model, config) - - maskformer = converter.convert(mask_former) - - mask_former_for_instance_segmentation = MaskFormerForInstanceSegmentation(config=config).eval() - - mask_former_for_instance_segmentation.model = mask_former - mask_former_for_instance_segmentation = converter.convert_instance_segmentation( - mask_former_for_instance_segmentation - ) - - test(original_model, mask_former_for_instance_segmentation, image_processor) - - model_name = get_name(checkpoint_file) - logger.info(f"🪄 Saving {model_name}") - - image_processor.save_pretrained(save_directory / model_name) - mask_former_for_instance_segmentation.save_pretrained(save_directory / model_name) - - image_processor.push_to_hub( - repo_path_or_name=save_directory / model_name, - commit_message="Add model", - use_temp_dir=True, - ) - mask_former_for_instance_segmentation.push_to_hub( - repo_path_or_name=save_directory / model_name, - commit_message="Add model", - use_temp_dir=True, - ) diff --git a/src/transformers/models/maskformer/convert_maskformer_resnet_to_pytorch.py b/src/transformers/models/maskformer/convert_maskformer_resnet_to_pytorch.py deleted file mode 100644 index 43fbd234fb2a..000000000000 --- a/src/transformers/models/maskformer/convert_maskformer_resnet_to_pytorch.py +++ /dev/null @@ -1,390 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert MaskFormer checkpoints with ResNet backbone from the original repository. URL: -https://github.com/facebookresearch/MaskFormer""" - -import argparse -import json -import pickle -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, MaskFormerImageProcessor, ResNetConfig -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_maskformer_config(model_name: str): - if "resnet101c" in model_name: - # TODO add support for ResNet-C backbone, which uses a "deeplab" stem - raise NotImplementedError("To do") - elif "resnet101" in model_name: - backbone_config = ResNetConfig.from_pretrained( - "microsoft/resnet-101", out_features=["stage1", "stage2", "stage3", "stage4"] - ) - else: - backbone_config = ResNetConfig.from_pretrained( - "microsoft/resnet-50", out_features=["stage1", "stage2", "stage3", "stage4"] - ) - config = MaskFormerConfig(backbone_config=backbone_config) - - repo_id = "huggingface/label-files" - if "ade20k-full" in model_name: - config.num_labels = 847 - filename = "maskformer-ade20k-full-id2label.json" - elif "ade" in model_name: - config.num_labels = 150 - filename = "ade20k-id2label.json" - elif "coco-stuff" in model_name: - config.num_labels = 171 - filename = "maskformer-coco-stuff-id2label.json" - elif "coco" in model_name: - # TODO - config.num_labels = 133 - filename = "coco-panoptic-id2label.json" - elif "cityscapes" in model_name: - config.num_labels = 19 - filename = "cityscapes-id2label.json" - elif "vistas" in model_name: - config.num_labels = 65 - filename = "mapillary-vistas-id2label.json" - - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - - return config - - -def create_rename_keys(config): - rename_keys = [] - # stem - # fmt: off - rename_keys.append(("backbone.stem.conv1.weight", "model.pixel_level_module.encoder.embedder.embedder.convolution.weight")) - rename_keys.append(("backbone.stem.conv1.norm.weight", "model.pixel_level_module.encoder.embedder.embedder.normalization.weight")) - rename_keys.append(("backbone.stem.conv1.norm.bias", "model.pixel_level_module.encoder.embedder.embedder.normalization.bias")) - rename_keys.append(("backbone.stem.conv1.norm.running_mean", "model.pixel_level_module.encoder.embedder.embedder.normalization.running_mean")) - rename_keys.append(("backbone.stem.conv1.norm.running_var", "model.pixel_level_module.encoder.embedder.embedder.normalization.running_var")) - # fmt: on - # stages - for stage_idx in range(len(config.backbone_config.depths)): - for layer_idx in range(config.backbone_config.depths[stage_idx]): - # shortcut - if layer_idx == 0: - rename_keys.append( - ( - f"backbone.res{stage_idx + 2}.{layer_idx}.shortcut.weight", - f"model.pixel_level_module.encoder.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.convolution.weight", - ) - ) - rename_keys.append( - ( - f"backbone.res{stage_idx + 2}.{layer_idx}.shortcut.norm.weight", - f"model.pixel_level_module.encoder.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.weight", - ) - ) - rename_keys.append( - ( - f"backbone.res{stage_idx + 2}.{layer_idx}.shortcut.norm.bias", - f"model.pixel_level_module.encoder.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.bias", - ) - ) - rename_keys.append( - ( - f"backbone.res{stage_idx + 2}.{layer_idx}.shortcut.norm.running_mean", - f"model.pixel_level_module.encoder.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_mean", - ) - ) - rename_keys.append( - ( - f"backbone.res{stage_idx + 2}.{layer_idx}.shortcut.norm.running_var", - f"model.pixel_level_module.encoder.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_var", - ) - ) - # 3 convs - for i in range(3): - rename_keys.append( - ( - f"backbone.res{stage_idx + 2}.{layer_idx}.conv{i + 1}.weight", - f"model.pixel_level_module.encoder.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.convolution.weight", - ) - ) - rename_keys.append( - ( - f"backbone.res{stage_idx + 2}.{layer_idx}.conv{i + 1}.norm.weight", - f"model.pixel_level_module.encoder.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.weight", - ) - ) - rename_keys.append( - ( - f"backbone.res{stage_idx + 2}.{layer_idx}.conv{i + 1}.norm.bias", - f"model.pixel_level_module.encoder.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.bias", - ) - ) - rename_keys.append( - ( - f"backbone.res{stage_idx + 2}.{layer_idx}.conv{i + 1}.norm.running_mean", - f"model.pixel_level_module.encoder.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_mean", - ) - ) - rename_keys.append( - ( - f"backbone.res{stage_idx + 2}.{layer_idx}.conv{i + 1}.norm.running_var", - f"model.pixel_level_module.encoder.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_var", - ) - ) - - # FPN - # fmt: off - rename_keys.append(("sem_seg_head.layer_4.weight", "model.pixel_level_module.decoder.fpn.stem.0.weight")) - rename_keys.append(("sem_seg_head.layer_4.norm.weight", "model.pixel_level_module.decoder.fpn.stem.1.weight")) - rename_keys.append(("sem_seg_head.layer_4.norm.bias", "model.pixel_level_module.decoder.fpn.stem.1.bias")) - for source_index, target_index in zip(range(3, 0, -1), range(0, 3)): - rename_keys.append((f"sem_seg_head.adapter_{source_index}.weight", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.proj.0.weight")) - rename_keys.append((f"sem_seg_head.adapter_{source_index}.norm.weight", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.proj.1.weight")) - rename_keys.append((f"sem_seg_head.adapter_{source_index}.norm.bias", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.proj.1.bias")) - rename_keys.append((f"sem_seg_head.layer_{source_index}.weight", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.block.0.weight")) - rename_keys.append((f"sem_seg_head.layer_{source_index}.norm.weight", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.block.1.weight")) - rename_keys.append((f"sem_seg_head.layer_{source_index}.norm.bias", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.block.1.bias")) - rename_keys.append(("sem_seg_head.mask_features.weight", "model.pixel_level_module.decoder.mask_projection.weight")) - rename_keys.append(("sem_seg_head.mask_features.bias", "model.pixel_level_module.decoder.mask_projection.bias")) - # fmt: on - - # Transformer decoder - # fmt: off - for idx in range(config.decoder_config.decoder_layers): - # self-attention out projection - rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.self_attn.out_proj.weight", f"model.transformer_module.decoder.layers.{idx}.self_attn.out_proj.weight")) - rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.self_attn.out_proj.bias", f"model.transformer_module.decoder.layers.{idx}.self_attn.out_proj.bias")) - # cross-attention out projection - rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.multihead_attn.out_proj.weight", f"model.transformer_module.decoder.layers.{idx}.encoder_attn.out_proj.weight")) - rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.multihead_attn.out_proj.bias", f"model.transformer_module.decoder.layers.{idx}.encoder_attn.out_proj.bias")) - # MLP 1 - rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.linear1.weight", f"model.transformer_module.decoder.layers.{idx}.fc1.weight")) - rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.linear1.bias", f"model.transformer_module.decoder.layers.{idx}.fc1.bias")) - # MLP 2 - rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.linear2.weight", f"model.transformer_module.decoder.layers.{idx}.fc2.weight")) - rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.linear2.bias", f"model.transformer_module.decoder.layers.{idx}.fc2.bias")) - # layernorm 1 (self-attention layernorm) - rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm1.weight", f"model.transformer_module.decoder.layers.{idx}.self_attn_layer_norm.weight")) - rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm1.bias", f"model.transformer_module.decoder.layers.{idx}.self_attn_layer_norm.bias")) - # layernorm 2 (cross-attention layernorm) - rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm2.weight", f"model.transformer_module.decoder.layers.{idx}.encoder_attn_layer_norm.weight")) - rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm2.bias", f"model.transformer_module.decoder.layers.{idx}.encoder_attn_layer_norm.bias")) - # layernorm 3 (final layernorm) - rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm3.weight", f"model.transformer_module.decoder.layers.{idx}.final_layer_norm.weight")) - rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm3.bias", f"model.transformer_module.decoder.layers.{idx}.final_layer_norm.bias")) - - rename_keys.append(("sem_seg_head.predictor.transformer.decoder.norm.weight", "model.transformer_module.decoder.layernorm.weight")) - rename_keys.append(("sem_seg_head.predictor.transformer.decoder.norm.bias", "model.transformer_module.decoder.layernorm.bias")) - # fmt: on - - # heads on top - # fmt: off - rename_keys.append(("sem_seg_head.predictor.query_embed.weight", "model.transformer_module.queries_embedder.weight")) - - rename_keys.append(("sem_seg_head.predictor.input_proj.weight", "model.transformer_module.input_projection.weight")) - rename_keys.append(("sem_seg_head.predictor.input_proj.bias", "model.transformer_module.input_projection.bias")) - - rename_keys.append(("sem_seg_head.predictor.class_embed.weight", "class_predictor.weight")) - rename_keys.append(("sem_seg_head.predictor.class_embed.bias", "class_predictor.bias")) - - for i in range(3): - rename_keys.append((f"sem_seg_head.predictor.mask_embed.layers.{i}.weight", f"mask_embedder.{i}.0.weight")) - rename_keys.append((f"sem_seg_head.predictor.mask_embed.layers.{i}.bias", f"mask_embedder.{i}.0.bias")) - # fmt: on - - return rename_keys - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_decoder_q_k_v(state_dict, config): - # fmt: off - hidden_size = config.decoder_config.hidden_size - for idx in range(config.decoder_config.decoder_layers): - # read in weights + bias of self-attention input projection layer (in the original implementation, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.self_attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.self_attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.q_proj.weight"] = in_proj_weight[: hidden_size, :] - state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.q_proj.bias"] = in_proj_bias[:config.hidden_size] - state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.k_proj.weight"] = in_proj_weight[hidden_size : hidden_size * 2, :] - state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.k_proj.bias"] = in_proj_bias[hidden_size : hidden_size * 2] - state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.v_proj.weight"] = in_proj_weight[-hidden_size :, :] - state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.v_proj.bias"] = in_proj_bias[-hidden_size :] - # read in weights + bias of cross-attention input projection layer (in the original implementation, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.multihead_attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.multihead_attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.q_proj.weight"] = in_proj_weight[: hidden_size, :] - state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.q_proj.bias"] = in_proj_bias[:config.hidden_size] - state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.k_proj.weight"] = in_proj_weight[hidden_size : hidden_size * 2, :] - state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.k_proj.bias"] = in_proj_bias[hidden_size : hidden_size * 2] - state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.v_proj.weight"] = in_proj_weight[-hidden_size :, :] - state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.v_proj.bias"] = in_proj_bias[-hidden_size :] - # fmt: on - - -# We will verify our results on an image of cute cats -def prepare_img() -> torch.Tensor: - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -@torch.no_grad() -def convert_maskformer_checkpoint( - model_name: str, checkpoint_path: str, pytorch_dump_folder_path: str, push_to_hub: bool = False -): - """ - Copy/paste/tweak model's weights to our MaskFormer structure. - """ - config = get_maskformer_config(model_name) - - # load original state_dict - with open(checkpoint_path, "rb") as f: - data = pickle.load(f) - state_dict = data["model"] - - # rename keys - rename_keys = create_rename_keys(config) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - read_in_decoder_q_k_v(state_dict, config) - - # update to torch tensors - for key, value in state_dict.items(): - state_dict[key] = torch.from_numpy(value) - - # load 🤗 model - model = MaskFormerForInstanceSegmentation(config) - model.eval() - - model.load_state_dict(state_dict) - - # verify results - image = prepare_img() - if "vistas" in model_name: - ignore_index = 65 - elif "cityscapes" in model_name: - ignore_index = 65535 - else: - ignore_index = 255 - do_reduce_labels = "ade" in model_name - image_processor = MaskFormerImageProcessor(ignore_index=ignore_index, do_reduce_labels=do_reduce_labels) - - inputs = image_processor(image, return_tensors="pt") - - outputs = model(**inputs) - - if model_name == "maskformer-resnet50-ade": - expected_logits = torch.tensor( - [[6.7710, -0.1452, -3.5687], [1.9165, -1.0010, -1.8614], [3.6209, -0.2950, -1.3813]] - ) - elif model_name == "maskformer-resnet101-ade": - expected_logits = torch.tensor( - [[4.0381, -1.1483, -1.9688], [2.7083, -1.9147, -2.2555], [3.4367, -1.3711, -2.1609]] - ) - elif model_name == "maskformer-resnet50-coco-stuff": - expected_logits = torch.tensor( - [[3.2309, -3.0481, -2.8695], [5.4986, -5.4242, -2.4211], [6.2100, -5.2279, -2.7786]] - ) - elif model_name == "maskformer-resnet101-coco-stuff": - expected_logits = torch.tensor( - [[4.7188, -3.2585, -2.8857], [6.6871, -2.9181, -1.2487], [7.2449, -2.2764, -2.1874]] - ) - elif model_name == "maskformer-resnet101-cityscapes": - expected_logits = torch.tensor( - [[-1.8861, -1.5465, 0.6749], [-2.3677, -1.6707, -0.0867], [-2.2314, -1.9530, -0.9132]] - ) - elif model_name == "maskformer-resnet50-vistas": - expected_logits = torch.tensor( - [[-6.3917, -1.5216, -1.1392], [-5.5335, -4.5318, -1.8339], [-4.3576, -4.0301, 0.2162]] - ) - elif model_name == "maskformer-resnet50-ade20k-full": - expected_logits = torch.tensor( - [[3.6146, -1.9367, -3.2534], [4.0099, 0.2027, -2.7576], [3.3913, -2.3644, -3.9519]] - ) - elif model_name == "maskformer-resnet101-ade20k-full": - expected_logits = torch.tensor( - [[3.2211, -1.6550, -2.7605], [2.8559, -2.4512, -2.9574], [2.6331, -2.6775, -2.1844]] - ) - - assert torch.allclose(outputs.class_queries_logits[0, :3, :3], expected_logits, atol=1e-4) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - print(f"Saving model and image processor of {model_name} to {pytorch_dump_folder_path}") - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - model.save_pretrained(pytorch_dump_folder_path) - image_processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - print(f"Pushing model and image processor of {model_name} to the hub...") - model.push_to_hub(f"facebook/{model_name}") - image_processor.push_to_hub(f"facebook/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="maskformer-resnet50-ade", - type=str, - required=True, - choices=[ - "maskformer-resnet50-ade", - "maskformer-resnet101-ade", - "maskformer-resnet50-coco-stuff", - "maskformer-resnet101-coco-stuff", - "maskformer-resnet101-cityscapes", - "maskformer-resnet50-vistas", - "maskformer-resnet50-ade20k-full", - "maskformer-resnet101-ade20k-full", - ], - help=("Name of the MaskFormer model you'd like to convert",), - ) - parser.add_argument( - "--checkpoint_path", - type=str, - required=True, - help="Path to the original pickle file (.pkl) of the original checkpoint.\n" - "Given the files are in the pickle format, please be wary of passing it files you trust.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub." - ) - - args = parser.parse_args() - convert_maskformer_checkpoint( - args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub - ) diff --git a/src/transformers/models/maskformer/convert_maskformer_swin_to_pytorch.py b/src/transformers/models/maskformer/convert_maskformer_swin_to_pytorch.py deleted file mode 100644 index 4b6e32e5cc13..000000000000 --- a/src/transformers/models/maskformer/convert_maskformer_swin_to_pytorch.py +++ /dev/null @@ -1,333 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert MaskFormer checkpoints with Swin backbone from the original repository. URL: -https://github.com/facebookresearch/MaskFormer""" - -import argparse -import json -import pickle -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, MaskFormerImageProcessor, SwinConfig -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_maskformer_config(model_name: str): - backbone_config = SwinConfig.from_pretrained( - "microsoft/swin-tiny-patch4-window7-224", out_features=["stage1", "stage2", "stage3", "stage4"] - ) - config = MaskFormerConfig(backbone_config=backbone_config) - - repo_id = "huggingface/label-files" - if "ade20k-full" in model_name: - # this should be ok - config.num_labels = 847 - filename = "maskformer-ade20k-full-id2label.json" - elif "ade" in model_name: - # this should be ok - config.num_labels = 150 - filename = "ade20k-id2label.json" - elif "coco-stuff" in model_name: - # this should be ok - config.num_labels = 171 - filename = "maskformer-coco-stuff-id2label.json" - elif "coco" in model_name: - # TODO - config.num_labels = 133 - filename = "coco-panoptic-id2label.json" - elif "cityscapes" in model_name: - # this should be ok - config.num_labels = 19 - filename = "cityscapes-id2label.json" - elif "vistas" in model_name: - # this should be ok - config.num_labels = 65 - filename = "mapillary-vistas-id2label.json" - - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - - return config - - -def create_rename_keys(config): - rename_keys = [] - # stem - # fmt: off - rename_keys.append(("backbone.patch_embed.proj.weight", "model.pixel_level_module.encoder.model.embeddings.patch_embeddings.projection.weight")) - rename_keys.append(("backbone.patch_embed.proj.bias", "model.pixel_level_module.encoder.model.embeddings.patch_embeddings.projection.bias")) - rename_keys.append(("backbone.patch_embed.norm.weight", "model.pixel_level_module.encoder.model.embeddings.norm.weight")) - rename_keys.append(("backbone.patch_embed.norm.bias", "model.pixel_level_module.encoder.model.embeddings.norm.bias")) - # stages - for i in range(len(config.backbone_config.depths)): - for j in range(config.backbone_config.depths[i]): - rename_keys.append((f"backbone.layers.{i}.blocks.{j}.norm1.weight", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.layernorm_before.weight")) - rename_keys.append((f"backbone.layers.{i}.blocks.{j}.norm1.bias", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.layernorm_before.bias")) - rename_keys.append((f"backbone.layers.{i}.blocks.{j}.attn.relative_position_bias_table", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_bias_table")) - rename_keys.append((f"backbone.layers.{i}.blocks.{j}.attn.relative_position_index", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_index")) - rename_keys.append((f"backbone.layers.{i}.blocks.{j}.attn.proj.weight", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.attention.output.dense.weight")) - rename_keys.append((f"backbone.layers.{i}.blocks.{j}.attn.proj.bias", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.attention.output.dense.bias")) - rename_keys.append((f"backbone.layers.{i}.blocks.{j}.norm2.weight", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.layernorm_after.weight")) - rename_keys.append((f"backbone.layers.{i}.blocks.{j}.norm2.bias", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.layernorm_after.bias")) - rename_keys.append((f"backbone.layers.{i}.blocks.{j}.mlp.fc1.weight", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.intermediate.dense.weight")) - rename_keys.append((f"backbone.layers.{i}.blocks.{j}.mlp.fc1.bias", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.intermediate.dense.bias")) - rename_keys.append((f"backbone.layers.{i}.blocks.{j}.mlp.fc2.weight", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.output.dense.weight")) - rename_keys.append((f"backbone.layers.{i}.blocks.{j}.mlp.fc2.bias", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.output.dense.bias")) - - if i < 3: - rename_keys.append((f"backbone.layers.{i}.downsample.reduction.weight", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.downsample.reduction.weight")) - rename_keys.append((f"backbone.layers.{i}.downsample.norm.weight", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.downsample.norm.weight")) - rename_keys.append((f"backbone.layers.{i}.downsample.norm.bias", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.downsample.norm.bias")) - rename_keys.append((f"backbone.norm{i}.weight", f"model.pixel_level_module.encoder.hidden_states_norms.{i}.weight")) - rename_keys.append((f"backbone.norm{i}.bias", f"model.pixel_level_module.encoder.hidden_states_norms.{i}.bias")) - - # FPN - rename_keys.append(("sem_seg_head.layer_4.weight", "model.pixel_level_module.decoder.fpn.stem.0.weight")) - rename_keys.append(("sem_seg_head.layer_4.norm.weight", "model.pixel_level_module.decoder.fpn.stem.1.weight")) - rename_keys.append(("sem_seg_head.layer_4.norm.bias", "model.pixel_level_module.decoder.fpn.stem.1.bias")) - for source_index, target_index in zip(range(3, 0, -1), range(0, 3)): - rename_keys.append((f"sem_seg_head.adapter_{source_index}.weight", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.proj.0.weight")) - rename_keys.append((f"sem_seg_head.adapter_{source_index}.norm.weight", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.proj.1.weight")) - rename_keys.append((f"sem_seg_head.adapter_{source_index}.norm.bias", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.proj.1.bias")) - rename_keys.append((f"sem_seg_head.layer_{source_index}.weight", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.block.0.weight")) - rename_keys.append((f"sem_seg_head.layer_{source_index}.norm.weight", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.block.1.weight")) - rename_keys.append((f"sem_seg_head.layer_{source_index}.norm.bias", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.block.1.bias")) - rename_keys.append(("sem_seg_head.mask_features.weight", "model.pixel_level_module.decoder.mask_projection.weight")) - rename_keys.append(("sem_seg_head.mask_features.bias", "model.pixel_level_module.decoder.mask_projection.bias")) - - # Transformer decoder - for idx in range(config.decoder_config.decoder_layers): - # self-attention out projection - rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.self_attn.out_proj.weight", f"model.transformer_module.decoder.layers.{idx}.self_attn.out_proj.weight")) - rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.self_attn.out_proj.bias", f"model.transformer_module.decoder.layers.{idx}.self_attn.out_proj.bias")) - # cross-attention out projection - rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.multihead_attn.out_proj.weight", f"model.transformer_module.decoder.layers.{idx}.encoder_attn.out_proj.weight")) - rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.multihead_attn.out_proj.bias", f"model.transformer_module.decoder.layers.{idx}.encoder_attn.out_proj.bias")) - # MLP 1 - rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.linear1.weight", f"model.transformer_module.decoder.layers.{idx}.fc1.weight")) - rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.linear1.bias", f"model.transformer_module.decoder.layers.{idx}.fc1.bias")) - # MLP 2 - rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.linear2.weight", f"model.transformer_module.decoder.layers.{idx}.fc2.weight")) - rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.linear2.bias", f"model.transformer_module.decoder.layers.{idx}.fc2.bias")) - # layernorm 1 (self-attention layernorm) - rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm1.weight", f"model.transformer_module.decoder.layers.{idx}.self_attn_layer_norm.weight")) - rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm1.bias", f"model.transformer_module.decoder.layers.{idx}.self_attn_layer_norm.bias")) - # layernorm 2 (cross-attention layernorm) - rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm2.weight", f"model.transformer_module.decoder.layers.{idx}.encoder_attn_layer_norm.weight")) - rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm2.bias", f"model.transformer_module.decoder.layers.{idx}.encoder_attn_layer_norm.bias")) - # layernorm 3 (final layernorm) - rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm3.weight", f"model.transformer_module.decoder.layers.{idx}.final_layer_norm.weight")) - rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm3.bias", f"model.transformer_module.decoder.layers.{idx}.final_layer_norm.bias")) - - rename_keys.append(("sem_seg_head.predictor.transformer.decoder.norm.weight", "model.transformer_module.decoder.layernorm.weight")) - rename_keys.append(("sem_seg_head.predictor.transformer.decoder.norm.bias", "model.transformer_module.decoder.layernorm.bias")) - - # heads on top - rename_keys.append(("sem_seg_head.predictor.query_embed.weight", "model.transformer_module.queries_embedder.weight")) - - rename_keys.append(("sem_seg_head.predictor.input_proj.weight", "model.transformer_module.input_projection.weight")) - rename_keys.append(("sem_seg_head.predictor.input_proj.bias", "model.transformer_module.input_projection.bias")) - - rename_keys.append(("sem_seg_head.predictor.class_embed.weight", "class_predictor.weight")) - rename_keys.append(("sem_seg_head.predictor.class_embed.bias", "class_predictor.bias")) - - for i in range(3): - rename_keys.append((f"sem_seg_head.predictor.mask_embed.layers.{i}.weight", f"mask_embedder.{i}.0.weight")) - rename_keys.append((f"sem_seg_head.predictor.mask_embed.layers.{i}.bias", f"mask_embedder.{i}.0.bias")) - # fmt: on - - return rename_keys - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_swin_q_k_v(state_dict, backbone_config): - num_features = [int(backbone_config.embed_dim * 2**i) for i in range(len(backbone_config.depths))] - for i in range(len(backbone_config.depths)): - dim = num_features[i] - for j in range(backbone_config.depths[i]): - # fmt: off - # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"backbone.layers.{i}.blocks.{j}.attn.qkv.weight") - in_proj_bias = state_dict.pop(f"backbone.layers.{i}.blocks.{j}.attn.qkv.bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.attention.self.query.weight"] = in_proj_weight[:dim, :] - state_dict[f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.attention.self.query.bias"] = in_proj_bias[: dim] - state_dict[f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.attention.self.key.weight"] = in_proj_weight[ - dim : dim * 2, : - ] - state_dict[f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.attention.self.key.bias"] = in_proj_bias[ - dim : dim * 2 - ] - state_dict[f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.attention.self.value.weight"] = in_proj_weight[ - -dim :, : - ] - state_dict[f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.attention.self.value.bias"] = in_proj_bias[-dim :] - # fmt: on - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_decoder_q_k_v(state_dict, config): - # fmt: off - hidden_size = config.decoder_config.hidden_size - for idx in range(config.decoder_config.decoder_layers): - # read in weights + bias of self-attention input projection layer (in the original implementation, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.self_attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.self_attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.q_proj.weight"] = in_proj_weight[: hidden_size, :] - state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.q_proj.bias"] = in_proj_bias[:config.hidden_size] - state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.k_proj.weight"] = in_proj_weight[hidden_size : hidden_size * 2, :] - state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.k_proj.bias"] = in_proj_bias[hidden_size : hidden_size * 2] - state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.v_proj.weight"] = in_proj_weight[-hidden_size :, :] - state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.v_proj.bias"] = in_proj_bias[-hidden_size :] - # read in weights + bias of cross-attention input projection layer (in the original implementation, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.multihead_attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.multihead_attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.q_proj.weight"] = in_proj_weight[: hidden_size, :] - state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.q_proj.bias"] = in_proj_bias[:config.hidden_size] - state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.k_proj.weight"] = in_proj_weight[hidden_size : hidden_size * 2, :] - state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.k_proj.bias"] = in_proj_bias[hidden_size : hidden_size * 2] - state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.v_proj.weight"] = in_proj_weight[-hidden_size :, :] - state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.v_proj.bias"] = in_proj_bias[-hidden_size :] - # fmt: on - - -# We will verify our results on an image of cute cats -def prepare_img() -> torch.Tensor: - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -@torch.no_grad() -def convert_maskformer_checkpoint( - model_name: str, checkpoint_path: str, pytorch_dump_folder_path: str, push_to_hub: bool = False -): - """ - Copy/paste/tweak model's weights to our MaskFormer structure. - """ - config = get_maskformer_config(model_name) - - # load original state_dict - with open(checkpoint_path, "rb") as f: - data = pickle.load(f) - state_dict = data["model"] - - # for name, param in state_dict.items(): - # print(name, param.shape) - - # rename keys - rename_keys = create_rename_keys(config) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - read_in_swin_q_k_v(state_dict, config.backbone_config) - read_in_decoder_q_k_v(state_dict, config) - - # update to torch tensors - for key, value in state_dict.items(): - state_dict[key] = torch.from_numpy(value) - - # load 🤗 model - model = MaskFormerForInstanceSegmentation(config) - model.eval() - - for name, param in model.named_parameters(): - print(name, param.shape) - - missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False) - assert missing_keys == [ - "model.pixel_level_module.encoder.model.layernorm.weight", - "model.pixel_level_module.encoder.model.layernorm.bias", - ] - assert len(unexpected_keys) == 0, f"Unexpected keys: {unexpected_keys}" - - # verify results - image = prepare_img() - if "vistas" in model_name: - ignore_index = 65 - elif "cityscapes" in model_name: - ignore_index = 65535 - else: - ignore_index = 255 - do_reduce_labels = "ade" in model_name - image_processor = MaskFormerImageProcessor(ignore_index=ignore_index, do_reduce_labels=do_reduce_labels) - - inputs = image_processor(image, return_tensors="pt") - - outputs = model(**inputs) - - print("Logits:", outputs.class_queries_logits[0, :3, :3]) - - if model_name == "maskformer-swin-tiny-ade": - expected_logits = torch.tensor( - [[3.6353, -4.4770, -2.6065], [0.5081, -4.2394, -3.5343], [2.1909, -5.0353, -1.9323]] - ) - assert torch.allclose(outputs.class_queries_logits[0, :3, :3], expected_logits, atol=1e-4) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - print(f"Saving model and image processor to {pytorch_dump_folder_path}") - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - model.save_pretrained(pytorch_dump_folder_path) - image_processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - print("Pushing model and image processor to the hub...") - model.push_to_hub(f"nielsr/{model_name}") - image_processor.push_to_hub(f"nielsr/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="maskformer-swin-tiny-ade", - type=str, - help=("Name of the MaskFormer model you'd like to convert",), - ) - parser.add_argument( - "--checkpoint_path", - default="/Users/nielsrogge/Documents/MaskFormer_checkpoints/MaskFormer-Swin-tiny-ADE20k/model.pkl", - type=str, - help="Path to the original state dict (.pth file).\n" - "Given the files are in the pickle format, please be wary of passing it files you trust.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub." - ) - - args = parser.parse_args() - convert_maskformer_checkpoint( - args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub - ) diff --git a/src/transformers/models/maskformer/image_processing_maskformer.py b/src/transformers/models/maskformer/image_processing_maskformer.py index 9ce33846170e..c2f9aee70167 100644 --- a/src/transformers/models/maskformer/image_processing_maskformer.py +++ b/src/transformers/models/maskformer/image_processing_maskformer.py @@ -308,7 +308,7 @@ def compute_segments( # TODO: (Amy) Move to image_transforms def convert_segmentation_map_to_binary_masks( - segmentation_map: "np.ndarray", + segmentation_map: np.ndarray, instance_id_to_semantic_id: Optional[dict[int, int]] = None, ignore_index: Optional[int] = None, do_reduce_labels: bool = False, @@ -585,7 +585,7 @@ def rescale( def convert_segmentation_map_to_binary_masks( self, - segmentation_map: "np.ndarray", + segmentation_map: np.ndarray, instance_id_to_semantic_id: Optional[dict[int, int]] = None, ignore_index: Optional[int] = None, do_reduce_labels: bool = False, diff --git a/src/transformers/models/maskformer/image_processing_maskformer_fast.py b/src/transformers/models/maskformer/image_processing_maskformer_fast.py index ab6411f1bb3f..9e15486cfa35 100644 --- a/src/transformers/models/maskformer/image_processing_maskformer_fast.py +++ b/src/transformers/models/maskformer/image_processing_maskformer_fast.py @@ -20,6 +20,7 @@ import torch from torch import nn +from torchvision.transforms.v2 import functional as F from ...image_processing_utils import BatchFeature, get_size_dict from ...image_processing_utils_fast import ( @@ -42,7 +43,6 @@ from ...utils import ( TensorType, auto_docstring, - is_torchvision_v2_available, logging, ) from .image_processing_maskformer import ( @@ -53,11 +53,6 @@ ) -if is_torchvision_v2_available(): - from torchvision.transforms.v2 import functional as F -else: - from torchvision.transforms import functional as F - logger = logging.get_logger(__name__) @@ -354,9 +349,7 @@ def _preprocess( image=grouped_segmentation_maps[shape], size=size, size_divisor=size_divisor, - interpolation=F.InterpolationMode.NEAREST_EXACT - if is_torchvision_v2_available() - else F.InterpolationMode.NEAREST, + interpolation=F.InterpolationMode.NEAREST_EXACT, ) resized_images_grouped[shape] = stacked_images if segmentation_maps is not None: diff --git a/src/transformers/models/maskformer/modeling_maskformer.py b/src/transformers/models/maskformer/modeling_maskformer.py index 9e1c0072425b..772f0a9fad0a 100644 --- a/src/transformers/models/maskformer/modeling_maskformer.py +++ b/src/transformers/models/maskformer/modeling_maskformer.py @@ -1088,7 +1088,7 @@ def get_num_masks(self, class_labels: torch.Tensor, device: torch.device) -> tor """ Computes the average number of target masks across the batch, for normalization purposes. """ - num_masks = sum([len(classes) for classes in class_labels]) + num_masks = sum(len(classes) for classes in class_labels) num_masks = torch.as_tensor(num_masks, dtype=torch.float, device=device) world_size = 1 if is_accelerate_available(): diff --git a/src/transformers/models/mbart/convert_mbart_original_checkpoint_to_pytorch.py b/src/transformers/models/mbart/convert_mbart_original_checkpoint_to_pytorch.py deleted file mode 100644 index 909b7b41284c..000000000000 --- a/src/transformers/models/mbart/convert_mbart_original_checkpoint_to_pytorch.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse - -import torch -from torch import nn - -from transformers import MBartConfig, MBartForConditionalGeneration - - -def remove_ignore_keys_(state_dict): - ignore_keys = [ - "encoder.version", - "decoder.version", - "model.encoder.version", - "model.decoder.version", - "_float_tensor", - "decoder.output_projection.weight", - ] - for k in ignore_keys: - state_dict.pop(k, None) - - -def make_linear_from_emb(emb): - vocab_size, emb_size = emb.weight.shape - lin_layer = nn.Linear(vocab_size, emb_size, bias=False) - lin_layer.weight.data = emb.weight.data - return lin_layer - - -def convert_fairseq_mbart_checkpoint_from_disk( - checkpoint_path, hf_config_path="facebook/mbart-large-en-ro", finetuned=False, mbart_50=False -): - state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)["model"] - remove_ignore_keys_(state_dict) - vocab_size = state_dict["encoder.embed_tokens.weight"].shape[0] - - mbart_config = MBartConfig.from_pretrained(hf_config_path, vocab_size=vocab_size) - if mbart_50 and finetuned: - mbart_config.activation_function = "relu" - - state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"] - model = MBartForConditionalGeneration(mbart_config) - model.model.load_state_dict(state_dict) - - if finetuned: - model.lm_head = make_linear_from_emb(model.model.shared) - - return model - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "fairseq_path", type=str, help="bart.large, bart.large.cnn or a path to a model.pt on local filesystem." - ) - parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - parser.add_argument( - "--hf_config", - default="facebook/mbart-large-cc25", - type=str, - help="Which huggingface architecture to use: mbart-large", - ) - parser.add_argument("--mbart_50", action="store_true", help="whether the model is mMART-50 checkpoint") - parser.add_argument("--finetuned", action="store_true", help="whether the model is a fine-tuned checkpoint") - args = parser.parse_args() - model = convert_fairseq_mbart_checkpoint_from_disk( - args.fairseq_path, hf_config_path=args.hf_config, finetuned=args.finetuned, mbart_50=args.mbart_50 - ) - model.save_pretrained(args.pytorch_dump_folder_path) diff --git a/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py b/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py deleted file mode 100644 index 69ebed5aa8e0..000000000000 --- a/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py +++ /dev/null @@ -1,334 +0,0 @@ -#################################################################################################### - -# Copyright (c) 2021-, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#################################################################################################### - -# -# Note: If when running this conversion script you're getting an exception: -# ModuleNotFoundError: No module named 'megatron.model.enums' -# you need to tell python where to find the clone of Megatron-LM, e.g.: -# -# cd /tmp -# git clone https://github.com/NVIDIA/Megatron-LM -# PYTHONPATH=/tmp/Megatron-LM python src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py ... -# -# if you already have it cloned elsewhere, simply adjust the path to the existing path -# -# If the training was done using a Megatron-LM fork, e.g., -# https://github.com/microsoft/Megatron-DeepSpeed/ then chances are that you need to have that one -# in your path, i.e., /path/to/Megatron-DeepSpeed/ -# - -import argparse -import os -import re -import zipfile - -import torch - -from transformers import MegatronBertConfig - - -#################################################################################################### - - -def recursive_print(name, val, spaces=0): - # Format the message. - if name is None: - msg = None - else: - fmt = "." * max(0, spaces - 2) + "# {:" + str(50 - spaces) + "s}" - msg = fmt.format(name) - - # Print and recurse (if needed). - if isinstance(val, dict): - if msg is not None: - print(msg) - for k in val: - recursive_print(k, val[k], spaces + 2) - elif isinstance(val, torch.Tensor): - print(msg, ":", val.size()) - else: - print(msg, ":", val) - - -def fix_query_key_value_ordering(param, checkpoint_version, num_splits, num_heads, hidden_size): - # Permutes layout of param tensor to [num_splits * num_heads * hidden_size, :] - # for compatibility with later versions of NVIDIA Megatron-LM. - # The inverse operation is performed inside Megatron-LM to read checkpoints: - # https://github.com/NVIDIA/Megatron-LM/blob/v2.4/megatron/checkpointing.py#L209 - # If param is the weight tensor of the self-attention block, the returned tensor - # will have to be transposed one more time to be read by HuggingFace BERT. - input_shape = param.size() - if checkpoint_version == 1.0: - # version 1.0 stores [num_heads * hidden_size * num_splits, :] - saved_shape = (num_heads, hidden_size, num_splits) + input_shape[1:] - param = param.view(*saved_shape) - param = param.transpose(0, 2) - param = param.transpose(1, 2).contiguous() - elif checkpoint_version >= 2.0: - # other versions store [num_heads * num_splits * hidden_size, :] - saved_shape = (num_heads, num_splits, hidden_size) + input_shape[1:] - param = param.view(*saved_shape) - param = param.transpose(0, 1).contiguous() - param = param.view(*input_shape) - return param - - -#################################################################################################### - - -def convert_megatron_checkpoint(args, input_state_dict, config): - # The converted output model. - output_state_dict = {} - - # old versions did not store training args - ds_args = input_state_dict.get("args", None) - if ds_args is not None: - # do not make the user write a config file when the exact dimensions/sizes are already in the checkpoint - # from pprint import pprint - # pprint(vars(ds_args)) - - config.tokenizer_type = ds_args.tokenizer_type - config.vocab_size = ds_args.padded_vocab_size - config.max_position_embeddings = ds_args.max_position_embeddings - config.hidden_size = ds_args.hidden_size - config.num_hidden_layers = ds_args.num_layers - config.num_attention_heads = ds_args.num_attention_heads - config.intermediate_size = ds_args.ffn_hidden_size if "ffn_hidden_size" in ds_args else 4 * ds_args.hidden_size - # pprint(config) - - # The number of heads. - heads = config.num_attention_heads - # The hidden_size per head. - hidden_size_per_head = config.hidden_size // heads - # Megatron-LM checkpoint version - if "checkpoint_version" in input_state_dict: - checkpoint_version = input_state_dict["checkpoint_version"] - else: - checkpoint_version = 0.0 - - # The model. - model = input_state_dict["model"] - # The language model. - lm = model["language_model"] - # The embeddings. - embeddings = lm["embedding"] - - # The word embeddings. - word_embeddings = embeddings["word_embeddings"]["weight"] - # Truncate the embedding table to vocab_size rows. - word_embeddings = word_embeddings[: config.vocab_size, :] - # Store the word embeddings. - output_state_dict["bert.embeddings.word_embeddings.weight"] = word_embeddings - - # The position embeddings. - pos_embeddings = embeddings["position_embeddings"]["weight"] - assert pos_embeddings.size(0) == config.max_position_embeddings and pos_embeddings.size(1) == config.hidden_size - # Store the position embeddings. - output_state_dict["bert.embeddings.position_embeddings.weight"] = pos_embeddings - - # The token-type embeddings. - tokentype_embeddings = embeddings["tokentype_embeddings"]["weight"] - # Store the position embeddings. - output_state_dict["bert.embeddings.token_type_embeddings.weight"] = tokentype_embeddings - - # The transformer. - transformer = lm["transformer"] if "transformer" in lm else lm["encoder"] - - # The regex to extract layer names. - layer_re = re.compile(r"layers\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)") - - # The simple map of names for "automated" rules. - megatron_to_transformers = { - "attention.dense": ".attention.output.dense.", - "self_attention.dense": ".attention.output.dense.", - "mlp.dense_h_to_4h": ".intermediate.dense.", - "mlp.dense_4h_to_h": ".output.dense.", - } - - # Keep track of the attention/query/value tensor. - attention_qkv_weight = None - - # Extract the layers. - for key, val in transformer.items(): - # Match the name. - m = layer_re.match(key) - - # Stop if that's not a layer - if m is None: - break - - # The index of the layer. - layer_idx = int(m.group(1)) - # The name of the operation. - op_name = m.group(2) - # Is it a weight or a bias? - weight_or_bias = m.group(3) - - # The name of the layer. - layer_name = f"bert.encoder.layer.{layer_idx}" - - # For layernorm(s), simply store the layer norm. - if op_name.endswith("layernorm"): - ln_name = "attention.ln" if op_name.startswith("input") else "ln" - output_state_dict[layer_name + "." + ln_name + "." + weight_or_bias] = val - - # Transpose the QKV matrix. - elif ( - op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value" - ) and weight_or_bias == "weight": - # Make sure the QKV pointer is nil. - assert attention_qkv_weight is None, "" - - out_val = fix_query_key_value_ordering(val, checkpoint_version, 3, heads, hidden_size_per_head) - # Store the tensor as we need the bias as well to interleave QKV and biases. - attention_qkv_weight = out_val - - # Transpose the bias. - elif ( - op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value" - ) and weight_or_bias == "bias": - # Make sure we read the weight tensor. - assert attention_qkv_weight is not None, "" - - # Split the QKV matrix into Q, K and V. Megatron stores Q,K,V interleaved. - q = attention_qkv_weight[0 * config.hidden_size : 1 * config.hidden_size, :] - k = attention_qkv_weight[1 * config.hidden_size : 2 * config.hidden_size, :] - v = attention_qkv_weight[2 * config.hidden_size : 3 * config.hidden_size, :] - - out_val = fix_query_key_value_ordering(val, checkpoint_version, 3, heads, hidden_size_per_head) - # Split the bias. - q_bias = out_val[0 * config.hidden_size : 1 * config.hidden_size] - k_bias = out_val[1 * config.hidden_size : 2 * config.hidden_size] - v_bias = out_val[2 * config.hidden_size : 3 * config.hidden_size] - - # Store. - output_state_dict[f"{layer_name}.attention.self.query.weight"] = q - output_state_dict[f"{layer_name}.attention.self.query.bias"] = q_bias - output_state_dict[f"{layer_name}.attention.self.key.weight"] = k - output_state_dict[f"{layer_name}.attention.self.key.bias"] = k_bias - output_state_dict[f"{layer_name}.attention.self.value.weight"] = v - output_state_dict[f"{layer_name}.attention.self.value.bias"] = v_bias - - # Clear the stored tensor. - attention_qkv_weight = None - - # Copy weights and biases as is. - elif weight_or_bias in ["weight", "bias"]: - out_name = megatron_to_transformers[op_name] - output_state_dict[layer_name + out_name + weight_or_bias] = val - - # The final layernorm. - output_state_dict["bert.encoder.ln.weight"] = transformer["final_layernorm.weight"] - output_state_dict["bert.encoder.ln.bias"] = transformer["final_layernorm.bias"] - - # The pooler. - pooler = lm["pooler"] - - # Store the matrix and the bias. - output_state_dict["bert.pooler.dense.weight"] = pooler["dense.weight"] - output_state_dict["bert.pooler.dense.bias"] = pooler["dense.bias"] - - # The LM head from Megatron (for RACE). - lm_head = model["lm_head"] - - # The transform matrix. - output_state_dict["cls.predictions.transform.dense.weight"] = lm_head["dense.weight"] - output_state_dict["cls.predictions.transform.dense.bias"] = lm_head["dense.bias"] - - # The transform LN. - output_state_dict["cls.predictions.transform.LayerNorm.weight"] = lm_head["layernorm.weight"] - output_state_dict["cls.predictions.transform.LayerNorm.bias"] = lm_head["layernorm.bias"] - - # For the decoder, we replicate the weights. - output_state_dict["cls.predictions.decoder.weight"] = word_embeddings - output_state_dict["cls.predictions.bias"] = lm_head["bias"] - - # The classifier from Megatron (for MLNI). - binary_head = model["binary_head"] - - # Store the classifier. - output_state_dict["cls.seq_relationship.weight"] = binary_head["weight"] - output_state_dict["cls.seq_relationship.bias"] = binary_head["bias"] - - # It should be done! - return output_state_dict - - -#################################################################################################### - - -def main(): - # Create the argument parser. - parser = argparse.ArgumentParser() - parser.add_argument("--print-checkpoint-structure", action="store_true") - parser.add_argument("path_to_checkpoint", type=str, help="Path to the ZIP file containing the checkpoint") - parser.add_argument( - "--config_file", - default="", - type=str, - help="An optional config json file describing the pre-trained model.", - ) - args = parser.parse_args() - - # Extract the basename. - basename = os.path.dirname(args.path_to_checkpoint) - - # Load the model. - # the .zip is very optional, let's keep it for backward compatibility - print(f'Extracting PyTorch state dictionary from "{args.path_to_checkpoint}"') - if args.path_to_checkpoint.endswith(".zip"): - with zipfile.ZipFile(args.path_to_checkpoint, "r") as checkpoint: - with checkpoint.open("release/mp_rank_00/model_optim_rng.pt") as pytorch_dict: - input_state_dict = torch.load(pytorch_dict, map_location="cpu", weights_only=True) - else: - input_state_dict = torch.load(args.path_to_checkpoint, map_location="cpu", weights_only=True) - - if args.config_file == "": - # Default config of megatron-bert 345m - config = MegatronBertConfig() - - # different megatron-bert-*-345m models have different vocab sizes, so override the default - # config (which is for megatron-bert-cased-345m) with the actual vocab dimension - config.vocab_size = input_state_dict["model"]["lm_head"]["bias"].numel() - else: - config = MegatronBertConfig.from_json_file(args.config_file) - - # Convert. - print("Converting") - output_state_dict = convert_megatron_checkpoint(args, input_state_dict, config) - - # Print the structure of converted state dict. - if args.print_checkpoint_structure: - recursive_print(None, output_state_dict) - - # Store the config to file. - print("Saving config") - config.save_pretrained(basename) - - # Store the state_dict to file. - output_checkpoint_file = os.path.join(basename, "pytorch_model.bin") - print(f'Saving checkpoint to "{output_checkpoint_file}"') - torch.save(output_state_dict, output_checkpoint_file) - - -#################################################################################################### - -if __name__ == "__main__": - main() - -#################################################################################################### diff --git a/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py b/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py deleted file mode 100644 index d1953f50baed..000000000000 --- a/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py +++ /dev/null @@ -1,430 +0,0 @@ -#################################################################################################### - -# Copyright (c) 2021-, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#################################################################################################### - -# -# Note: If when running this conversion script you're getting an exception: -# ModuleNotFoundError: No module named 'megatron.model.enums' -# you need to tell python where to find the clone of Megatron-LM, e.g.: -# -# cd /tmp -# git clone https://github.com/NVIDIA/Megatron-LM -# PYTHONPATH=/tmp/Megatron-LM python src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py ... -# -# if you already have it cloned elsewhere, simply adjust the path to the existing path -# -# If the training was done using a Megatron-LM fork, e.g., -# https://github.com/microsoft/Megatron-DeepSpeed/ then chances are that you need to have that one -# in your path, i.e., /path/to/Megatron-DeepSpeed/ -# - -import argparse -import os -import re -import zipfile - -import torch - -from transformers import AutoTokenizer, GPT2Config - - -#################################################################################################### - - -def recursive_print(name, val, spaces=0): - # Format the message. - if name is None: - msg = None - else: - fmt = "." * max(0, spaces - 2) + "# {:" + str(50 - spaces) + "s}" - msg = fmt.format(name) - - # Print and recurse (if needed). - if isinstance(val, dict): - if msg is not None: - print(msg) - for k in val: - recursive_print(k, val[k], spaces + 2) - elif isinstance(val, torch.Tensor): - print(msg, ":", val.size()) - else: - print(msg, ":", val) - - -def fix_query_key_value_ordering(param, checkpoint_version, num_splits, num_heads, hidden_size): - # Permutes layout of param tensor to [num_splits * num_heads * hidden_size, :] - # for compatibility with later versions of NVIDIA Megatron-LM. - # The inverse operation is performed inside Megatron-LM to read checkpoints: - # https://github.com/NVIDIA/Megatron-LM/blob/v2.4/megatron/checkpointing.py#L209 - # If param is the weight tensor of the self-attention block, the returned tensor - # will have to be transposed one more time to be read by HuggingFace GPT2. - input_shape = param.size() - if checkpoint_version == 1.0: - # version 1.0 stores [num_heads * hidden_size * num_splits, :] - saved_shape = (num_heads, hidden_size, num_splits) + input_shape[1:] - param = param.view(*saved_shape) - param = param.transpose(0, 2) - param = param.transpose(1, 2).contiguous() - elif checkpoint_version >= 2.0: - # other versions store [num_heads * num_splits * hidden_size, :] - saved_shape = (num_heads, num_splits, hidden_size) + input_shape[1:] - param = param.view(*saved_shape) - param = param.transpose(0, 1).contiguous() - param = param.view(*input_shape) - return param - - -#################################################################################################### - - -def convert_megatron_checkpoint(args, input_state_dict, config): - # The converted output model. - output_state_dict = {} - - # old versions did not store training args - ds_args = input_state_dict.get("args", None) - if ds_args is not None: - # do not make the user write a config file when the exact dimensions/sizes are already in the checkpoint - # from pprint import pprint - # pprint(vars(ds_args)) - - config.vocab_size = ds_args.padded_vocab_size - config.n_positions = ds_args.max_position_embeddings - config.n_embd = ds_args.hidden_size - config.n_layer = ds_args.num_layers - config.n_head = ds_args.num_attention_heads - config.n_inner = ds_args.ffn_hidden_size - # pprint(config) - - # The number of heads. - heads = config.n_head - # The hidden_size per head. - hidden_size_per_head = config.n_embd // config.n_head - # Megatron-LM checkpoint version - if "checkpoint_version" in input_state_dict: - checkpoint_version = input_state_dict["checkpoint_version"] - else: - checkpoint_version = 0.0 - - # The model. - model = input_state_dict["model"] - # The language model. - lm = model["language_model"] - # The embeddings. - embeddings = lm["embedding"] - - # The word embeddings. - word_embeddings = embeddings["word_embeddings"]["weight"] - # Truncate the embedding table to vocab_size rows. - word_embeddings = word_embeddings[: config.vocab_size, :] - output_state_dict["transformer.wte.weight"] = word_embeddings - - # The position embeddings. - pos_embeddings = embeddings["position_embeddings"]["weight"] - # Read the causal mask dimension (seqlen). [max_sequence_length, hidden_size] - n_positions = pos_embeddings.size(0) - if n_positions != config.n_positions: - raise ValueError( - f"pos_embeddings.max_sequence_length={n_positions} and config.n_positions={config.n_positions} don't match" - ) - # Store the position embeddings. - output_state_dict["transformer.wpe.weight"] = pos_embeddings - - # The transformer. - transformer = lm["transformer"] if "transformer" in lm else lm["encoder"] - - # The regex to extract layer names. - layer_re = re.compile(r"layers\.(\d+)\.([a-z0-9_.]+)\.([a-z0-9_]+)") - - # The simple map of names for "automated" rules. - megatron_to_transformers = { - "attention.dense": ".attn.c_proj.", - "self_attention.dense": ".attn.c_proj.", - "self_attention.proj": ".attn.c_proj.", # New format - "mlp.dense_h_to_4h": ".mlp.c_fc.", - "mlp.dense_4h_to_h": ".mlp.c_proj.", - "layernorm_mlp.fc1": ".mlp.c_fc.", # New format - "layernorm_mlp.fc2": ".mlp.c_proj.", # New format - } - - # Extract the layers. - for key, val in transformer.items(): - # Match the name. - m = layer_re.match(key) - - # Stop if that's not a layer - if m is None: - continue - - # The index of the layer. - layer_idx = int(m.group(1)) - # The name of the operation. - op_name = m.group(2) - # Is it a weight or a bias? - weight_or_bias = m.group(3) - # The name of the layer. - layer_name = f"transformer.h.{layer_idx}" - - # Handle _extra_state keys (skip them) - if weight_or_bias == "_extra_state": - continue - - # For layernorm(s), simply store the layer norm. - if op_name.endswith("layernorm") or weight_or_bias.startswith("layer_norm"): - if weight_or_bias.startswith("layer_norm"): - # New format: layers.X.self_attention.layernorm_qkv.layer_norm_weight - if op_name == "self_attention.layernorm_qkv": - ln_name = "ln_1" # Pre-attention layer norm - elif op_name == "layernorm_mlp": - ln_name = "ln_2" # Pre-MLP layer norm - else: - ln_name = "ln_1" if op_name.startswith("input") else "ln_2" - - param_name = "weight" if weight_or_bias == "layer_norm_weight" else "bias" - output_state_dict[layer_name + "." + ln_name + "." + param_name] = val - else: - # Old format - ln_name = "ln_1" if op_name.startswith("input") else "ln_2" - output_state_dict[layer_name + "." + ln_name + "." + weight_or_bias] = val - - # Handle QKV projections - new format: self_attention.layernorm_qkv.weight/bias - elif op_name == "self_attention.layernorm_qkv" and weight_or_bias in ["weight", "bias"]: - if weight_or_bias == "weight": - # Insert a tensor of 1x1xDxD bias. - causal_mask = torch.tril(torch.ones((n_positions, n_positions), dtype=torch.float16)).view( - 1, 1, n_positions, n_positions - ) - output_state_dict[layer_name + ".attn.bias"] = causal_mask - - # Insert a "dummy" tensor for masked_bias. - masked_bias = torch.tensor(-1e4, dtype=torch.float16) - output_state_dict[layer_name + ".attn.masked_bias"] = masked_bias - - out_val = fix_query_key_value_ordering(val, checkpoint_version, 3, heads, hidden_size_per_head) - # Megatron stores (3*D) x D but transformers-GPT2 expects D x 3*D. - out_val = out_val.transpose(0, 1).contiguous() - # Store. - output_state_dict[layer_name + ".attn.c_attn.weight"] = out_val - else: # bias - out_val = fix_query_key_value_ordering(val, checkpoint_version, 3, heads, hidden_size_per_head) - # Store. No change of shape. - output_state_dict[layer_name + ".attn.c_attn.bias"] = out_val - - # Transpose the QKV matrix - old format. - elif ( - op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value" - ) and weight_or_bias == "weight": - # Insert a tensor of 1x1xDxD bias. - causal_mask = torch.tril(torch.ones((n_positions, n_positions), dtype=torch.float16)).view( - 1, 1, n_positions, n_positions - ) - output_state_dict[layer_name + ".attn.bias"] = causal_mask - - # Insert a "dummy" tensor for masked_bias. - masked_bias = torch.tensor(-1e4, dtype=torch.float16) - output_state_dict[layer_name + ".attn.masked_bias"] = masked_bias - - out_val = fix_query_key_value_ordering(val, checkpoint_version, 3, heads, hidden_size_per_head) - # Megatron stores (3*D) x D but transformers-GPT2 expects D x 3*D. - out_val = out_val.transpose(0, 1).contiguous() - # Store. - output_state_dict[layer_name + ".attn.c_attn.weight"] = out_val - - # Transpose the bias - old format. - elif ( - op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value" - ) and weight_or_bias == "bias": - out_val = fix_query_key_value_ordering(val, checkpoint_version, 3, heads, hidden_size_per_head) - # Store. No change of shape. - output_state_dict[layer_name + ".attn.c_attn.bias"] = out_val - - # Transpose the weights. - elif weight_or_bias == "weight": - # DEBUG: Check if op_name exists in the mapping - if op_name not in megatron_to_transformers: - continue - out_name = megatron_to_transformers[op_name] - output_state_dict[layer_name + out_name + "weight"] = val.transpose(0, 1) - - # Copy the bias. - elif weight_or_bias == "bias": - # DEBUG: Check if op_name exists in the mapping - if op_name not in megatron_to_transformers: - continue - out_name = megatron_to_transformers[op_name] - output_state_dict[layer_name + out_name + "bias"] = val - - # Handle new format MLP weights/biases - elif weight_or_bias in ["fc1_weight", "fc2_weight", "fc1_bias", "fc2_bias"]: - if weight_or_bias == "fc1_weight": - output_state_dict[layer_name + ".mlp.c_fc.weight"] = val.transpose(0, 1) - elif weight_or_bias == "fc1_bias": - output_state_dict[layer_name + ".mlp.c_fc.bias"] = val - elif weight_or_bias == "fc2_weight": - output_state_dict[layer_name + ".mlp.c_proj.weight"] = val.transpose(0, 1) - elif weight_or_bias == "fc2_bias": - output_state_dict[layer_name + ".mlp.c_proj.bias"] = val - - else: - print( - f"DEBUG: Unhandled key: {key} (layer {layer_idx}, op_name: '{op_name}', weight_or_bias: '{weight_or_bias}')" - ) - - # DEBUG. - assert config.n_layer == layer_idx + 1 - - # The final layernorm - handle both old and new formats. - if "final_layernorm.weight" in transformer: - # Old format - output_state_dict["transformer.ln_f.weight"] = transformer["final_layernorm.weight"] - output_state_dict["transformer.ln_f.bias"] = transformer["final_layernorm.bias"] - elif "final_norm.weight" in transformer: - # New format - output_state_dict["transformer.ln_f.weight"] = transformer["final_norm.weight"] - output_state_dict["transformer.ln_f.bias"] = transformer["final_norm.bias"] - else: - print("WARNING: Could not find final layer norm weights!") - - # For LM head, transformers' wants the matrix to weight embeddings. - output_state_dict["lm_head.weight"] = word_embeddings - - # It should be done! - return output_state_dict - - -#################################################################################################### - - -def main(): - # Create the argument parser. - parser = argparse.ArgumentParser() - parser.add_argument("--print-checkpoint-structure", action="store_true") - parser.add_argument( - "path_to_checkpoint", - type=str, - help="Path to the checkpoint file (.zip archive or direct .pt file)", - ) - parser.add_argument( - "--config_file", - default="", - type=str, - help="An optional config json file describing the pre-trained model.", - ) - args = parser.parse_args() - - # Extract the basename. - basename = os.path.dirname(args.path_to_checkpoint) - - # Load the model. - # the .zip is very optional, let's keep it for backward compatibility - print(f"Extracting PyTorch state dictionary from {args.path_to_checkpoint}") - if args.path_to_checkpoint.endswith(".zip"): - with zipfile.ZipFile(args.path_to_checkpoint, "r") as checkpoint: - with checkpoint.open("release/mp_rank_00/model_optim_rng.pt") as pytorch_dict: - input_state_dict = torch.load(pytorch_dict, map_location="cpu", weights_only=True) - else: - input_state_dict = torch.load(args.path_to_checkpoint, map_location="cpu", weights_only=False) - - ds_args = input_state_dict.get("args", None) - - # Read the config, or default to the model released by NVIDIA. - if args.config_file == "": - if ds_args is not None: - if ds_args.bias_gelu_fusion: - activation_function = "gelu_fast" - elif ds_args.openai_gelu: - activation_function = "gelu_new" - else: - activation_function = "gelu" - else: - # in the very early days this used to be "gelu_new" - activation_function = "gelu_new" - - # Spell out all parameters in case the defaults change. - config = GPT2Config( - vocab_size=50257, - n_positions=1024, - n_embd=1024, - n_layer=24, - n_head=16, - n_inner=4096, - activation_function=activation_function, - resid_pdrop=0.1, - embd_pdrop=0.1, - attn_pdrop=0.1, - layer_norm_epsilon=1e-5, - initializer_range=0.02, - summary_type="cls_index", - summary_use_proj=True, - summary_activation=None, - summary_proj_to_labels=True, - summary_first_dropout=0.1, - scale_attn_weights=True, - use_cache=True, - bos_token_id=50256, - eos_token_id=50256, - ) - else: - config = GPT2Config.from_json_file(args.config_file) - - config.architectures = ["GPT2LMHeadModel"] - - # Convert. - print("Converting") - output_state_dict = convert_megatron_checkpoint(args, input_state_dict, config) - - # Print the structure of converted state dict. - if args.print_checkpoint_structure: - recursive_print(None, output_state_dict) - - # Add tokenizer class info to config - # see https://github.com/huggingface/transformers/issues/13906) - if ds_args is not None: - tokenizer_type = ds_args.tokenizer_type - if tokenizer_type == "GPT2BPETokenizer": - tokenizer_model_name = "openai-community/gpt2" - elif tokenizer_type == "PretrainedFromHF": - tokenizer_model_name = ds_args.tokenizer_name_or_path - else: - raise ValueError(f"Unrecognized tokenizer_type {tokenizer_type}") - else: - tokenizer_model_name = "openai-community/gpt2" - - tokenizer = AutoTokenizer.from_pretrained(tokenizer_model_name) - tokenizer_class = type(tokenizer).__name__ - config.tokenizer_class = tokenizer_class - - # Store the config to file. - print("Saving config") - config.save_pretrained(basename) - - # Save tokenizer based on args - print(f"Adding {tokenizer_class} tokenizer files") - tokenizer.save_pretrained(basename) - - # Store the state_dict to file. - output_checkpoint_file = os.path.join(basename, "pytorch_model.bin") - print(f'Saving checkpoint to "{output_checkpoint_file}"') - torch.save(output_state_dict, output_checkpoint_file) - - -#################################################################################################### - -if __name__ == "__main__": - main() - -#################################################################################################### diff --git a/src/transformers/models/metaclip_2/configuration_metaclip_2.py b/src/transformers/models/metaclip_2/configuration_metaclip_2.py index a0cec0f3c5b3..4ad1bcde0daa 100644 --- a/src/transformers/models/metaclip_2/configuration_metaclip_2.py +++ b/src/transformers/models/metaclip_2/configuration_metaclip_2.py @@ -277,7 +277,7 @@ def __init__( # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different. for key, value in _text_config_dict.items(): - if key in text_config and value != text_config[key] and key not in ["transformers_version"]: + if key in text_config and value != text_config[key] and key != "transformers_version": # If specified in `text_config_dict` if key in text_config_dict: message = ( @@ -309,7 +309,7 @@ def __init__( # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different. for key, value in _vision_config_dict.items(): - if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]: + if key in vision_config and value != vision_config[key] and key != "transformers_version": # If specified in `vision_config_dict` if key in vision_config_dict: message = ( diff --git a/src/transformers/models/metaclip_2/convert_metaclip_2_to_hf.py b/src/transformers/models/metaclip_2/convert_metaclip_2_to_hf.py deleted file mode 100644 index 21a0a1462fff..000000000000 --- a/src/transformers/models/metaclip_2/convert_metaclip_2_to_hf.py +++ /dev/null @@ -1,426 +0,0 @@ -""" -This script allows you to convert MetaCLIP 2 (worldwide) checkpoints from the -original repository to the Hugging Face format. - -URL: https://github.com/facebookresearch/MetaCLIP - -To convert: -1. git clone the MetaCLIP repository -2. place it in the same directory as this script -3. move the conversion script to the MetaCLIP repository. - -Then run the script with: - -```bash -cd MetaCLIP -python convert_metaclip_2_to_hf.py --checkpoint_path /path/to/checkpoint --model_name ViT-H-14-quickgelu-worldwide -``` -""" - -import argparse -import os -from typing import Optional - -import torch -from PIL import Image - -# Import MetaCLIP modules -from src.mini_clip.factory import create_model_and_transforms -from transformers import ( - AutoTokenizer, - CLIPImageProcessor, - CLIPProcessor, - MetaClip2Config, - MetaClip2Model, -) - - -def load_metaclip2_checkpoint(checkpoint_path: str, model_name: str) -> torch.nn.Module: - """Load MetaCLIP 2 model from checkpoint.""" - print(f"Loading MetaCLIP 2 model: {model_name}") - - # For worldwide models, use WorldWideCLIP class - model_name_with_class = model_name - if "worldwide" in model_name.lower(): - model_name_with_class = f"{model_name}@WorldWideCLIP" - print("Using WorldWideCLIP class for worldwide model") - - # Create model using the factory - model, _, preprocess = create_model_and_transforms(model_name_with_class, pretrained=checkpoint_path, device="cpu") - model.eval() - return model, preprocess - - -def create_hf_config(tokenizer: AutoTokenizer, model_name: str) -> tuple[MetaClip2Config, int]: - """Create Hugging Face MetaClip2Config from MetaCLIP model. - - This is based on the configs found at https://github.com/facebookresearch/MetaCLIP/tree/main/src/mini_clip/model_configs. - """ - print("Creating Hugging Face config...") - - # Vision config - vision_configs = { - "ViT-H-14-quickgelu-worldwide": { - "image_size": 224, - "patch_size": 14, - "hidden_size": 1280, - "intermediate_size": 1280 * 4, - "num_attention_heads": 16, - "num_hidden_layers": 32, - "hidden_act": "quick_gelu", - "projection_dim": 1024, - }, - "ViT-H-14-378-worldwide": { - "image_size": 378, - "patch_size": 14, - "hidden_size": 1280, - "intermediate_size": 1280 * 4, - "num_attention_heads": 16, - "num_hidden_layers": 32, - "hidden_act": "gelu", - "projection_dim": 1024, - }, - "ViT-bigG-14-worldwide": { - "image_size": 224, - "patch_size": 14, - "hidden_size": 1664, - "intermediate_size": 8192, - "num_attention_heads": 16, - "num_hidden_layers": 48, - "hidden_act": "gelu", - "projection_dim": 1280, - }, - "ViT-bigG-14-378-worldwide": { - "image_size": 378, - "patch_size": 14, - "hidden_size": 1664, - "intermediate_size": 8192, - "num_attention_heads": 16, - "num_hidden_layers": 48, - "hidden_act": "gelu", - "projection_dim": 1280, - }, - } - - vision_config = vision_configs[model_name] - image_size = vision_config["image_size"] - - # Text config - text_configs = { - "ViT-H-14-quickgelu-worldwide": { - "hidden_size": 1024, - "intermediate_size": 1024 * 4, - "num_attention_heads": 16, - "num_hidden_layers": 24, - "max_position_embeddings": 77, - "vocab_size": 901629, - "eos_token_id": tokenizer.eos_token_id, - "hidden_act": "quick_gelu", - "projection_dim": 1024, - }, - "ViT-H-14-378-worldwide": { - "hidden_size": 1024, - "intermediate_size": 1024 * 4, - "num_attention_heads": 16, - "num_hidden_layers": 24, - "max_position_embeddings": 77, - "vocab_size": 901629, - "eos_token_id": tokenizer.eos_token_id, - "hidden_act": "gelu", - "projection_dim": 1024, - }, - "ViT-bigG-14-worldwide": { - "hidden_size": 1280, - "intermediate_size": 1280 * 4, - "num_attention_heads": 20, - "num_hidden_layers": 32, - "max_position_embeddings": 77, - "vocab_size": 901629, - "eos_token_id": tokenizer.eos_token_id, - "hidden_act": "gelu", - "projection_dim": 1280, - }, - "ViT-bigG-14-378-worldwide": { - "hidden_size": 1280, - "intermediate_size": 1280 * 4, - "num_attention_heads": 20, - "num_hidden_layers": 32, - "max_position_embeddings": 77, - "vocab_size": 901629, - "eos_token_id": tokenizer.eos_token_id, - "hidden_act": "gelu", - "projection_dim": 1280, - }, - } - - text_config = text_configs[model_name] - projection_dim = text_config["projection_dim"] - - # Create config - config = MetaClip2Config( - vision_config=vision_config, - text_config=text_config, - projection_dim=projection_dim, - ) - - return config, image_size - - -def convert_state_dict(metaclip_state_dict: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: - """Convert MetaCLIP state dict to Hugging Face format.""" - print("Converting state dict...") - - hf_state_dict = {} - - for key, value in metaclip_state_dict.items(): - new_key = key - - # Handle specific mappings first before general prefix replacements - if key == "visual.proj": - new_key = "visual_projection.weight" - # Don't transpose! MetaCLIP: x @ proj, HF: Linear(x) = x @ weight.T - # So we want weight.T = proj, which means weight = proj.T - # But since we're storing proj as weight, we need proj.T - value = value.T # This gives us the correct orientation for Linear layer - elif key == "text_projection": - new_key = "text_projection.weight" - # Same logic as visual projection - value = value.T - elif key == "token_embedding.weight": - new_key = "text_model.embeddings.token_embedding.weight" - elif key == "positional_embedding": - new_key = "text_model.embeddings.position_embedding.weight" - elif key == "ln_final.weight": - new_key = "text_model.final_layer_norm.weight" - elif key == "ln_final.bias": - new_key = "text_model.final_layer_norm.bias" - # Vision encoder mappings - elif key.startswith("visual."): - new_key = key.replace("visual.", "vision_model.") - - # Handle specific vision model components - if "conv1" in new_key: - new_key = new_key.replace("conv1", "embeddings.patch_embedding") - elif "class_embedding" in new_key: - new_key = new_key.replace("class_embedding", "embeddings.class_embedding") - elif "positional_embedding" in new_key: - new_key = new_key.replace("positional_embedding", "embeddings.position_embedding.weight") - elif "ln_pre" in new_key: - new_key = new_key.replace("ln_pre", "pre_layrnorm") - elif "ln_post" in new_key: - new_key = new_key.replace("ln_post", "post_layernorm") - elif "transformer.resblocks" in new_key: - new_key = new_key.replace("transformer.resblocks", "encoder.layers") - # Handle attention and MLP mappings within transformer blocks - if "attn.in_proj" in new_key: - # Split the in_proj into q, k, v projections - if "weight" in new_key: - # We'll handle this later in a special case - continue - elif "bias" in new_key: - continue - elif "attn.out_proj" in new_key: - new_key = new_key.replace("attn.out_proj", "self_attn.out_proj") - elif "ln_1" in new_key: - new_key = new_key.replace("ln_1", "layer_norm1") - elif "ln_2" in new_key: - new_key = new_key.replace("ln_2", "layer_norm2") - elif "mlp.c_fc" in new_key: - new_key = new_key.replace("mlp.c_fc", "mlp.fc1") - elif "mlp.c_proj" in new_key: - new_key = new_key.replace("mlp.c_proj", "mlp.fc2") - - # Text encoder mappings - elif key.startswith("transformer."): - new_key = key.replace("transformer.", "text_model.encoder.") - - if "resblocks" in new_key: - new_key = new_key.replace("resblocks", "layers") - # Similar mappings as vision transformer - if "attn.in_proj" in new_key: - continue # Handle separately - elif "attn.out_proj" in new_key: - new_key = new_key.replace("attn.out_proj", "self_attn.out_proj") - elif "ln_1" in new_key: - new_key = new_key.replace("ln_1", "layer_norm1") - elif "ln_2" in new_key: - new_key = new_key.replace("ln_2", "layer_norm2") - elif "mlp.c_fc" in new_key: - new_key = new_key.replace("mlp.c_fc", "mlp.fc1") - elif "mlp.c_proj" in new_key: - new_key = new_key.replace("mlp.c_proj", "mlp.fc2") - - hf_state_dict[new_key] = value - - # Handle in_proj weights separately (split into q, k, v) - for key, value in metaclip_state_dict.items(): - if "attn.in_proj_weight" in key: - # Split the combined qkv weight into separate q, k, v weights - dim = value.shape[0] // 3 - q_weight = value[:dim] - k_weight = value[dim : 2 * dim] - v_weight = value[2 * dim :] - - base_key = key.replace("attn.in_proj_weight", "") - if key.startswith("visual."): - base_key = base_key.replace("visual.transformer.resblocks", "vision_model.encoder.layers") - else: - base_key = base_key.replace("transformer.resblocks", "text_model.encoder.layers") - - hf_state_dict[f"{base_key}self_attn.q_proj.weight"] = q_weight - hf_state_dict[f"{base_key}self_attn.k_proj.weight"] = k_weight - hf_state_dict[f"{base_key}self_attn.v_proj.weight"] = v_weight - - elif "attn.in_proj_bias" in key: - # Split the combined qkv bias into separate q, k, v biases - dim = value.shape[0] // 3 - q_bias = value[:dim] - k_bias = value[dim : 2 * dim] - v_bias = value[2 * dim :] - - base_key = key.replace("attn.in_proj_bias", "") - if key.startswith("visual."): - base_key = base_key.replace("visual.transformer.resblocks", "vision_model.encoder.layers") - else: - base_key = base_key.replace("transformer.resblocks", "text_model.encoder.layers") - - hf_state_dict[f"{base_key}self_attn.q_proj.bias"] = q_bias - hf_state_dict[f"{base_key}self_attn.k_proj.bias"] = k_bias - hf_state_dict[f"{base_key}self_attn.v_proj.bias"] = v_bias - - return hf_state_dict - - -def verify_conversion( - original_model, hf_model, preprocess, image_processor, tokenizer, test_image_path: Optional[str] = None -) -> bool: - """Verify that the conversion produces the same outputs.""" - print("Verifying conversion...") - - # Create test image - if test_image_path and os.path.exists(test_image_path): - image = Image.open(test_image_path) - else: - # Create a dummy image - image = Image.new("RGB", (224, 224), color="red") - - # Verify image processor - processed_image = preprocess(image).unsqueeze(0) - pixel_values = image_processor(image, return_tensors="pt").pixel_values - print("Shape of pixel_values:", pixel_values.shape) - print("Shape of processed_image:", processed_image.shape) - assert torch.allclose(pixel_values, processed_image) - - # Use tokenizer to get input_ids - texts = ["a cat", "a dog", "a bird"] - token_inputs = tokenizer(texts, return_tensors="pt", padding="max_length", truncation=True, max_length=77) - input_ids = token_inputs.input_ids - - print(f"Processed text shape: {input_ids.shape}") - print(f"Processed image shape: {processed_image.shape}") - - with torch.no_grad(): - # Original model outputs - orig_image_features = original_model.encode_image(processed_image) - orig_text_features = original_model.encode_text(input_ids) - - # Normalize and compute logits - orig_image_features = orig_image_features / orig_image_features.norm(dim=-1, keepdim=True) - orig_text_features = orig_text_features / orig_text_features.norm(dim=-1, keepdim=True) - orig_logits = original_model.logit_scale.exp() * orig_image_features @ orig_text_features.T - - print(f"Original text features: {orig_text_features[0][:5].tolist()}") - print(f"Original image features: {orig_image_features[0][:5].tolist()}") - - with torch.no_grad(): - hf_outputs = hf_model(input_ids=input_ids, pixel_values=pixel_values) - hf_logits = hf_outputs.logits_per_image - - # Debug: Check HF model features - print(f"HF text features: {hf_outputs.text_embeds[0][:5].tolist()}") - print(f"HF image features: {hf_outputs.image_embeds[0][:5].tolist()}") - print(f"HF model EOS token ID: {hf_model.config.text_config.eos_token_id}") - - # Compare outputs - print(f"Original logits: {orig_logits}") - print(f"HF logits: {hf_logits}") - print(f"Logit scale - Original: {original_model.logit_scale.exp():.6f}, HF: {hf_model.logit_scale.exp():.6f}") - - # Check if they're close - if orig_logits.shape == hf_logits.shape and torch.allclose(orig_logits, hf_logits, atol=1e-4): - print("✅ Conversion verified! Outputs match.") - return True - else: - print("❌ Conversion failed! Outputs don't match.") - if orig_logits.numel() > 0 and hf_logits.numel() > 0: - print(f"Max difference: {(orig_logits - hf_logits).abs().max()}") - return False - - -def push_to_hub(hf_model: MetaClip2Model, processor: CLIPProcessor, repo_name: str): - """Push the converted model to Hugging Face Hub.""" - print(f"Pushing to hub: {repo_name}") - - try: - hf_model.push_to_hub(repo_name) - processor.push_to_hub(repo_name) - print(f"✅ Successfully pushed to {repo_name}") - except Exception as e: - print(f"❌ Failed to push to hub: {e}") - - -def main(): - parser = argparse.ArgumentParser(description="Convert MetaCLIP 2 to Hugging Face format") - parser.add_argument("--checkpoint_path", required=True, help="Path to MetaCLIP 2 checkpoint") - parser.add_argument("--model_name", required=True, help="MetaCLIP model name (e.g., ViT-H-14-quickgelu-worldwide)") - parser.add_argument("--output_dir", default="./converted_models", help="Output directory for converted model") - parser.add_argument("--push_to_hub", action="store_true", help="Push to Hugging Face Hub") - parser.add_argument("--hub_repo_name", help="Hub repository name") - parser.add_argument("--test_image", help="Path to test image for verification") - - args = parser.parse_args() - - # Load original model - original_model, preprocess = load_metaclip2_checkpoint(args.checkpoint_path, args.model_name) - - # Create HF config - # Requires the tokenizer for the eos token id - tokenizer = AutoTokenizer.from_pretrained("facebook/xlm-v-base") - config, image_size = create_hf_config(tokenizer=tokenizer, model_name=args.model_name) - - # Create processor - image_processor = CLIPImageProcessor( - size={"height": image_size, "width": image_size}, crop_size={"height": image_size, "width": image_size} - ) - processor = CLIPProcessor(image_processor=image_processor, tokenizer=tokenizer) - - # Create HF model - hf_model = MetaClip2Model(config) - - # Convert state dict - converted_state_dict = convert_state_dict(original_model.state_dict()) - - for name, param in hf_model.named_parameters(): - print(name, param.shape) - - # Load converted weights - hf_model.load_state_dict(converted_state_dict) - - # Verify conversion - if not verify_conversion(original_model, hf_model, preprocess, image_processor, tokenizer, args.test_image): - print("Conversion verification failed. Please check the conversion logic.") - return - - # Save model locally - if args.output_dir: - os.makedirs(args.output_dir, exist_ok=True) - hf_model.save_pretrained(args.output_dir) - processor.save_pretrained(args.output_dir) - - # Push to hub if requested - if args.push_to_hub and args.hub_repo_name: - push_to_hub(hf_model, processor, args.hub_repo_name) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/mimi/convert_mimi_checkpoint_to_pytorch.py b/src/transformers/models/mimi/convert_mimi_checkpoint_to_pytorch.py deleted file mode 100644 index 75702aadd314..000000000000 --- a/src/transformers/models/mimi/convert_mimi_checkpoint_to_pytorch.py +++ /dev/null @@ -1,198 +0,0 @@ -# coding=utf-8 -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Mimi checkpoints.""" - -import argparse - -import safetensors -import torch - -from transformers import ( - EncodecFeatureExtractor, - MimiConfig, - MimiModel, - logging, -) - - -logging.set_verbosity_info() -logger = logging.get_logger("transformers.models.mimi") - - -def assert_param_count(model_1, model_2): - count_1 = sum(p[1].numel() for p in model_1.named_parameters() if "final_proj" not in p[0]) - count_2 = sum(p[1].numel() for p in model_2.named_parameters() if "final_proj" not in p[0]) - assert count_1 == count_2, f"{model_1.__class__}: {count_1} != {model_2.__class__}: {count_2}" - - -def param_count(model): - return sum(p[1].numel() for p in model.named_parameters() if "final_proj" not in p[0]) - - -def _grab_best_device(use_gpu=True): - if torch.cuda.device_count() > 0 and use_gpu: - device = "cuda" - else: - device = "cpu" - return torch.device(device) - - -convert_list = [ - # GENERAL - ("conv.conv.conv", "conv"), - ("convtr.convtr.convtr", "conv"), - ("conv.conv", "conv"), - ("convtr.convtr", "conv"), - # QUANTIZER - ("quantizer.rvq_first.vq", "quantizer.semantic_residual_vector_quantizer"), - ("quantizer.rvq_first", "quantizer.semantic_residual_vector_quantizer"), - ("quantizer.rvq_rest.vq", "quantizer.acoustic_residual_vector_quantizer"), - ("quantizer.rvq_rest", "quantizer.acoustic_residual_vector_quantizer"), - ("_codebook", "codebook"), - ("_initialized", "initialized"), - ("embedding_sum", "embed_sum"), - # ENCODER PART - ("encoder.model", "encoder.layers"), - ("decoder.model", "decoder.layers"), - # TRANSFORMERS PART - ("encoder_transformer.transformer", "encoder_transformer"), - ("decoder_transformer.transformer", "decoder_transformer"), - ("linear1", "mlp.fc1"), - ("linear2", "mlp.fc2"), - ("self_attn.out_proj", "self_attn.o_proj"), - ("norm1", "input_layernorm"), - ("norm2", "post_attention_layernorm"), - ("layer_scale_1", "self_attn_layer_scale"), - ("layer_scale_2", "mlp_layer_scale"), -] - - -def _convert_model( - state_dict, - hf_model, - convert_list, - device, - config, - unwanted_prefix=None, -): - hidden_size = config.hidden_size - head_dim = config.head_dim - num_heads = int(config.hidden_size // config.head_dim) - num_key_value_heads = config.num_key_value_heads - key_value_head_dim = config.num_key_value_heads * head_dim - - # permute for sliced rotary - def permute(w, n_heads, dim1=hidden_size, dim2=hidden_size): - return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2) - - for k, v in list(state_dict.items()): - new_k = k if unwanted_prefix is None else k[len(unwanted_prefix) :] - for old_layer_name, new_layer_name in convert_list: - if old_layer_name in new_k: - new_k = new_k.replace(old_layer_name, new_layer_name) - - if "in_proj_weight" in new_k: - # split qkv into query key and value - mixed_qkv = state_dict.pop(k) - qkv_dim = mixed_qkv.size(0) // 3 - - query_layer = mixed_qkv[:qkv_dim] - key_layer = mixed_qkv[qkv_dim : qkv_dim * 2] - value_layer = mixed_qkv[qkv_dim * 2 :] - - state_dict[new_k.replace("in_proj_weight", "q_proj.weight")] = permute(query_layer, num_heads) - state_dict[new_k.replace("in_proj_weight", "k_proj.weight")] = permute( - key_layer, num_key_value_heads, dim1=key_value_head_dim - ) - state_dict[new_k.replace("in_proj_weight", "v_proj.weight")] = value_layer - else: - state_dict[new_k] = state_dict.pop(k) - - extra_keys = set(state_dict.keys()) - set(hf_model.state_dict().keys()) - missing_keys = set(hf_model.state_dict().keys()) - set(state_dict.keys()) - if len(extra_keys) != 0: - raise ValueError(f"extra keys found: {extra_keys}") - if len(missing_keys) != 0: - raise ValueError(f"missing keys: {missing_keys}") - hf_model.load_state_dict(state_dict, strict=True) - n_params = param_count(hf_model) - - logger.info(f"model loaded: {round(n_params / 1e6, 1)}M params") - - hf_model.eval() - hf_model.to(device) - del state_dict - - return hf_model - - -@torch.no_grad() -def convert_checkpoint( - checkpoint_path, - pytorch_dump_folder_path, - config_path=None, - repo_id=None, -): - """ - Copy/paste/tweak model's weights to transformers design. - """ - device = _grab_best_device() - - if config_path is not None: - config = MimiConfig.from_pretrained(config_path) - else: - config = MimiConfig() - - model = MimiModel(config) - - feature_extractor = EncodecFeatureExtractor( - feature_size=config.audio_channels, - sampling_rate=config.sampling_rate, - ) - feature_extractor.save_pretrained(pytorch_dump_folder_path) - - original_checkpoint = safetensors.torch.load_file(checkpoint_path) - if "best_state" in original_checkpoint: - # we might have a training state saved, in which case discard the yaml results and just retain the weights - original_checkpoint = original_checkpoint["best_state"] - - model = _convert_model(original_checkpoint, model, convert_list, device, config) - - model.save_pretrained(pytorch_dump_folder_path) - - if repo_id: - print("Pushing to the hub...") - feature_extractor.push_to_hub(repo_id) - model.push_to_hub(repo_id) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint") - parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert") - parser.add_argument( - "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model." - ) - parser.add_argument( - "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub." - ) - - args = parser.parse_args() - convert_checkpoint( - args.checkpoint_path, - args.pytorch_dump_folder_path, - args.config_path, - args.push_to_hub, - ) diff --git a/src/transformers/models/mistral/convert_mistral_weights_to_hf.py b/src/transformers/models/mistral/convert_mistral_weights_to_hf.py deleted file mode 100644 index a790fed81d1b..000000000000 --- a/src/transformers/models/mistral/convert_mistral_weights_to_hf.py +++ /dev/null @@ -1,284 +0,0 @@ -# Copyright 2023 Mistral AI and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import json -import os -import re - -import torch -from safetensors.torch import load_file - -from transformers import AutoTokenizer, LlamaTokenizerFast, MistralConfig, MistralForCausalLM -from transformers.integrations.mistral import convert_tekken_tokenizer - - -# fmt: off -STATE_DICT_MAPPING = { - # CausalLM keys - r"^output.weight": r"lm_head.weight", - - # Model keys - r"^norm.weight": r"model.norm.weight", - r"^tok_embeddings.weight": r"model.embed_tokens.weight", - - # Layers keys - r"^layers.(\d+).attention_norm.weight": r"model.layers.\1.input_layernorm.weight", - r"^layers.(\d+).ffn_norm.weight": r"model.layers.\1.post_attention_layernorm.weight", - - # Attention keys - r"^layers.(\d+).attention.w(q|k|v|o).weight": r"model.layers.\1.self_attn.\2_proj.weight", - - - # MLP keys - r"^layers.(\d+).feed_forward.w1.weight": r"model.layers.\1.mlp.gate_proj.weight", - r"^layers.(\d+).feed_forward.w2.weight": r"model.layers.\1.mlp.down_proj.weight", - r"^layers.(\d+).feed_forward.w3.weight": r"model.layers.\1.mlp.up_proj.weight", -} -# fmt: on - - -def map_old_key_to_new(old_key): - """Map of a key of the original state dict to the equivalent key in HF format""" - for pattern, replacement in STATE_DICT_MAPPING.items(): - new_key, n_replace = re.subn(pattern, replacement, old_key) - # Early exit of the loop - if n_replace > 0: - return new_key - - raise ValueError(f"Key: {old_key} could not be mapped (check the mapping).") - - -def read_json(path): - with open(path, "r") as f: - return json.load(f) - - -def permute_for_rope(tensor, n_heads, dim1, dim2): - """Permute the weights for the ROPE formulation.""" - tensor = tensor.view(n_heads, dim1 // n_heads // 2, 2, dim2) - tensor = tensor.transpose(1, 2) - tensor = tensor.reshape(dim1, dim2) - return tensor - - -def convert_state_dict(original_state_dict: dict, config: MistralConfig): - """Convert a state dict file, when a single `nn.Module` is never sharded in different files (usual case).""" - new_dict = {} - - num_attention_heads = config.num_attention_heads - hidden_size = config.hidden_size - head_dim = config.head_dim - num_key_value_heads = config.num_key_value_heads - key_value_dim = head_dim * num_key_value_heads - query_dim = head_dim * num_attention_heads - - for old_key, tensor in original_state_dict.items(): - new_key = map_old_key_to_new(old_key) - - if "q_proj" in new_key: - tensor = tensor.view(num_attention_heads, head_dim, hidden_size).reshape(query_dim, hidden_size) - tensor = permute_for_rope(tensor, num_attention_heads, query_dim, hidden_size) - elif "k_proj" in new_key: - tensor = tensor.view(num_key_value_heads, head_dim, hidden_size).reshape(key_value_dim, hidden_size) - tensor = permute_for_rope(tensor, num_key_value_heads, key_value_dim, hidden_size) - elif "v_proj" in new_key: - tensor = tensor.view(num_key_value_heads, head_dim, hidden_size).reshape(key_value_dim, hidden_size) - - new_dict[new_key] = tensor - return new_dict - - -def get_concat_dim(key): - """Return the dimension to concatenate the weights on.""" - concat_dim_1 = [ - r"model.embed_tokens.weight", - r"model.layers.(\d+).self_attn.o_proj.weight", - r"model.layers.(\d+).mlp.down_proj.weight", - ] - if any(re.search(pattern, key) for pattern in concat_dim_1): - return 1 - return 0 - - -def convert_state_dict_sharded(loaded_shards: list[dict], config: MistralConfig): - """Convert the state dict, when a single `nn.Module` is sharded across different files.""" - new_dict = {} - - num_shards = len(loaded_shards) - - n_heads = config.num_attention_heads - dim = config.hidden_size - dims_per_head = dim // n_heads - num_key_value_heads = config.num_key_value_heads - n_heads_per_shard = n_heads // num_shards - num_local_key_value_heads = num_key_value_heads // num_shards - key_value_dim = dim if n_heads == num_key_value_heads else dims_per_head * num_local_key_value_heads - - original_keys = loaded_shards[0].keys() - for old_key in original_keys: - new_key = map_old_key_to_new(old_key) - cat_dim = get_concat_dim(new_key) - - if "q_proj" in new_key: - tensor = torch.cat( - [shard.pop(old_key).view(n_heads_per_shard, dims_per_head, dim) for shard in loaded_shards], - dim=cat_dim, - ).reshape(dim, dim) - tensor = permute_for_rope(tensor, n_heads, dim, dim) - elif "k_proj" in new_key: - tensor = torch.cat( - [shard.pop(old_key).view(num_local_key_value_heads, dims_per_head, dim) for shard in loaded_shards], - dim=cat_dim, - ).reshape(key_value_dim, dim) - tensor = permute_for_rope(tensor, num_key_value_heads, key_value_dim, dim) - elif "v_proj" in new_key: - tensor = torch.cat( - [shard.pop(old_key).view(num_local_key_value_heads, dims_per_head, dim) for shard in loaded_shards], - dim=cat_dim, - ).reshape(key_value_dim, dim) - elif "input_layernorm" in new_key or "post_attention_layernorm" in new_key: - tensor = loaded_shards[0][old_key].clone() - elif "model.norm.weight" in new_key: - tensor = loaded_shards[0][old_key] - else: - tensor = torch.cat([shard.pop(old_key) for shard in loaded_shards], dim=cat_dim) - - new_dict[new_key] = tensor - - return new_dict - - -def convert_config(original_config: dict, max_position_embeddings: int = 32768): - key_mapping = { - "hidden_size": "dim", - "num_hidden_layers": "n_layers", - "intermediate_size": "hidden_dim", - "num_attention_heads": "n_heads", - "rms_norm_eps": "norm_eps", - } - similar_keys_to_keep = [ - "head_dim", - "vocab_size", - ] - - new_config_kwargs = {k: original_config[v] for k, v in key_mapping.items()} - new_config_kwargs.update({k: v for k, v in original_config.items() if k in similar_keys_to_keep}) - - # These are not always defined depending on `params.json` - new_config_kwargs["sliding_window"] = original_config.get("sliding_window") - new_config_kwargs["num_key_value_heads"] = original_config.get( - "n_kv_heads", new_config_kwargs["num_attention_heads"] - ) - new_config_kwargs["rope_theta"] = original_config.get("rope_theta", 10000.0) - new_config_kwargs["max_position_embeddings"] = original_config.get("max_seq_len", max_position_embeddings) - - # This may sometimes be a string in `params.json` - if new_config_kwargs["sliding_window"] is not None: - new_config_kwargs["sliding_window"] = int(new_config_kwargs["sliding_window"]) - - new_config = MistralConfig(**new_config_kwargs) - return new_config - - -def convert_and_write_model(input_dir: str, output_dir: str, max_position_embeddings: int, modules_are_split: bool): - """Convert the model and save it (this implicitly save the config as well).""" - params = read_json(os.path.join(input_dir, "params.json")) - config = convert_config(params, max_position_embeddings) - - full_state_dict = {} - # The model may be split between different files, but a single nn.Module is always fully present in a single file - if not modules_are_split: - shards = [file for file in os.listdir(input_dir) if file.endswith(".safetensors")] - for shard_file in shards: - original_state_dict = load_file(os.path.join(input_dir, shard_file)) - new_dict = convert_state_dict(original_state_dict, config) - full_state_dict.update(new_dict) - # A single nn.Module is split between different checkpoint files - else: - shards = [file for file in os.listdir(input_dir) if re.match(r"consolidated.\d+.pth", file)] - shards = sorted(shards, key=lambda x: int(x.split(".")[1])) - loaded_shards = [ - torch.load(os.path.join(input_dir, file), map_location="cpu", weights_only=True) for file in shards - ] - full_state_dict = convert_state_dict_sharded(loaded_shards, config) - - # Load weights into model and resave them - with torch.device("meta"): - model = MistralForCausalLM(config) - model.load_state_dict(full_state_dict, strict=True, assign=True) - model.save_pretrained(output_dir) - - -def convert_and_write_tokenizer(input_dir: str, output_dir: str, tokenizer_template_name: str = ""): - """Convert the tokenizer and save it.""" - # Tekken format - if "tekken.json" in os.listdir(input_dir): - tokenizer_file = os.path.join(input_dir, "tekken.json") - tokenizer = convert_tekken_tokenizer(tokenizer_file) - else: - # May have .v3 or .v7 at the end - tokenizer_file = [file for file in os.listdir(input_dir) if "tokenizer.model" in file][0] - tokenizer = LlamaTokenizerFast(os.path.join(input_dir, tokenizer_file)) - - # Load a chat template from another model - if tokenizer_template_name != "": - template_tok = AutoTokenizer.from_pretrained(tokenizer_template_name) - tokenizer.chat_template = template_tok.chat_template - - # Finally save it - tokenizer.save_pretrained(output_dir) - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "input_dir", - help="Location of Mistral weights, which contains tokenizer.model and model folders", - ) - parser.add_argument( - "output_dir", - help="Location to write HF model and tokenizer", - ) - parser.add_argument( - "--template_name", - type=str, - default="", - help="Another model name from which to copy the chat template.", - ) - parser.add_argument( - "--max_position_embeddings", - type=int, - default=32768, - help="`max_position_embeddings` field in the config. This needs to be manually passed (not present anywhere otherwise).", - ) - parser.add_argument( - "--modules_are_split", - action="store_true", - help="If passed, then the weights of a single `nn.Module` are assumed to be split between different files.", - ) - parser.add_argument( - "--tokenizer_only", - action="store_true", - help="If passed, will only convert the tokenizer.", - ) - - args = parser.parse_args() - - if not args.tokenizer_only: - convert_and_write_model(args.input_dir, args.output_dir, args.max_position_embeddings, args.modules_are_split) - convert_and_write_tokenizer(args.input_dir, args.output_dir, args.template_name) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/mistral3/convert_mistral3_weights_to_hf.py b/src/transformers/models/mistral3/convert_mistral3_weights_to_hf.py deleted file mode 100644 index c8f9b64ab1f6..000000000000 --- a/src/transformers/models/mistral3/convert_mistral3_weights_to_hf.py +++ /dev/null @@ -1,241 +0,0 @@ -# Copyright 2023 Mistral AI and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import json -import os -import re - -import torch -from safetensors.torch import load_file - -from transformers import ( - Mistral3Config, - Mistral3ForConditionalGeneration, - MistralConfig, - PixtralImageProcessorFast, - PixtralProcessor, - PixtralVisionConfig, -) -from transformers.integrations.mistral import convert_tekken_tokenizer - - -# fmt: off -STATE_DICT_MAPPING = { - # Text model keys - r"^output.weight": r"language_model.lm_head.weight", - r"^norm.weight": r"language_model.model.norm.weight", - r"^tok_embeddings.weight": r"language_model.model.embed_tokens.weight", - r"^layers.(\d+).attention_norm.weight": r"language_model.model.layers.\1.input_layernorm.weight", - r"^layers.(\d+).ffn_norm.weight": r"language_model.model.layers.\1.post_attention_layernorm.weight", - r"^layers.(\d+).attention.w(q|k|v|o).weight": r"language_model.model.layers.\1.self_attn.\2_proj.weight", - r"^layers.(\d+).feed_forward.w1.weight": r"language_model.model.layers.\1.mlp.gate_proj.weight", - r"^layers.(\d+).feed_forward.w2.weight": r"language_model.model.layers.\1.mlp.down_proj.weight", - r"^layers.(\d+).feed_forward.w3.weight": r"language_model.model.layers.\1.mlp.up_proj.weight", - - # Vision model keys - r"vision_encoder.transformer.layers.(\d+).attention_norm.weight": r"vision_tower.transformer.layers.\1.attention_norm.weight", - r"^vision_encoder.transformer.layers.(\d+).ffn_norm.weight": r"vision_tower.transformer.layers.\1.ffn_norm.weight", - r"^vision_encoder.transformer.layers.(\d+).attention.w(q|k|v|o).weight": r"vision_tower.transformer.layers.\1.attention.\2_proj.weight", - r"^vision_encoder.transformer.layers.(\d+).feed_forward.w1.weight": r"vision_tower.transformer.layers.\1.feed_forward.gate_proj.weight", - r"^vision_encoder.transformer.layers.(\d+).feed_forward.w2.weight": r"vision_tower.transformer.layers.\1.feed_forward.down_proj.weight", - r"^vision_encoder.transformer.layers.(\d+).feed_forward.w3.weight": r"vision_tower.transformer.layers.\1.feed_forward.up_proj.weight", - r"^vision_language_adapter.w_in": r"multi_modal_projector.linear_1", - r"^vision_language_adapter.w_out": r"multi_modal_projector.linear_2", - r"^vision_encoder.ln_pre.weight": r"vision_tower.ln_pre.weight", - r"^vision_encoder.patch_conv.weight": r"vision_tower.patch_conv.weight", - r"^patch_merger.merging_layer.weight": r"multi_modal_projector.patch_merger.merging_layer.weight", - r"^pre_mm_projector_norm.weight": r"multi_modal_projector.norm.weight", -} -# fmt: on - - -def map_old_key_to_new(old_key): - """Map of a key of the original state dict to the equivalent key in HF format""" - for pattern, replacement in STATE_DICT_MAPPING.items(): - new_key, n_replace = re.subn(pattern, replacement, old_key) - # Early exit of the loop - if n_replace > 0: - return new_key - - raise ValueError(f"Key: {old_key} could not be mapped (check the mapping).") - - -def read_json(path): - with open(path, "r") as f: - return json.load(f) - - -def permute_for_rope(tensor, n_heads, dim1, dim2): - """Permute the weights for the ROPE formulation.""" - tensor = tensor.view(n_heads, dim1 // n_heads // 2, 2, dim2) - tensor = tensor.transpose(1, 2) - tensor = tensor.reshape(dim1, dim2) - return tensor - - -def convert_state_dict(original_state_dict: dict, config: MistralConfig): - """Convert a state dict file, when a single `nn.Module` is never sharded in different files (usual case).""" - new_dict = {} - - for old_key, tensor in original_state_dict.items(): - new_key = map_old_key_to_new(old_key) - - if "vision" in old_key: - num_attention_heads = config.vision_config.num_attention_heads - num_key_value_heads = num_attention_heads - hidden_size = config.vision_config.hidden_size - head_dim = config.vision_config.head_dim - key_value_dim = head_dim * num_attention_heads - query_dim = head_dim * num_attention_heads - else: - num_attention_heads = config.text_config.num_attention_heads - hidden_size = config.text_config.hidden_size - head_dim = config.text_config.head_dim - num_key_value_heads = config.text_config.num_key_value_heads - key_value_dim = head_dim * num_key_value_heads - query_dim = head_dim * num_attention_heads - - if "q_proj" in new_key: - tensor = permute_for_rope(tensor, num_attention_heads, query_dim, hidden_size) - elif "k_proj" in new_key: - tensor = permute_for_rope(tensor, num_key_value_heads, key_value_dim, hidden_size) - - new_dict[new_key] = tensor - return new_dict - - -def convert_config(original_config: dict, max_position_embeddings: int = 131072): - original_vision_config = original_config.pop("vision_encoder") - original_text_config = original_config - - # Text config - text_key_mapping = { - "hidden_size": "dim", - "num_hidden_layers": "n_layers", - "intermediate_size": "hidden_dim", - "num_attention_heads": "n_heads", - "num_key_value_heads": "n_kv_heads", - "rms_norm_eps": "norm_eps", - } - similar_text_keys_to_keep = [ - "head_dim", - "vocab_size", - "rope_theta", - ] - new_text_config_kwargs = {k: original_text_config[v] for k, v in text_key_mapping.items()} - new_text_config_kwargs.update({k: v for k, v in original_text_config.items() if k in similar_text_keys_to_keep}) - # These are not always defined depending on `params.json` - new_text_config_kwargs["sliding_window"] = original_text_config.get("sliding_window", None) - new_text_config_kwargs["max_position_embeddings"] = original_text_config.get( - "max_seq_len", max_position_embeddings - ) - # This may sometimes be a string in `params.json` - if new_text_config_kwargs["sliding_window"] is not None: - new_text_config_kwargs["sliding_window"] = int(new_text_config_kwargs["sliding_window"]) - new_text_config = MistralConfig(**new_text_config_kwargs) - - # Vision config - new_vision_config = original_vision_config - adapter_bias = new_vision_config.pop("adapter_bias", False) - _ = new_vision_config.pop("mm_projector_id", None) - _ = new_vision_config.pop("add_pre_mm_projector_layer_norm", None) - spatial_merge_size = new_vision_config.pop("spatial_merge_size") - image_token_id = new_vision_config.pop("image_token_id", 10) - _ = new_vision_config.pop("image_break_token_id", 12) - _ = new_vision_config.pop("image_end_token_id", 13) - _ = new_vision_config.pop("max_image_size") - new_vision_config = PixtralVisionConfig(**new_vision_config) - - new_config = Mistral3Config( - vision_config=new_vision_config, - text_config=new_text_config, - multimodal_projector_bias=adapter_bias, - image_token_id=image_token_id, - spatial_merge_size=spatial_merge_size, - vision_feature_layer=-1, - ) - return new_config - - -def convert_and_write_model(input_dir: str, output_dir: str, max_position_embeddings: int): - """Convert the model and save it (this implicitly save the config as well).""" - params = read_json(os.path.join(input_dir, "params.json")) - config = convert_config(params, max_position_embeddings) - - full_state_dict = {} - # The model may be split between different files, but a single nn.Module is always fully present in a single file - shards = [file for file in os.listdir(input_dir) if file.endswith(".safetensors")] - for shard_file in shards: - original_state_dict = load_file(os.path.join(input_dir, shard_file)) - new_dict = convert_state_dict(original_state_dict, config) - full_state_dict.update(new_dict) - - # Load weights into model and resave them - with torch.device("meta"): - model = Mistral3ForConditionalGeneration(config) - model.load_state_dict(full_state_dict, strict=True, assign=True) - model.save_pretrained(output_dir) - - -def convert_and_write_processor(input_dir: str, output_dir: str): - """Convert the tokenizer and save it.""" - tokenizer_file = os.path.join(input_dir, "tekken.json") - tokenizer = convert_tekken_tokenizer(tokenizer_file) - tokenizer.add_special_tokens({"pad_token": "