diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py index aff69510d636..6e98ee0f1493 100644 --- a/.circleci/create_circleci_config.py +++ b/.circleci/create_circleci_config.py @@ -16,10 +16,9 @@ import argparse import copy import os -import random from dataclasses import dataclass -from typing import Any, Dict, List, Optional -import glob +from typing import Any, Optional + import yaml @@ -30,6 +29,7 @@ "RUN_PIPELINE_TESTS": False, # will be adjust in `CircleCIJob.to_dict`. "RUN_FLAKY": True, + "DISABLE_SAFETENSORS_CONVERSION": True, } # Disable the use of {"s": None} as the output is way too long, causing the navigation on CircleCI impractical COMMON_PYTEST_OPTIONS = {"max-worker-restart": 0, "vvv": None, "rsfE":None} @@ -82,15 +82,15 @@ def to_dict(self): @dataclass class CircleCIJob: name: str - additional_env: Dict[str, Any] = None - docker_image: List[Dict[str, str]] = None - install_steps: List[str] = None + additional_env: dict[str, Any] = None + docker_image: list[dict[str, str]] = None + install_steps: list[str] = None marker: Optional[str] = None parallelism: Optional[int] = 0 pytest_num_workers: int = 8 - pytest_options: Dict[str, Any] = None + pytest_options: dict[str, Any] = None resource_class: Optional[str] = "xlarge" - tests_to_run: Optional[List[str]] = None + tests_to_run: Optional[list[str]] = None num_test_files_per_worker: Optional[int] = 10 # This should be only used for doctest job! command_timeout: Optional[int] = None @@ -130,6 +130,12 @@ def __post_init__(self): def to_dict(self): env = COMMON_ENV_VARIABLES.copy() + if self.job_name != "tests_hub": + # fmt: off + # not critical + env.update({"HF_TOKEN": "".join(["h", "f", "_", "H", "o", "d", "V", "u", "M", "q", "b", "R", "m", "t", "b", "z", "F", "Q", "O", "Q", "A", "J", "G", "D", "l", "V", "Q", "r", "R", "N", "w", "D", "M", "V", "C", "s", "d"])}) + # fmt: on + # Do not run tests decorated by @is_flaky on pull requests env['RUN_FLAKY'] = os.environ.get("CIRCLE_PULL_REQUEST", "") == "" env.update(self.additional_env) @@ -149,7 +155,7 @@ def to_dict(self): # Examples special case: we need to download NLTK files in advance to avoid cuncurrency issues timeout_cmd = f"timeout {self.command_timeout} " if self.command_timeout else "" marker_cmd = f"-m '{self.marker}'" if self.marker is not None else "" - junit_flags = f" -p no:warning -o junit_family=xunit1 --junitxml=test-results/junit.xml" + junit_flags = " -p no:warning -o junit_family=xunit1 --junitxml=test-results/junit.xml" joined_flaky_patterns = "|".join(FLAKY_TEST_FAILURE_PATTERNS) repeat_on_failure_flags = f"--reruns 5 --reruns-delay 2 --only-rerun '({joined_flaky_patterns})'" parallel = f' << pipeline.parameters.{self.job_name}_parallelism >> ' @@ -180,6 +186,7 @@ def to_dict(self): # During the CircleCI docker images build time, we might already (or not) download the data. # If it's done already, the files are inside the directory `/test_data/`. {"run": {"name": "fetch hub objects before pytest", "command": "cp -r /test_data/* . 2>/dev/null || true; python3 utils/fetch_hub_objects_for_ci.py"}}, + {"run": {"name": "download and unzip hub cache", "command": 'curl -L -o huggingface-cache.tar.gz https://huggingface.co/datasets/hf-internal-testing/hf_hub_cache/resolve/main/huggingface-cache.tar.gz && apt-get install pigz && tar --use-compress-program="pigz -d -p 8" -xf huggingface-cache.tar.gz && mv -n hub/* /root/.cache/huggingface/hub/ && ls -la /root/.cache/huggingface/hub/'}}, {"run": { "name": "Run tests", "command": f"({timeout_cmd} python3 -m pytest {marker_cmd} -n {self.pytest_num_workers} {junit_flags} {repeat_on_failure_flags} {' '.join(pytest_flags)} $(cat splitted_tests.txt) | tee tests_output.txt)"} @@ -200,9 +207,9 @@ def to_dict(self): fi""" }, }, - {"run": {"name": "Expand to show skipped tests", "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --skip"}}, - {"run": {"name": "Failed tests: show reasons", "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --fail"}}, - {"run": {"name": "Errors", "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --errors"}}, + {"run": {"name": "Expand to show skipped tests", "when": "always", "command": "python3 .circleci/parse_test_outputs.py --file tests_output.txt --skip"}}, + {"run": {"name": "Failed tests: show reasons", "when": "always", "command": "python3 .circleci/parse_test_outputs.py --file tests_output.txt --fail"}}, + {"run": {"name": "Errors", "when": "always", "command": "python3 .circleci/parse_test_outputs.py --file tests_output.txt --errors"}}, {"store_test_results": {"path": "test-results"}}, {"store_artifacts": {"path": "test-results/junit.xml"}}, {"store_artifacts": {"path": "reports"}}, diff --git a/.circleci/parse_test_outputs.py b/.circleci/parse_test_outputs.py index a69da1a3eafb..c58447155859 100644 --- a/.circleci/parse_test_outputs.py +++ b/.circleci/parse_test_outputs.py @@ -1,5 +1,6 @@ -import re import argparse +import re + def parse_pytest_output(file_path): skipped_tests = {} diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml index 78e96e9b3386..30ac3b4c9512 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.yml +++ b/.github/ISSUE_TEMPLATE/bug-report.yml @@ -61,6 +61,7 @@ body: - Big Model Inference: @SunMarc - quantization (bitsandbytes, autogpt): @SunMarc @MekkCyber - kernels: @MekkCyber @drbh + - peft: @BenjaminBossan @githubnemo Devices/Backends: diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index aa1e881122c1..de4ed57873ef 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -39,20 +39,23 @@ members/contributors who may be interested in your PR. Models: -- text models: @ArthurZucker -- vision models: @amyeroberts, @qubvel -- speech models: @eustlb +- text models: @ArthurZucker @Cyrilvallez +- vision models: @yonigozlan @molbap +- audio models: @eustlb @ebezzam @vasqu +- multimodal models: @zucchini-nlp - graph models: @clefourrier Library: -- flax: @gante and @Rocketknight1 - generate: @zucchini-nlp (visual-language models) or @gante (all others) +- continuous batching: @remi-or @ArthurZucker @McPatate - pipelines: @Rocketknight1 -- tensorflow: @gante and @Rocketknight1 -- tokenizers: @ArthurZucker -- trainer: @zach-huggingface, @SunMarc and @qgallouedec -- chat templates: @Rocketknight1 +- tokenizers: @ArthurZucker and @itazap +- trainer: @zach-huggingface @SunMarc +- attention: @vasqu @ArthurZucker @CyrilVallez +- model loading (from pretrained, etc): @CyrilVallez +- distributed: @3outeille @ArthurZucker @S1ro1 +- CIs: @ydshieh Integrations: @@ -60,20 +63,17 @@ Integrations: - ray/raytune: @richardliaw, @amogkam - Big Model Inference: @SunMarc - quantization (bitsandbytes, autogpt): @SunMarc @MekkCyber +- kernels: @MekkCyber @drbh +- peft: @BenjaminBossan @githubnemo -Documentation: @stevhliu - -HF projects: +Devices/Backends: -- accelerate: [different repo](https://github.com/huggingface/accelerate) -- datasets: [different repo](https://github.com/huggingface/datasets) -- diffusers: [different repo](https://github.com/huggingface/diffusers) -- rust tokenizers: [different repo](https://github.com/huggingface/tokenizers) +- AMD ROCm: @ivarflakstad +- Intel XPU: @IlyasMoutawwakil +- Ascend NPU: @ivarflakstad -Maintained examples (not research project or legacy): +Documentation: @stevhliu -- Flax: @Rocketknight1 -- PyTorch: See Models above and tag the person corresponding to the modality of the example. -- TensorFlow: @Rocketknight1 +Research projects are not maintained and should be taken as is. --> diff --git a/.github/scripts/assign_reviewers.py b/.github/scripts/assign_reviewers.py index 02966204ea32..18567203596f 100644 --- a/.github/scripts/assign_reviewers.py +++ b/.github/scripts/assign_reviewers.py @@ -13,14 +13,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -import github import json -from github import Github +import os import re from collections import Counter from pathlib import Path +import github +from github import Github + + def pattern_to_regex(pattern): if pattern.startswith("/"): start_anchor = True diff --git a/.github/scripts/codeowners_for_review_action b/.github/scripts/codeowners_for_review_action index 7325b0f570cc..f6c4b65a1e22 100644 --- a/.github/scripts/codeowners_for_review_action +++ b/.github/scripts/codeowners_for_review_action @@ -7,8 +7,8 @@ docs/ @stevhliu /docker/ @ydshieh @ArthurZucker # More high-level globs catch cases when specific rules later don't apply -/src/transformers/models/*/processing* @molbap @yonigozlan @qubvel -/src/transformers/models/*/image_processing* @qubvel +/src/transformers/models/*/processing* @molbap @yonigozlan +/src/transformers/models/*/image_processing* @yonigozlan /src/transformers/models/*/image_processing_*_fast* @yonigozlan # Owners of subsections of the library @@ -186,65 +186,65 @@ trainer_utils.py @zach-huggingface @SunMarc /src/transformers/models/zamba/mod*_zamba* @ArthurZucker # Vision models -/src/transformers/models/beit/mod*_beit* @amyeroberts @qubvel -/src/transformers/models/bit/mod*_bit* @amyeroberts @qubvel -/src/transformers/models/conditional_detr/mod*_conditional_detr* @amyeroberts @qubvel -/src/transformers/models/convnext/mod*_convnext* @amyeroberts @qubvel -/src/transformers/models/convnextv2/mod*_convnextv2* @amyeroberts @qubvel -/src/transformers/models/cvt/mod*_cvt* @amyeroberts @qubvel -/src/transformers/models/deformable_detr/mod*_deformable_detr* @amyeroberts @qubvel -/src/transformers/models/deit/mod*_deit* @amyeroberts @qubvel -/src/transformers/models/depth_anything/mod*_depth_anything* @amyeroberts @qubvel -/src/transformers/models/depth_anything_v2/mod*_depth_anything_v2* @amyeroberts @qubvel -/src/transformers/models/deta/mod*_deta* @amyeroberts @qubvel -/src/transformers/models/detr/mod*_detr* @amyeroberts @qubvel -/src/transformers/models/dinat/mod*_dinat* @amyeroberts @qubvel -/src/transformers/models/dinov2/mod*_dinov2* @amyeroberts @qubvel -/src/transformers/models/dinov2_with_registers/mod*_dinov2_with_registers* @amyeroberts @qubvel -/src/transformers/models/dit/mod*_dit* @amyeroberts @qubvel -/src/transformers/models/dpt/mod*_dpt* @amyeroberts @qubvel -/src/transformers/models/efficientformer/mod*_efficientformer* @amyeroberts @qubvel -/src/transformers/models/efficientnet/mod*_efficientnet* @amyeroberts @qubvel -/src/transformers/models/focalnet/mod*_focalnet* @amyeroberts @qubvel -/src/transformers/models/glpn/mod*_glpn* @amyeroberts @qubvel -/src/transformers/models/hiera/mod*_hiera* @amyeroberts @qubvel -/src/transformers/models/ijepa/mod*_ijepa* @amyeroberts @qubvel -/src/transformers/models/imagegpt/mod*_imagegpt* @amyeroberts @qubvel -/src/transformers/models/levit/mod*_levit* @amyeroberts @qubvel -/src/transformers/models/mask2former/mod*_mask2former* @amyeroberts @qubvel -/src/transformers/models/maskformer/mod*_maskformer* @amyeroberts @qubvel -/src/transformers/models/mobilenet_v1/mod*_mobilenet_v1* @amyeroberts @qubvel -/src/transformers/models/mobilenet_v2/mod*_mobilenet_v2* @amyeroberts @qubvel -/src/transformers/models/mobilevit/mod*_mobilevit* @amyeroberts @qubvel -/src/transformers/models/mobilevitv2/mod*_mobilevitv2* @amyeroberts @qubvel -/src/transformers/models/nat/mod*_nat* @amyeroberts @qubvel -/src/transformers/models/poolformer/mod*_poolformer* @amyeroberts @qubvel -/src/transformers/models/pvt/mod*_pvt* @amyeroberts @qubvel -/src/transformers/models/pvt_v2/mod*_pvt_v2* @amyeroberts @qubvel -/src/transformers/models/regnet/mod*_regnet* @amyeroberts @qubvel -/src/transformers/models/resnet/mod*_resnet* @amyeroberts @qubvel -/src/transformers/models/rt_detr/mod*_rt_detr* @amyeroberts @qubvel -/src/transformers/models/segformer/mod*_segformer* @amyeroberts @qubvel -/src/transformers/models/seggpt/mod*_seggpt* @amyeroberts @qubvel -/src/transformers/models/superpoint/mod*_superpoint* @amyeroberts @qubvel -/src/transformers/models/swiftformer/mod*_swiftformer* @amyeroberts @qubvel -/src/transformers/models/swin/mod*_swin* @amyeroberts @qubvel -/src/transformers/models/swinv2/mod*_swinv2* @amyeroberts @qubvel -/src/transformers/models/swin2sr/mod*_swin2sr* @amyeroberts @qubvel -/src/transformers/models/table_transformer/mod*_table_transformer* @amyeroberts @qubvel -/src/transformers/models/textnet/mod*_textnet* @amyeroberts @qubvel -/src/transformers/models/timm_wrapper/mod*_timm_wrapper* @amyeroberts @qubvel -/src/transformers/models/upernet/mod*_upernet* @amyeroberts @qubvel -/src/transformers/models/van/mod*_van* @amyeroberts @qubvel -/src/transformers/models/vit/mod*_vit* @amyeroberts @qubvel -/src/transformers/models/vit_hybrid/mod*_vit_hybrid* @amyeroberts @qubvel -/src/transformers/models/vitdet/mod*_vitdet* @amyeroberts @qubvel -/src/transformers/models/vit_mae/mod*_vit_mae* @amyeroberts @qubvel -/src/transformers/models/vitmatte/mod*_vitmatte* @amyeroberts @qubvel -/src/transformers/models/vit_msn/mod*_vit_msn* @amyeroberts @qubvel -/src/transformers/models/vitpose/mod*_vitpose* @amyeroberts @qubvel -/src/transformers/models/yolos/mod*_yolos* @amyeroberts @qubvel -/src/transformers/models/zoedepth/mod*_zoedepth* @amyeroberts @qubvel +/src/transformers/models/beit/mod*_beit* @yonigozlan @molbap +/src/transformers/models/bit/mod*_bit* @yonigozlan @molbap +/src/transformers/models/conditional_detr/mod*_conditional_detr* @yonigozlan @molbap +/src/transformers/models/convnext/mod*_convnext* @yonigozlan @molbap +/src/transformers/models/convnextv2/mod*_convnextv2* @yonigozlan @molbap +/src/transformers/models/cvt/mod*_cvt* @yonigozlan @molbap +/src/transformers/models/deformable_detr/mod*_deformable_detr* @yonigozlan @molbap +/src/transformers/models/deit/mod*_deit* @yonigozlan @molbap +/src/transformers/models/depth_anything/mod*_depth_anything* @yonigozlan @molbap +/src/transformers/models/depth_anything_v2/mod*_depth_anything_v2* @yonigozlan @molbap +/src/transformers/models/deta/mod*_deta* @yonigozlan @molbap +/src/transformers/models/detr/mod*_detr* @yonigozlan @molbap +/src/transformers/models/dinat/mod*_dinat* @yonigozlan @molbap +/src/transformers/models/dinov2/mod*_dinov2* @yonigozlan @molbap +/src/transformers/models/dinov2_with_registers/mod*_dinov2_with_registers* @yonigozlan @molbap +/src/transformers/models/dit/mod*_dit* @yonigozlan @molbap +/src/transformers/models/dpt/mod*_dpt* @yonigozlan @molbap +/src/transformers/models/efficientformer/mod*_efficientformer* @yonigozlan @molbap +/src/transformers/models/efficientnet/mod*_efficientnet* @yonigozlan @molbap +/src/transformers/models/focalnet/mod*_focalnet* @yonigozlan @molbap +/src/transformers/models/glpn/mod*_glpn* @yonigozlan @molbap +/src/transformers/models/hiera/mod*_hiera* @yonigozlan @molbap +/src/transformers/models/ijepa/mod*_ijepa* @yonigozlan @molbap +/src/transformers/models/imagegpt/mod*_imagegpt* @yonigozlan @molbap +/src/transformers/models/levit/mod*_levit* @yonigozlan @molbap +/src/transformers/models/mask2former/mod*_mask2former* @yonigozlan @molbap +/src/transformers/models/maskformer/mod*_maskformer* @yonigozlan @molbap +/src/transformers/models/mobilenet_v1/mod*_mobilenet_v1* @yonigozlan @molbap +/src/transformers/models/mobilenet_v2/mod*_mobilenet_v2* @yonigozlan @molbap +/src/transformers/models/mobilevit/mod*_mobilevit* @yonigozlan @molbap +/src/transformers/models/mobilevitv2/mod*_mobilevitv2* @yonigozlan @molbap +/src/transformers/models/nat/mod*_nat* @yonigozlan @molbap +/src/transformers/models/poolformer/mod*_poolformer* @yonigozlan @molbap +/src/transformers/models/pvt/mod*_pvt* @yonigozlan @molbap +/src/transformers/models/pvt_v2/mod*_pvt_v2* @yonigozlan @molbap +/src/transformers/models/regnet/mod*_regnet* @yonigozlan @molbap +/src/transformers/models/resnet/mod*_resnet* @yonigozlan @molbap +/src/transformers/models/rt_detr/mod*_rt_detr* @yonigozlan @molbap +/src/transformers/models/segformer/mod*_segformer* @yonigozlan @molbap +/src/transformers/models/seggpt/mod*_seggpt* @yonigozlan @molbap +/src/transformers/models/superpoint/mod*_superpoint* @yonigozlan @molbap +/src/transformers/models/swiftformer/mod*_swiftformer* @yonigozlan @molbap +/src/transformers/models/swin/mod*_swin* @yonigozlan @molbap +/src/transformers/models/swinv2/mod*_swinv2* @yonigozlan @molbap +/src/transformers/models/swin2sr/mod*_swin2sr* @yonigozlan @molbap +/src/transformers/models/table_transformer/mod*_table_transformer* @yonigozlan @molbap +/src/transformers/models/textnet/mod*_textnet* @yonigozlan @molbap +/src/transformers/models/timm_wrapper/mod*_timm_wrapper* @yonigozlan @molbap +/src/transformers/models/upernet/mod*_upernet* @yonigozlan @molbap +/src/transformers/models/van/mod*_van* @yonigozlan @molbap +/src/transformers/models/vit/mod*_vit* @yonigozlan @molbap +/src/transformers/models/vit_hybrid/mod*_vit_hybrid* @yonigozlan @molbap +/src/transformers/models/vitdet/mod*_vitdet* @yonigozlan @molbap +/src/transformers/models/vit_mae/mod*_vit_mae* @yonigozlan @molbap +/src/transformers/models/vitmatte/mod*_vitmatte* @yonigozlan @molbap +/src/transformers/models/vit_msn/mod*_vit_msn* @yonigozlan @molbap +/src/transformers/models/vitpose/mod*_vitpose* @yonigozlan @molbap +/src/transformers/models/yolos/mod*_yolos* @yonigozlan @molbap +/src/transformers/models/zoedepth/mod*_zoedepth* @yonigozlan @molbap # Audio models /src/transformers/models/audio_spectrogram_transformer/mod*_audio_spectrogram_transformer* @eustlb @@ -304,7 +304,7 @@ trainer_utils.py @zach-huggingface @SunMarc /src/transformers/models/donut/mod*_donut* @zucchini-nlp /src/transformers/models/flava/mod*_flava* @zucchini-nlp /src/transformers/models/git/mod*_git* @zucchini-nlp -/src/transformers/models/grounding_dino/mod*_grounding_dino* @qubvel +/src/transformers/models/grounding_dino/mod*_grounding_dino* @yonigozlan /src/transformers/models/groupvit/mod*_groupvit* @zucchini-nlp /src/transformers/models/idefics/mod*_idefics* @zucchini-nlp /src/transformers/models/idefics2/mod*_idefics2* @zucchini-nlp @@ -326,10 +326,10 @@ trainer_utils.py @zach-huggingface @SunMarc /src/transformers/models/mgp_str/mod*_mgp_str* @zucchini-nlp /src/transformers/models/mllama/mod*_mllama* @zucchini-nlp /src/transformers/models/nougat/mod*_nougat* @NielsRogge -/src/transformers/models/omdet_turbo/mod*_omdet_turbo* @qubvel @yonigozlan +/src/transformers/models/omdet_turbo/mod*_omdet_turbo* @yonigozlan /src/transformers/models/oneformer/mod*_oneformer* @zucchini-nlp -/src/transformers/models/owlvit/mod*_owlvit* @qubvel -/src/transformers/models/owlv2/mod*_owlv2* @qubvel +/src/transformers/models/owlvit/mod*_owlvit* @yonigozlan +/src/transformers/models/owlv2/mod*_owlv2* @yonigozlan /src/transformers/models/paligemma/mod*_paligemma* @zucchini-nlp @molbap /src/transformers/models/perceiver/mod*_perceiver* @zucchini-nlp /src/transformers/models/pix2struct/mod*_pix2struct* @zucchini-nlp diff --git a/.github/workflows/benchmark_v2.yml b/.github/workflows/benchmark_v2.yml new file mode 100644 index 000000000000..fc9e07635185 --- /dev/null +++ b/.github/workflows/benchmark_v2.yml @@ -0,0 +1,85 @@ +name: Benchmark v2 Framework + +on: + workflow_call: + inputs: + runner: + description: 'GH Actions runner group to use' + required: true + type: string + container_image: + description: 'Docker image to use' + required: true + type: string + container_options: + description: 'Container options to use' + required: true + type: string + commit_sha: + description: 'Commit SHA to benchmark' + required: false + type: string + default: '' + run_id: + description: 'Custom run ID for organizing results (auto-generated if not provided)' + required: false + type: string + default: '' + benchmark_repo_id: + description: 'HuggingFace Dataset to upload results to (e.g., "org/benchmark-results")' + required: false + type: string + default: '' + +env: + HF_HOME: /mnt/cache + TRANSFORMERS_IS_CI: yes + # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access. + # This token is created under the bot `hf-transformers-bot`. + HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} + +jobs: + benchmark-v2: + name: Benchmark v2 + runs-on: ${{ inputs.runner }} + if: | + (github.event_name == 'pull_request' && contains( github.event.pull_request.labels.*.name, 'run-benchmark')) || + (github.event_name == 'schedule') + container: + image: ${{ inputs.container_image }} + options: ${{ inputs.container_options }} + steps: + - name: Get repo + uses: actions/checkout@v4 + with: + ref: ${{ inputs.commit_sha || github.sha }} + + - name: Install benchmark dependencies + run: | + python3 -m pip install -r benchmark_v2/requirements.txt + + - name: Reinstall transformers in edit mode + run: | + python3 -m pip uninstall -y transformers + python3 -m pip install -e ".[torch]" + + - name: Show installed libraries and their versions + run: | + python3 -m pip list + python3 -c "import torch; print(f'PyTorch version: {torch.__version__}')" + python3 -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')" + python3 -c "import torch; print(f'CUDA device count: {torch.cuda.device_count()}')" || true + nvidia-smi || true + + - name: Run benchmark v2 + working-directory: benchmark_v2 + run: | + echo "Running benchmarks" + python3 run_benchmarks.py \ + --commit-id '${{ inputs.commit_sha || github.sha }}' \ + --run-id '${{ inputs.run_id }}' \ + --push-to-hub '${{ inputs.benchmark_repo_id}}' \ + --token '${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}' \ + --log-level INFO + env: + HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} \ No newline at end of file diff --git a/.github/workflows/benchmark_v2_a10_caller.yml b/.github/workflows/benchmark_v2_a10_caller.yml new file mode 100644 index 000000000000..6573d398b000 --- /dev/null +++ b/.github/workflows/benchmark_v2_a10_caller.yml @@ -0,0 +1,21 @@ +name: Benchmark v2 Scheduled Runner - A10 Single-GPU + +on: + schedule: + # Run daily at 16:30 UTC + - cron: "30 16 * * *" + pull_request: + types: [ opened, labeled, reopened, synchronize ] + +jobs: + benchmark-v2-default: + name: Benchmark v2 - Default Models + uses: ./.github/workflows/benchmark_v2.yml + with: + runner: aws-g5-4xlarge-cache-use1-public-80 + container_image: huggingface/transformers-pytorch-gpu + container_options: --gpus all --privileged --ipc host --shm-size "16gb" + commit_sha: ${{ github.sha }} + run_id: ${{ github.run_id }} + benchmark_repo_id: hf-internal-testing/transformers-daily-benchmarks + secrets: inherit \ No newline at end of file diff --git a/.github/workflows/benchmark_v2_mi325_caller.yml b/.github/workflows/benchmark_v2_mi325_caller.yml new file mode 100644 index 000000000000..ed403148e596 --- /dev/null +++ b/.github/workflows/benchmark_v2_mi325_caller.yml @@ -0,0 +1,21 @@ +name: Benchmark v2 Scheduled Runner - MI325 Single-GPU + +on: + schedule: + # Run daily at 16:30 UTC + - cron: "30 16 * * *" + pull_request: + types: [ opened, labeled, reopened, synchronize ] + +jobs: + benchmark-v2-default: + name: Benchmark v2 - Default Models + uses: ./.github/workflows/benchmark_v2.yml + with: + runner: amd-mi325-ci-1gpu + container_image: huggingface/transformers-pytorch-amd-gpu + container_options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache + commit_sha: ${{ github.sha }} + run_id: ${{ github.run_id }} + benchmark_repo_id: hf-internal-testing/transformers-daily-benchmarks + secrets: inherit \ No newline at end of file diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml index fe1f18f42b99..b53c6a4671f0 100644 --- a/.github/workflows/build-docker-images.yml +++ b/.github/workflows/build-docker-images.yml @@ -5,6 +5,7 @@ on: branches: - build_ci_docker_image* repository_dispatch: + workflow_dispatch: workflow_call: inputs: image_postfix: @@ -221,7 +222,7 @@ jobs: latest-pytorch-amd: name: "Latest PyTorch (AMD) [dev]" runs-on: - group: aws-general-8-plus + group: aws-highcpu-32-priv steps: - name: Set up Docker Buildx diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml index c55638ded149..28982d04eb46 100644 --- a/.github/workflows/build_documentation.yml +++ b/.github/workflows/build_documentation.yml @@ -16,8 +16,20 @@ jobs: commit_sha: ${{ github.sha }} package: transformers notebook_folder: transformers_doc - languages: ar de en es fr hi it ko pt tr zh ja te + languages: en custom_container: huggingface/transformers-doc-builder secrets: token: ${{ secrets.HUGGINGFACE_PUSH }} hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }} + + build_other_lang: + uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main + with: + commit_sha: ${{ github.sha }} + package: transformers + notebook_folder: transformers_doc + languages: ar de es fr hi it ja ko pt zh + custom_container: huggingface/transformers-doc-builder + secrets: + token: ${{ secrets.HUGGINGFACE_PUSH }} + hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }} \ No newline at end of file diff --git a/.github/workflows/model_jobs.yml b/.github/workflows/model_jobs.yml index 5da145c2b006..83f818fcda3b 100644 --- a/.github/workflows/model_jobs.yml +++ b/.github/workflows/model_jobs.yml @@ -128,28 +128,47 @@ jobs: echo "machine_type=$machine_type" >> $GITHUB_ENV echo "machine_type=$machine_type" >> $GITHUB_OUTPUT + - name: Create report directory if it doesn't exist + shell: bash + run: | + mkdir -p /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports + echo "dummy" > /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports/dummy.txt + ls -la /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports + - name: Run all tests on GPU working-directory: /transformers - run: python3 -m pytest -rsfE -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} + run: | + script -q -c "PATCH_TESTING_METHODS_TO_COLLECT_OUTPUTS=yes _PATCHED_TESTING_METHODS_OUTPUT_DIR=/transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports python3 -m pytest -rsfE -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports tests/${{ matrix.folders }}" test_outputs.txt + ls -la + # Extract the exit code from the output file + EXIT_CODE=$(tail -1 test_outputs.txt | grep -o 'COMMAND_EXIT_CODE="[0-9]*"' | cut -d'"' -f2) + exit ${EXIT_CODE:-1} - name: Failure short reports if: ${{ failure() }} + # This step is only to show information on Github Actions log. + # Always mark this step as successful, even if the report directory or the file `failures_short.txt` in it doesn't exist continue-on-error: true - run: cat /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/failures_short.txt + run: cat /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports/failures_short.txt - - name: Run test - shell: bash + - name: Captured information + if: ${{ failure() }} + continue-on-error: true + run: | + cat /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports/captured_info.txt + + - name: Copy test_outputs.txt + if: ${{ always() }} + continue-on-error: true run: | - mkdir -p /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports - echo "hello" > /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/hello.txt - echo "${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports" + cp /transformers/test_outputs.txt /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports - name: "Test suite reports artifacts: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports" if: ${{ always() }} uses: actions/upload-artifact@v4 with: name: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports - path: /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports + path: /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports collated_reports: name: Collated Reports diff --git a/.github/workflows/pr_build_doc_with_comment.yml b/.github/workflows/pr_build_doc_with_comment.yml index ec43c5b2cf96..59aa22eef1ec 100644 --- a/.github/workflows/pr_build_doc_with_comment.yml +++ b/.github/workflows/pr_build_doc_with_comment.yml @@ -14,7 +14,7 @@ permissions: {} jobs: get-pr-number: name: Get PR number - if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber", "manueldeprada", "vasqu", "ivarflakstad", "stevhliu", "ebezzam"]'), github.actor) && (startsWith(github.event.comment.body, 'build-doc')) }} + if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "eustlb", "MekkCyber", "vasqu", "ivarflakstad", "stevhliu", "ebezzam", "itazap"]'), github.actor) && (startsWith(github.event.comment.body, 'build-doc')) }} uses: ./.github/workflows/get-pr-number.yml get-pr-info: diff --git a/.github/workflows/self-comment-ci.yml b/.github/workflows/self-comment-ci.yml index f1c93aab5a86..e485973dcb05 100644 --- a/.github/workflows/self-comment-ci.yml +++ b/.github/workflows/self-comment-ci.yml @@ -29,7 +29,7 @@ jobs: runs-on: ubuntu-22.04 name: Get PR number # For security: only allow team members to run - if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber", "manueldeprada", "vasqu", "ivarflakstad", "stevhliu", "ebezzam", "remi-or"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }} + if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "eustlb", "MekkCyber", "vasqu", "ivarflakstad", "stevhliu", "ebezzam", "remi-or", "itazap"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }} outputs: PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }} steps: diff --git a/.github/workflows/self-scheduled-amd-mi325-caller.yml b/.github/workflows/self-scheduled-amd-mi325-caller.yml index 8c2bad414bcf..510b3f6e2c78 100644 --- a/.github/workflows/self-scheduled-amd-mi325-caller.yml +++ b/.github/workflows/self-scheduled-amd-mi325-caller.yml @@ -20,7 +20,7 @@ jobs: with: job: run_models_gpu slack_report_channel: "#amd-hf-ci" - runner_scale_set: amd-mi325-ci + runner_group: amd-mi325 docker: huggingface/transformers-pytorch-amd-gpu ci_event: Scheduled CI (AMD) - mi325 report_repo_id: optimum-amd/transformers_daily_ci @@ -33,7 +33,7 @@ jobs: with: job: run_pipelines_torch_gpu slack_report_channel: "#amd-hf-ci" - runner_scale_set: amd-mi325-ci + runner_group: amd-mi325 docker: huggingface/transformers-pytorch-amd-gpu ci_event: Scheduled CI (AMD) - mi325 report_repo_id: optimum-amd/transformers_daily_ci @@ -46,7 +46,7 @@ jobs: with: job: run_examples_gpu slack_report_channel: "#amd-hf-ci" - runner_scale_set: amd-mi325-ci + runner_group: amd-mi325 docker: huggingface/transformers-pytorch-amd-gpu ci_event: Scheduled CI (AMD) - mi325 report_repo_id: optimum-amd/transformers_daily_ci @@ -59,7 +59,7 @@ jobs: with: job: run_torch_cuda_extensions_gpu slack_report_channel: "#amd-hf-ci" - runner_scale_set: amd-mi325-ci + runner_group: amd-mi325 docker: huggingface/transformers-pytorch-deepspeed-amd-gpu ci_event: Scheduled CI (AMD) - mi325 report_repo_id: optimum-amd/transformers_daily_ci diff --git a/.github/workflows/self-scheduled-amd-mi355-caller.yml b/.github/workflows/self-scheduled-amd-mi355-caller.yml index d7061f433569..1b5dbe96ad97 100644 --- a/.github/workflows/self-scheduled-amd-mi355-caller.yml +++ b/.github/workflows/self-scheduled-amd-mi355-caller.yml @@ -3,7 +3,7 @@ name: Self-hosted runner scale set (AMD mi355 scheduled CI caller) # Note: For every job in this workflow, the name of the runner scale set is finalized in the runner yaml i.e. huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml # For example, 1gpu : amd-mi355-ci-1gpu # 2gpu : amd-mi355-ci-2gpu - + on: workflow_run: workflows: ["Self-hosted runner (AMD scheduled CI caller)"] @@ -20,7 +20,7 @@ jobs: with: job: run_models_gpu slack_report_channel: "#amd-hf-ci" - runner_scale_set: amd-mi355-ci + runner_group: hfc-amd-mi355 docker: huggingface/testing-rocm7.0-preview ci_event: Scheduled CI (AMD) - mi355 report_repo_id: hf-transformers-bot/transformers-ci-dummy @@ -32,7 +32,7 @@ jobs: with: job: run_pipelines_torch_gpu slack_report_channel: "#amd-hf-ci" - runner_scale_set: amd-mi355-ci + runner_group: hfc-amd-mi355 docker: huggingface/testing-rocm7.0-preview ci_event: Scheduled CI (AMD) - mi355 report_repo_id: hf-transformers-bot/transformers-ci-dummy @@ -44,7 +44,7 @@ jobs: with: job: run_examples_gpu slack_report_channel: "#amd-hf-ci" - runner_scale_set: amd-mi355-ci + runner_group: hfc-amd-mi355 docker: huggingface/testing-rocm7.0-preview ci_event: Scheduled CI (AMD) - mi355 report_repo_id: hf-transformers-bot/transformers-ci-dummy @@ -53,10 +53,10 @@ jobs: deepspeed-ci: name: DeepSpeed CI uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml@main - with: + with: job: run_torch_cuda_extensions_gpu slack_report_channel: "#amd-hf-ci" - runner_scale_set: amd-mi355-ci + runner_group: hfc-amd-mi355 docker: huggingface/testing-rocm7.0-preview ci_event: Scheduled CI (AMD) - mi355 report_repo_id: hf-transformers-bot/transformers-ci-dummy diff --git a/.gitignore b/.gitignore index cdf189505dc7..b59797c2188b 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,7 @@ tests/fixtures/cached_*_text.txt logs/ lightning_logs/ lang_code_data/ +reports/ # Distribution / packaging .Python diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7728546633b9..ea62fd545882 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -278,13 +278,14 @@ are working on it).
useful to avoid duplicated work, and to differentiate it from PRs ready to be merged.
☐ Make sure existing tests pass.
☐ If adding a new feature, also add tests for it.
- - If you are adding a new model, make sure you use + +- If you are adding a new model, make sure you use `ModelTester.all_model_classes = (MyModel, MyModelWithLMHead,...)` to trigger the common tests. - - If you are adding new `@slow` tests, make sure they pass using +- If you are adding new `@slow` tests, make sure they pass using `RUN_SLOW=1 python -m pytest tests/models/my_new_model/test_my_new_model.py`. - - If you are adding a new tokenizer, write tests and make sure +- If you are adding a new tokenizer, write tests and make sure `RUN_SLOW=1 python -m pytest tests/models/{your_model_name}/test_tokenization_{your_model_name}.py` passes. - - CircleCI does not run the slow tests, but GitHub Actions does every night!
+- CircleCI does not run the slow tests, but GitHub Actions does every night!
☐ All public methods must have informative docstrings (see [`modeling_bert.py`](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py) @@ -340,6 +341,7 @@ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/t ``` Like the slow tests, there are other environment variables available which are not enabled by default during testing: + - `RUN_CUSTOM_TOKENIZERS`: Enables tests for custom tokenizers. More environment variables and additional information can be found in the [testing_utils.py](https://github.com/huggingface/transformers/blob/main/src/transformers/testing_utils.py). diff --git a/ISSUES.md b/ISSUES.md index 9c96162647bc..c87bd9fc2c3f 100644 --- a/ISSUES.md +++ b/ISSUES.md @@ -38,7 +38,6 @@ In particular all "Please explain" questions or objectively very user-specific f * "How to train T5 on De->En translation?" - ## The GitHub Issues Everything which hints at a bug should be opened as an [issue](https://github.com/huggingface/transformers/issues). @@ -247,7 +246,6 @@ You are not required to read the following guidelines before opening an issue. H Try not use italics and bold text too much as these often make the text more difficult to read. - 12. If you are cross-referencing a specific comment in a given thread or another issue, always link to that specific comment, rather than using the issue link. If you do the latter it could be quite impossible to find which specific comment you're referring to. To get the link to the specific comment do not copy the url from the location bar of your browser, but instead, click the `...` icon in the upper right corner of the comment and then select "Copy Link". @@ -257,7 +255,6 @@ You are not required to read the following guidelines before opening an issue. H 1. https://github.com/huggingface/transformers/issues/9257 2. https://github.com/huggingface/transformers/issues/9257#issuecomment-749945162 - 13. If you are replying to a last comment, it's totally fine to make your reply with just your comment in it. The readers can follow the information flow here. But if you're replying to a comment that happened some comments back it's always a good practice to quote just the relevant lines you're replying it. The `>` is used for quoting, or you can always use the menu to do so. For example your editor box will look like: diff --git a/README.md b/README.md index 5d782bcea78e..f01a2bcc6e52 100644 --- a/README.md +++ b/README.md @@ -48,9 +48,11 @@ limitations under the License. తెలుగు | Français | Deutsch | + Italiano | Tiếng Việt | العربية | اردو | + বাংলা |

@@ -62,12 +64,11 @@ limitations under the License. +Transformers acts as the model-definition framework for state-of-the-art machine learning models in text, computer +vision, audio, video, and multimodal model, for both inference and training. -Transformers acts as the model-definition framework for state-of-the-art machine learning models in text, computer -vision, audio, video, and multimodal model, for both inference and training. - -It centralizes the model definition so that this definition is agreed upon across the ecosystem. `transformers` is the -pivot across frameworks: if a model definition is supported, it will be compatible with the majority of training +It centralizes the model definition so that this definition is agreed upon across the ecosystem. `transformers` is the +pivot across frameworks: if a model definition is supported, it will be compatible with the majority of training frameworks (Axolotl, Unsloth, DeepSpeed, FSDP, PyTorch-Lightning, ...), inference engines (vLLM, SGLang, TGI, ...), and adjacent modeling libraries (llama.cpp, mlx, ...) which leverage the model definition from `transformers`. @@ -110,10 +111,10 @@ git clone https://github.com/huggingface/transformers.git cd transformers # pip -pip install .[torch] +pip install '.[torch]' # uv -uv pip install .[torch] +uv pip install '.[torch]' ``` ## Quickstart @@ -193,7 +194,6 @@ pipeline("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.pn
Visual question answering -

diff --git a/awesome-transformers.md b/awesome-transformers.md index adc84f101eae..d0398e7bde6a 100644 --- a/awesome-transformers.md +++ b/awesome-transformers.md @@ -6,7 +6,7 @@ developers, researchers, students, professors, engineers, and anyone else to bui In this list, we showcase incredibly impactful and novel projects that have pushed the field forward. We celebrate 100 of these projects as we reach the milestone of 100k stars as a community; but we're very open to pull requests -adding other projects to the list. If you believe a project should be here and it's not, then please, open a PR +adding other projects to the list. If you believe a project should be here and it's not, then please, open a PR to add it. ## [gpt4all](https://github.com/nomic-ai/gpt4all) @@ -49,7 +49,7 @@ Keywords: LLMs, Large Language Models, Agents, Chains [LlamaIndex](https://github.com/run-llama/llama_index) is a project that provides a central interface to connect your LLM's with external data. It provides various kinds of indices and retrieval mechanisms to perform different LLM tasks and obtain knowledge-augmented results. -Keywords: LLMs, Large Language Models, Data Retrieval, Indices, Knowledge Augmentation +Keywords: LLMs, Large Language Models, Data Retrieval, Indices, Knowledge Augmentation ## [ParlAI](https://github.com/facebookresearch/ParlAI) @@ -257,7 +257,7 @@ Stable-Dreamfusion is a pytorch implementation of the text-to-3D model Dreamfusi Keywords: Text-to-3D, Stable Diffusion ## [txtai](https://github.com/neuml/txtai) - + [txtai](https://github.com/neuml/txtai) is an open-source platform for semantic search and workflows powered by language models. txtai builds embeddings databases, which are a union of vector indexes and relational databases enabling similarity search with SQL. Semantic workflows connect language models together into unified applications. Keywords: Semantic search, LLM @@ -309,8 +309,8 @@ Keywords: OCR, LaTeX, Math formula OpenCLIP is an open source implementation of OpenAI's CLIP. -The goal of this repository is to enable training models with contrastive image-text supervision, and to investigate their properties such as robustness to distribution shift. -The starting point is an implementation of CLIP that matches the accuracy of the original CLIP models when trained on the same dataset. +The goal of this repository is to enable training models with contrastive image-text supervision, and to investigate their properties such as robustness to distribution shift. +The starting point is an implementation of CLIP that matches the accuracy of the original CLIP models when trained on the same dataset. Specifically, a ResNet-50 model trained with this codebase on OpenAI's 15 million image subset of YFCC achieves 32.7% top-1 accuracy on ImageNet. @@ -596,7 +596,7 @@ Keywords: Data-Centric AI, Data Quality, Noisy Labels, Outlier Detection, Active ## [BentoML](https://github.com/bentoml/BentoML) -[BentoML](https://github.com/bentoml) is the unified framework for building, shipping, and scaling production-ready AI applications incorporating traditional ML, pre-trained AI models, Generative and Large Language Models. +[BentoML](https://github.com/bentoml) is the unified framework for building, shipping, and scaling production-ready AI applications incorporating traditional ML, pre-trained AI models, Generative and Large Language Models. All Hugging Face models and pipelines can be seamlessly integrated into BentoML applications, enabling the running of models on the most suitable hardware and independent scaling based on usage. Keywords: BentoML, Framework, Deployment, AI Applications @@ -606,4 +606,3 @@ Keywords: BentoML, Framework, Deployment, AI Applications [LLaMA Factory](https://github.com/hiyouga/LLaMA-Factory) offers a user-friendly fine-tuning framework that incorporates PEFT. The repository includes training(fine-tuning) and inference examples for LLaMA-2, BLOOM, Falcon, Baichuan, Qwen, and other LLMs. A ChatGLM version is also available in [ChatGLM-Efficient-Tuning](https://github.com/hiyouga/ChatGLM-Efficient-Tuning). Keywords: PEFT, fine-tuning, LLaMA-2, ChatGLM, Qwen - diff --git a/benchmark_v2/README.md b/benchmark_v2/README.md index 9a0102b387fc..bcbb9cc71ef3 100644 --- a/benchmark_v2/README.md +++ b/benchmark_v2/README.md @@ -21,6 +21,46 @@ python run_benchmarks.py \ --num-tokens-to-generate 200 ``` +### Uploading Results to HuggingFace Dataset + +You can automatically upload benchmark results to a HuggingFace Dataset for tracking and analysis: + +```bash +# Upload to a public dataset with auto-generated run ID +python run_benchmarks.py --upload-to-hub username/benchmark-results + +# Upload with a custom run ID for easy identification +python run_benchmarks.py --upload-to-hub username/benchmark-results --run-id experiment_v1 + +# Upload with custom HuggingFace token (if not set in environment) +python run_benchmarks.py --upload-to-hub username/benchmark-results --token hf_your_token_here +``` + +**Dataset Directory Structure:** +``` +dataset_name/ +├── 2025-01-15/ +│ ├── runs/ # Non-scheduled runs (manual, PR, etc.) +│ │ └── 123-1245151651/ # GitHub run number and ID +│ │ └── benchmark_results/ +│ │ ├── benchmark_summary_20250115_143022.json +│ │ └── model-name/ +│ │ └── model-name_benchmark_20250115_143022.json +│ └── benchmark_results_abc123de/ # Scheduled runs (daily CI) +│ ├── benchmark_summary_20250115_143022.json +│ └── model-name/ +│ └── model-name_benchmark_20250115_143022.json +└── 2025-01-16/ + └── ... +``` + +**Authentication for Uploads:** + +For uploading results, you need a HuggingFace token with write permissions to the target dataset. You can provide the token in several ways (in order of precedence): + +1. Command line: `--token hf_your_token_here` +3. Environment variable: `HF_TOKEN` + ### Running Specific Benchmarks ```bash diff --git a/benchmark_v2/benches/llama.py b/benchmark_v2/benches/llama.py index 23427a8549c7..2349e75f1347 100644 --- a/benchmark_v2/benches/llama.py +++ b/benchmark_v2/benches/llama.py @@ -20,7 +20,6 @@ from benchmark_framework import ModelBenchmark -os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" os.environ["TOKENIZERS_PARALLELISM"] = "1" torch.set_float32_matmul_precision("high") diff --git a/benchmark_v2/requirements.txt b/benchmark_v2/requirements.txt index a7a435958cf7..e4dcbb3eb7ef 100644 --- a/benchmark_v2/requirements.txt +++ b/benchmark_v2/requirements.txt @@ -3,4 +3,5 @@ psutil>=5.8.0 gpustat>=1.0.0 torch>=2.0.0 transformers>=4.30.0 -datasets>=2.10.0 \ No newline at end of file +datasets>=2.10.0 +huggingface_hub>=0.16.0 \ No newline at end of file diff --git a/benchmark_v2/run_benchmarks.py b/benchmark_v2/run_benchmarks.py index 26c816b9d16d..d04069887f2d 100755 --- a/benchmark_v2/run_benchmarks.py +++ b/benchmark_v2/run_benchmarks.py @@ -24,6 +24,7 @@ import logging import os import sys +import uuid from datetime import datetime from pathlib import Path from typing import Any, Optional @@ -160,7 +161,12 @@ def run_single_benchmark( return None -def generate_summary_report(output_dir: str, benchmark_results: dict[str, Any], logger: logging.Logger) -> str: +def generate_summary_report( + output_dir: str, + benchmark_results: dict[str, Any], + logger: logging.Logger, + benchmark_run_uuid: Optional[str] = None, +) -> str: """Generate a summary report of all benchmark runs.""" timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") summary_file = os.path.join(output_dir, f"benchmark_summary_{timestamp}.json") @@ -168,6 +174,7 @@ def generate_summary_report(output_dir: str, benchmark_results: dict[str, Any], summary_data = { "run_metadata": { "timestamp": datetime.utcnow().isoformat(), + "benchmark_run_uuid": benchmark_run_uuid, "total_benchmarks": len(benchmark_results), "successful_benchmarks": len([r for r in benchmark_results.values() if r is not None]), "failed_benchmarks": len([r for r in benchmark_results.values() if r is None]), @@ -183,9 +190,114 @@ def generate_summary_report(output_dir: str, benchmark_results: dict[str, Any], return summary_file +def upload_results_to_hf_dataset( + output_dir: str, + summary_file: str, + dataset_name: str, + run_id: Optional[str] = None, + token: Optional[str] = None, + logger: Optional[logging.Logger] = None, +) -> Optional[str]: + """ + Upload benchmark results to a HuggingFace Dataset. + Based on upload_collated_report() from utils/collated_reports.py + Args: + output_dir: Local output directory containing results + summary_file: Path to the summary file + dataset_name: Name of the HuggingFace dataset to upload to + run_id: Unique run identifier (if None, will generate one) + token: HuggingFace token for authentication (if None, will use environment variables) + logger: Logger instance + Returns: + The run_id used for the upload, None if upload failed + """ + if logger is None: + logger = logging.getLogger(__name__) + + import os + + from huggingface_hub import HfApi + + api = HfApi() + + if run_id is None: + github_run_number = os.getenv("GITHUB_RUN_NUMBER") + github_run_id = os.getenv("GITHUB_RUN_ID") + if github_run_number and github_run_id: + run_id = f"{github_run_number}-{github_run_id}" + + date_folder = datetime.now().strftime("%Y-%m-%d") + + github_event_name = os.getenv("GITHUB_EVENT_NAME") + if github_event_name != "schedule": + # Non-scheduled runs go under a runs subfolder + repo_path = f"{date_folder}/runs/{run_id}/benchmark_results" + else: + # Scheduled runs go directly under the date + repo_path = f"{date_folder}/{run_id}/benchmark_results" + + logger.info(f"Uploading benchmark results to dataset '{dataset_name}' at path '{repo_path}'") + + try: + # Upload all files in the output directory + from pathlib import Path + + output_path = Path(output_dir) + + for file_path in output_path.rglob("*"): + if file_path.is_file(): + # Calculate relative path from output_dir + relative_path = file_path.relative_to(output_path) + path_in_repo = f"{repo_path}/{relative_path}" + + logger.debug(f"Uploading {file_path} to {path_in_repo}") + + api.upload_file( + path_or_fileobj=str(file_path), + path_in_repo=path_in_repo, + repo_id=dataset_name, + repo_type="dataset", + token=token, + commit_message=f"Upload benchmark results for run {run_id}", + ) + + logger.info( + f"Successfully uploaded results to: https://huggingface.co/datasets/{dataset_name}/tree/main/{repo_path}" + ) + + return run_id + + except Exception as upload_error: + logger.error(f"Failed to upload results: {upload_error}") + import traceback + + logger.debug(traceback.format_exc()) + return None + + def main(): """Main entry point for the benchmarking script.""" - parser = argparse.ArgumentParser(description="Run all benchmarks in the ./benches directory") + # Generate a unique UUID for this benchmark run + benchmark_run_uuid = str(uuid.uuid4())[:8] + + parser = argparse.ArgumentParser( + description="Run all benchmarks in the ./benches directory", + epilog=""" +Examples: + # Run all available benchmarks + python3 run_benchmarks.py + + # Run with specific model and upload to HuggingFace Dataset + python3 run_benchmarks.py --model-id meta-llama/Llama-2-7b-hf --upload-to-hf username/benchmark-results + + # Run with custom run ID and upload to HuggingFace Dataset + python3 run_benchmarks.py --run-id experiment_v1 --upload-to-hf org/benchmarks + + # Run only specific benchmarks with file logging + python3 run_benchmarks.py --include llama --enable-file-logging + """, # noqa: W293 + formatter_class=argparse.RawDescriptionHelpFormatter, + ) parser.add_argument( "--output-dir", @@ -228,20 +340,35 @@ def main(): parser.add_argument("--exclude", type=str, nargs="*", help="Exclude benchmarks matching these names") - parser.add_argument("--enable-mock", action="store_true", help="Enable mock benchmark (skipped by default)") - parser.add_argument("--enable-file-logging", action="store_true", help="Enable file logging (disabled by default)") parser.add_argument( "--commit-id", type=str, help="Git commit ID for metadata (if not provided, will auto-detect from git)" ) + parser.add_argument( + "--push-to-hub", + type=str, + help="Upload results to HuggingFace Dataset (provide dataset name, e.g., 'username/benchmark-results')", + ) + + parser.add_argument( + "--run-id", type=str, help="Custom run ID for organizing results (if not provided, will generate a unique ID)" + ) + + parser.add_argument( + "--token", + type=str, + help="HuggingFace token for dataset uploads (if not provided, will use HF_TOKEN environment variable)", + ) + args = parser.parse_args() # Setup logging logger = setup_logging(args.log_level, args.enable_file_logging) logger.info("Starting benchmark discovery and execution") + logger.info(f"Benchmark run UUID: {benchmark_run_uuid}") logger.info(f"Output directory: {args.output_dir}") logger.info(f"Benches directory: {args.benches_dir}") @@ -286,9 +413,6 @@ def main(): if args.model_id: benchmark_kwargs["model_id"] = args.model_id - # Add enable_mock flag for mock benchmark - benchmark_kwargs["enable_mock"] = args.enable_mock - # Add commit_id if provided if args.commit_id: benchmark_kwargs["commit_id"] = args.commit_id @@ -306,7 +430,28 @@ def main(): successful_count += 1 # Generate summary report - summary_file = generate_summary_report(args.output_dir, benchmark_results, logger) + summary_file = generate_summary_report(args.output_dir, benchmark_results, logger, benchmark_run_uuid) + + # Upload results to HuggingFace Dataset if requested + upload_run_id = None + if args.push_to_hub: + logger.info("=" * 60) + logger.info("UPLOADING TO HUGGINGFACE DATASET") + logger.info("=" * 60) + # Use provided run_id or fallback to benchmark run UUID + effective_run_id = args.run_id or benchmark_run_uuid + upload_run_id = upload_results_to_hf_dataset( + output_dir=args.output_dir, + summary_file=summary_file, + dataset_name=args.push_to_hub, + run_id=effective_run_id, + token=args.token, + logger=logger, + ) + if upload_run_id: + logger.info(f"Upload completed with run ID: {upload_run_id}") + else: + logger.warning("Upload failed - continuing with local results") # Final summary total_benchmarks = len(filtered_benchmarks) @@ -321,6 +466,16 @@ def main(): logger.info(f"Output directory: {args.output_dir}") logger.info(f"Summary report: {summary_file}") + if args.push_to_hub: + if upload_run_id: + logger.info(f"HuggingFace Dataset: {args.push_to_hub}") + logger.info(f"Run ID: {upload_run_id}") + logger.info( + f"View results: https://huggingface.co/datasets/{args.push_to_hub}/tree/main/{datetime.now().strftime('%Y-%m-%d')}/runs/{upload_run_id}" + ) + else: + logger.warning("Upload to HuggingFace Dataset failed") + if failed_count > 0: logger.warning(f"{failed_count} benchmark(s) failed. Check logs for details.") return 1 diff --git a/conftest.py b/conftest.py index 67064fbd5d3d..69dfb0b3bc20 100644 --- a/conftest.py +++ b/conftest.py @@ -54,7 +54,6 @@ "test_gradient_checkpointing_backward_compatibility", "test_gradient_checkpointing_enable_disable", "test_torch_save_load", - "test_initialization", "test_forward_signature", "test_model_get_set_embeddings", "test_model_main_input_name", @@ -64,8 +63,7 @@ "test_load_save_without_tied_weights", "test_tied_weights_keys", "test_model_weights_reload_no_missing_tied_weights", - "test_mismatched_shapes_have_properly_initialized_weights", - "test_matched_shapes_have_loaded_weights_when_some_mismatched_shapes_exist", + "test_can_load_ignoring_mismatched_shapes", "test_model_is_small", "test_tf_from_pt_safetensors", "test_flax_from_pt_safetensors", @@ -93,6 +91,8 @@ def pytest_configure(config): config.addinivalue_line("markers", "torch_compile_test: mark test which tests torch compile functionality") config.addinivalue_line("markers", "torch_export_test: mark test which tests torch export functionality") + os.environ['DISABLE_SAFETENSORS_CONVERSION'] = 'true' + def pytest_collection_modifyitems(items): for item in items: diff --git a/docker/consistency.dockerfile b/docker/consistency.dockerfile index e569307f92dc..08f23db55e94 100644 --- a/docker/consistency.dockerfile +++ b/docker/consistency.dockerfile @@ -1,4 +1,4 @@ -FROM python:3.9-slim +FROM python:3.10-slim ENV PYTHONDONTWRITEBYTECODE=1 USER root ARG REF=main diff --git a/docker/custom-tokenizers.dockerfile b/docker/custom-tokenizers.dockerfile index 00ab463f4b5a..c00a9edb7db2 100644 --- a/docker/custom-tokenizers.dockerfile +++ b/docker/custom-tokenizers.dockerfile @@ -1,4 +1,4 @@ -FROM python:3.9-slim +FROM python:3.10-slim ENV PYTHONDONTWRITEBYTECODE=1 ARG REF=main USER root diff --git a/docker/examples-torch.dockerfile b/docker/examples-torch.dockerfile index 4f8a694021b2..5960930ae48c 100644 --- a/docker/examples-torch.dockerfile +++ b/docker/examples-torch.dockerfile @@ -1,4 +1,4 @@ -FROM python:3.9-slim +FROM python:3.10-slim ENV PYTHONDONTWRITEBYTECODE=1 ARG REF=main USER root diff --git a/docker/exotic-models.dockerfile b/docker/exotic-models.dockerfile index d603a57c4c06..1e16ae77d4a9 100644 --- a/docker/exotic-models.dockerfile +++ b/docker/exotic-models.dockerfile @@ -1,4 +1,4 @@ -FROM python:3.9-slim +FROM python:3.10-slim ENV PYTHONDONTWRITEBYTECODE=1 ARG REF=main USER root diff --git a/docker/pipeline-torch.dockerfile b/docker/pipeline-torch.dockerfile index 6759f156687f..e434eeaed93f 100644 --- a/docker/pipeline-torch.dockerfile +++ b/docker/pipeline-torch.dockerfile @@ -1,4 +1,4 @@ -FROM python:3.9-slim +FROM python:3.10-slim ENV PYTHONDONTWRITEBYTECODE=1 ARG REF=main USER root diff --git a/docker/quality.dockerfile b/docker/quality.dockerfile index 7a619e315689..6455a27d642b 100644 --- a/docker/quality.dockerfile +++ b/docker/quality.dockerfile @@ -1,4 +1,4 @@ -FROM python:3.9-slim +FROM python:3.10-slim ENV PYTHONDONTWRITEBYTECODE=1 ARG REF=main USER root diff --git a/docker/torch-light.dockerfile b/docker/torch-light.dockerfile index d670b421be7f..14ba613bdb37 100644 --- a/docker/torch-light.dockerfile +++ b/docker/torch-light.dockerfile @@ -1,4 +1,4 @@ -FROM python:3.9-slim +FROM python:3.10-slim ENV PYTHONDONTWRITEBYTECODE=1 ARG REF=main USER root diff --git a/docker/transformers-pytorch-amd-gpu/Dockerfile b/docker/transformers-pytorch-amd-gpu/Dockerfile index 37542ffb8943..eba5b984cce4 100644 --- a/docker/transformers-pytorch-amd-gpu/Dockerfile +++ b/docker/transformers-pytorch-amd-gpu/Dockerfile @@ -38,3 +38,10 @@ RUN python3 -m pip uninstall -y kernels # On ROCm, torchcodec is required to decode audio files and 0.4 or 0.6 fails RUN python3 -m pip install --no-cache-dir "torchcodec==0.5" + +# Install flash attention from source. Tested with commit 6387433156558135a998d5568a9d74c1778666d8 +RUN git clone https://github.com/ROCm/flash-attention/ -b tridao && \ + cd flash-attention && \ + GPU_ARCHS="gfx942" python setup.py install + +RUN python3 -m pip install --no-cache-dir einops diff --git a/docker/transformers-quantization-latest-gpu/Dockerfile b/docker/transformers-quantization-latest-gpu/Dockerfile index deb6761db8e0..2b25ca091b5c 100755 --- a/docker/transformers-quantization-latest-gpu/Dockerfile +++ b/docker/transformers-quantization-latest-gpu/Dockerfile @@ -1,4 +1,4 @@ -FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04 +FROM nvidia/cuda:12.6.0-cudnn-devel-ubuntu22.04 LABEL maintainer="Hugging Face" ARG DEBIAN_FRONTEND=noninteractive @@ -9,9 +9,9 @@ SHELL ["sh", "-lc"] # The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant # to be used as arguments for docker build (so far). -ARG PYTORCH='2.6.0' +ARG PYTORCH='2.8.0' # Example: `cu102`, `cu113`, etc. -ARG CUDA='cu121' +ARG CUDA='cu126' # Disable kernel mapping for quantization tests ENV DISABLE_KERNEL_MAPPING=1 @@ -30,31 +30,20 @@ RUN python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio tor RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate -# needed in bnb and awq -RUN python3 -m pip install --no-cache-dir einops - -# Add bitsandbytes for mixed int8 testing -RUN python3 -m pip install --no-cache-dir bitsandbytes - -# Add gptqmodel for gtpq quantization testing, installed from source for pytorch==2.6.0 compatibility -RUN python3 -m pip install lm_eval -RUN git clone https://github.com/ModelCloud/GPTQModel.git && cd GPTQModel && pip install -v . --no-build-isolation - # Add optimum for gptq quantization testing RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum # Add PEFT RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/peft@main#egg=peft -# Add aqlm for quantization testing -RUN python3 -m pip install --no-cache-dir aqlm[gpu]==1.0.2 +# needed in bnb and awq +RUN python3 -m pip install --no-cache-dir einops -# Add vptq for quantization testing -RUN pip install vptq +# Add bitsandbytes +RUN python3 -m pip install --no-cache-dir bitsandbytes -# Add spqr for quantization testing -# Commented for now as No matching distribution found we need to reach out to the authors -# RUN python3 -m pip install --no-cache-dir spqr_quant[gpu] +# # Add gptqmodel +# RUN python3 -m pip install --no-cache-dir gptqmodel # Add hqq for quantization testing RUN python3 -m pip install --no-cache-dir hqq @@ -63,25 +52,11 @@ RUN python3 -m pip install --no-cache-dir hqq RUN python3 -m pip install --no-cache-dir gguf # Add autoawq for quantization testing -# New release v0.2.8 RUN python3 -m pip install --no-cache-dir autoawq[kernels] # Add quanto for quantization testing RUN python3 -m pip install --no-cache-dir optimum-quanto -# Add eetq for quantization testing -RUN git clone https://github.com/NetEase-FuXi/EETQ.git && cd EETQ/ && git submodule update --init --recursive && pip install . - -# # Add flute-kernel and fast_hadamard_transform for quantization testing -# # Commented for now as they cause issues with the build -# # TODO: create a new workflow to test them -# RUN python3 -m pip install --no-cache-dir flute-kernel==0.4.1 -# RUN python3 -m pip install --no-cache-dir git+https://github.com/Dao-AILab/fast-hadamard-transform.git - -# Add fp-quant for quantization testing -# Requires py3.11 but our CI runs on 3.9 -# RUN python3 -m pip install --no-cache-dir "fp-quant>=0.1.6" - # Add compressed-tensors for quantization testing RUN python3 -m pip install --no-cache-dir compressed-tensors @@ -89,7 +64,10 @@ RUN python3 -m pip install --no-cache-dir compressed-tensors RUN python3 -m pip install --no-cache-dir amd-quark # Add AutoRound for quantization testing -RUN python3 -m pip install --no-cache-dir "auto-round>=0.5.0" +RUN python3 -m pip install --no-cache-dir auto-round + +# Add torchao for quantization testing +RUN python3 -m pip install --no-cache-dir torchao # Add transformers in editable mode RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch] @@ -103,3 +81,27 @@ RUN python3 -m pip uninstall -y flash-attn # When installing in editable mode, `transformers` is not recognized as a package. # this line must be added in order for python to be aware of transformers. RUN cd transformers && python3 setup.py develop + +# Add fp-quant for quantization testing +RUN python3 -m pip install --no-cache-dir "fp-quant>=0.2.0" + +# Low usage or incompatible lib, will enable later on + +# # Add aqlm for quantization testing +# RUN python3 -m pip install --no-cache-dir aqlm[gpu]==1.0.2 + +# # Add vptq for quantization testing +# RUN pip install vptq + +# Add spqr for quantization testing +# Commented for now as No matching distribution found we need to reach out to the authors +# RUN python3 -m pip install --no-cache-dir spqr_quant[gpu] + +# # Add eetq for quantization testing +# RUN git clone https://github.com/NetEase-FuXi/EETQ.git && cd EETQ/ && git submodule update --init --recursive && pip install . + +# # Add flute-kernel and fast_hadamard_transform for quantization testing +# # Commented for now as they cause issues with the build +# # TODO: create a new workflow to test them +# RUN python3 -m pip install --no-cache-dir flute-kernel==0.4.1 +# RUN python3 -m pip install --no-cache-dir git+https://github.com/Dao-AILab/fast-hadamard-transform.git diff --git a/docs/TRANSLATING.md b/docs/TRANSLATING.md index 64dced450987..7a2da690945b 100644 --- a/docs/TRANSLATING.md +++ b/docs/TRANSLATING.md @@ -50,7 +50,7 @@ Begin translating the text! 1. Start with the `_toctree.yml` file that corresponds to your documentation chapter. This file is essential for rendering the table of contents on the website. - - If the `_toctree.yml` file doesn’t exist for your language, create one by copying the English version and removing unrelated sections. + - If the `_toctree.yml` file doesn't exist for your language, create one by copying the English version and removing unrelated sections. - Ensure it is placed in the `docs/source/LANG-ID/` directory. Here’s an example structure for the `_toctree.yml` file: diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index d7fa25e185eb..dab792a5f286 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -307,6 +307,8 @@ title: Glossary - local: philosophy title: Philosophy + - local: models_timeline + title: Models Timeline - local: notebooks title: Notebooks with examples - local: community @@ -411,6 +413,8 @@ title: Blenderbot Small - local: model_doc/bloom title: BLOOM + - local: model_doc/blt + title: BLT - local: model_doc/bort title: BORT - local: model_doc/byt5 @@ -441,6 +445,8 @@ title: DeBERTa - local: model_doc/deberta-v2 title: DeBERTa-v2 + - local: model_doc/deepseek_v2 + title: DeepSeek-V2 - local: model_doc/deepseek_v3 title: DeepSeek-V3 - local: model_doc/dialogpt @@ -763,12 +769,6 @@ title: D-FINE - local: model_doc/dab-detr title: DAB-DETR - - local: model_doc/deepseek_v2 - title: DeepSeek-V2 - - local: model_doc/deepseek_vl - title: DeepseekVL - - local: model_doc/deepseek_vl_hybrid - title: DeepseekVLHybrid - local: model_doc/deformable_detr title: Deformable DETR - local: model_doc/deit @@ -851,10 +851,16 @@ title: RT-DETR - local: model_doc/rt_detr_v2 title: RT-DETRv2 + - local: model_doc/sam2 + title: SAM2 - local: model_doc/segformer title: SegFormer - local: model_doc/seggpt title: SegGpt + - local: model_doc/sam + title: Segment Anything + - local: model_doc/sam_hq + title: Segment Anything High Quality - local: model_doc/superglue title: SuperGlue - local: model_doc/superpoint @@ -933,6 +939,8 @@ title: MusicGen - local: model_doc/musicgen_melody title: MusicGen Melody + - local: model_doc/parakeet + title: Parakeet - local: model_doc/pop2piano title: Pop2Piano - local: model_doc/seamless_m4t @@ -977,6 +985,8 @@ title: XLSR-Wav2Vec2 title: Audio models - sections: + - local: model_doc/sam2_video + title: SAM2 Video - local: model_doc/timesformer title: TimeSformer - local: model_doc/vjepa2 @@ -1021,10 +1031,18 @@ title: ColQwen2 - local: model_doc/data2vec title: Data2Vec + - local: model_doc/deepseek_vl + title: DeepseekVL + - local: model_doc/deepseek_vl_hybrid + title: DeepseekVLHybrid - local: model_doc/deplot title: DePlot - local: model_doc/donut title: Donut + - local: model_doc/edgetam + title: EdgeTAM + - local: model_doc/edgetam_video + title: EdgeTamVideo - local: model_doc/emu3 title: Emu3 - local: model_doc/evolla @@ -1077,6 +1095,8 @@ title: LayoutLMV3 - local: model_doc/layoutxlm title: LayoutXLM + - local: model_doc/lfm2_vl + title: LFM2-VL - local: model_doc/lilt title: LiLT - local: model_doc/llama4 @@ -1135,18 +1155,12 @@ title: Qwen2Audio - local: model_doc/qwen2_vl title: Qwen2VL + - local: model_doc/qwen3_omni_moe + title: Qwen3-Omni-MoE - local: model_doc/qwen3_vl title: Qwen3VL - local: model_doc/qwen3_vl_moe title: Qwen3VLMoe - - local: model_doc/sam2 - title: SAM2 - - local: model_doc/sam2_video - title: SAM2 Video - - local: model_doc/sam - title: Segment Anything - - local: model_doc/sam_hq - title: Segment Anything High Quality - local: model_doc/shieldgemma2 title: ShieldGemma2 - local: model_doc/siglip diff --git a/docs/source/en/accelerator_selection.md b/docs/source/en/accelerator_selection.md index 5d5bbc2675fa..3cd809cba6a2 100644 --- a/docs/source/en/accelerator_selection.md +++ b/docs/source/en/accelerator_selection.md @@ -69,7 +69,6 @@ CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ... Only GPUs 0 and 2 are "visible" to PyTorch and are mapped to `cuda:0` and `cuda:1` respectively. To reverse the order (use GPU 2 as `cuda:0` and GPU 0 as `cuda:1`): - ```bash CUDA_VISIBLE_DEVICES=2,0 torchrun trainer-program.py ... ``` @@ -108,7 +107,6 @@ To reverse the order (use XPU 2 as `xpu:0` and XPU 0 as `xpu:1`): ZE_AFFINITY_MASK=2,0 torchrun trainer-program.py ... ``` - You can also control the order of Intel XPUs with: ```bash @@ -120,7 +118,5 @@ For more information about device enumeration and sorting on Intel XPU, please r - - > [!WARNING] > Environment variables can be exported instead of being added to the command line. This is not recommended because it can be confusing if you forget how the environment variable was set up and you end up using the wrong accelerators. Instead, it is common practice to set the environment variable for a specific training run on the same command line. diff --git a/docs/source/en/attention_interface.md b/docs/source/en/attention_interface.md index 407a47a7d353..621aa7409da0 100644 --- a/docs/source/en/attention_interface.md +++ b/docs/source/en/attention_interface.md @@ -193,4 +193,4 @@ def custom_attention_mask( It mostly works thanks to the `mask_function`, which is a `Callable` in the form of [torch's mask_mod functions](https://pytorch.org/blog/flexattention/), taking 4 indices as input and returning a boolean to indicate if this position should take part in the attention computation. -If you cannot use the `mask_function` to create your mask for some reason, you can try to work around it by doing something similar to our [torch export workaround](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/executorch.py). \ No newline at end of file +If you cannot use the `mask_function` to create your mask for some reason, you can try to work around it by doing something similar to our [torch export workaround](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/executorch.py). diff --git a/docs/source/en/auto_docstring.md b/docs/source/en/auto_docstring.md index 5fc4ed061ce1..e6c753419978 100644 --- a/docs/source/en/auto_docstring.md +++ b/docs/source/en/auto_docstring.md @@ -145,7 +145,6 @@ Arguments can also be passed directly to `@auto_docstring` for more control. Use The `Returns` and `Examples` parts of the docstring can also be manually specified. - ```python MODEL_COMMON_CUSTOM_ARGS = r""" common_arg_1 (`torch.Tensor`, *optional*, defaults to `default_value`): @@ -202,7 +201,6 @@ There are some rules for documenting different types of arguments and they're li If a standard argument behaves differently in your model, then you can override it locally in a `r""" """` block. This local definition has a higher priority. For example, the `labels` argument is often customized per model and typically requires overriding. - - New or custom arguments should be documented within an `r""" """` block after the signature if it is a function or in the `__init__` method's docstring if it is a class. ```py @@ -212,9 +210,9 @@ There are some rules for documenting different types of arguments and they're li This can span multiple lines. ``` - * Include `type` in backticks. - * Add *optional* if the argument is not required or has a default value. - * Add "defaults to X" if it has a default value. You don't need to add "defaults to `None`" if the default value is `None`. + * Include `type` in backticks. + * Add *optional* if the argument is not required or has a default value. + * Add "defaults to X" if it has a default value. You don't need to add "defaults to `None`" if the default value is `None`. These arguments can also be passed to `@auto_docstring` as a `custom_args` argument. It is used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file. diff --git a/docs/source/en/cache_explanation.md b/docs/source/en/cache_explanation.md index 0e192fd47f42..6d6718b8cab8 100644 --- a/docs/source/en/cache_explanation.md +++ b/docs/source/en/cache_explanation.md @@ -59,11 +59,9 @@ Refer to the table below to compare how caching improves efficiency. | without caching | with caching | |---|---| -| for each step, recompute all previous `K` and `V` | for each step, only compute current `K` and `V` +| for each step, recompute all previous `K` and `V` | for each step, only compute current `K` and `V` | attention cost per step is **quadratic** with sequence length | attention cost per step is **linear** with sequence length (memory grows linearly, but compute/token remains low) | - - ## Cache class A basic KV cache interface takes a key and value tensor for the current token and returns the updated `K` and `V` tensors. This is internally managed by a model's `forward` method. @@ -138,12 +136,11 @@ The cache position tracks where to insert new tokens in the attention cache. It Cache position is used internally for two purposes: -1. Selecting new tokens to process in the input sequence and ensuring only tokens that haven’t been cached yet are passed to the model's `forward`. +1. Selecting new tokens to process in the input sequence and ensuring only tokens that haven't been cached yet are passed to the model's `forward`. 2. Storing key/value pairs at the correct positions in the cache. This is especially important for fixed-size caches, that pre-allocates a specific cache length. The generation loop usually takes care of the cache position, but if you're writing a custom generation method, it is important that cache positions are accurate since they are used to write and read key/value states into fixed slots. - ```py import torch from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache, infer_device @@ -160,12 +157,12 @@ generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=10) ``` - ## Legacy cache format Before the [`Cache`] class, the cache used to be stored as a tuple of tuples of tensors. This format is dynamic because it grows as text is generated, similar to [`DynamicCache`]. The legacy format is essentially the same data structure but organized differently. + - It's a tuple of tuples, where each inner tuple contains the key and value tensors for a layer. - The tensors have the same shape `[batch_size, num_heads, seq_len, head_dim]`. - The format is less flexible and doesn't support features like quantization or offloading. diff --git a/docs/source/en/chat_extras.md b/docs/source/en/chat_extras.md index 53c431633c5e..f52825158272 100644 --- a/docs/source/en/chat_extras.md +++ b/docs/source/en/chat_extras.md @@ -16,7 +16,7 @@ rendered properly in your Markdown viewer. # Tool use -Chat models are commonly trained with support for "function-calling" or "tool-use". Tools are functions supplied by the user, which the model can choose to call as part of its response. For example, models could have access to a calculator tool to perform arithmetic without having to it internally. +Chat models are commonly trained with support for "function-calling" or "tool-use". Tools are functions supplied by the user, which the model can choose to call as part of its response. For example, models could have access to a calculator tool to perform arithmetic without having to perform the computation internally. This guide will demonstrate how to define tools, how to pass them to a chat model, and how to handle the model's output when it calls a tool. @@ -29,12 +29,11 @@ the arguments, argument types, and function docstring are parsed in order to gen Although passing Python functions is very convenient, the parser can only handle [Google-style](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings) docstrings. Refer to the examples below for how to format a tool-ready function. - ```py def get_current_temperature(location: str, unit: str): """ Get the current temperature at a location. - + Args: location: The location to get the temperature for, in the format "City, Country" unit: The unit to return the temperature in. (choices: ["celsius", "fahrenheit"]) @@ -44,7 +43,7 @@ def get_current_temperature(location: str, unit: str): def get_current_wind_speed(location: str): """ Get the current wind speed in km/h at a given location. - + Args: location: The location to get the wind speed for, in the format "City, Country" """ @@ -103,7 +102,6 @@ Hold the call in the `tool_calls` key of an `assistant` message. This is the rec > [!WARNING] > Although `tool_calls` is similar to the OpenAI API, the OpenAI API uses a JSON string as its `tool_calls` format. This may cause errors or strange model behavior if used in Transformers, which expects a dict. - ```py tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}} messages.append({"role": "assistant", "tool_calls": [{"type": "function", "function": tool_call}]}) @@ -131,7 +129,6 @@ The temperature in Paris, France right now is 22°C.<|im_end|> > Although the key in the assistant message is called `tool_calls`, in most cases, models only emit a single tool call at a time. Some older models emit multiple tool calls at the same time, but this is a > significantly more complex process, as you need to handle multiple tool responses at once and disambiguate them, often using tool call IDs. Please refer to the model card to see exactly what format a model expects for tool calls. - ## JSON schemas Another way to define tools is by passing a [JSON schema](https://json-schema.org/learn/getting-started-step-by-step). @@ -147,7 +144,7 @@ from transformers.utils import get_json_schema def multiply(a: float, b: float): """ A function that multiplies two numbers - + Args: a: The first number to multiply b: The second number to multiply @@ -160,22 +157,22 @@ print(schema) ```json { - "type": "function", + "type": "function", "function": { - "name": "multiply", - "description": "A function that multiplies two numbers", + "name": "multiply", + "description": "A function that multiplies two numbers", "parameters": { - "type": "object", + "type": "object", "properties": { "a": { - "type": "number", + "type": "number", "description": "The first number to multiply" - }, + }, "b": { "type": "number", "description": "The second number to multiply" } - }, + }, "required": ["a", "b"] } } @@ -187,7 +184,7 @@ We won't go into the details of JSON schema itself here, since it's already [ver ```py # A simple function that takes no arguments current_time = { - "type": "function", + "type": "function", "function": { "name": "current_time", "description": "Get the current local time as a string.", @@ -203,18 +200,18 @@ multiply = { 'type': 'function', 'function': { 'name': 'multiply', - 'description': 'A function that multiplies two numbers', + 'description': 'A function that multiplies two numbers', 'parameters': { - 'type': 'object', + 'type': 'object', 'properties': { 'a': { 'type': 'number', 'description': 'The first number to multiply' - }, + }, 'b': { 'type': 'number', 'description': 'The second number to multiply' } - }, + }, 'required': ['a', 'b'] } } @@ -224,4 +221,4 @@ model_input = tokenizer.apply_chat_template( messages, tools = [current_time, multiply] ) -``` \ No newline at end of file +``` diff --git a/docs/source/en/chat_templating.md b/docs/source/en/chat_templating.md index 2f965657a420..1e83da188a03 100644 --- a/docs/source/en/chat_templating.md +++ b/docs/source/en/chat_templating.md @@ -16,13 +16,13 @@ rendered properly in your Markdown viewer. # Chat templates -The [chat basics](./conversations) guide covers how to store chat histories and generate text from chat models using [`TextGenerationPipeline`]. +The [chat basics](./conversations) guide covers how to store chat histories and generate text from chat models using [`TextGenerationPipeline`]. This guide is intended for more advanced users, and covers the underlying classes and methods, as well as the key concepts for understanding what's actually going on when you chat with a model. The critical insight needed to understand chat models is this: All causal LMs, whether chat-trained or not, continue a sequence of tokens. When causal LMs are trained, the training usually begins with "pre-training" on a huge corpus of text, which creates a "base" model. These base models are then often "fine-tuned" for chat, which means training them on data that is formatted as a sequence of messages. The chat is still just a sequence of tokens, though! The list of `role` and `content` dictionaries that you pass -to a chat model get converted to a token sequence, often with control tokens like `<|user|>` or `<|assistant|>` or `<|end_of_message|>`, which allow the model to see the chat structure. +to a chat model get converted to a token sequence, often with control tokens like `<|user|>` or `<|assistant|>` or `<|end_of_message|>`, which allow the model to see the chat structure. There are many possible chat formats, and different models may use different formats or control tokens, even if they were fine-tuned from the same base model! Don't panic, though - you don't need to memorize every possible chat format in order to use chat models. Chat models come with **chat templates**, which indicate how they expect chats to be formatted. @@ -43,6 +43,7 @@ chat = [ tokenizer.apply_chat_template(chat, tokenize=False) ``` + ```md [INST] Hello, how are you? [/INST]I'm doing great. How can I help you today? [INST] I'd like to show off how chat templating works! [/INST] ``` @@ -62,6 +63,7 @@ chat = [ tokenizer.apply_chat_template(chat, tokenize=False) ``` + ```md <|user|>\nHello, how are you?\n<|assistant|>\nI'm doing great. How can I help you today?\n<|user|>\nI'd like to show off how chat templating works!\n ``` @@ -75,9 +77,9 @@ Mistral-7B-Instruct uses `[INST]` and `[/INST]` tokens to indicate the start and The input to `apply_chat_template` should be structured as a list of dictionaries with `role` and `content` keys. The `role` key specifies the speaker, and the `content` key contains the message. The common roles are: - - `user` for messages from the user - - `assistant` for messages from the model - - `system` for directives on how the model should act (usually placed at the beginning of the chat) +- `user` for messages from the user +- `assistant` for messages from the model +- `system` for directives on how the model should act (usually placed at the beginning of the chat) [`apply_chat_template`] takes this list and returns a formatted sequence. Set `tokenize=True` if you want to tokenize the sequence. @@ -110,6 +112,7 @@ Pass the tokenized chat to [`~GenerationMixin.generate`] to generate a response. outputs = model.generate(tokenized_chat, max_new_tokens=128) print(tokenizer.decode(outputs[0])) ``` + ```md <|system|> You are a friendly chatbot who always responds in the style of a pirate @@ -121,13 +124,13 @@ Matey, I'm afraid I must inform ye that humans cannot eat helicopters. Helicopte > [!WARNING] > Some tokenizers add special `` and `` tokens. Chat templates should already include all the necessary special tokens, and adding additional special tokens is often incorrect or duplicated, hurting model performance. When you format text with `apply_chat_template(tokenize=False)`, make sure you set `add_special_tokens=False` if you tokenize later to avoid duplicating these tokens. -> This isn’t an issue if you use `apply_chat_template(tokenize=True)`, which means it's usually the safer option! +> This isn't an issue if you use `apply_chat_template(tokenize=True)`, which means it's usually the safer option! ### add_generation_prompt -You may have noticed the [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) argument in the above examples. +You may have noticed the [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) argument in the above examples. This argument adds tokens to the end of the chat that indicate the start of an `assistant` response. Remember: Beneath all the chat abstractions, chat models are still just language models that continue a sequence of tokens! -If you include tokens that tell it that it's now in an `assistant` response, it will correctly write a response, but if you don't include these tokens, the model may get confused and do something strange, like **continuing** the user's message instead of replying to it! +If you include tokens that tell it that it's now in an `assistant` response, it will correctly write a response, but if you don't include these tokens, the model may get confused and do something strange, like **continuing** the user's message instead of replying to it! Let's see an example to understand what `add_generation_prompt` is actually doing. First, let's format a chat without `add_generation_prompt`: @@ -135,6 +138,7 @@ Let's see an example to understand what `add_generation_prompt` is actually doin tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False) tokenized_chat ``` + ```md <|im_start|>user Hi there!<|im_end|> @@ -150,6 +154,7 @@ Now, let's format the same chat with `add_generation_prompt=True`: tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) tokenized_chat ``` + ```md <|im_start|>user Hi there!<|im_end|> @@ -163,7 +168,7 @@ Can I ask a question?<|im_end|> When `add_generation_prompt=True`, `<|im_start|>assistant` is added at the end to indicate the start of an `assistant` message. This lets the model know an `assistant` response is next. -Not all models require generation prompts, and some models, like [Llama](./model_doc/llama), don’t have any special tokens before the `assistant` response. In these cases, [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) has no effect. +Not all models require generation prompts, and some models, like [Llama](./model_doc/llama), don't have any special tokens before the `assistant` response. In these cases, [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) has no effect. ### continue_final_message @@ -182,14 +187,13 @@ model.generate(**formatted_chat) ``` > [!WARNING] -> You shouldn’t use [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) and [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) together. The former adds tokens that start a new message, while the latter removes end of sequence tokens. Using them together returns an error. - -[`TextGenerationPipeline`] sets [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) to `True` by default to start a new message. However, if the final message in the chat has the `assistant` role, it assumes the message is a prefill and switches to `continue_final_message=True`. This is because most models don’t support multiple consecutive assistant messages. To override this behavior, explicitly pass the [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) argument to the pipeline. +> You shouldn't use [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) and [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) together. The former adds tokens that start a new message, while the latter removes end of sequence tokens. Using them together returns an error. +[`TextGenerationPipeline`] sets [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) to `True` by default to start a new message. However, if the final message in the chat has the `assistant` role, it assumes the message is a prefill and switches to `continue_final_message=True`. This is because most models don't support multiple consecutive assistant messages. To override this behavior, explicitly pass the [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) argument to the pipeline. ## Model training -Training a model with a chat template is a good way to ensure the template matches the tokens the model was trained on. Apply the chat template as a preprocessing step to your dataset. Set `add_generation_prompt=False` because the additional tokens to prompt an assistant response aren’t helpful during training. +Training a model with a chat template is a good way to ensure the template matches the tokens the model was trained on. Apply the chat template as a preprocessing step to your dataset. Set `add_generation_prompt=False` because the additional tokens to prompt an assistant response aren't helpful during training. An example of preprocessing a dataset with a chat template is shown below. @@ -212,6 +216,7 @@ dataset = Dataset.from_dict({"chat": [chat1, chat2]}) dataset = dataset.map(lambda x: {"formatted_chat": tokenizer.apply_chat_template(x["chat"], tokenize=False, add_generation_prompt=False)}) print(dataset['formatted_chat'][0]) ``` + ```md <|user|> Which is bigger, the moon or the sun? diff --git a/docs/source/en/chat_templating_multimodal.md b/docs/source/en/chat_templating_multimodal.md index 79d01a96d9ad..d8cf3dfda3b7 100644 --- a/docs/source/en/chat_templating_multimodal.md +++ b/docs/source/en/chat_templating_multimodal.md @@ -18,8 +18,7 @@ rendered properly in your Markdown viewer. Multimodal chat models accept inputs like images, audio or video, in addition to text. The `content` key in a multimodal chat history is a list containing multiple items of different types. This is unlike text-only chat models whose `content` key is a single string. - -In the same way the [Tokenizer](./fast_tokenizer) class handles chat templates and tokenization for text-only models, +In the same way the [Tokenizer](./fast_tokenizer) class handles chat templates and tokenization for text-only models, the [Processor](./processors) class handles preprocessing, tokenization and chat templates for multimodal models. Their [`~ProcessorMixin.apply_chat_template`] methods are almost identical. This guide will show you how to chat with multimodal models with the high-level [`ImageTextToTextPipeline`] and at a lower level using the [`~ProcessorMixin.apply_chat_template`] and [`~GenerationMixin.generate`] methods. @@ -46,7 +45,7 @@ messages = [ ] ``` -Create an [`ImageTextToTextPipeline`] and pass the chat to it. For large models, setting [device_map=“auto”](./models#big-model-inference) helps load the model quicker and automatically places it on the fastest device available. Setting the data type to [auto](./models#model-data-type) also helps save memory and improve speed. +Create an [`ImageTextToTextPipeline`] and pass the chat to it. For large models, setting [device_map="auto"](./models#big-model-inference) helps load the model quicker and automatically places it on the fastest device available. Setting the data type to [auto](./models#model-data-type) also helps save memory and improve speed. ```python import torch @@ -57,8 +56,7 @@ out = pipe(text=messages, max_new_tokens=128) print(out[0]['generated_text'][-1]['content']) ``` - -``` +```text Ahoy, me hearty! These be two feline friends, likely some tabby cats, taking a siesta on a cozy pink blanket. They're resting near remote controls, perhaps after watching some TV or just enjoying some quiet time together. Cats sure know how to find comfort and relaxation, don't they? ``` @@ -66,10 +64,9 @@ Aside from the gradual descent from pirate-speak into modern American English (i ## Using `apply_chat_template` -Like [text-only models](./chat_templating), use the [`~ProcessorMixin.apply_chat_template`] method to prepare the chat messages for multimodal models. +Like [text-only models](./chat_templating), use the [`~ProcessorMixin.apply_chat_template`] method to prepare the chat messages for multimodal models. This method handles the tokenization and formatting of the chat messages, including images and other media types. The resulting inputs are passed to the model for generation. - ```python from transformers import AutoProcessor, AutoModelForImageTextToText @@ -99,8 +96,7 @@ processed_chat = processor.apply_chat_template(messages, add_generation_prompt=T print(list(processed_chat.keys())) ``` - -``` +```text ['input_ids', 'attention_mask', 'pixel_values', 'image_grid_thw'] ``` @@ -113,14 +109,13 @@ print(processor.decode(out[0])) The decoded output contains the full conversation so far, including the user message and the placeholder tokens that contain the image information. You may need to trim the previous conversation from the output before displaying it to the user. - ## Video inputs Some vision models also support video inputs. The message format is very similar to the format for [image inputs](#image-inputs). - The content `"type"` should be `"video"` to indicate the content is a video. - For videos, it can be a link to the video (`"url"`) or it could be a file path (`"path"`). Videos loaded from a URL can only be decoded with [PyAV](https://pyav.basswood-io.com/docs/stable/) or [Decord](https://github.com/dmlc/decord). -- In addition to loading videos from a URL or file path, you can also pass decoded video data directly. This is useful if you’ve already preprocessed or decoded video frames elsewhere in memory (e.g., using OpenCV, decord, or torchvision). You don't need to save to files or store it in an URL. +- In addition to loading videos from a URL or file path, you can also pass decoded video data directly. This is useful if you've already preprocessed or decoded video frames elsewhere in memory (e.g., using OpenCV, decord, or torchvision). You don't need to save to files or store it in an URL. > [!WARNING] > Loading a video from `"url"` is only supported by the PyAV or Decord backends. @@ -148,6 +143,7 @@ messages = [ ``` ### Example: Passing decoded video objects + ```python import numpy as np @@ -167,7 +163,9 @@ messages = [ }, ] ``` + You can also use existing (`"load_video()"`) function to load a video, edit the video in memory and pass it in the messages. + ```python # Make sure a video backend library (pyav, decord, or torchvision) is available. @@ -200,7 +198,6 @@ Pass `messages` to [`~ProcessorMixin.apply_chat_template`] to tokenize the input The `num_frames` parameter controls how many frames to uniformly sample from the video. Each checkpoint has a maximum frame count it was pretrained with and exceeding this count can significantly lower generation quality. It's important to choose a frame count that fits both the model capacity and your hardware resources. If `num_frames` isn't specified, the entire video is loaded without any frame sampling. - ```python processed_chat = processor.apply_chat_template( messages, @@ -265,4 +262,3 @@ print(processed_chat.keys()) - diff --git a/docs/source/en/chat_templating_writing.md b/docs/source/en/chat_templating_writing.md index a7da4b6597c8..8df0c5e671f3 100644 --- a/docs/source/en/chat_templating_writing.md +++ b/docs/source/en/chat_templating_writing.md @@ -18,7 +18,6 @@ rendered properly in your Markdown viewer. A chat template is a [Jinja](https://jinja.palletsprojects.com/en/stable/templates/) template stored in the tokenizer's [chat_template](https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.PreTrainedTokenizer.chat_template) attribute. Jinja is a templating language that allows you to write Python-like code and syntax. - ```jinja {%- for message in messages %} {{- '<|' + message['role'] + |>\n' }} @@ -30,8 +29,8 @@ A chat template is a [Jinja](https://jinja.palletsprojects.com/en/stable/templat ``` If you stare at this for a while, you should realize that this is actually very like Python, albeit with some strange -`{%-` syntax. The template iterates over a list of messages, and for each message, it prints the role and content of -the message, followed by an end-of-sequence token. If `add_generation_prompt=True`, it adds +`{%-` syntax. The template iterates over a list of messages, and for each message, it prints the role and content of +the message, followed by an end-of-sequence token. If `add_generation_prompt=True`, it adds the starting header for an assistant message to the end of the conversation. Load the written template as a string and assign it to the tokenizer's `chat_template` attribute. Once set, the template is used whenever you call [`~PreTrainedTokenizerBase.apply_chat_template`]. It is also saved @@ -42,7 +41,7 @@ edit this file directly to change the template, which is often easier than manip The easiest way to start writing Jinja templates is to refer to existing templates. Use `print(tokenizer.chat_template)` on any chat model to see the template it's using. Try starting with simple models that don't call any tools or support RAG because tool-use models can have very complex templates. Finally, take a look at the [Jinja documentation](https://jinja.palletsprojects.com/en/stable/templates/#synopsis) for more details about formatting and syntax. -There are some specific tips and pitfalls you may encounter while writing chat templates specifically, though, and this section will cover some of them in more detail. +There are some specific tips and pitfalls you may encounter while writing chat templates specifically, though, and this section will cover some of them in more detail. ### Writing multimodal chat templates @@ -108,7 +107,6 @@ We strongly recommend using `-` to ensure only the intended content is printed. ### Special variables and callables - The only constants in a template are the `messages` variable and the `add_generation_prompt` boolean. However, you have access to **any other keyword arguments that are passed** to the [`~PreTrainedTokenizerBase.apply_chat_template`] method. @@ -133,7 +131,7 @@ Make the changes below to ensure compatibility across all Jinja implementations. ### Big templates -Newer models or models with features like [tool-calling](./chat_extras#tools) and [RAG](./chat_extras#retrieval-augmented-generation-rag) require larger templates that can be longer than 100 lines. It may be easier to write larger templates in a separate file. The line numbers in the separate file corresponds exactly to the line numbers in template parsing or execution errors, making it easier to debug any potential issues. +Newer models or models with features like [tool-calling](./chat_extras) and RAG require larger templates that can be longer than 100 lines. It may be easier to write larger templates in a separate file. The line numbers in the separate file corresponds exactly to the line numbers in template parsing or execution errors, making it easier to debug any potential issues. Write the template in a separate file and extract it to the chat template. @@ -166,22 +164,22 @@ The example below shows how a tool is defined in JSON schema format. ```json { - "type": "function", + "type": "function", "function": { - "name": "multiply", - "description": "A function that multiplies two numbers", + "name": "multiply", + "description": "A function that multiplies two numbers", "parameters": { - "type": "object", + "type": "object", "properties": { "a": { - "type": "number", + "type": "number", "description": "The first number to multiply" - }, + }, "b": { "type": "number", "description": "The second number to multiply" } - }, + }, "required": ["a", "b"] } } @@ -190,7 +188,7 @@ The example below shows how a tool is defined in JSON schema format. An example of handling tool definitions in a chat template is shown below. The specific tokens and layouts should be changed to match the ones the model was trained with. -``` +```jinja {%- if tools %} {%- for tool in tools %} {{- '' + tool['function']['name'] + '\n' }} @@ -228,7 +226,7 @@ Tool calls are generally passed in the `tool_calls` key of an `"assistant”` me A common pattern for handling tool calls is shown below. You can use this as a starting point, but make sure you template actually matches the format the model was trained with! -``` +```jinja {%- if message['role'] == 'assistant' and 'tool_calls' in message %} {%- for tool_call in message['tool_calls'] %} {{- '' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments']|tojson + '\n' }} @@ -251,7 +249,7 @@ Tool responses are message dicts with the `tool` role. They are much simpler tha Some templates may not even need the `name` key, in which case, you can write your template to only read the `content` key. -``` +```jinja {%- if message['role'] == 'tool' %} {{- "" + message['content'] + "" }} {%- endif %} diff --git a/docs/source/en/conversations.md b/docs/source/en/conversations.md index 0fed56c632d2..a36be2203a5f 100644 --- a/docs/source/en/conversations.md +++ b/docs/source/en/conversations.md @@ -48,7 +48,6 @@ transformers chat -h The chat is implemented on top of the [AutoClass](./model_doc/auto), using tooling from [text generation](./llm_tutorial) and [chat](./chat_templating). It uses the `transformers serve` CLI under the hood ([docs](./serving.md#serve-cli)). - ## TextGenerationPipeline [`TextGenerationPipeline`] is a high-level text generation class with a "chat mode". Chat mode is enabled when a conversational model is detected and the chat prompt is [properly formatted](./llm_tutorial#wrong-prompt-format). @@ -109,7 +108,7 @@ quantization_config = BitsAndBytesConfig(load_in_8bit=True) pipeline = pipeline(task="text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", model_kwargs={"quantization_config": quantization_config}) ``` -In general, model size and performance are directly correlated. Larger models are slower in addition to requiring more memory because each active parameter must be read from memory for every generated token. +In general, model size and performance are directly correlated. Larger models are slower in addition to requiring more memory because each active parameter must be read from memory for every generated token. This is a bottleneck for LLM text generation and the main options for improving generation speed are to either quantize a model or use hardware with higher memory bandwidth. Adding more compute power doesn't meaningfully help. You can also try techniques like [speculative decoding](./generation_strategies#speculative-decoding), where a smaller model generates candidate tokens that are verified by the larger model. If the candidate tokens are correct, the larger model can generate more than one token at a time. This significantly alleviates the bandwidth bottleneck and improves generation speed. diff --git a/docs/source/en/cursor.md b/docs/source/en/cursor.md index 18ebe803edfb..e56155a8e42c 100644 --- a/docs/source/en/cursor.md +++ b/docs/source/en/cursor.md @@ -21,9 +21,10 @@ where `port` is the port used by `transformers serve` (`8000` by default). On th You're now ready to set things up on the app side! In Cursor, while you can't set a new provider, you can change the endpoint for OpenAI requests in the model selection settings. First, navigate to "Settings" > "Cursor Settings", "Models" tab, and expand the "API Keys" collapsible. To set your `transformers serve` endpoint, follow this order: + 1. Unselect ALL models in the list above (e.g. `gpt4`, ...); 2. Add and select the model you want to use (e.g. `Qwen/Qwen3-4B`) -3. Add some random text to OpenAI API Key. This field won't be used, but it can’t be empty; +3. Add some random text to OpenAI API Key. This field won't be used, but it can't be empty; 4. Add the https address from `ngrok` to the "Override OpenAI Base URL" field, appending `/v1` to the address (i.e. `https://(...).ngrok-free.app/v1`); 5. Hit "Verify". @@ -38,5 +39,3 @@ You are now ready to use your local model in Cursor! For instance, if you toggle

- - diff --git a/docs/source/en/debugging.md b/docs/source/en/debugging.md index 09394d2229d1..bea40c282dee 100644 --- a/docs/source/en/debugging.md +++ b/docs/source/en/debugging.md @@ -35,7 +35,7 @@ pip install deepspeed PyTorch comes with its own CUDA toolkit, but to use DeepSpeed with PyTorch, you need to have an identical version of CUDA installed system-wide. For example, if you installed PyTorch with `cudatoolkit==10.2` in your Python environment, then you'll also need to have CUDA 10.2 installed everywhere. -The exact location can vary from system to system, but `usr/local/cuda-10.2` is the most common location on many Unix systems. When CUDA is correctly set up and added to your `PATH` environment variable, you can find the installation location with the following command. +The exact location can vary from system to system, but `/usr/local/cuda-10.2` is the most common location on many Unix systems. When CUDA is correctly set up and added to your `PATH` environment variable, you can find the installation location with the following command. ```bash which nvcc @@ -45,7 +45,7 @@ which nvcc You may also have more than one CUDA toolkit installed on your system. -```bash +```text /usr/local/cuda-10.2 /usr/local/cuda-11.0 ``` diff --git a/docs/source/en/deepspeed.md b/docs/source/en/deepspeed.md index 87ae0296e09c..642cc8a42d98 100644 --- a/docs/source/en/deepspeed.md +++ b/docs/source/en/deepspeed.md @@ -294,7 +294,7 @@ Consider running a [benchmark](https://github.com/microsoft/DeepSpeed/issues/998 The example ZeRO-3 and ZeRO-Infinity config below sets most of the parameter values to `auto`, but you can also manually set configure these values. -```yaml +```json { "fp16": { "enabled": "auto", @@ -383,7 +383,7 @@ Gradient checkpointing saves memory by only storing *some* of the intermediate a The batch size can be automatically configured or manually set. When you choose the `"auto"` option, [`Trainer`] sets `train_micro_batch_size_per_gpu` and `train_batch_size` to the value of `world_size * per_device_train_batch_size * gradient_accumulation_steps`. -```yaml +```json { "train_micro_batch_size_per_gpu": "auto", "train_batch_size": "auto" @@ -400,7 +400,7 @@ Reduce operations are lossy, for example, when gradients are averaged across mul Choose the communication data type by setting the `communication_data_type` parameter in the config file. For example, choosing fp32 adds a small amount of overhead but ensures the reduction operation is accumulated in fp32 and when it is ready, it's downcasted to whichever half-precision data type you're training in. -```yaml +```json { "communication_data_type": "fp32" } @@ -412,7 +412,7 @@ Gradient accumulation accumulates gradients over several mini-batches of data be Gradient accumulation can be automatically configured or manually set. When you choose the `"auto"` option, [`Trainer`] sets it to the value of `gradient_accumulation_steps`. -```yaml +```json { "gradient_accumulation_steps": "auto" } @@ -424,7 +424,7 @@ Gradient clipping is useful for preventing exploding gradients which can lead to Gradient clipping can be automatically configured or manually set. When you choose the `"auto"` option, [`Trainer`] sets it to the value of `max_grad_norm`. -```yaml +```json { "gradient_clipping": "auto" } @@ -439,7 +439,7 @@ Mixed precision accelerates training speed by performing some calculations in ha Train in fp32 if a model wasn't pretrained in mixed precision because it may cause underflow or overflow errors. Disable fp16, the default, in this case. -```yaml +```json { "fp16": { "enabled": false @@ -454,7 +454,7 @@ For Ampere GPUs and PyTorch 1.7+, the more efficient [tf32](https://pytorch.org/ To configure AMP-like fp16 mixed precision, set up the config as shown below with `"auto"` or your own values. [`Trainer`] automatically enables or disables fp16 based on the value of `fp16_backend`, and the rest of the config can be set by you. fp16 is enabled from the command line when the following arguments are passed: `--fp16`, `--fp16_backend amp` or `--fp16_full_eval`. -```yaml +```json { "fp16": { "enabled": "auto", @@ -471,7 +471,7 @@ For additional DeepSpeed fp16 training options, take a look at the [FP16 Trainin To configure Apex-like fp16 mixed precision, set up the config as shown below with `"auto"` or your own values. [`Trainer`] automatically configures `amp` based on the values of `fp16_backend` and `fp16_opt_level`. It can also be enabled from the command line when the following arguments are passed: `--fp16`, `--fp16_backend apex` or `--fp16_opt_level 01`. -```yaml +```json { "amp": { "enabled": "auto", @@ -486,11 +486,11 @@ To configure Apex-like fp16 mixed precision, set up the config as shown below wi > [!TIP] > bf16 requires DeepSpeed 0.6.0. -bf16 has the same dynamic range as fp32, and doesn’t require loss scaling unlike fp16. However, if you use [gradient accumulation](#gradient-accumulation) with bf16, gradients are accumulated in bf16 which may not be desirable because the lower precision can lead to lossy accumulation. +bf16 has the same dynamic range as fp32, and doesn't require loss scaling unlike fp16. However, if you use [gradient accumulation](#gradient-accumulation) with bf16, gradients are accumulated in bf16 which may not be desirable because the lower precision can lead to lossy accumulation. bf16 can be set up in the config file or enabled from the command line when the following arguments are passed: `--bf16` or `--bf16_full_eval`. -```yaml +```json { "bf16": { "enabled": "auto" @@ -514,7 +514,7 @@ DeepSpeed offers several [optimizers](https://www.deepspeed.ai/docs/config-json/ You can set the parameters to `"auto"` or manually input your own values. -```yaml +```json { "optimizer": { "type": "AdamW", @@ -530,7 +530,7 @@ You can set the parameters to `"auto"` or manually input your own values. Use an unsupported optimizer by adding the following to the top level configuration. -```yaml +```json { "zero_allow_untested_optimizer": true } @@ -538,7 +538,7 @@ Use an unsupported optimizer by adding the following to the top level configurat From DeepSpeed 0.8.3+, if you want to use offload, you'll also need to add the following to the top level configuration because offload works best with DeepSpeed's CPU Adam optimizer. -```yaml +```json { "zero_force_ds_cpu_optimizer": false } @@ -558,7 +558,7 @@ If you don't configure the scheduler in the config file, [`Trainer`] automatical You can set the parameters to `"auto"` or manually input your own values. -```yaml +```json { "scheduler": { "type": "WarmupDecayLR", @@ -581,7 +581,7 @@ You can set the parameters to `"auto"` or manually input your own values. Resume training with a Universal checkpoint by setting `load_universal` to `true` in the config file. -```yaml +```json { "checkpoint": { "load_universal": true @@ -640,7 +640,7 @@ deepspeed --num_gpus=1 examples/pytorch/translation/run_translation.py \ A multi-node setup consists of multiple nodes, where each node has one of more GPUs running a workload. DeepSpeed expects a shared storage system, but if this is not the case, you need to adjust the config file to include a [checkpoint](https://www.deepspeed.ai/docs/config-json/#checkpoint-options) to allow loading without access to a shared filesystem. -```yaml +```json { "checkpoint": { "use_node_local_storage": true @@ -824,7 +824,7 @@ ZeRO-2 saves the model weights in fp16. To save the weights in fp16 for ZeRO-3, If you don't, [`Trainer`] won't save the weights in fp16 and won't create a `pytorch_model.bin` file. This is because DeepSpeed's state_dict contains a placeholder instead of the real weights, so you won't be able to load it. -```yaml +```json { "zero_optimization": { "stage": 3, @@ -986,7 +986,7 @@ NaN loss often occurs when a model is pretrained in bf16 and you try to use it w It is also possible that fp16 is causing overflow. For example, if your config file looks like the one below, you may see the following overflow errors in the logs. -```yaml +```json { "fp16": { "enabled": "auto", diff --git a/docs/source/en/fast_tokenizers.md b/docs/source/en/fast_tokenizers.md index 3e9db79cfc7f..7f3caaef3301 100644 --- a/docs/source/en/fast_tokenizers.md +++ b/docs/source/en/fast_tokenizers.md @@ -226,7 +226,7 @@ tokenizer = PreTrainedTokenizerFast.from_pretrained("config/save/dir") -A Transformers model expects the input to be a PyTorch or NumPy tensor. A tokenizers job is to preprocess text into those tensors. Specify the framework tensor type to return with the `return_tensors` parameter. +A Transformers model expects the input to be a PyTorch or NumPy tensor. A tokenizer's job is to preprocess text into those tensors. Specify the framework tensor type to return with the `return_tensors` parameter. ```py from transformers import AutoTokenizer diff --git a/docs/source/en/generation_strategies.md b/docs/source/en/generation_strategies.md index 63b70899af4d..d2d49e1f7028 100644 --- a/docs/source/en/generation_strategies.md +++ b/docs/source/en/generation_strategies.md @@ -229,6 +229,7 @@ tokenizer.batch_decode(outputs, skip_special_tokens=True) ## Custom generation methods Custom generation methods enable specialized behavior such as: + - have the model continue thinking if it is uncertain; - roll back generation if the model gets stuck; - handle special tokens with custom logic; @@ -289,7 +290,7 @@ print(tokenizer.batch_decode(gen_out)[0]) If the custom method has pinned Python requirements that your environment doesn't meet, you'll get an exception about missing requirements. For instance, [transformers-community/custom_generate_bad_requirements](https://huggingface.co/transformers-community/custom_generate_bad_requirements) has an impossible set of requirements defined in its `custom_generate/requirements.txt` file, and you'll see the error message below if you try to run it. -``` +```text ImportError: Missing requirements in your local environment for `transformers-community/custom_generate_bad_requirements`: foo (installed: None) bar==0.0.0 (installed: None) @@ -301,6 +302,7 @@ Updating your Python requirements accordingly will remove this error message. ### Creating a custom generation method To create a new generation method, you need to create a new [**Model**](https://huggingface.co/new) repository and push a few files into it. + 1. The model you've designed your generation method with. 2. `custom_generate/generate.py`, which contains all the logic for your custom generation method. 3. `custom_generate/requirements.txt`, used to optionally add new Python requirements and/or lock specific versions to correctly use your method. @@ -308,7 +310,7 @@ To create a new generation method, you need to create a new [**Model**](https:// After you've added all required files, your repository should look like this -``` +```text your_repo/ ├── README.md # include the 'custom_generate' tag ├── config.json @@ -377,6 +379,7 @@ def generate(model, input_ids, generation_config=None, left_padding=None, **kwar ``` Follow the recommended practices below to ensure your custom generation method works as expected. + - Feel free to reuse the logic for validation and input preparation in the original [`~GenerationMixin.generate`]. - Pin the `transformers` version in the requirements if you use any private method/attribute in `model`. - Consider adding model validation, input validation, or even a separate test file to help users sanity-check your code in their environment. @@ -389,7 +392,6 @@ from .utils import some_function Only relative imports from the same-level `custom_generate` folder are supported. Parent/sibling folder imports are not valid. The `custom_generate` argument also works locally with any directory that contains a `custom_generate` structure. This is the recommended workflow for developing your custom generation method. - #### requirements.txt You can optionally specify additional Python requirements in a `requirements.txt` file inside the `custom_generate` folder. These are checked at runtime and an exception will be thrown if they're missing, nudging users to update their environment accordingly. @@ -400,7 +402,7 @@ The root level `README.md` in the model repository usually describes the model t For discoverability, we highly recommend you to add the `custom_generate` tag to your repository. To do so, the top of your `README.md` file should look like the example below. After you push the file, you should see the tag in your repository! -``` +```text --- library_name: transformers tags: @@ -411,13 +413,14 @@ tags: ``` Recommended practices: + - Document input and output differences in [`~GenerationMixin.generate`]. - Add self-contained examples to enable quick experimentation. - Describe soft-requirements such as if the method only works well with a certain family of models. -### Reusing `generate`’s input preparation +### Reusing `generate`'s input preparation -If you're adding a new decoding loop, you might want to preserve the input preparation present in `generate` (batch expansion, attention masks, logits processors, stopping criteria, etc.). You can also pass a **callable** to `custom_generate` to reuse [`~GenerationMixin.generate`]’s full preparation pipeline while overriding only the decoding loop. +If you're adding a new decoding loop, you might want to preserve the input preparation present in `generate` (batch expansion, attention masks, logits processors, stopping criteria, etc.). You can also pass a **callable** to `custom_generate` to reuse [`~GenerationMixin.generate`]'s full preparation pipeline while overriding only the decoding loop. ```py def custom_loop(model, input_ids, attention_mask, logits_processor, stopping_criteria, generation_config, **model_kwargs): @@ -438,11 +441,12 @@ output = model.generate( ``` > [!TIP] -> If you publish a `custom_generate` repository, your `generate` implementation can itself define a callable and pass it to `model.generate()`. This lets you customize the decoding loop while still benefiting from Transformers’ built-in input preparation logic. +> If you publish a `custom_generate` repository, your `generate` implementation can itself define a callable and pass it to `model.generate()`. This lets you customize the decoding loop while still benefiting from Transformers' built-in input preparation logic. ### Finding custom generation methods You can find all custom generation methods by [searching for their custom tag.](https://huggingface.co/models?other=custom_generate), `custom_generate`. In addition to the tag, we curate two collections of `custom_generate` methods: + - [Custom generation methods - Community](https://huggingface.co/collections/transformers-community/custom-generation-methods-community-6888fb1da0efbc592d3a8ab6) -- a collection of powerful methods contributed by the community; - [Custom generation methods - Tutorials](https://huggingface.co/collections/transformers-community/custom-generation-methods-tutorials-6823589657a94940ea02cfec) -- a collection of reference implementations for methods that previously were part of `transformers`, as well as tutorials for `custom_generate`. diff --git a/docs/source/en/glossary.md b/docs/source/en/glossary.md index 9e57c3fdc9f8..1c8d8ebc2146 100644 --- a/docs/source/en/glossary.md +++ b/docs/source/en/glossary.md @@ -185,9 +185,9 @@ See the [Fine-tune a pretrained model](https://huggingface.co/docs/transformers/ The model head refers to the last layer of a neural network that accepts the raw hidden states and projects them onto a different dimension. There is a different model head for each task. For example: - * [`GPT2ForSequenceClassification`] is a sequence classification head - a linear layer - on top of the base [`GPT2Model`]. - * [`ViTForImageClassification`] is an image classification head - a linear layer on top of the final hidden state of the `CLS` token - on top of the base [`ViTModel`]. - * [`Wav2Vec2ForCTC`] is a language modeling head with [CTC](#connectionist-temporal-classification-ctc) on top of the base [`Wav2Vec2Model`]. +* [`GPT2ForSequenceClassification`] is a sequence classification head - a linear layer - on top of the base [`GPT2Model`]. +* [`ViTForImageClassification`] is an image classification head - a linear layer on top of the final hidden state of the `CLS` token - on top of the base [`ViTModel`]. +* [`Wav2Vec2ForCTC`] is a language modeling head with [CTC](#connectionist-temporal-classification-ctc) on top of the base [`Wav2Vec2Model`]. ## I diff --git a/docs/source/en/how_to_hack_models.md b/docs/source/en/how_to_hack_models.md index 0a3c38a3e14f..d5ce5bde7901 100644 --- a/docs/source/en/how_to_hack_models.md +++ b/docs/source/en/how_to_hack_models.md @@ -149,4 +149,4 @@ Call [print_trainable_parameters](https://huggingface.co/docs/peft/package_refer ```py model.print_trainable_parameters() "trainable params: 589,824 || all params: 94,274,096 || trainable%: 0.6256" -``` \ No newline at end of file +``` diff --git a/docs/source/en/index.md b/docs/source/en/index.md index ab0677b5a54e..5d7faa886618 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -19,7 +19,6 @@ rendered properly in your Markdown viewer. - Transformers acts as the model-definition framework for state-of-the-art machine learning models in text, computer vision, audio, video, and multimodal model, for both inference and training. @@ -35,6 +34,10 @@ There are over 1M+ Transformers [model checkpoints](https://huggingface.co/model Explore the [Hub](https://huggingface.com/) today to find a model and use Transformers to help you get started right away. +Explore the [Models Timeline](./models_timeline) to discover the latest text, vision, audio and multimodal model architectures in Transformers. + + + ## Features Transformers provides everything you need for inference or training with state-of-the-art pretrained models. Some of the main features include: @@ -61,4 +64,4 @@ Transformers is designed for developers and machine learning engineers and resea ## Learn -If you're new to Transformers or want to learn more about transformer models, we recommend starting with the [LLM course](https://huggingface.co/learn/llm-course/chapter1/1?fw=pt). This comprehensive course covers everything from the fundamentals of how transformer models work to practical applications across various tasks. You'll learn the complete workflow, from curating high-quality datasets to fine-tuning large language models and implementing reasoning capabilities. The course contains both theoretical and hands-on exercises to build a solid foundational knowledge of transformer models as you learn. \ No newline at end of file +If you're new to Transformers or want to learn more about transformer models, we recommend starting with the [LLM course](https://huggingface.co/learn/llm-course/chapter1/1?fw=pt). This comprehensive course covers everything from the fundamentals of how transformer models work to practical applications across various tasks. You'll learn the complete workflow, from curating high-quality datasets to fine-tuning large language models and implementing reasoning capabilities. The course contains both theoretical and hands-on exercises to build a solid foundational knowledge of transformer models as you learn. diff --git a/docs/source/en/internal/file_utils.md b/docs/source/en/internal/file_utils.md index 31fbc5b88110..63db5756a622 100644 --- a/docs/source/en/internal/file_utils.md +++ b/docs/source/en/internal/file_utils.md @@ -20,7 +20,6 @@ This page lists all of Transformers general utility functions that are found in Most of those are only useful if you are studying the general code in the library. - ## Enums and namedtuples [[autodoc]] utils.ExplicitEnum diff --git a/docs/source/en/internal/generation_utils.md b/docs/source/en/internal/generation_utils.md index d47eba82d8cc..87b0111ff053 100644 --- a/docs/source/en/internal/generation_utils.md +++ b/docs/source/en/internal/generation_utils.md @@ -65,7 +65,6 @@ values. Here, for instance, it has two keys that are `sequences` and `scores`. We document here all output types. - [[autodoc]] generation.GenerateDecoderOnlyOutput [[autodoc]] generation.GenerateEncoderDecoderOutput @@ -74,13 +73,11 @@ We document here all output types. [[autodoc]] generation.GenerateBeamEncoderDecoderOutput - ## LogitsProcessor A [`LogitsProcessor`] can be used to modify the prediction scores of a language model head for generation. - [[autodoc]] AlternatingCodebooksLogitsProcessor - __call__ @@ -174,8 +171,6 @@ generation. [[autodoc]] WatermarkLogitsProcessor - __call__ - - ## StoppingCriteria A [`StoppingCriteria`] can be used to change when to stop generation (other than EOS token). Please note that this is exclusively available to our PyTorch implementations. @@ -300,7 +295,6 @@ A [`Constraint`] can be used to force the generation to include specific tokens - to_legacy_cache - from_legacy_cache - ## Watermark Utils [[autodoc]] WatermarkingConfig diff --git a/docs/source/en/internal/import_utils.md b/docs/source/en/internal/import_utils.md index 0d76c2bbe33a..4a9915378a1f 100644 --- a/docs/source/en/internal/import_utils.md +++ b/docs/source/en/internal/import_utils.md @@ -22,8 +22,8 @@ worked around. We don't want for all users of `transformers` to have to install we therefore mark those as soft dependencies rather than hard dependencies. The transformers toolkit is not made to error-out on import of a model that has a specific dependency; instead, an -object for which you are lacking a dependency will error-out when calling any method on it. As an example, if -`torchvision` isn't installed, the fast image processors will not be available. +object for which you are lacking a dependency will error-out when calling any method on it. As an example, if +`torchvision` isn't installed, the fast image processors will not be available. This object is still importable: @@ -60,7 +60,7 @@ PyTorch dependency **Tokenizers**: All files starting with `tokenization_` and ending with `_fast` have an automatic `tokenizers` dependency -**Vision**: All files starting with `image_processing_` have an automatic dependency to the `vision` dependency group; +**Vision**: All files starting with `image_processing_` have an automatic dependency to the `vision` dependency group; at the time of writing, this only contains the `pillow` dependency. **Vision + Torch + Torchvision**: All files starting with `image_processing_` and ending with `_fast` have an automatic @@ -71,7 +71,7 @@ All of these automatic dependencies are added on top of the explicit dependencie ### Explicit Object Dependencies We add a method called `requires` that is used to explicitly specify the dependencies of a given object. As an -example, the `Trainer` class has two hard dependencies: `torch` and `accelerate`. Here is how we specify these +example, the `Trainer` class has two hard dependencies: `torch` and `accelerate`. Here is how we specify these required dependencies: ```python diff --git a/docs/source/en/internal/model_debugging_utils.md b/docs/source/en/internal/model_debugging_utils.md index 262113575f42..553a5ce56845 100644 --- a/docs/source/en/internal/model_debugging_utils.md +++ b/docs/source/en/internal/model_debugging_utils.md @@ -21,10 +21,8 @@ provides for it. Most of those are only useful if you are adding new models in the library. - ## Model addition debuggers - ### Model addition debugger - context manager for model adders This context manager is a power user tool intended for model adders. It tracks all forward calls within a model forward @@ -72,7 +70,6 @@ with model_addition_debugger_context( ``` - ### Reading results The debugger generates two files from the forward call, both with the same base name, but ending either with @@ -221,9 +218,9 @@ path reference to the associated `.safetensors` file. Each tensor is written to the state dictionary. File names are constructed using the `module_path` as a prefix with a few possible postfixes that are built recursively. -* Module inputs are denoted with the `_inputs` and outputs by `_outputs`. -* `list` and `tuple` instances, such as `args` or function return values, will be postfixed with `_{index}`. -* `dict` instances will be postfixed with `_{key}`. +* Module inputs are denoted with the `_inputs` and outputs by `_outputs`. +* `list` and `tuple` instances, such as `args` or function return values, will be postfixed with `_{index}`. +* `dict` instances will be postfixed with `_{key}`. ### Comparing between implementations @@ -231,10 +228,8 @@ Once the forward passes of two models have been traced by the debugger, one can below: we can see slight differences between these two implementations' key projection layer. Inputs are mostly identical, but not quite. Looking through the file differences makes it easier to pinpoint which layer is wrong. - ![download-icon](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/files_difference_debugging.png) - ### Limitations and scope This feature will only work for torch-based models, and would require more work and case-by-case approach for say @@ -254,13 +249,14 @@ layers. This small util is a power user tool intended for model adders and maintainers. It lists all test methods existing in `test_modeling_common.py`, inherited by all model tester classes, and scans the repository to measure -how many tests are being skipped and for which models. +how many tests are being skipped and for which models. ### Rationale When porting models to transformers, tests fail as they should, and sometimes `test_modeling_common` feels irreconcilable with the peculiarities of our brand new model. But how can we be sure we're not breaking everything by adding a seemingly innocent skip? This utility: + - scans all test_modeling_common methods - looks for times where a method is skipped - returns a summary json you can load as a DataFrame/inspect @@ -269,8 +265,7 @@ This utility: ![download-icon](https://huggingface.co/datasets/huggingface/documentation-images/resolve/f7f671f69b88ce4967e19179172c248958d35742/transformers/tests_skipped_visualisation.png) - -### Usage +### Usage You can run the skipped test analyzer in two ways: @@ -286,7 +281,7 @@ python utils/scan_skipped_tests.py --output_dir path/to/output **Example output:** -``` +```text 🔬 Parsing 331 model test files once each... 📝 Aggregating 224 tests... (224/224) test_update_candidate_strategy_with_matches_1es_3d_is_nonecodet_schedule_fa_kwargs diff --git a/docs/source/en/internal/pipelines_utils.md b/docs/source/en/internal/pipelines_utils.md index 6ea6de9a61b8..23856e5639c3 100644 --- a/docs/source/en/internal/pipelines_utils.md +++ b/docs/source/en/internal/pipelines_utils.md @@ -20,7 +20,6 @@ This page lists all the utility functions the library provides for pipelines. Most of those are only useful if you are studying the code of the models in the library. - ## Argument handling [[autodoc]] pipelines.ArgumentHandler diff --git a/docs/source/en/jan.md b/docs/source/en/jan.md index ff580496c81b..95309f46cd04 100644 --- a/docs/source/en/jan.md +++ b/docs/source/en/jan.md @@ -25,7 +25,7 @@ You are now ready to chat! To conclude this example, let's look into a more advanced use-case. If you have a beefy machine to serve models with, but prefer using Jan on a different device, you need to add port forwarding. If you have `ssh` access from your Jan machine into your server, this can be accomplished by typing the following to your Jan machine's terminal -``` +```bash ssh -N -f -L 8000:localhost:8000 your_server_account@your_server_IP -p port_to_ssh_into_your_server ``` diff --git a/docs/source/en/kv_cache.md b/docs/source/en/kv_cache.md index f0a781cba4fc..f318c73d28a9 100644 --- a/docs/source/en/kv_cache.md +++ b/docs/source/en/kv_cache.md @@ -67,7 +67,7 @@ out = model.generate(**inputs, do_sample=False, max_new_tokens=20, past_key_valu ## Fixed-size cache -The default [`DynamicCache`] prevents you from taking advantage of most just-in-time (JIT) optimizations because the cache size isn't fixed. JIT optimizations enable you to maximize latency at the expense of memory usage. All of the following cache types are compatible with JIT optimizations like [torch.compile](./llm_optims#static-kv-cache-and-torchcompile) to accelerate generation. +The default [`DynamicCache`] prevents you from taking advantage of most just-in-time (JIT) optimizations because the cache size isn't fixed. JIT optimizations enable you to maximize latency at the expense of memory usage. All of the following cache types are compatible with JIT optimizations like [torch.compile](./llm_optims#static-kv-cache-and-torchcompile) to accelerate generation. A fixed-size cache ([`StaticCache`]) pre-allocates a specific maximum cache size for the kv pairs. You can generate up to the maximum cache size without needing to modify it. However, having a fixed (usually large) size for the key/value states means that while generating, a lot of tokens will actually be masked as they should not take part in the attention. So this trick allows to easily `compile` the decoding stage, but it incurs a waste of tokens in the attention computation. As all things, it's then a trade-off which should be very good if you generate with several sequence of more or less the same lengths, but may be sub-optimal if you have for example 1 very large sequence, and then only short sequences (as the fix cache size would be large, a lot would be wasted for the short sequences). Make sure you understand the impact if you use it! @@ -213,7 +213,7 @@ A cache can also work in iterative generation settings where there is back-and-f For iterative generation with a cache, start by initializing an empty cache class and then you can feed in your new prompts. Keep track of dialogue history with a [chat template](./chat_templating). -The following example demonstrates [Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf). If you’re using a different chat-style model, [`~PreTrainedTokenizer.apply_chat_template`] may process messages differently. It might cut out important tokens depending on how the Jinja template is written. +The following example demonstrates [Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf). If you're using a different chat-style model, [`~PreTrainedTokenizer.apply_chat_template`] may process messages differently. It might cut out important tokens depending on how the Jinja template is written. For example, some models use special ` ... ` tokens during reasoning. These could get lost during re-encoding, causing indexing issues. You might need to manually remove or adjust extra tokens from the completions to keep things stable. diff --git a/docs/source/en/llm_tutorial.md b/docs/source/en/llm_tutorial.md index a08f57426b6a..0499335c2ace 100644 --- a/docs/source/en/llm_tutorial.md +++ b/docs/source/en/llm_tutorial.md @@ -35,6 +35,7 @@ Before you begin, it's helpful to install [bitsandbytes](https://hf.co/docs/bits ```bash !pip install -U transformers bitsandbytes ``` + Bitsandbytes supports multiple backends in addition to CUDA-based GPUs. Refer to the multi-backend installation [guide](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend) to learn more. Load a LLM with [`~PreTrainedModel.from_pretrained`] and add the following two parameters to reduce the memory requirements. @@ -92,6 +93,7 @@ model.generate(**inputs, num_beams=4, do_sample=True) ``` [`~GenerationMixin.generate`] can also be extended with external libraries or custom code: + 1. the `logits_processor` parameter accepts custom [`LogitsProcessor`] instances for manipulating the next token probability distribution; 2. the `stopping_criteria` parameters supports custom [`StoppingCriteria`] to stop text generation; 3. other custom generation methods can be loaded through the `custom_generate` flag ([docs](generation_strategies.md/#custom-decoding-methods)). @@ -154,7 +156,6 @@ print(tokenizer.batch_decode(outputs, skip_special_tokens=True)) | `repetition_penalty` | `float` | Set it to `>1.0` if you're seeing the model repeat itself often. Larger values apply a larger penalty. | | `eos_token_id` | `list[int]` | The token(s) that will cause generation to stop. The default value is usually good, but you can specify a different token. | - ## Pitfalls The section below covers some common issues you may encounter during text generation and how to solve them. diff --git a/docs/source/en/llm_tutorial_optimization.md b/docs/source/en/llm_tutorial_optimization.md index 63d9308a84f4..d3095055472c 100644 --- a/docs/source/en/llm_tutorial_optimization.md +++ b/docs/source/en/llm_tutorial_optimization.md @@ -66,6 +66,7 @@ If you have access to an 8 x 80GB A100 node, you could load BLOOM as follows ```bash !pip install transformers accelerate bitsandbytes optimum ``` + ```python from transformers import AutoModelForCausalLM @@ -98,7 +99,8 @@ result ``` **Output**: -``` + +```text Here is a Python function that transforms bytes to Giga bytes:\n\n```python\ndef bytes_to_giga_bytes(bytes):\n return bytes / 1024 / 1024 / 1024\n```\n\nThis function takes a single ``` @@ -116,7 +118,8 @@ bytes_to_giga_bytes(torch.cuda.max_memory_allocated()) ``` **Output**: -```bash + +```text 29.0260648727417 ``` @@ -127,7 +130,6 @@ Note that if we had tried to run the model in full float32 precision, a whopping If you are unsure in which format the model weights are stored on the Hub, you can always look into the checkpoint's config under `"dtype"`, *e.g.* [here](https://huggingface.co/meta-llama/Llama-2-7b-hf/blob/6fdf2e60f86ff2481f2241aaee459f85b5b0bbb9/config.json#L21). It is recommended to set the model to the same precision type as written in the config when loading with `from_pretrained(..., dtype=...)` except when the original type is float32 in which case one can use both `float16` or `bfloat16` for inference. - Let's define a `flush(...)` function to free all allocated memory so that we can accurately measure the peak allocated GPU memory. ```python @@ -148,6 +150,7 @@ Let's call it now for the next experiment. ```python flush() ``` + From the Accelerate library, you can also use a device-agnostic utility method called [release_memory](https://github.com/huggingface/accelerate/blob/29be4788629b772a3b722076e433b5b3b5c85da3/src/accelerate/utils/memory.py#L63), which takes various hardware backends like XPU, MLU, NPU, MPS, and more into account. ```python @@ -204,7 +207,8 @@ result ``` **Output**: -``` + +```text Here is a Python function that transforms bytes to Giga bytes:\n\n```python\ndef bytes_to_giga_bytes(bytes):\n return bytes / 1024 / 1024 / 1024\n```\n\nThis function takes a single ``` @@ -215,15 +219,16 @@ bytes_to_giga_bytes(torch.cuda.max_memory_allocated()) ``` **Output**: -``` + +```text 15.219234466552734 ``` Significantly less! We're down to just a bit over 15 GBs and could therefore run this model on consumer GPUs like the 4090. We're seeing a very nice gain in memory efficiency and more or less no degradation to the model's output. However, we can also notice a slight slow-down during inference. - We delete the models and flush the memory again. + ```python del model del pipe @@ -245,7 +250,8 @@ result ``` **Output**: -``` + +```text Here is a Python function that transforms bytes to Giga bytes:\n\n```\ndef bytes_to_gigabytes(bytes):\n return bytes / 1024 / 1024 / 1024\n```\n\nThis function takes a single argument ``` @@ -256,7 +262,8 @@ bytes_to_giga_bytes(torch.cuda.max_memory_allocated()) ``` **Output**: -``` + +```text 9.543574333190918 ``` @@ -270,6 +277,7 @@ Also note that inference here was again a bit slower compared to 8-bit quantizat del model del pipe ``` + ```python flush() ``` @@ -384,6 +392,7 @@ def alternating(list1, list2): ----- """ ``` + For demonstration purposes, we duplicate the system prompt by ten so that the input length is long enough to observe Flash Attention's memory savings. We append the original text prompt `"Question: Please write a function in Python that transforms bytes to Giga bytes.\n\nAnswer: Here"` @@ -413,7 +422,8 @@ result ``` **Output**: -``` + +```text Generated in 10.96854019165039 seconds. Sure. Here is a function that does that.\n\ndef bytes_to_giga(bytes):\n return bytes / 1024 / 1024 / 1024\n\nAnswer: Sure. Here is a function that does that.\n\ndef ```` @@ -429,7 +439,8 @@ bytes_to_giga_bytes(torch.cuda.max_memory_allocated()) ``` **Output**: -```bash + +```text 37.668193340301514 ``` @@ -460,7 +471,8 @@ result ``` **Output**: -``` + +```text Generated in 3.0211617946624756 seconds. Sure. Here is a function that does that.\n\ndef bytes_to_giga(bytes):\n return bytes / 1024 / 1024 / 1024\n\nAnswer: Sure. Here is a function that does that.\n\ndef ``` @@ -474,7 +486,8 @@ bytes_to_giga_bytes(torch.cuda.max_memory_allocated()) ``` **Output**: -``` + +```text 32.617331981658936 ``` @@ -604,7 +617,8 @@ generated_text ``` **Output**: -``` + +```text shape of input_ids torch.Size([1, 21]) shape of input_ids torch.Size([1, 22]) shape of input_ids torch.Size([1, 23]) @@ -641,7 +655,8 @@ generated_text ``` **Output**: -``` + +```text shape of input_ids torch.Size([1, 1]) length of key-value cache 20 shape of input_ids torch.Size([1, 1]) @@ -675,7 +690,7 @@ Note that, despite our advice to use key-value caches, your LLM output may be sl The key-value cache is especially useful for applications such as chat where multiple passes of auto-regressive decoding are required. Let's look at an example. -``` +```text User: How many people live in France? Assistant: Roughly 75 million people live in France User: And how many are in Germany? @@ -712,7 +727,8 @@ tokenizer.batch_decode(generation_output.sequences)[0][len(prompt):] ``` **Output**: -``` + +```text is a modified version of the function that returns Mega bytes instead. def bytes_to_megabytes(bytes): @@ -733,7 +749,8 @@ config = model.config ``` **Output**: -``` + +```text 7864320000 ``` @@ -773,7 +790,6 @@ The most notable application of GQA is [Llama-v2](https://huggingface.co/meta-ll > As a conclusion, it is strongly recommended to make use of either GQA or MQA if the LLM is deployed with auto-regressive decoding and is required to handle large input sequences as is the case for example for chat. - ## Conclusion The research community is constantly coming up with new, nifty ways to speed up inference time for ever-larger LLMs. As an example, one such promising research direction is [speculative decoding](https://huggingface.co/papers/2211.17192) where "easy tokens" are generated by smaller, faster language models and only "hard tokens" are generated by the LLM itself. Going into more detail is out of the scope of this notebook, but can be read upon in this [nice blog post](https://huggingface.co/blog/assisted-generation). diff --git a/docs/source/en/main_classes/callback.md b/docs/source/en/main_classes/callback.md index b29c9e7264ec..bc1413a94742 100644 --- a/docs/source/en/main_classes/callback.md +++ b/docs/source/en/main_classes/callback.md @@ -54,7 +54,6 @@ The main class that implements callbacks is [`TrainerCallback`]. It gets the Trainer's internal state via [`TrainerState`], and can take some actions on the training loop via [`TrainerControl`]. - ## Available Callbacks Here is the list of the available [`TrainerCallback`] in the library: diff --git a/docs/source/en/main_classes/configuration.md b/docs/source/en/main_classes/configuration.md index 0cfef06d3ce9..933621f6a144 100644 --- a/docs/source/en/main_classes/configuration.md +++ b/docs/source/en/main_classes/configuration.md @@ -24,7 +24,6 @@ Each derived config class implements model specific attributes. Common attribute `hidden_size`, `num_attention_heads`, and `num_hidden_layers`. Text models further implement: `vocab_size`. - ## PretrainedConfig [[autodoc]] PretrainedConfig diff --git a/docs/source/en/main_classes/data_collator.md b/docs/source/en/main_classes/data_collator.md index 2941338375be..33d156ec93fe 100644 --- a/docs/source/en/main_classes/data_collator.md +++ b/docs/source/en/main_classes/data_collator.md @@ -25,7 +25,6 @@ on the formed batch. Examples of use can be found in the [example scripts](../examples) or [example notebooks](../notebooks). - ## Default data collator [[autodoc]] data.data_collator.default_data_collator diff --git a/docs/source/en/main_classes/deepspeed.md b/docs/source/en/main_classes/deepspeed.md index 0b9e28656c09..b04949229da4 100644 --- a/docs/source/en/main_classes/deepspeed.md +++ b/docs/source/en/main_classes/deepspeed.md @@ -16,7 +16,7 @@ rendered properly in your Markdown viewer. # DeepSpeed -[DeepSpeed](https://github.com/deepspeedai/DeepSpeed), powered by Zero Redundancy Optimizer (ZeRO), is an optimization library for training and fitting very large models onto a GPU. It is available in several ZeRO stages, where each stage progressively saves more GPU memory by partitioning the optimizer state, gradients, parameters, and enabling offloading to a CPU or NVMe. DeepSpeed is integrated with the [`Trainer`] class and most of the setup is automatically taken care of for you. +[DeepSpeed](https://github.com/deepspeedai/DeepSpeed), powered by Zero Redundancy Optimizer (ZeRO), is an optimization library for training and fitting very large models onto a GPU. It is available in several ZeRO stages, where each stage progressively saves more GPU memory by partitioning the optimizer state, gradients, parameters, and enabling offloading to a CPU or NVMe. DeepSpeed is integrated with the [`Trainer`] class and most of the setup is automatically taken care of for you. However, if you want to use DeepSpeed without the [`Trainer`], Transformers provides a [`HfDeepSpeedConfig`] class. diff --git a/docs/source/en/main_classes/executorch.md b/docs/source/en/main_classes/executorch.md index 3178085c9135..3406309aa325 100644 --- a/docs/source/en/main_classes/executorch.md +++ b/docs/source/en/main_classes/executorch.md @@ -15,14 +15,12 @@ rendered properly in your Markdown viewer. --> - # ExecuTorch [`ExecuTorch`](https://github.com/pytorch/executorch) is an end-to-end solution for enabling on-device inference capabilities across mobile and edge devices including wearables, embedded devices and microcontrollers. It is part of the PyTorch ecosystem and supports the deployment of PyTorch models with a focus on portability, productivity, and performance. ExecuTorch introduces well defined entry points to perform model, device, and/or use-case specific optimizations such as backend delegation, user-defined compiler transformations, memory planning, and more. The first step in preparing a PyTorch model for execution on an edge device using ExecuTorch is to export the model. This is achieved through the use of a PyTorch API called [`torch.export`](https://pytorch.org/docs/stable/export.html). - ## ExecuTorch Integration An integration point is being developed to ensure that 🤗 Transformers can be exported using `torch.export`. The goal of this integration is not only to enable export but also to ensure that the exported artifact can be further lowered and optimized to run efficiently in `ExecuTorch`, particularly for mobile and edge use cases. diff --git a/docs/source/en/main_classes/feature_extractor.md b/docs/source/en/main_classes/feature_extractor.md index fd451a35481a..294ecad6309e 100644 --- a/docs/source/en/main_classes/feature_extractor.md +++ b/docs/source/en/main_classes/feature_extractor.md @@ -18,7 +18,6 @@ rendered properly in your Markdown viewer. A feature extractor is in charge of preparing input features for audio or vision models. This includes feature extraction from sequences, e.g., pre-processing audio files to generate Log-Mel Spectrogram features, feature extraction from images, e.g., cropping image files, but also padding, normalization, and conversion to NumPy and PyTorch tensors. - ## FeatureExtractionMixin [[autodoc]] feature_extraction_utils.FeatureExtractionMixin diff --git a/docs/source/en/main_classes/image_processor.md b/docs/source/en/main_classes/image_processor.md index 7dc9de60571f..61be0306630d 100644 --- a/docs/source/en/main_classes/image_processor.md +++ b/docs/source/en/main_classes/image_processor.md @@ -26,6 +26,7 @@ from transformers import AutoImageProcessor processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50", use_fast=True) ``` + Note that `use_fast` will be set to `True` by default in a future release. When using a fast image processor, you can also set the `device` argument to specify the device on which the processing should be done. By default, the processing is done on the same device as the inputs if the inputs are tensors, or on the CPU otherwise. @@ -57,7 +58,6 @@ Here are some speed comparisons between the base and fast image processors for t These benchmarks were run on an [AWS EC2 g5.2xlarge instance](https://aws.amazon.com/ec2/instance-types/g5/), utilizing an NVIDIA A10G Tensor Core GPU. - ## ImageProcessingMixin [[autodoc]] image_processing_utils.ImageProcessingMixin @@ -72,7 +72,6 @@ These benchmarks were run on an [AWS EC2 g5.2xlarge instance](https://aws.amazon [[autodoc]] image_processing_utils.BaseImageProcessor - ## BaseImageProcessorFast [[autodoc]] image_processing_utils_fast.BaseImageProcessorFast diff --git a/docs/source/en/main_classes/logging.md b/docs/source/en/main_classes/logging.md index 5cbdf9ae27ed..330c68218bf9 100644 --- a/docs/source/en/main_classes/logging.md +++ b/docs/source/en/main_classes/logging.md @@ -55,7 +55,6 @@ logger.info("INFO") logger.warning("WARN") ``` - All the methods of this logging module are documented below, the main ones are [`logging.get_verbosity`] to get the current level of verbosity in the logger and [`logging.set_verbosity`] to set the verbosity to the level of your choice. In order (from the least @@ -81,6 +80,7 @@ We use both in the `transformers` library. We leverage and adapt `logging`'s `ca management of these warning messages by the verbosity setters above. What does that mean for developers of the library? We should respect the following heuristics: + - `warnings` should be favored for developers of the library and libraries dependent on `transformers` - `logging` should be used for end-users of the library using it in every-day projects diff --git a/docs/source/en/main_classes/model.md b/docs/source/en/main_classes/model.md index d7768a905ce0..e3e77a8e2e13 100644 --- a/docs/source/en/main_classes/model.md +++ b/docs/source/en/main_classes/model.md @@ -26,7 +26,6 @@ file or directory, or from a pretrained model configuration provided by the libr The other methods that are common to each model are defined in [`~modeling_utils.ModuleUtilsMixin`] and [`~generation.GenerationMixin`]. - ## PreTrainedModel [[autodoc]] PreTrainedModel diff --git a/docs/source/en/main_classes/onnx.md b/docs/source/en/main_classes/onnx.md index 81d31c97e88d..5f8869948d2b 100644 --- a/docs/source/en/main_classes/onnx.md +++ b/docs/source/en/main_classes/onnx.md @@ -51,4 +51,3 @@ to export models for different types of topologies or tasks. ### FeaturesManager [[autodoc]] onnx.features.FeaturesManager - diff --git a/docs/source/en/main_classes/optimizer_schedules.md b/docs/source/en/main_classes/optimizer_schedules.md index 84d9ca7b907e..3bab249ab4ee 100644 --- a/docs/source/en/main_classes/optimizer_schedules.md +++ b/docs/source/en/main_classes/optimizer_schedules.md @@ -22,7 +22,6 @@ The `.optimization` module provides: - several schedules in the form of schedule objects that inherit from `_LRSchedule`: - a gradient accumulation class to accumulate the gradients of multiple batches - ## AdaFactor [[autodoc]] Adafactor diff --git a/docs/source/en/main_classes/output.md b/docs/source/en/main_classes/output.md index 295f99e21d10..8a9ae879fb19 100644 --- a/docs/source/en/main_classes/output.md +++ b/docs/source/en/main_classes/output.md @@ -47,7 +47,6 @@ However, this is not always the case. Some models apply normalization or subsequ - You can access each attribute as you would usually do, and if that attribute has not been returned by the model, you will get `None`. Here for instance `outputs.loss` is the loss computed by the model, and `outputs.attentions` is `None`. diff --git a/docs/source/en/main_classes/pipelines.md b/docs/source/en/main_classes/pipelines.md index 0e4cf55995bf..2a63deeba378 100644 --- a/docs/source/en/main_classes/pipelines.md +++ b/docs/source/en/main_classes/pipelines.md @@ -81,7 +81,6 @@ for out in tqdm(pipe(KeyDataset(dataset, "file"))): For ease of use, a generator is also possible: - ```python from transformers import pipeline @@ -160,7 +159,7 @@ for batch_size in [1, 8, 64, 256]: pass ``` -``` +```text # On GTX 970 ------------------------------ Streaming no batching @@ -196,8 +195,7 @@ This is a occasional very long sentence compared to the other. In that case, the tokens long, so the whole batch will be [64, 400] instead of [64, 4], leading to the high slowdown. Even worse, on bigger batches, the program simply crashes. - -``` +```text ------------------------------ Streaming no batching 100%|█████████████████████████████████████████████████████████████████████| 1000/1000 [00:05<00:00, 183.69it/s] @@ -245,7 +243,6 @@ multiple forward pass of a model. Under normal circumstances, this would yield i In order to circumvent this issue, both of these pipelines are a bit specific, they are `ChunkPipeline` instead of regular `Pipeline`. In short: - ```python preprocessed = pipe.preprocess(inputs) model_outputs = pipe.forward(preprocessed) @@ -254,7 +251,6 @@ outputs = pipe.postprocess(model_outputs) Now becomes: - ```python all_model_outputs = [] for preprocessed in pipe.preprocess(inputs): @@ -282,7 +278,6 @@ If you want to override a specific pipeline. Don't hesitate to create an issue for your task at hand, the goal of the pipeline is to be easy to use and support most cases, so `transformers` could maybe support your use case. - If you want to try simply you can: - Subclass your pipeline of choice @@ -302,7 +297,6 @@ my_pipeline = pipeline(model="xxxx", pipeline_class=MyPipeline) That should enable you to do all the custom code you want. - ## Implementing a pipeline [Implementing a new pipeline](../add_new_pipeline) @@ -329,7 +323,6 @@ Pipelines available for audio tasks include the following. - __call__ - all - ### ZeroShotAudioClassificationPipeline [[autodoc]] ZeroShotAudioClassificationPipeline diff --git a/docs/source/en/main_classes/processors.md b/docs/source/en/main_classes/processors.md index 2c2e0cd31b72..44a2bceeca68 100644 --- a/docs/source/en/main_classes/processors.md +++ b/docs/source/en/main_classes/processors.md @@ -17,6 +17,7 @@ rendered properly in your Markdown viewer. # Processors Processors can mean two different things in the Transformers library: + - the objects that pre-process inputs for multi-modal models such as [Wav2Vec2](../model_doc/wav2vec2) (speech and text) or [CLIP](../model_doc/clip) (text and vision) - deprecated objects that were used in older versions of the library to preprocess data for GLUE or SQUAD. @@ -71,7 +72,6 @@ Additionally, the following method can be used to load values from a data file a [[autodoc]] data.processors.glue.glue_convert_examples_to_features - ## XNLI [The Cross-Lingual NLI Corpus (XNLI)](https://www.nyu.edu/projects/bowman/xnli/) is a benchmark that evaluates the @@ -88,7 +88,6 @@ Please note that since the gold labels are available on the test set, evaluation An example using these processors is given in the [run_xnli.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification/run_xnli.py) script. - ## SQuAD [The Stanford Question Answering Dataset (SQuAD)](https://rajpurkar.github.io/SQuAD-explorer//) is a benchmark that @@ -115,11 +114,9 @@ Additionally, the following method can be used to convert SQuAD examples into [[autodoc]] data.processors.squad.squad_convert_examples_to_features - These processors as well as the aforementioned method can be used with files containing the data as well as with the *tensorflow_datasets* package. Examples are given below. - ### Example usage Here is an example using the processors as well as the conversion method using data files: diff --git a/docs/source/en/main_classes/text_generation.md b/docs/source/en/main_classes/text_generation.md index cb853f722e1d..d879669bcab8 100644 --- a/docs/source/en/main_classes/text_generation.md +++ b/docs/source/en/main_classes/text_generation.md @@ -30,15 +30,15 @@ like token streaming. ## GenerationConfig [[autodoc]] generation.GenerationConfig - - from_pretrained - - from_model_config - - save_pretrained - - update - - validate - - get_generation_mode + - from_pretrained + - from_model_config + - save_pretrained + - update + - validate + - get_generation_mode ## GenerationMixin [[autodoc]] GenerationMixin - - generate - - compute_transition_scores + - generate + - compute_transition_scores diff --git a/docs/source/en/main_classes/tokenizer.md b/docs/source/en/main_classes/tokenizer.md index 83d2ae5df6a7..52c9751226d4 100644 --- a/docs/source/en/main_classes/tokenizer.md +++ b/docs/source/en/main_classes/tokenizer.md @@ -22,7 +22,7 @@ Rust library [🤗 Tokenizers](https://github.com/huggingface/tokenizers). The " 1. a significant speed-up in particular when doing batched tokenization and 2. additional methods to map between the original string (character and words) and the token space (e.g. getting the - index of the token comprising a given character or the span of characters corresponding to a given token). + index of the token comprising a given character or the span of characters corresponding to a given token). The base classes [`PreTrainedTokenizer`] and [`PreTrainedTokenizerFast`] implement the common methods for encoding string inputs in model inputs (see below) and instantiating/saving python and @@ -50,12 +50,11 @@ several advanced alignment methods which can be used to map between the original token space (e.g., getting the index of the token comprising a given character or the span of characters corresponding to a given token). - # Multimodal Tokenizer Apart from that each tokenizer can be a "multimodal" tokenizer which means that the tokenizer will hold all relevant special tokens as part of tokenizer attributes for easier access. For example, if the tokenizer is loaded from a vision-language model like LLaVA, you will -be able to access `tokenizer.image_token_id` to obtain the special image token used as a placeholder. +be able to access `tokenizer.image_token_id` to obtain the special image token used as a placeholder. To enable extra special tokens for any type of tokenizer, you have to add the following lines and save the tokenizer. Extra special tokens do not have to be modality related and can ne anything that the model often needs access to. In the below code, tokenizer at `output_dir` will have direct access diff --git a/docs/source/en/main_classes/video_processor.md b/docs/source/en/main_classes/video_processor.md index ee69030ab1a1..29d29d0cb605 100644 --- a/docs/source/en/main_classes/video_processor.md +++ b/docs/source/en/main_classes/video_processor.md @@ -22,7 +22,6 @@ The video processor extends the functionality of image processors by allowing Vi When adding a new VLM or updating an existing one to enable distinct video preprocessing, saving and reloading the processor configuration will store the video related arguments in a dedicated file named `video_preprocessing_config.json`. Don't worry if you haven't updated your VLM, the processor will try to load video related configurations from a file named `preprocessing_config.json`. - ### Usage Example Here's an example of how to load a video processor with [`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf) model: @@ -59,7 +58,6 @@ The video processor can also sample video frames using the technique best suited - ```python from transformers import AutoVideoProcessor @@ -92,4 +90,3 @@ print(processed_video_inputs.pixel_values_videos.shape) ## BaseVideoProcessor [[autodoc]] video_processing_utils.BaseVideoProcessor - diff --git a/docs/source/en/model_doc/aimv2.md b/docs/source/en/model_doc/aimv2.md index 9d0abbaaf36b..acf9c4de12fe 100644 --- a/docs/source/en/model_doc/aimv2.md +++ b/docs/source/en/model_doc/aimv2.md @@ -25,7 +25,6 @@ The abstract from the paper is the following: *We introduce a novel method for pre-training of large-scale vision encoders. Building on recent advancements in autoregressive pre-training of vision models, we extend this framework to a multimodal setting, i.e., images and text. In this paper, we present AIMV2, a family of generalist vision encoders characterized by a straightforward pre-training process, scalability, and remarkable performance across a range of downstream tasks. This is achieved by pairing the vision encoder with a multimodal decoder that autoregressively generates raw image patches and text tokens. Our encoders excel not only in multimodal evaluations but also in vision benchmarks such as localization, grounding, and classification. Notably, our AIMV2-3B encoder achieves 89.5% accuracy on ImageNet-1k with a frozen trunk. Furthermore, AIMV2 consistently outperforms state-of-the-art contrastive models (e.g., CLIP, SigLIP) in multimodal image understanding across diverse settings.* - This model was contributed by [Yaswanth Gali](https://huggingface.co/yaswanthgali). The original code can be found [here](https://github.com/apple/ml-aim). diff --git a/docs/source/en/model_doc/align.md b/docs/source/en/model_doc/align.md index 7379c84fc3a9..275b510ccd5c 100644 --- a/docs/source/en/model_doc/align.md +++ b/docs/source/en/model_doc/align.md @@ -148,6 +148,7 @@ for label, score in zip(candidate_labels, probs): ``` ## Resources + - Refer to the [Kakao Brain’s Open Source ViT, ALIGN, and the New COYO Text-Image Dataset](https://huggingface.co/blog/vit-align) blog post for more details. ## AlignConfig diff --git a/docs/source/en/model_doc/arcee.md b/docs/source/en/model_doc/arcee.md index a5335608edb1..ebedd73a4a46 100644 --- a/docs/source/en/model_doc/arcee.md +++ b/docs/source/en/model_doc/arcee.md @@ -102,4 +102,4 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True)) ## ArceeForTokenClassification [[autodoc]] ArceeForTokenClassification - - forward \ No newline at end of file + - forward diff --git a/docs/source/en/model_doc/aria.md b/docs/source/en/model_doc/aria.md index e5f4afa7b7ae..ddd0815aaa57 100644 --- a/docs/source/en/model_doc/aria.md +++ b/docs/source/en/model_doc/aria.md @@ -98,7 +98,7 @@ print(response) Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends. - + The example below uses [torchao](../quantization/torchao) to only quantize the weights to int4 and the [rhymes-ai/Aria-sequential_mlp](https://huggingface.co/rhymes-ai/Aria-sequential_mlp) checkpoint. This checkpoint replaces grouped GEMM with `torch.nn.Linear` layers for easier quantization. ```py @@ -142,7 +142,6 @@ response = processor.decode(output_ids, skip_special_tokens=True) print(response) ``` - ## AriaImageProcessor [[autodoc]] AriaImageProcessor diff --git a/docs/source/en/model_doc/audio-spectrogram-transformer.md b/docs/source/en/model_doc/audio-spectrogram-transformer.md index 40115810467a..bced0a4b2bcc 100644 --- a/docs/source/en/model_doc/audio-spectrogram-transformer.md +++ b/docs/source/en/model_doc/audio-spectrogram-transformer.md @@ -52,16 +52,16 @@ the authors compute the stats for a downstream dataset. ### Using Scaled Dot Product Attention (SDPA) -PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function -encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the -[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) +PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function +encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the +[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention) page for more information. -SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set +SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set `attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used. -``` +```py from transformers import ASTForAudioClassification model = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593", attn_implementation="sdpa", dtype=torch.float16) ... diff --git a/docs/source/en/model_doc/auto.md b/docs/source/en/model_doc/auto.md index 2f8cbc2009b3..c1db5e2541a6 100644 --- a/docs/source/en/model_doc/auto.md +++ b/docs/source/en/model_doc/auto.md @@ -23,7 +23,6 @@ automatically retrieve the relevant model given the name/path to the pretrained Instantiating one of [`AutoConfig`], [`AutoModel`], and [`AutoTokenizer`] will directly create a class of the relevant architecture. For instance - ```python model = AutoModel.from_pretrained("google-bert/bert-base-cased") ``` diff --git a/docs/source/en/model_doc/aya_vision.md b/docs/source/en/model_doc/aya_vision.md index 1f02b30344a2..d0822173e898 100644 --- a/docs/source/en/model_doc/aya_vision.md +++ b/docs/source/en/model_doc/aya_vision.md @@ -29,7 +29,7 @@ You can find all the original Aya Vision checkpoints under the [Aya Vision](http > [!TIP] > This model was contributed by [saurabhdash](https://huggingface.co/saurabhdash) and [yonigozlan](https://huggingface.co/yonigozlan). -> +> > Click on the Aya Vision models in the right sidebar for more examples of how to apply Aya Vision to different image-to-text tasks. The example below demonstrates how to generate text based on an image with [`Pipeline`] or the [`AutoModel`] class. diff --git a/docs/source/en/model_doc/bark.md b/docs/source/en/model_doc/bark.md index a5787ab234ee..6024b0e83ed5 100644 --- a/docs/source/en/model_doc/bark.md +++ b/docs/source/en/model_doc/bark.md @@ -76,7 +76,7 @@ Note that 🤗 Optimum must be installed before using this feature. [Here's how Flash Attention 2 is an even faster, optimized version of the previous optimization. -##### Installation +##### Installation First, check whether your hardware is compatible with Flash Attention 2. The latest list of compatible hardware can be found in the [official documentation](https://github.com/Dao-AILab/flash-attention#installation-and-features). If your hardware is not compatible with Flash Attention 2, you can still benefit from attention kernel optimisations through Better Transformer support covered [above](https://huggingface.co/docs/transformers/main/en/model_doc/bark#using-better-transformer). @@ -86,7 +86,6 @@ Next, [install](https://github.com/Dao-AILab/flash-attention#installation-and-fe pip install -U flash-attn --no-build-isolation ``` - ##### Usage To load a model using Flash Attention 2, we can pass the `attn_implementation="flash_attention_2"` flag to [`.from_pretrained`](https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained). We'll also load the model in half-precision (e.g. `torch.float16`), since it results in almost no degradation to audio quality but significantly lower memory usage and faster inference: @@ -97,7 +96,6 @@ model = BarkModel.from_pretrained("suno/bark-small", dtype=torch.float16, attn_i ##### Performance comparison - The following diagram shows the latency for the native attention implementation (no optimisation) against Better Transformer and Flash Attention 2. In all cases, we generate 400 semantic tokens on a 40GB A100 GPU with PyTorch 2.1. Flash Attention 2 is also consistently faster than Better Transformer, and its performance improves even more as batch sizes increase:
@@ -108,7 +106,6 @@ To put this into perspective, on an NVIDIA A100 and when generating 400 semantic At batch size 8, on an NVIDIA A100, Flash Attention 2 is also 10% faster than Better Transformer, and at batch size 16, 25%. - #### Combining optimization techniques You can combine optimization techniques, and use CPU offload, half-precision and Flash Attention 2 (or 🤗 Better Transformer) all at once. @@ -147,7 +144,7 @@ These presets are also uploaded in the hub [here](https://huggingface.co/suno/ba >>> audio_array = audio_array.cpu().numpy().squeeze() ``` -Bark can generate highly realistic, **multilingual** speech as well as other audio - including music, background noise and simple sound effects. +Bark can generate highly realistic, **multilingual** speech as well as other audio - including music, background noise and simple sound effects. ```python >>> # Multilingual speech - simplified Chinese @@ -165,7 +162,6 @@ Bark can generate highly realistic, **multilingual** speech as well as other aud The model can also produce **nonverbal communications** like laughing, sighing and crying. - ```python >>> # Adding non-speech cues to the input text >>> inputs = processor("Hello uh ... [clears throat], my dog is cute [laughter]") @@ -235,4 +231,3 @@ To save the audio, simply take the sample rate from the model config and some sc [[autodoc]] BarkSemanticConfig - all - diff --git a/docs/source/en/model_doc/bart.md b/docs/source/en/model_doc/bart.md index b0252ea92311..daa65d6afc0c 100644 --- a/docs/source/en/model_doc/bart.md +++ b/docs/source/en/model_doc/bart.md @@ -15,7 +15,6 @@ rendered properly in your Markdown viewer. --> *This model was released on 2019-10-29 and added to Hugging Face Transformers on 2020-11-16.* -
PyTorch @@ -24,7 +23,7 @@ rendered properly in your Markdown viewer.
# BART -[BART](https://huggingface.co/papers/1910.13461) is a sequence-to-sequence model that combines the pretraining objectives from BERT and GPT. It’s pretrained by corrupting text in different ways like deleting words, shuffling sentences, or masking tokens and learning how to fix it. The encoder encodes the corrupted document and the corrupted text is fixed by the decoder. As it learns to recover the original text, BART gets really good at both understanding and generating language. +[BART](https://huggingface.co/papers/1910.13461) is a sequence-to-sequence model that combines the pretraining objectives from BERT and GPT. It's pretrained by corrupting text in different ways like deleting words, shuffling sentences, or masking tokens and learning how to fix it. The encoder encodes the corrupted document and the corrupted text is fixed by the decoder. As it learns to recover the original text, BART gets really good at both understanding and generating language. You can find all the original BART checkpoints under the [AI at Meta](https://huggingface.co/facebook?search_models=bart) organization. @@ -46,6 +45,7 @@ pipeline = pipeline( pipeline("Plants create through a process known as photosynthesis.") ``` + @@ -89,7 +89,7 @@ echo -e "Plants create through a process known as photosynthesis." | tran - Inputs should be padded on the right because BERT uses absolute position embeddings. - The [facebook/bart-large-cnn](https://huggingface.co/facebook/bart-large-cnn) checkpoint doesn't include `mask_token_id` which means it can't perform mask-filling tasks. -- BART doesn’t use `token_type_ids` for sequence classification. Use [`BartTokenizer`] or [`~PreTrainedTokenizerBase.encode`] to get the proper splitting. +- BART doesn't use `token_type_ids` for sequence classification. Use [`BartTokenizer`] or [`~PreTrainedTokenizerBase.encode`] to get the proper splitting. - The forward pass of [`BartModel`] creates the `decoder_input_ids` if they're not passed. This can be different from other model APIs, but it is a useful feature for mask-filling tasks. - Model predictions are intended to be identical to the original implementation when `forced_bos_token_id=0`. This only works if the text passed to `fairseq.encode` begins with a space. - [`~GenerationMixin.generate`] should be used for conditional generation tasks like summarization. diff --git a/docs/source/en/model_doc/barthez.md b/docs/source/en/model_doc/barthez.md index 43b6521f1013..f7a100a4208c 100644 --- a/docs/source/en/model_doc/barthez.md +++ b/docs/source/en/model_doc/barthez.md @@ -31,7 +31,6 @@ You can find all of the original BARThez checkpoints under the [BARThez](https:/ > This model was contributed by [moussakam](https://huggingface.co/moussakam). > Refer to the [BART](./bart) docs for more usage examples. - The example below demonstrates how to predict the `` token with [`Pipeline`], [`AutoModel`], and from the command line. diff --git a/docs/source/en/model_doc/bartpho.md b/docs/source/en/model_doc/bartpho.md index 9e86a1b615d0..15e96c57669f 100644 --- a/docs/source/en/model_doc/bartpho.md +++ b/docs/source/en/model_doc/bartpho.md @@ -33,12 +33,9 @@ You can find all the original checkpoints under the [VinAI](https://huggingface. The example below demonstrates how to summarize text with [`Pipeline`] or the [`AutoModel`] class. - - - ```python import torch from transformers import pipeline @@ -98,8 +95,6 @@ transformers run --task summarization --model vinai/bartpho-word --device 0 - - ## Notes - BARTpho uses the large architecture of BART with an additional layer-normalization layer on top of the encoder and decoder. The BART-specific classes should be replaced with the mBART-specific classes. diff --git a/docs/source/en/model_doc/beit.md b/docs/source/en/model_doc/beit.md index b66021ec8d98..ee516a935ed4 100644 --- a/docs/source/en/model_doc/beit.md +++ b/docs/source/en/model_doc/beit.md @@ -87,7 +87,7 @@ page for more information. SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set `attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used. -``` +```py from transformers import BeitForImageClassification model = BeitForImageClassification.from_pretrained("microsoft/beit-base-patch16-224", attn_implementation="sdpa", dtype=torch.float16) ... @@ -123,6 +123,7 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h - See also: [Image classification task guide](../tasks/image_classification) **Semantic segmentation** + - [Semantic segmentation task guide](../tasks/semantic_segmentation) If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. diff --git a/docs/source/en/model_doc/bert-generation.md b/docs/source/en/model_doc/bert-generation.md index 38cbe2137eb7..d57734b069ba 100644 --- a/docs/source/en/model_doc/bert-generation.md +++ b/docs/source/en/model_doc/bert-generation.md @@ -13,6 +13,7 @@ specific language governing permissions and limitations under the License. rendered properly in your Markdown viewer. --> +*This model was released on 2019-07-29 and added to Hugging Face Transformers on 2020-11-16.*
@@ -155,4 +156,4 @@ print(tokenizer.decode(outputs[0])) ## BertGenerationDecoder [[autodoc]] BertGenerationDecoder - - forward \ No newline at end of file + - forward diff --git a/docs/source/en/model_doc/bert-japanese.md b/docs/source/en/model_doc/bert-japanese.md index 812e5a455ad5..6599efa73e08 100644 --- a/docs/source/en/model_doc/bert-japanese.md +++ b/docs/source/en/model_doc/bert-japanese.md @@ -81,7 +81,6 @@ API reference information. - ## BertJapaneseTokenizer [[autodoc]] BertJapaneseTokenizer diff --git a/docs/source/en/model_doc/bertweet.md b/docs/source/en/model_doc/bertweet.md index 4dffe29168d3..20206da87e43 100644 --- a/docs/source/en/model_doc/bertweet.md +++ b/docs/source/en/model_doc/bertweet.md @@ -24,8 +24,7 @@ rendered properly in your Markdown viewer. ## BERTweet -[BERTweet](https://huggingface.co/papers/2005.10200) shares the same architecture as [BERT-base](./bert), but it’s pretrained like [RoBERTa](./roberta) on English Tweets. It performs really well on Tweet-related tasks like part-of-speech tagging, named entity recognition, and text classification. - +[BERTweet](https://huggingface.co/papers/2005.10200) shares the same architecture as [BERT-base](./bert), but it's pretrained like [RoBERTa](./roberta) on English Tweets. It performs really well on Tweet-related tasks like part-of-speech tagging, named entity recognition, and text classification. You can find all the original BERTweet checkpoints under the [VinAI Research](https://huggingface.co/vinai?search_models=BERTweet) organization. @@ -49,6 +48,7 @@ pipeline = pipeline( ) pipeline("Plants create through a process known as photosynthesis.") ``` + @@ -88,7 +88,8 @@ echo -e "Plants create through a process known as photosynthesis." | tran ## Notes -- Use the [`AutoTokenizer`] or [`BertweetTokenizer`] because it’s preloaded with a custom vocabulary adapted to tweet-specific tokens like hashtags (#), mentions (@), emojis, and common abbreviations. Make sure to also install the [emoji](https://pypi.org/project/emoji/) library. + +- Use the [`AutoTokenizer`] or [`BertweetTokenizer`] because it's preloaded with a custom vocabulary adapted to tweet-specific tokens like hashtags (#), mentions (@), emojis, and common abbreviations. Make sure to also install the [emoji](https://pypi.org/project/emoji/) library. - Inputs should be padded on the right (`padding="max_length"`) because BERT uses absolute position embeddings. ## BertweetTokenizer diff --git a/docs/source/en/model_doc/big_bird.md b/docs/source/en/model_doc/big_bird.md index 2d3b6d545faf..b4bfeefa516a 100644 --- a/docs/source/en/model_doc/big_bird.md +++ b/docs/source/en/model_doc/big_bird.md @@ -47,6 +47,7 @@ pipeline = pipeline( ) pipeline("Plants create [MASK] through a process known as photosynthesis.") ``` + @@ -81,10 +82,12 @@ print(f"The predicted token is: {predicted_token}") ```bash !echo -e "Plants create [MASK] through a process known as photosynthesis." | transformers-cli run --task fill-mask --model google/bigbird-roberta-base --device 0 ``` + ## Notes + - Inputs should be padded on the right because BigBird uses absolute position embeddings. - BigBird supports `original_full` and `block_sparse` attention. If the input sequence length is less than 1024, it is recommended to use `original_full` since sparse patterns don't offer much benefit for smaller inputs. - The current implementation uses window size of 3 blocks and 2 global blocks, only supports the ITC-implementation, and doesn't support `num_random_blocks=0`. diff --git a/docs/source/en/model_doc/bigbird_pegasus.md b/docs/source/en/model_doc/bigbird_pegasus.md index cae1e8f779d4..c4a6d54b9442 100644 --- a/docs/source/en/model_doc/bigbird_pegasus.md +++ b/docs/source/en/model_doc/bigbird_pegasus.md @@ -52,6 +52,7 @@ Through photosynthesis, plants capture energy from sunlight using a green pigmen These ingredients are then transformed into glucose, a type of sugar that serves as a source of chemical energy, and oxygen, which is released as a byproduct into the atmosphere. The glucose produced during photosynthesis is not just used immediately; plants also store it as starch or convert it into other organic compounds like cellulose, which is essential for building their cellular structure. This energy reserve allows them to grow, develop leaves, produce flowers, bear fruit, and carry out various physiological processes throughout their lifecycle.""") ``` + @@ -77,6 +78,7 @@ input_ids = tokenizer(input_text, return_tensors="pt").to(model.device) output = model.generate(**input_ids, cache_implementation="static") print(tokenizer.decode(output[0], skip_special_tokens=True)) ``` + diff --git a/docs/source/en/model_doc/biogpt.md b/docs/source/en/model_doc/biogpt.md index 60b84f015122..82c2cb0e8cd0 100644 --- a/docs/source/en/model_doc/biogpt.md +++ b/docs/source/en/model_doc/biogpt.md @@ -135,31 +135,26 @@ print(output) [[autodoc]] BioGptConfig - ## BioGptTokenizer [[autodoc]] BioGptTokenizer - save_vocabulary - ## BioGptModel [[autodoc]] BioGptModel - forward - ## BioGptForCausalLM [[autodoc]] BioGptForCausalLM - forward - ## BioGptForTokenClassification [[autodoc]] BioGptForTokenClassification - forward - ## BioGptForSequenceClassification [[autodoc]] BioGptForSequenceClassification diff --git a/docs/source/en/model_doc/bit.md b/docs/source/en/model_doc/bit.md index 5a6630566fca..5ed3b8f816ab 100644 --- a/docs/source/en/model_doc/bit.md +++ b/docs/source/en/model_doc/bit.md @@ -36,6 +36,7 @@ The original code can be found [here](https://github.com/google-research/big_tra ## Usage tips - BiT models are equivalent to ResNetv2 in terms of architecture, except that: 1) all batch normalization layers are replaced by [group normalization](https://huggingface.co/papers/1803.08494), + 2) [weight standardization](https://huggingface.co/papers/1903.10520) is used for convolutional layers. The authors show that the combination of both is useful for training with large batch sizes, and has a significant impact on transfer learning. @@ -72,4 +73,4 @@ If you're interested in submitting a resource to be included here, please feel f ## BitForImageClassification [[autodoc]] BitForImageClassification - - forward \ No newline at end of file + - forward diff --git a/docs/source/en/model_doc/bitnet.md b/docs/source/en/model_doc/bitnet.md index 6946ec65d437..c674f51fc305 100644 --- a/docs/source/en/model_doc/bitnet.md +++ b/docs/source/en/model_doc/bitnet.md @@ -35,33 +35,29 @@ Several versions of the model weights are available on Hugging Face: * [**`microsoft/bitnet-b1.58-2B-4T-gguf`**](https://huggingface.co/microsoft/bitnet-b1.58-2B-4T-gguf): Contains the model weights in GGUF format, compatible with the `bitnet.cpp` library for CPU inference. - ### Model Details - * **Architecture:** Transformer-based, modified with `BitLinear` layers (BitNet framework). - * Uses Rotary Position Embeddings (RoPE). - * Uses squared ReLU (ReLU²) activation in FFN layers. - * Employs [`subln`](https://proceedings.mlr.press/v202/wang23u.html) normalization. - * No bias terms in linear or normalization layers. + * Uses Rotary Position Embeddings (RoPE). + * Uses squared ReLU (ReLU²) activation in FFN layers. + * Employs [`subln`](https://proceedings.mlr.press/v202/wang23u.html) normalization. + * No bias terms in linear or normalization layers. * **Quantization:** Native 1.58-bit weights and 8-bit activations (W1.58A8). - * Weights are quantized to ternary values {-1, 0, +1} using absmean quantization during the forward pass. - * Activations are quantized to 8-bit integers using absmax quantization (per-token). - * **Crucially, the model was *trained from scratch* with this quantization scheme, not post-training quantized.** + * Weights are quantized to ternary values {-1, 0, +1} using absmean quantization during the forward pass. + * Activations are quantized to 8-bit integers using absmax quantization (per-token). + * **Crucially, the model was *trained from scratch* with this quantization scheme, not post-training quantized.** * **Parameters:** ~2 Billion * **Training Tokens:** 4 Trillion -* **Context Length:** Maximum sequence length of **4096 tokens**. - * *Recommendation:* For optimal performance on tasks requiring very long contexts (beyond the pre-training length or for specialized long-reasoning tasks), we recommend performing intermediate long-sequence adaptation/training before the final fine-tuning stage. +* **Context Length:** Maximum sequence length of **4096 tokens**. + * *Recommendation:* For optimal performance on tasks requiring very long contexts (beyond the pre-training length or for specialized long-reasoning tasks), we recommend performing intermediate long-sequence adaptation/training before the final fine-tuning stage. * **Training Stages:** - 1. **Pre-training:** Large-scale training on public text/code and synthetic math data using a two-stage learning rate and weight decay schedule. - 2. **Supervised Fine-tuning (SFT):** Fine-tuned on instruction-following and conversational datasets using sum loss aggregation and specific hyperparameter tuning. - 3. **Direct Preference Optimization (DPO):** Aligned with human preferences using preference pairs. + 1. **Pre-training:** Large-scale training on public text/code and synthetic math data using a two-stage learning rate and weight decay schedule. + 2. **Supervised Fine-tuning (SFT):** Fine-tuned on instruction-following and conversational datasets using sum loss aggregation and specific hyperparameter tuning. + 3. **Direct Preference Optimization (DPO):** Aligned with human preferences using preference pairs. * **Tokenizer:** LLaMA 3 Tokenizer (vocab size: 128,256). - ## Usage tips - **VERY IMPORTANT NOTE ON EFFICIENCY** > Please do NOT expect performance efficiency gains (in terms of speed, latency, or energy consumption) when using this model with the standard transformers library. @@ -106,7 +102,6 @@ response = tokenizer.decode(chat_outputs[0][chat_input.shape[-1]:], skip_special print("\nAssistant Response:", response) ``` - ## BitNetConfig [[autodoc]] BitNetConfig diff --git a/docs/source/en/model_doc/blenderbot-small.md b/docs/source/en/model_doc/blenderbot-small.md index 1967013208b0..830db710e039 100644 --- a/docs/source/en/model_doc/blenderbot-small.md +++ b/docs/source/en/model_doc/blenderbot-small.md @@ -55,7 +55,6 @@ found [here](https://github.com/facebookresearch/ParlAI). Blenderbot Small is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than the left. - ## Resources - [Causal language modeling task guide](../tasks/language_modeling) diff --git a/docs/source/en/model_doc/blenderbot.md b/docs/source/en/model_doc/blenderbot.md index 99149c5d948f..168c744235d8 100644 --- a/docs/source/en/model_doc/blenderbot.md +++ b/docs/source/en/model_doc/blenderbot.md @@ -71,7 +71,6 @@ An example: `facebook/blenderbot_small_90M`, have a different architecture and consequently should be used with [BlenderbotSmall](blenderbot-small). - ## Resources - [Causal language modeling task guide](../tasks/language_modeling) diff --git a/docs/source/en/model_doc/blip-2.md b/docs/source/en/model_doc/blip-2.md index fe4e939c2dc8..faaaee7b0840 100644 --- a/docs/source/en/model_doc/blip-2.md +++ b/docs/source/en/model_doc/blip-2.md @@ -26,14 +26,14 @@ rendered properly in your Markdown viewer. The BLIP-2 model was proposed in [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://huggingface.co/papers/2301.12597) by Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi. BLIP-2 leverages frozen pre-trained image encoders and large language models (LLMs) by training a lightweight, 12-layer Transformer encoder in between them, achieving state-of-the-art performance on various vision-language tasks. Most notably, BLIP-2 improves upon [Flamingo](https://huggingface.co/papers/2204.14198), an 80 billion parameter model, by 8.7% -on zero-shot VQAv2 with 54x fewer trainable parameters. +on zero-shot VQAv2 with 54x fewer trainable parameters. The abstract from the paper is the following: *The cost of vision-and-language pre-training has become increasingly prohibitive due to end-to-end training of large-scale models. This paper proposes BLIP-2, a generic and efficient pre-training strategy that bootstraps vision-language pre-training from off-the-shelf frozen pre-trained image encoders and frozen large language models. BLIP-2 bridges the modality gap with a lightweight Querying Transformer, which is pre-trained in two stages. The first stage bootstraps vision-language representation learning from a frozen image encoder. The second stage bootstraps vision-to-language generative learning from a frozen language model. BLIP-2 achieves state-of-the-art performance on various vision-language tasks, despite having significantly fewer trainable parameters than existing methods. For example, our model outperforms Flamingo80B by 8.7% on zero-shot VQAv2 with 54x fewer trainable parameters. We also demonstrate the model's emerging capabilities of zero-shot image-to-text generation that can follow natural language instructions.* +alt="drawing" width="600"/> BLIP-2 architecture. Taken from the original paper. diff --git a/docs/source/en/model_doc/blip.md b/docs/source/en/model_doc/blip.md index 13a2a5731a5f..5e727050f6ee 100644 --- a/docs/source/en/model_doc/blip.md +++ b/docs/source/en/model_doc/blip.md @@ -25,7 +25,6 @@ rendered properly in your Markdown viewer. [BLIP](https://huggingface.co/papers/2201.12086) (Bootstrapped Language-Image Pretraining) is a vision-language pretraining (VLP) framework designed for *both* understanding and generation tasks. Most existing pretrained models are only good at one or the other. It uses a captioner to generate captions and a filter to remove the noisy captions. This increases training data quality and more effectively uses the messy web data. - You can find all the original BLIP checkpoints under the [BLIP](https://huggingface.co/collections/Salesforce/blip-models-65242f40f1491fbf6a9e9472) collection. > [!TIP] @@ -129,7 +128,7 @@ Refer to this [notebook](https://github.com/huggingface/notebooks/blob/main/exam ## BlipTextLMHeadModel [[autodoc]] BlipTextLMHeadModel -- forward + - forward ## BlipVisionModel diff --git a/docs/source/en/model_doc/bloom.md b/docs/source/en/model_doc/bloom.md index 805379338e32..51e2970c25f6 100644 --- a/docs/source/en/model_doc/bloom.md +++ b/docs/source/en/model_doc/bloom.md @@ -43,17 +43,19 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h - [`BloomForCausalLM`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#gpt-2gpt-and-causal-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb). See also: + - [Causal language modeling task guide](../tasks/language_modeling) - [Text classification task guide](../tasks/sequence_classification) - [Token classification task guide](../tasks/token_classification) - [Question answering task guide](../tasks/question_answering) - ⚡️ Inference + - A blog on [Optimization story: Bloom inference](https://huggingface.co/blog/bloom-inference-optimization). - A blog on [Incredibly Fast BLOOM Inference with DeepSpeed and Accelerate](https://huggingface.co/blog/bloom-inference-pytorch-scripts). ⚙️ Training + - A blog on [The Technology Behind BLOOM Training](https://huggingface.co/blog/bloom-megatron-deepspeed). ## BloomConfig diff --git a/docs/source/en/model_doc/blt.md b/docs/source/en/model_doc/blt.md new file mode 100644 index 000000000000..254cf6c0f44a --- /dev/null +++ b/docs/source/en/model_doc/blt.md @@ -0,0 +1,97 @@ + +*This model was released on 2024-12-13 and added to Hugging Face Transformers on 2025-09-19.* + +
+
+ PyTorch + Flax + FlashAttention + SDPA +
+
+ +# Byte Lantet Transformer (BLT) + +## Overview + +The BLT model was proposed in [Byte Latent Transformer: Patches Scale Better Than Tokens](https://huggingface.co/papers/2412.09871) by Artidoro Pagnoni, Ram Pasunuru, Pedro Rodriguez, John Nguyen, Benjamin Muller, Margaret Li1, Chunting Zhou, Lili Yu, Jason Weston, Luke Zettlemoyer, Gargi Ghosh, Mike Lewis, Ari Holtzman†, Srinivasan Iyer. +BLT is a byte-level LLM that achieves tokenization-level performance through entropy-based dynamic patching. + +The abstract from the paper is the following: + +*We introduce the Byte Latent Transformer (BLT), a new byte-level LLM architecture that, for the first time, matches tokenization-based LLM performance at scale with significant improvements in inference +efficiency and robustness. BLT encodes bytes into dynamically sized patches, which serve as the primary units of computation. Patches are segmented based on the entropy of the next byte, allocating +more compute and model capacity where increased data complexity demands it. We present the first flop controlled scaling study of byte-level models up to 8B parameters and 4T training bytes. Our results demonstrate the feasibility of scaling models trained on raw bytes without a fixed vocabulary. Both training and inference efficiency improve due to dynamically selecting long patches when data is predictable, along with qualitative improvements on reasoning and long tail generalization. Overall, for fixed inference costs, BLT shows significantly better scaling than tokenization-based models, by simultaneously growing both patch and model size.* + +## Usage Tips: + +- **Dual Model Architecture**: BLT consists of two separate trained models: + - **Patcher (Entropy Model)**: A smaller transformer model that predicts byte-level entropy to determine patch boundaries and segment input. + - **Main Transformer Model**: The primary model that processes the patches through a Local Encoder, Global Transformer, and Local Decoder. + +- **Dynamic Patching**: The model uses entropy-based dynamic patching where: + - High-entropy regions (complex data) get shorter patches with more computational attention + - Low-entropy regions (predictable data) get longer patches for efficiency + - This allows the model to allocate compute resources where they're most needed + +- **Local Encoder**: Processes byte sequences with cross-attention to patch embeddings +- **Global Transformer**: Processes patch-level representations with full attention across patches +- **Local Decoder**: Generates output with cross-attention back to the original byte sequence + +- **Byte-Level Tokenizer**: Unlike traditional tokenizers that use learned vocabularies, BLT's tokenizer simply converts text to UTF-8 bytes and maps each byte to a token ID. There is no need for a vocabulary. + +The model can be loaded via: + + + +```python +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM + +tokenizer = AutoTokenizer.from_pretrained("itazap/blt-1b-hf") +model = AutoModelForCausalLM.from_pretrained( + "itazap/blt-1b-hf", + device_map="auto", +) + +inputs = tokenizer(prompt, return_tensors="pt").to(model.device) + +prompt = "my name is" +generated_ids = model.generate( + **inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False, use_cache=False +) + +print(tokenizer.decode(generated_ids[0])) +``` + + + +This model was contributed by [itazap](https://huggingface.co/). +The original code can be found [here](). + +## BltConfig + +[[autodoc]] BltConfig + +[[autodoc]] BltModel + - forward + +## BltForCausalLM + +[[autodoc]] BltForCausalLM + - forward diff --git a/docs/source/en/model_doc/bridgetower.md b/docs/source/en/model_doc/bridgetower.md index 6a2b09e263ab..861dd32c16fe 100644 --- a/docs/source/en/model_doc/bridgetower.md +++ b/docs/source/en/model_doc/bridgetower.md @@ -26,7 +26,7 @@ rendered properly in your Markdown viewer. The BridgeTower model was proposed in [BridgeTower: Building Bridges Between Encoders in Vision-Language Representative Learning](https://huggingface.co/papers/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan. The goal of this model is to build a bridge between each uni-modal encoder and the cross-modal encoder to enable comprehensive and detailed interaction at each layer of the cross-modal encoder thus achieving remarkable performance on various downstream tasks with almost negligible additional performance and computational costs. -This paper has been accepted to the [AAAI'23](https://aaai.org/Conferences/AAAI-23/) conference. +This paper has been accepted to the [AAAI'23](https://aaai.org/Conferences/AAAI-23/) conference. The abstract from the paper is the following: @@ -54,6 +54,7 @@ The [`BridgeTowerProcessor`] wraps [`RobertaTokenizer`] and [`BridgeTowerImagePr encode the text and prepare the images respectively. The following example shows how to run contrastive learning using [`BridgeTowerProcessor`] and [`BridgeTowerForContrastiveLearning`]. + ```python >>> from transformers import BridgeTowerProcessor, BridgeTowerForContrastiveLearning >>> import requests @@ -76,6 +77,7 @@ The following example shows how to run contrastive learning using [`BridgeTowerP ``` The following example shows how to run image-text retrieval using [`BridgeTowerProcessor`] and [`BridgeTowerForImageAndTextRetrieval`]. + ```python >>> from transformers import BridgeTowerProcessor, BridgeTowerForImageAndTextRetrieval >>> import requests @@ -130,7 +132,6 @@ Tips: - Please refer to [Table 5](https://huggingface.co/papers/2206.08657) for BridgeTower's performance on Image Retrieval and other down stream tasks. - The PyTorch version of this model is only available in torch 1.10 and higher. - ## BridgeTowerConfig [[autodoc]] BridgeTowerConfig @@ -177,4 +178,3 @@ Tips: [[autodoc]] BridgeTowerForImageAndTextRetrieval - forward - diff --git a/docs/source/en/model_doc/bros.md b/docs/source/en/model_doc/bros.md index aeb3dd76e52b..4ef3d3737ae2 100644 --- a/docs/source/en/model_doc/bros.md +++ b/docs/source/en/model_doc/bros.md @@ -57,7 +57,6 @@ def expand_and_normalize_bbox(bboxes, doc_width, doc_height): - [`~transformers.BrosForTokenClassification.forward`, `~transformers.BrosSpadeEEForTokenClassification.forward`, `~transformers.BrosSpadeEEForTokenClassification.forward`] require not only `input_ids` and `bbox` but also `box_first_token_mask` for loss calculation. It is a mask to filter out non-first tokens of each box. You can obtain this mask by saving start token indices of bounding boxes when creating `input_ids` from words. You can make `box_first_token_mask` with following code, - ```python def make_box_first_token_mask(bboxes, words, tokenizer, max_seq_length=512): @@ -102,7 +101,6 @@ def make_box_first_token_mask(bboxes, words, tokenizer, max_seq_length=512): [[autodoc]] BrosModel - forward - ## BrosForTokenClassification [[autodoc]] BrosForTokenClassification diff --git a/docs/source/en/model_doc/camembert.md b/docs/source/en/model_doc/camembert.md index ddce66f2dedb..8affbd73a570 100644 --- a/docs/source/en/model_doc/camembert.md +++ b/docs/source/en/model_doc/camembert.md @@ -16,10 +16,10 @@ rendered properly in your Markdown viewer. *This model was released on 2019-11-10 and added to Hugging Face Transformers on 2020-11-16.*
-
- PyTorch +
+ PyTorch SDPA -
+
# CamemBERT @@ -50,6 +50,7 @@ from transformers import pipeline pipeline = pipeline("fill-mask", model="camembert-base", dtype=torch.float16, device=0) pipeline("Le camembert est un délicieux fromage .") ``` +
@@ -72,6 +73,7 @@ predicted_token = tokenizer.decode(predicted_token_id) print(f"The predicted token is: {predicted_token}") ``` + @@ -84,7 +86,6 @@ echo -e "Le camembert est un délicieux fromage ." | transformers run --ta - Quantization reduces the memory burden of large models by representing weights in lower precision. Refer to the [Quantization](../quantization/overview) overview for available options. The example below uses [bitsandbytes](../quantization/bitsandbytes) quantization to quantize the weights to 8-bits. diff --git a/docs/source/en/model_doc/canine.md b/docs/source/en/model_doc/canine.md index e1d8bb7f7f68..29a926c305cd 100644 --- a/docs/source/en/model_doc/canine.md +++ b/docs/source/en/model_doc/canine.md @@ -23,7 +23,7 @@ rendered properly in your Markdown viewer. # CANINE -[CANINE](https://huggingface.co/papers/2103.06874) is a tokenization-free Transformer. It skips the usual step of splitting text into subwords or wordpieces and processes text character by character. That means it works directly with raw Unicode, making it especially useful for languages with complex or inconsistent tokenization rules and even noisy inputs like typos. Since working with characters means handling longer sequences, CANINE uses a smart trick. The model compresses the input early on (called downsampling) so the transformer doesn’t have to process every character individually. This keeps things fast and efficient. +[CANINE](https://huggingface.co/papers/2103.06874) is a tokenization-free Transformer. It skips the usual step of splitting text into subwords or wordpieces and processes text character by character. That means it works directly with raw Unicode, making it especially useful for languages with complex or inconsistent tokenization rules and even noisy inputs like typos. Since working with characters means handling longer sequences, CANINE uses a smart trick. The model compresses the input early on (called downsampling) so the transformer doesn't have to process every character individually. This keeps things fast and efficient. You can find all the original CANINE checkpoints under the [Google](https://huggingface.co/google?search_models=canine) organization. @@ -86,6 +86,7 @@ echo -e "Plant create energy through a process known as photosynthesis." | trans inputs = ["Life is like a box of chocolates.", "You never know what you gonna get."] encoding = tokenizer(inputs, padding="longest", truncation=True, return_tensors="pt") ``` + - CANINE is primarily designed to be fine-tuned on a downstream task. The pretrained model can be used for either masked language modeling or next sentence prediction. ## CanineConfig diff --git a/docs/source/en/model_doc/chameleon.md b/docs/source/en/model_doc/chameleon.md index eb71349115ed..dc573faa1112 100644 --- a/docs/source/en/model_doc/chameleon.md +++ b/docs/source/en/model_doc/chameleon.md @@ -28,7 +28,6 @@ rendered properly in your Markdown viewer. The Chameleon model was proposed in [Chameleon: Mixed-Modal Early-Fusion Foundation Models ](https://huggingface.co/papers/2405.09818) by META AI Chameleon Team. Chameleon is a Vision-Language Model that use vector quantization to tokenize images which enables the model to generate multimodal output. The model takes images and texts as input, including an interleaved format, and generates textual response. Image generation module is not released yet. - The abstract from the paper is the following: *We present Chameleon, a family of early-fusion token-based mixed-modal models capable of understanding and generating images and text in any arbitrary sequence. We outline a stable training @@ -43,7 +42,6 @@ including Gemini Pro and GPT-4V, according to human judgments on a new long-form generation evaluation, where either the prompt or outputs contain mixed sequences of both images and text. Chameleon marks a significant step forward in unified modeling of full multimodal documents* - drawing @@ -52,7 +50,6 @@ alt="drawing" width="600"/> This model was contributed by [joaogante](https://huggingface.co/joaogante) and [RaushanTurganbay](https://huggingface.co/RaushanTurganbay). The original code can be found [here](https://github.com/facebookresearch/chameleon). - ## Usage tips - We advise users to use `padding_side="left"` when computing batched generation as it leads to more accurate results. Simply make sure to set `processor.tokenizer.padding_side = "left"` before generating. diff --git a/docs/source/en/model_doc/chinese_clip.md b/docs/source/en/model_doc/chinese_clip.md index 7ed4d503c00f..96b094ccd91b 100644 --- a/docs/source/en/model_doc/chinese_clip.md +++ b/docs/source/en/model_doc/chinese_clip.md @@ -119,4 +119,4 @@ Currently, following scales of pretrained Chinese-CLIP models are available on ## ChineseCLIPVisionModel [[autodoc]] ChineseCLIPVisionModel - - forward \ No newline at end of file + - forward diff --git a/docs/source/en/model_doc/clipseg.md b/docs/source/en/model_doc/clipseg.md index e27d49ffe484..099fd4fb1bac 100644 --- a/docs/source/en/model_doc/clipseg.md +++ b/docs/source/en/model_doc/clipseg.md @@ -47,7 +47,7 @@ can be formulated. Finally, we find our system to adapt well to generalized queries involving affordances or properties* +alt="drawing" width="600"/> CLIPSeg overview. Taken from the original paper. @@ -106,4 +106,4 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h ## CLIPSegForImageSegmentation [[autodoc]] CLIPSegForImageSegmentation - - forward \ No newline at end of file + - forward diff --git a/docs/source/en/model_doc/clvp.md b/docs/source/en/model_doc/clvp.md index 926438a3c1f5..eead4a546435 100644 --- a/docs/source/en/model_doc/clvp.md +++ b/docs/source/en/model_doc/clvp.md @@ -29,29 +29,25 @@ The abstract from the paper is the following: *In recent years, the field of image generation has been revolutionized by the application of autoregressive transformers and DDPMs. These approaches model the process of image generation as a step-wise probabilistic processes and leverage large amounts of compute and data to learn the image distribution. This methodology of improving performance need not be confined to images. This paper describes a way to apply advances in the image generative domain to speech synthesis. The result is TorToise - an expressive, multi-voice text-to-speech system.* - This model was contributed by [Susnato Dhar](https://huggingface.co/susnato). The original code can be found [here](https://github.com/neonbjb/tortoise-tts). - ## Usage tips 1. CLVP is an integral part of the Tortoise TTS model. 2. CLVP can be used to compare different generated speech candidates with the provided text, and the best speech tokens are forwarded to the diffusion model. 3. The use of the [`ClvpModelForConditionalGeneration.generate()`] method is strongly recommended for tortoise usage. -4. Note that the CLVP model expects the audio to be sampled at 22.05 kHz contrary to other audio models which expects 16 kHz. - +4. Note that the CLVP model expects the audio to be sampled at 22.05 kHz contrary to other audio models which expects 16 kHz. ## Brief Explanation: - The [`ClvpTokenizer`] tokenizes the text input, and the [`ClvpFeatureExtractor`] extracts the log mel-spectrogram from the desired audio. - [`ClvpConditioningEncoder`] takes those text tokens and audio representations and converts them into embeddings conditioned on the text and audio. - The [`ClvpForCausalLM`] uses those embeddings to generate multiple speech candidates. -- Each speech candidate is passed through the speech encoder ([`ClvpEncoder`]) which converts them into a vector representation, and the text encoder ([`ClvpEncoder`]) converts the text tokens into the same latent space. -- At the end, we compare each speech vector with the text vector to see which speech vector is most similar to the text vector. +- Each speech candidate is passed through the speech encoder ([`ClvpEncoder`]) which converts them into a vector representation, and the text encoder ([`ClvpEncoder`]) converts the text tokens into the same latent space. +- At the end, we compare each speech vector with the text vector to see which speech vector is most similar to the text vector. - [`ClvpModelForConditionalGeneration.generate()`] compresses all of the logic described above into a single method. - Example : ```python @@ -74,7 +70,6 @@ Example : >>> generated_output = model.generate(**processor_output) ``` - ## ClvpConfig [[autodoc]] ClvpConfig @@ -128,4 +123,3 @@ Example : ## ClvpDecoder [[autodoc]] ClvpDecoder - diff --git a/docs/source/en/model_doc/code_llama.md b/docs/source/en/model_doc/code_llama.md index 60e9cb4c3cf2..a46e1f05b32a 100644 --- a/docs/source/en/model_doc/code_llama.md +++ b/docs/source/en/model_doc/code_llama.md @@ -143,6 +143,7 @@ visualizer("""def func(a, b): - Infilling is only available in the 7B and 13B base models, and not in the Python, Instruct, 34B, or 70B models. - Use the `` token where you want your input to be filled. The tokenizer splits this token to create a formatted input string that follows the [original training pattern](https://github.com/facebookresearch/codellama/blob/cb51c14ec761370ba2e2bc351374a79265d0465e/llama/generation.py#L402). This is more robust than preparing the pattern yourself. + ```py from transformers import LlamaForCausalLM, CodeLlamaTokenizer @@ -158,6 +159,7 @@ visualizer("""def func(a, b): filling = tokenizer.batch_decode(generated_ids[:, input_ids.shape[1]:], skip_special_tokens = True)[0] print(PROMPT.replace("", filling)) ``` + - Use `bfloat16` for further training or fine-tuning and `float16` for inference. - The `BOS` character is not used for infilling when encoding the prefix or suffix, but only at the beginning of each prompt. - The tokenizer is a byte-pair encoding model based on [SentencePiece](https://github.com/google/sentencepiece). During decoding, if the first token is the start of the word (for example, “Banana”), the tokenizer doesn’t prepend the prefix space to the string. diff --git a/docs/source/en/model_doc/codegen.md b/docs/source/en/model_doc/codegen.md index e5ad3863b67c..c341154921e3 100644 --- a/docs/source/en/model_doc/codegen.md +++ b/docs/source/en/model_doc/codegen.md @@ -29,7 +29,7 @@ CodeGen is an autoregressive language model for program synthesis trained sequen The abstract from the paper is the following: -*Program synthesis strives to generate a computer program as a solution to a given problem specification. We propose a conversational program synthesis approach via large language models, which addresses the challenges of searching over a vast program space and user intent specification faced in prior approaches. Our new approach casts the process of writing a specification and program as a multi-turn conversation between a user and a system. It treats program synthesis as a sequence prediction problem, in which the specification is expressed in natural language and the desired program is conditionally sampled. We train a family of large language models, called CodeGen, on natural language and programming language data. With weak supervision in the data and the scaling up of data size and model size, conversational capacities emerge from the simple autoregressive language modeling. To study the model behavior on conversational program synthesis, we develop a multi-turn programming benchmark (MTPB), where solving each problem requires multi-step synthesis via multi-turn conversation between the user and the model. Our findings show the emergence of conversational capabilities and the effectiveness of the proposed conversational program synthesis paradigm. In addition, our model CodeGen (with up to 16B parameters trained on TPU-v4) outperforms OpenAI's Codex on the HumanEval benchmark. We make the training library JaxFormer including checkpoints available as open source contribution: [this https URL](https://github.com/salesforce/codegen).* +*Program synthesis strives to generate a computer program as a solution to a given problem specification. We propose a conversational program synthesis approach via large language models, which addresses the challenges of searching over a vast program space and user intent specification faced in prior approaches. Our new approach casts the process of writing a specification and program as a multi-turn conversation between a user and a system. It treats program synthesis as a sequence prediction problem, in which the specification is expressed in natural language and the desired program is conditionally sampled. We train a family of large language models, called CodeGen, on natural language and programming language data. With weak supervision in the data and the scaling up of data size and model size, conversational capacities emerge from the simple autoregressive language modeling. To study the model behavior on conversational program synthesis, we develop a multi-turn programming benchmark (MTPB), where solving each problem requires multi-step synthesis via multi-turn conversation between the user and the model. Our findings show the emergence of conversational capabilities and the effectiveness of the proposed conversational program synthesis paradigm. In addition, our model CodeGen (with up to 16B parameters trained on TPU-v4) outperforms OpenAI's Codex on the HumanEval benchmark. We make the training library JaxFormer including checkpoints available as open source contribution: [this https URL](https://github.com/salesforce/codegen).* This model was contributed by [Hiroaki Hayashi](https://huggingface.co/rooa). The original code can be found [here](https://github.com/salesforce/codegen). @@ -39,7 +39,7 @@ The original code can be found [here](https://github.com/salesforce/codegen). * CodeGen model [checkpoints](https://huggingface.co/models?other=codegen) are available on different pre-training data with variable sizes. * The format is: `Salesforce/codegen-{size}-{data}`, where * `size`: `350M`, `2B`, `6B`, `16B` - * `data`: + * `data`: * `nl`: Pre-trained on the Pile * `multi`: Initialized with `nl`, then further pre-trained on multiple programming languages data * `mono`: Initialized with `multi`, then further pre-trained on Python data diff --git a/docs/source/en/model_doc/cohere.md b/docs/source/en/model_doc/cohere.md index 9fc6d266d69a..022a178b5cfa 100644 --- a/docs/source/en/model_doc/cohere.md +++ b/docs/source/en/model_doc/cohere.md @@ -22,14 +22,12 @@ rendered properly in your Markdown viewer.
- # Cohere Cohere [Command-R](https://cohere.com/blog/command-r) is a 35B parameter multilingual large language model designed for long context tasks like retrieval-augmented generation (RAG) and calling external APIs and tools. The model is specifically trained for grounded generation and supports both single-step and multi-step tool use. It supports a context length of 128K tokens. You can find all the original Command-R checkpoints under the [Command Models](https://huggingface.co/collections/CohereForAI/command-models-67652b401665205e17b192ad) collection. - > [!TIP] > Click on the Cohere models in the right sidebar for more examples of how to apply Cohere to different language tasks. @@ -123,9 +121,9 @@ visualizer("Plants create energy through a process known as")
- ## Notes -- Don’t use the dtype parameter in [`~AutoModel.from_pretrained`] if you’re using FlashAttention-2 because it only supports fp16 or bf16. You should use [Automatic Mixed Precision](https://pytorch.org/tutorials/recipes/recipes/amp_recipe.html), set fp16 or bf16 to True if using [`Trainer`], or use [torch.autocast](https://pytorch.org/docs/stable/amp.html#torch.autocast). + +- Don't use the dtype parameter in [`~AutoModel.from_pretrained`] if you're using FlashAttention-2 because it only supports fp16 or bf16. You should use [Automatic Mixed Precision](https://pytorch.org/tutorials/recipes/recipes/amp_recipe.html), set fp16 or bf16 to True if using [`Trainer`], or use [torch.autocast](https://pytorch.org/docs/stable/amp.html#torch.autocast). ## CohereConfig @@ -145,7 +143,6 @@ visualizer("Plants create energy through a process known as") [[autodoc]] CohereModel - forward - ## CohereForCausalLM [[autodoc]] CohereForCausalLM diff --git a/docs/source/en/model_doc/cohere2.md b/docs/source/en/model_doc/cohere2.md index bcfa05e98d19..52555d6ae558 100644 --- a/docs/source/en/model_doc/cohere2.md +++ b/docs/source/en/model_doc/cohere2.md @@ -22,7 +22,6 @@ rendered properly in your Markdown viewer.
- # Cohere 2 [Cohere Command R7B](https://cohere.com/blog/command-r7b) is an open weights research release of a 7B billion parameter model. It is a multilingual model trained on 23 languages and has a context window of 128k. The model features three layers with sliding window attention and ROPE for efficient local context modeling and relative positional encoding. A fourth layer uses global attention without positional embeddings, enabling unrestricted token interactions across the entire sequence. @@ -31,7 +30,6 @@ This model is optimized for speed, cost-performance, and compute resources. You can find all the original Command-R checkpoints under the [Command Models](https://huggingface.co/collections/CohereForAI/command-models-67652b401665205e17b192ad) collection. - > [!TIP] > Click on the Cohere models in the right sidebar for more examples of how to apply Cohere to different language tasks. @@ -136,7 +134,6 @@ print(tokenizer.decode(output[0], skip_special_tokens=True)) [[autodoc]] Cohere2Model - forward - ## Cohere2ForCausalLM [[autodoc]] Cohere2ForCausalLM diff --git a/docs/source/en/model_doc/cohere2_vision.md b/docs/source/en/model_doc/cohere2_vision.md index 2e12ff3e4767..e466ce6a5f09 100644 --- a/docs/source/en/model_doc/cohere2_vision.md +++ b/docs/source/en/model_doc/cohere2_vision.md @@ -113,6 +113,7 @@ outputs = pipe(text=messages, max_new_tokens=300, return_full_text=False) print(outputs) ``` + diff --git a/docs/source/en/model_doc/cpm.md b/docs/source/en/model_doc/cpm.md index ccfa1596bad4..275f5629db13 100644 --- a/docs/source/en/model_doc/cpm.md +++ b/docs/source/en/model_doc/cpm.md @@ -42,7 +42,6 @@ NLP tasks in the settings of few-shot (even zero-shot) learning.* This model was contributed by [canwenxu](https://huggingface.co/canwenxu). The original implementation can be found here: https://github.com/TsinghuaAI/CPM-Generate - CPM's architecture is the same as GPT-2, except for tokenization method. Refer to [GPT-2 documentation](gpt2) for @@ -50,7 +49,6 @@ API reference information. - ## CpmTokenizer [[autodoc]] CpmTokenizer diff --git a/docs/source/en/model_doc/cpmant.md b/docs/source/en/model_doc/cpmant.md index 6f13f785ac1e..bb70a369bb7f 100644 --- a/docs/source/en/model_doc/cpmant.md +++ b/docs/source/en/model_doc/cpmant.md @@ -45,8 +45,8 @@ This model was contributed by [OpenBMB](https://huggingface.co/openbmb). The ori [[autodoc]] CpmAntModel - all - + ## CpmAntForCausalLM [[autodoc]] CpmAntForCausalLM - - all \ No newline at end of file + - all diff --git a/docs/source/en/model_doc/csm.md b/docs/source/en/model_doc/csm.md index 1ee2b63dd715..162832470482 100644 --- a/docs/source/en/model_doc/csm.md +++ b/docs/source/en/model_doc/csm.md @@ -346,7 +346,6 @@ out.loss.backward() This model was contributed by [Eustache Le Bihan](https://huggingface.co/eustlb). The original code can be found [here](https://github.com/SesameAILabs/csm). - ## CsmConfig [[autodoc]] CsmConfig diff --git a/docs/source/en/model_doc/ctrl.md b/docs/source/en/model_doc/ctrl.md index e5b48d638b68..6244ee0a59ef 100644 --- a/docs/source/en/model_doc/ctrl.md +++ b/docs/source/en/model_doc/ctrl.md @@ -55,7 +55,6 @@ This model was contributed by [keskarnitishr](https://huggingface.co/keskarnitis pre-computed values in the context of text generation. See the [`forward`](model_doc/ctrl#transformers.CTRLModel.forward) method for more information on the usage of this argument. - ## Resources - [Text classification task guide](../tasks/sequence_classification) diff --git a/docs/source/en/model_doc/d_fine.md b/docs/source/en/model_doc/d_fine.md index 9dffde75ebc7..05e855d333b5 100644 --- a/docs/source/en/model_doc/d_fine.md +++ b/docs/source/en/model_doc/d_fine.md @@ -24,13 +24,13 @@ Yansong Peng, Hebei Li, Peixi Wu, Yueyi Zhang, Xiaoyan Sun, Feng Wu The abstract from the paper is the following: -*We introduce D-FINE, a powerful real-time object detector that achieves outstanding localization precision by redefining the bounding box regression task in DETR models. D-FINE comprises two key components: Fine-grained Distribution Refinement (FDR) and Global Optimal Localization Self-Distillation (GO-LSD). +*We introduce D-FINE, a powerful real-time object detector that achieves outstanding localization precision by redefining the bounding box regression task in DETR models. D-FINE comprises two key components: Fine-grained Distribution Refinement (FDR) and Global Optimal Localization Self-Distillation (GO-LSD). FDR transforms the regression process from predicting fixed coordinates to iteratively refining probability distributions, providing a fine-grained intermediate representation that significantly enhances localization accuracy. GO-LSD is a bidirectional optimization strategy that transfers localization knowledge from refined distributions to shallower layers through self-distillation, while also simplifying the residual prediction tasks for deeper layers. Additionally, D-FINE incorporates lightweight optimizations in computationally intensive modules and operations, achieving a better balance between speed and accuracy. Specifically, D-FINE-L / X achieves 54.0% / 55.8% AP on the COCO dataset at 124 / 78 FPS on an NVIDIA T4 GPU. When pretrained on Objects365, D-FINE-L / X attains 57.1% / 59.3% AP, surpassing all existing real-time detectors. Furthermore, our method significantly enhances the performance of a wide range of DETR models by up to 5.3% AP with negligible extra parameters and training costs. Our code and pretrained models: this https URL.* -This model was contributed by [VladOS95-cyber](https://github.com/VladOS95-cyber). +This model was contributed by [VladOS95-cyber](https://github.com/VladOS95-cyber). The original code can be found [here](https://github.com/Peterande/D-FINE). -## Usage tips +## Usage tips ```python >>> import torch diff --git a/docs/source/en/model_doc/dab-detr.md b/docs/source/en/model_doc/dab-detr.md index 32b27d4b2479..e3262f140f4d 100644 --- a/docs/source/en/model_doc/dab-detr.md +++ b/docs/source/en/model_doc/dab-detr.md @@ -77,8 +77,10 @@ for result in results: box = [round(i, 2) for i in box.tolist()] print(f"{model.config.id2label[label]}: {score:.2f} {box}") ``` + This should output -``` + +```text cat: 0.87 [14.7, 49.39, 320.52, 469.28] remote: 0.86 [41.08, 72.37, 173.39, 117.2] cat: 0.86 [344.45, 19.43, 639.85, 367.86] @@ -89,6 +91,7 @@ couch: 0.59 [-0.04, 1.34, 639.9, 477.09] There are three other ways to instantiate a DAB-DETR model (depending on what you prefer): Option 1: Instantiate DAB-DETR with pre-trained weights for entire model + ```py >>> from transformers import DabDetrForObjectDetection @@ -96,19 +99,21 @@ Option 1: Instantiate DAB-DETR with pre-trained weights for entire model ``` Option 2: Instantiate DAB-DETR with randomly initialized weights for Transformer, but pre-trained weights for backbone + ```py >>> from transformers import DabDetrConfig, DabDetrForObjectDetection >>> config = DabDetrConfig() >>> model = DabDetrForObjectDetection(config) ``` + Option 3: Instantiate DAB-DETR with randomly initialized weights for backbone + Transformer + ```py >>> config = DabDetrConfig(use_pretrained_backbone=False) >>> model = DabDetrForObjectDetection(config) ``` - ## DabDetrConfig [[autodoc]] DabDetrConfig diff --git a/docs/source/en/model_doc/dac.md b/docs/source/en/model_doc/dac.md index e17cc69fc37a..94f70fdff32a 100644 --- a/docs/source/en/model_doc/dac.md +++ b/docs/source/en/model_doc/dac.md @@ -23,7 +23,6 @@ rendered properly in your Markdown viewer. ## Overview - The DAC model was proposed in [Descript Audio Codec: High-Fidelity Audio Compression with Improved RVQGAN](https://huggingface.co/papers/2306.06546) by Rithesh Kumar, Prem Seetharaman, Alejandro Luebs, Ishaan Kumar, Kundan Kumar. The Descript Audio Codec (DAC) model is a powerful tool for compressing audio data, making it highly efficient for storage and transmission. By compressing 44.1 KHz audio into tokens at just 8kbps bandwidth, the DAC model enables high-quality audio processing while significantly reducing the data footprint. This is particularly useful in scenarios where bandwidth is limited or storage space is at a premium, such as in streaming applications, remote conferencing, and archiving large audio datasets. @@ -35,7 +34,6 @@ The abstract from the paper is the following: This model was contributed by [Kamil Akesbi](https://huggingface.co/kamilakesbi). The original code can be found [here](https://github.com/descriptinc/descript-audio-codec/tree/main?tab=readme-ov-file). - ## Model structure The Descript Audio Codec (DAC) model is structured into three distinct stages: @@ -44,11 +42,11 @@ The Descript Audio Codec (DAC) model is structured into three distinct stages: 2. Residual Vector Quantizer (RVQ) Model: Working in tandem with the encoder, this model quantizes the latent codes of the audio, refining the compression and ensuring high-quality reconstruction. 3. Decoder Model: This final stage reconstructs the audio from its compressed form, restoring it to a state that closely resembles the original input. -## Usage example +## Usage example -Here is a quick example of how to encode and decode an audio using this model: +Here is a quick example of how to encode and decode an audio using this model: -```python +```python >>> from datasets import load_dataset, Audio >>> from transformers import DacModel, AutoProcessor >>> librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") diff --git a/docs/source/en/model_doc/data2vec.md b/docs/source/en/model_doc/data2vec.md index f975c0d35b35..4018a98bb69d 100644 --- a/docs/source/en/model_doc/data2vec.md +++ b/docs/source/en/model_doc/data2vec.md @@ -68,7 +68,7 @@ SDPA is used by default for `torch>=2.1.1` when an implementation is available, The SDPA implementation is currently available for the Data2VecAudio and Data2VecVision models. -``` +```py from transformers import Data2VecVisionForImageClassification model = Data2VecVisionForImageClassification.from_pretrained("facebook/data2vec-vision-base", attn_implementation="sdpa", dtype=torch.float16) ... @@ -104,6 +104,7 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h - [`Data2VecVisionForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb). **Data2VecText documentation resources** + - [Text classification task guide](../tasks/sequence_classification) - [Token classification task guide](../tasks/token_classification) - [Question answering task guide](../tasks/question_answering) @@ -112,10 +113,12 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h - [Multiple choice task guide](../tasks/multiple_choice) **Data2VecAudio documentation resources** + - [Audio classification task guide](../tasks/audio_classification) - [Automatic speech recognition task guide](../tasks/asr) **Data2VecVision documentation resources** + - [Image classification](../tasks/image_classification) - [Semantic segmentation](../tasks/semantic_segmentation) diff --git a/docs/source/en/model_doc/dbrx.md b/docs/source/en/model_doc/dbrx.md index 8b2e5ae75e34..a97e594e415a 100644 --- a/docs/source/en/model_doc/dbrx.md +++ b/docs/source/en/model_doc/dbrx.md @@ -35,7 +35,6 @@ We estimate that this data is at least 2x better token-for-token than the data w This new dataset was developed using the full suite of Databricks tools, including Apache Spark™ and Databricks notebooks for data processing, and Unity Catalog for data management and governance. We used curriculum learning for pretraining, changing the data mix during training in ways we found to substantially improve model quality. - More detailed information about DBRX Instruct and DBRX Base can be found in our [technical blog post](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm). This model was contributed by [eitan-turok](https://huggingface.co/eitanturok) and [abhi-db](https://huggingface.co/abhi-db). The original code can be found [here](https://github.com/databricks/dbrx-instruct), though this may not be up to date. @@ -65,6 +64,7 @@ print(tokenizer.decode(outputs[0])) ``` If you have flash-attention installed (`pip install flash-attn`), it is possible to generate faster. (The HuggingFace documentation for flash-attention can be found [here](https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2).) + ```python from transformers import DbrxForCausalLM, AutoTokenizer import torch @@ -87,6 +87,7 @@ print(tokenizer.decode(outputs[0])) ``` You can also generate faster using the PyTorch scaled dot product attention. (The HuggingFace documentation for scaled dot product attention can be found [here](https://huggingface.co/docs/transformers/perf_infer_gpu_one#pytorch-scaled-dot-product-attention).) + ```python from transformers import DbrxForCausalLM, AutoTokenizer import torch @@ -112,15 +113,12 @@ print(tokenizer.decode(outputs[0])) [[autodoc]] DbrxConfig - ## DbrxModel [[autodoc]] DbrxModel - forward - ## DbrxForCausalLM [[autodoc]] DbrxForCausalLM - forward - diff --git a/docs/source/en/model_doc/deberta-v2.md b/docs/source/en/model_doc/deberta-v2.md index 7fc8bcdc5226..2c8b3ba956c3 100644 --- a/docs/source/en/model_doc/deberta-v2.md +++ b/docs/source/en/model_doc/deberta-v2.md @@ -21,14 +21,12 @@ rendered properly in your Markdown viewer. - # DeBERTa-v2 [DeBERTa-v2](https://huggingface.co/papers/2006.03654) improves on the original [DeBERTa](./deberta) architecture by using a SentencePiece-based tokenizer and a new vocabulary size of 128K. It also adds an additional convolutional layer within the first transformer layer to better learn local dependencies of input tokens. Finally, the position projection and content projection matrices are shared in the attention layer to reduce the number of parameters. You can find all the original [DeBERTa-v2] checkpoints under the [Microsoft](https://huggingface.co/microsoft?search_models=deberta-v2) organization. - > [!TIP] > This model was contributed by [Pengcheng He](https://huggingface.co/DeBERTa). > @@ -86,6 +84,7 @@ print(f"Predicted label: {predicted_label}") ```bash echo -e "DeBERTa-v2 is great at understanding context!" | transformers-cli run --task fill-mask --model microsoft/deberta-v2-xlarge-mnli --device 0 ``` + @@ -119,7 +118,6 @@ print(f"Predicted label: {predicted_label}") ``` - ## DebertaV2Config [[autodoc]] DebertaV2Config diff --git a/docs/source/en/model_doc/deberta.md b/docs/source/en/model_doc/deberta.md index 2d99bdbfd210..08be80c19ff0 100644 --- a/docs/source/en/model_doc/deberta.md +++ b/docs/source/en/model_doc/deberta.md @@ -31,7 +31,6 @@ Even with less training data than RoBERTa, DeBERTa manages to outperform it on s You can find all the original DeBERTa checkpoints under the [Microsoft](https://huggingface.co/microsoft?search_models=deberta) organization. - > [!TIP] > Click on the DeBERTa models in the right sidebar for more examples of how to apply DeBERTa to different language tasks. @@ -93,6 +92,7 @@ echo -e '{"text": "A soccer game with multiple people playing.", "text_pair": "S ## Notes + - DeBERTa uses **relative position embeddings**, so it does not require **right-padding** like BERT. - For best results, use DeBERTa on sentence-level or sentence-pair classification tasks like MNLI, RTE, or SST-2. - If you're using DeBERTa for token-level tasks like masked language modeling, make sure to load a checkpoint specifically pretrained or fine-tuned for token-level tasks. diff --git a/docs/source/en/model_doc/decision_transformer.md b/docs/source/en/model_doc/decision_transformer.md index cdfcd42f9a34..349b8eaae2e7 100644 --- a/docs/source/en/model_doc/decision_transformer.md +++ b/docs/source/en/model_doc/decision_transformer.md @@ -28,14 +28,14 @@ by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael La The abstract from the paper is the following: -*We introduce a framework that abstracts Reinforcement Learning (RL) as a sequence modeling problem. +*We introduce a framework that abstracts Reinforcement Learning (RL) as a sequence modeling problem. This allows us to draw upon the simplicity and scalability of the Transformer architecture, and associated advances - in language modeling such as GPT-x and BERT. In particular, we present Decision Transformer, an architecture that - casts the problem of RL as conditional sequence modeling. Unlike prior approaches to RL that fit value functions or - compute policy gradients, Decision Transformer simply outputs the optimal actions by leveraging a causally masked - Transformer. By conditioning an autoregressive model on the desired return (reward), past states, and actions, our - Decision Transformer model can generate future actions that achieve the desired return. Despite its simplicity, - Decision Transformer matches or exceeds the performance of state-of-the-art model-free offline RL baselines on + in language modeling such as GPT-x and BERT. In particular, we present Decision Transformer, an architecture that + casts the problem of RL as conditional sequence modeling. Unlike prior approaches to RL that fit value functions or + compute policy gradients, Decision Transformer simply outputs the optimal actions by leveraging a causally masked + Transformer. By conditioning an autoregressive model on the desired return (reward), past states, and actions, our + Decision Transformer model can generate future actions that achieve the desired return. Despite its simplicity, + Decision Transformer matches or exceeds the performance of state-of-the-art model-free offline RL baselines on Atari, OpenAI Gym, and Key-to-Door tasks.* This version of the model is for tasks where the state is a vector. @@ -46,7 +46,6 @@ This model was contributed by [edbeeching](https://huggingface.co/edbeeching). T [[autodoc]] DecisionTransformerConfig - ## DecisionTransformerGPT2Model [[autodoc]] DecisionTransformerGPT2Model diff --git a/docs/source/en/model_doc/deepseek_v2.md b/docs/source/en/model_doc/deepseek_v2.md index bcdf65fbe8c0..fcff8521c071 100644 --- a/docs/source/en/model_doc/deepseek_v2.md +++ b/docs/source/en/model_doc/deepseek_v2.md @@ -47,4 +47,4 @@ The model uses Multi-head Latent Attention (MLA) and DeepSeekMoE architectures f ## DeepseekV2ForSequenceClassification [[autodoc]] DeepseekV2ForSequenceClassification - - forward \ No newline at end of file + - forward diff --git a/docs/source/en/model_doc/deepseek_v3.md b/docs/source/en/model_doc/deepseek_v3.md index d8eb2e942033..2f61408a79cd 100644 --- a/docs/source/en/model_doc/deepseek_v3.md +++ b/docs/source/en/model_doc/deepseek_v3.md @@ -26,17 +26,17 @@ We present DeepSeek-V3, a strong Mixture-of-Experts (MoE) language model with 67 ## Limitations and call for contribution! -We are super happy to make this code community-powered, and would love to see how you can best optimize the following: +We are super happy to make this code community-powered, and would love to see how you can best optimize the following: - current implementation uses the "naive" attention compution (so not really MLA) -- current implementation loops through the experts. This should be replaced. Pointers to use `get_packed_weights` from `integrations/tensor_parallel`. +- current implementation loops through the experts. This should be replaced. Pointers to use `get_packed_weights` from `integrations/tensor_parallel`. - current implementation uses the eleuther formula for ROPE, using the original one would be more efficient! (should still follow our API) - static cache is not supported (this should be just a generation config issue / config shape issues) ### Usage tips The model uses Multi-head Latent Attention (MLA) and DeepSeekMoE architectures for efficient inference and cost-effective training. It employs an auxiliary-loss-free strategy for load balancing and multi-token prediction training objective. The model can be used for various language tasks after being pre-trained on 14.8 trillion tokens and going through Supervised Fine-Tuning and Reinforcement Learning stages. -You can run the model in `FP8` automatically, using 2 nodes of 8 H100 should be more than enough! +You can run the model in `FP8` automatically, using 2 nodes of 8 H100 should be more than enough! ```python # `run_deepseek_v1.py` @@ -61,9 +61,10 @@ outputs = model.generate(inputs, max_new_tokens=50) print(tokenizer.batch_decode(outputs)) print(time.time()-start) ``` -This generated: -`````` +This generated: + +``````text <|Assistant|> Okay, the user wants to demonstrate how chat templating works. Let me break down what that means. Chat templating is about structuring the conversation data, especially for models that need specific input formats. Maybe they're referring to something like how messages are formatted with roles (user, assistant, system) in APIs like OpenAI. @@ -137,7 +138,7 @@ Applying the template to our `messages` list would produce: This tells the model: 1. The conversation history (user/assistant turns). -2. The model’s turn to generate a response (`<|assistant|>` at the end). +2. The model's turn to generate a response (`<|assistant|>` at the end). --- @@ -157,18 +158,20 @@ Want to dive deeper or see a specific framework’s implementation (e.g., OpenAI `````` Use the following to run it + ```bash torchrun --nproc_per_node=8 --nnodes=2 --node_rank=0|1 --rdzv-id an_id --rdzv-backend c10d --rdzv-endpoint master_addr:master_port run_deepseek_r1.py ``` -If you have: +If you have: + ```bash [rank0]: ncclInternalError: Internal check failed. [rank0]: Last error: [rank0]: Bootstrap : no socket interface found ``` -error, it means NCCL was probably not loaded. +error, it means NCCL was probably not loaded. ## DeepseekV3Config @@ -192,4 +195,4 @@ error, it means NCCL was probably not loaded. ## DeepseekV3ForTokenClassification [[autodoc]] DeepseekV3ForTokenClassification - - forward \ No newline at end of file + - forward diff --git a/docs/source/en/model_doc/deepseek_vl.md b/docs/source/en/model_doc/deepseek_vl.md index 58695db8348c..710e6144bb0e 100644 --- a/docs/source/en/model_doc/deepseek_vl.md +++ b/docs/source/en/model_doc/deepseek_vl.md @@ -63,6 +63,7 @@ messages = [ pipe(text=messages, max_new_tokens=20, return_full_text=False) ``` + @@ -115,6 +116,7 @@ output_text = processor.batch_decode( print(output_text) ``` + @@ -138,9 +140,11 @@ model = DeepseekVLForConditionalGeneration.from_pretrained( quantization_config=quantization_config ) ``` + ### Notes - Do inference with multiple images in a single conversation. + ```py import torch from transformers import DeepseekVLForConditionalGeneration, AutoProcessor diff --git a/docs/source/en/model_doc/deepseek_vl_hybrid.md b/docs/source/en/model_doc/deepseek_vl_hybrid.md index d18ab7576adc..e779d0ac55f1 100644 --- a/docs/source/en/model_doc/deepseek_vl_hybrid.md +++ b/docs/source/en/model_doc/deepseek_vl_hybrid.md @@ -24,7 +24,7 @@ rendered properly in your Markdown viewer. # DeepseekVLHybrid -[Deepseek-VL-Hybrid](https://huggingface.co/papers/2403.05525) was introduced by the DeepSeek AI team. It is a vision-language model (VLM) designed to process both text and images for generating contextually relevant responses. The model leverages [LLaMA](./llama) as its text encoder, while [SigLip](./siglip) is used for encoding low-resolution images and [SAM (Segment Anything Model)](./sam) is incorporated to handle high-resolution image encoding, enhancing the model’s ability to process fine-grained visual details. Deepseek-VL-Hybrid is a variant of Deepseek-VL that uses [SAM (Segment Anything Model)](./sam) to handle high-resolution image encoding. +[Deepseek-VL-Hybrid](https://huggingface.co/papers/2403.05525) was introduced by the DeepSeek AI team. It is a vision-language model (VLM) designed to process both text and images for generating contextually relevant responses. The model leverages [LLaMA](./llama) as its text encoder, while [SigLip](./siglip) is used for encoding low-resolution images and [SAM (Segment Anything Model)](./sam) is incorporated to handle high-resolution image encoding, enhancing the model's ability to process fine-grained visual details. Deepseek-VL-Hybrid is a variant of Deepseek-VL that uses [SAM (Segment Anything Model)](./sam) to handle high-resolution image encoding. You can find all the original Deepseek-VL-Hybrid checkpoints under the [DeepSeek-community](https://huggingface.co/deepseek-community) organization. @@ -62,6 +62,7 @@ messages = [ pipe(text=messages, max_new_tokens=20, return_full_text=False) ``` + @@ -114,6 +115,7 @@ output_text = processor.batch_decode( print(output_text) ``` + @@ -137,9 +139,11 @@ model = DeepseekVLHybridForConditionalGeneration.from_pretrained( quantization_config=quantization_config ) ``` + ### Notes - Do inference with multiple images in a single conversation. + ```py import torch from transformers import DeepseekVLHybridForConditionalGeneration, AutoProcessor diff --git a/docs/source/en/model_doc/deformable_detr.md b/docs/source/en/model_doc/deformable_detr.md index da03770bcbe5..c83dede78086 100644 --- a/docs/source/en/model_doc/deformable_detr.md +++ b/docs/source/en/model_doc/deformable_detr.md @@ -16,9 +16,9 @@ rendered properly in your Markdown viewer. *This model was released on 2020-10-08 and added to Hugging Face Transformers on 2022-09-14.*
-
- PyTorch -
+
+ PyTorch +
# Deformable DETR diff --git a/docs/source/en/model_doc/deit.md b/docs/source/en/model_doc/deit.md index b40db07365a1..185a741d5b44 100644 --- a/docs/source/en/model_doc/deit.md +++ b/docs/source/en/model_doc/deit.md @@ -86,7 +86,7 @@ page for more information. SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set `attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used. -``` +```py from transformers import DeiTForImageClassification model = DeiTForImageClassification.from_pretrained("facebook/deit-base-distilled-patch16-224", attn_implementation="sdpa", dtype=torch.float16) ... diff --git a/docs/source/en/model_doc/deplot.md b/docs/source/en/model_doc/deplot.md index 651ddcef7fe9..5a7d4d12dcd6 100644 --- a/docs/source/en/model_doc/deplot.md +++ b/docs/source/en/model_doc/deplot.md @@ -21,7 +21,7 @@ rendered properly in your Markdown viewer. PyTorch -## Overview +## Overview DePlot was proposed in the paper [DePlot: One-shot visual language reasoning by plot-to-table translation](https://huggingface.co/papers/2212.10505) from Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun. @@ -36,8 +36,7 @@ DePlot is a Visual Question Answering subset of `Pix2Struct` architecture. It re Currently one checkpoint is available for DePlot: -- `google/deplot`: DePlot fine-tuned on ChartQA dataset - +- `google/deplot`: DePlot fine-tuned on ChartQA dataset ```python from transformers import AutoProcessor, Pix2StructForConditionalGeneration @@ -57,6 +56,7 @@ print(processor.decode(predictions[0], skip_special_tokens=True)) ## Fine-tuning To fine-tune DePlot, refer to the pix2struct [fine-tuning notebook](https://github.com/huggingface/notebooks/blob/main/examples/image_captioning_pix2struct.ipynb). For `Pix2Struct` models, we have found out that fine-tuning the model with Adafactor and cosine learning rate scheduler leads to faster convergence: + ```python from transformers.optimization import Adafactor, get_cosine_schedule_with_warmup @@ -68,4 +68,4 @@ scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=1000, nu DePlot is a model trained using `Pix2Struct` architecture. For API reference, see [`Pix2Struct` documentation](pix2struct). - \ No newline at end of file + diff --git a/docs/source/en/model_doc/depth_anything.md b/docs/source/en/model_doc/depth_anything.md index 5ac7007595ff..44774c961eaa 100644 --- a/docs/source/en/model_doc/depth_anything.md +++ b/docs/source/en/model_doc/depth_anything.md @@ -86,4 +86,4 @@ Image.fromarray(depth.astype("uint8")) ## DepthAnythingForDepthEstimation [[autodoc]] DepthAnythingForDepthEstimation - - forward \ No newline at end of file + - forward diff --git a/docs/source/en/model_doc/depth_anything_v2.md b/docs/source/en/model_doc/depth_anything_v2.md index e8637ba6192c..fbcf2248f658 100644 --- a/docs/source/en/model_doc/depth_anything_v2.md +++ b/docs/source/en/model_doc/depth_anything_v2.md @@ -110,4 +110,4 @@ If you're interested in submitting a resource to be included here, please feel f ## DepthAnythingForDepthEstimation [[autodoc]] DepthAnythingForDepthEstimation - - forward \ No newline at end of file + - forward diff --git a/docs/source/en/model_doc/depth_pro.md b/docs/source/en/model_doc/depth_pro.md index 85423359ceb0..c19703cdccc3 100644 --- a/docs/source/en/model_doc/depth_pro.md +++ b/docs/source/en/model_doc/depth_pro.md @@ -84,12 +84,13 @@ alt="drawing" width="600"/> The `DepthProForDepthEstimation` model uses a `DepthProEncoder`, for encoding the input image and a `FeatureFusionStage` for fusing the output features from encoder. The `DepthProEncoder` further uses two encoders: + - `patch_encoder` - - Input image is scaled with multiple ratios, as specified in the `scaled_images_ratios` configuration. - - Each scaled image is split into smaller **patches** of size `patch_size` with overlapping areas determined by `scaled_images_overlap_ratios`. - - These patches are processed by the **`patch_encoder`** + - Input image is scaled with multiple ratios, as specified in the `scaled_images_ratios` configuration. + - Each scaled image is split into smaller **patches** of size `patch_size` with overlapping areas determined by `scaled_images_overlap_ratios`. + - These patches are processed by the **`patch_encoder`** - `image_encoder` - - Input image is also rescaled to `patch_size` and processed by the **`image_encoder`** + - Input image is also rescaled to `patch_size` and processed by the **`image_encoder`** Both these encoders can be configured via `patch_model_config` and `image_model_config` respectively, both of which are separate `Dinov2Model` by default. @@ -102,12 +103,14 @@ The network is supplemented with a focal length estimation head. A small convolu The `use_fov_model` parameter in `DepthProConfig` controls whether **FOV prediction** is enabled. By default, it is set to `False` to conserve memory and computation. When enabled, the **FOV encoder** is instantiated based on the `fov_model_config` parameter, which defaults to a `Dinov2Model`. The `use_fov_model` parameter can also be passed when initializing the `DepthProForDepthEstimation` model. The pretrained model at checkpoint `apple/DepthPro-hf` uses the FOV encoder. To use the pretrained-model without FOV encoder, set `use_fov_model=False` when loading the model, which saves computation. + ```py >>> from transformers import DepthProForDepthEstimation >>> model = DepthProForDepthEstimation.from_pretrained("apple/DepthPro-hf", use_fov_model=False) ``` To instantiate a new model with FOV encoder, set `use_fov_model=True` in the config. + ```py >>> from transformers import DepthProConfig, DepthProForDepthEstimation >>> config = DepthProConfig(use_fov_model=True) @@ -115,6 +118,7 @@ To instantiate a new model with FOV encoder, set `use_fov_model=True` in the con ``` Or set `use_fov_model=True` when initializing the model, which overrides the value in config. + ```py >>> from transformers import DepthProConfig, DepthProForDepthEstimation >>> config = DepthProConfig() @@ -123,13 +127,13 @@ Or set `use_fov_model=True` when initializing the model, which overrides the val ### Using Scaled Dot Product Attention (SDPA) -PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function -encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the -[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) +PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function +encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the +[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention) page for more information. -SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set +SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set `attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used. ```py @@ -156,8 +160,8 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h - Official Implementation: [apple/ml-depth-pro](https://github.com/apple/ml-depth-pro) - DepthPro Inference Notebook: [DepthPro Inference](https://github.com/qubvel/transformers-notebooks/blob/main/notebooks/DepthPro_inference.ipynb) - DepthPro for Super Resolution and Image Segmentation - - Read blog on Medium: [Depth Pro: Beyond Depth](https://medium.com/@raoarmaghanshakir040/depth-pro-beyond-depth-9d822fc557ba) - - Code on Github: [geetu040/depthpro-beyond-depth](https://github.com/geetu040/depthpro-beyond-depth) + - Read blog on Medium: [Depth Pro: Beyond Depth](https://medium.com/@raoarmaghanshakir040/depth-pro-beyond-depth-9d822fc557ba) + - Code on Github: [geetu040/depthpro-beyond-depth](https://github.com/geetu040/depthpro-beyond-depth) If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. diff --git a/docs/source/en/model_doc/detr.md b/docs/source/en/model_doc/detr.md index 425ab0f04c51..46c9d3dadce6 100644 --- a/docs/source/en/model_doc/detr.md +++ b/docs/source/en/model_doc/detr.md @@ -16,9 +16,9 @@ rendered properly in your Markdown viewer. *This model was released on 2020-05-26 and added to Hugging Face Transformers on 2021-06-09.*
-
- PyTorch -
+
+ PyTorch +
# DETR @@ -113,6 +113,7 @@ DETR can be naturally extended to perform panoptic segmentation (which unifies s There are three other ways to instantiate a DETR model (depending on what you prefer): - Option 1: Instantiate DETR with pre-trained weights for entire model + ```python from transformers import DetrForObjectDetection @@ -120,6 +121,7 @@ model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50") ``` - Option 2: Instantiate DETR with randomly initialized weights for Transformer, but pre-trained weights for backbone + ```python from transformers import DetrConfig, DetrForObjectDetection @@ -128,6 +130,7 @@ model = DetrForObjectDetection(config) ``` - Option 3: Instantiate DETR with randomly initialized weights for backbone + Transformer + ```python config = DetrConfig(use_pretrained_backbone=False) model = DetrForObjectDetection(config) @@ -144,7 +147,7 @@ As a summary, consider the following table: | **Postprocessing** (i.e. converting the output of the model to Pascal VOC format) | [`~transformers.DetrImageProcessor.post_process`] | [`~transformers.DetrImageProcessor.post_process_segmentation`] | [`~transformers.DetrImageProcessor.post_process_segmentation`], [`~transformers.DetrImageProcessor.post_process_panoptic`] | | **evaluators** | `CocoEvaluator` with `iou_types="bbox"` | `CocoEvaluator` with `iou_types="bbox"` or `"segm"` | `CocoEvaluator` with `iou_tupes="bbox"` or `"segm"`, `PanopticEvaluator` | -- In short, one should prepare the data either in COCO detection or COCO panoptic format, then use [`~transformers.DetrImageProcessor`] to create `pixel_values`, `pixel_mask` and optional `labels`, which can then be used to train (or fine-tune) a model. +- In short, one should prepare the data either in COCO detection or COCO panoptic format, then use [`~transformers.DetrImageProcessor`] to create `pixel_values`, `pixel_mask` and optional `labels`, which can then be used to train (or fine-tune) a model. - For evaluation, one should first convert the outputs of the model using one of the postprocessing methods of [`~transformers.DetrImageProcessor`]. These can be provided to either `CocoEvaluator` or `PanopticEvaluator`, which allow you to calculate metrics like mean Average Precision (mAP) and Panoptic Quality (PQ). The latter objects are implemented in the [original repository](https://github.com/facebookresearch/detr). See the [example notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DETR) for more info regarding evaluation. ## Resources diff --git a/docs/source/en/model_doc/dia.md b/docs/source/en/model_doc/dia.md index 1a07e8831ee7..bab0cb4a72d3 100644 --- a/docs/source/en/model_doc/dia.md +++ b/docs/source/en/model_doc/dia.md @@ -117,11 +117,9 @@ out = model(**inputs) out.loss.backward() ``` - This model was contributed by [Jaeyong Sung](https://huggingface.co/buttercrab), [Arthur Zucker](https://huggingface.co/ArthurZ), and [Anton Vlasjuk](https://huggingface.co/AntonV). The original code can be found [here](https://github.com/nari-labs/dia/). - ## DiaConfig [[autodoc]] DiaConfig diff --git a/docs/source/en/model_doc/diffllama.md b/docs/source/en/model_doc/diffllama.md index 406bae43c5f2..79b8314d0ae2 100644 --- a/docs/source/en/model_doc/diffllama.md +++ b/docs/source/en/model_doc/diffllama.md @@ -35,7 +35,6 @@ The abstract from the paper is the following: ### Usage tips The hyperparameters of this model is the same as Llama model. - ## DiffLlamaConfig [[autodoc]] DiffLlamaConfig diff --git a/docs/source/en/model_doc/dinat.md b/docs/source/en/model_doc/dinat.md index e6d3385003cb..89f0f5cb6572 100644 --- a/docs/source/en/model_doc/dinat.md +++ b/docs/source/en/model_doc/dinat.md @@ -65,6 +65,7 @@ DiNAT can be used as a *backbone*. When `output_hidden_states = True`, it will output both `hidden_states` and `reshaped_hidden_states`. The `reshaped_hidden_states` have a shape of `(batch, num_channels, height, width)` rather than `(batch_size, height, width, num_channels)`. Notes: + - DiNAT depends on [NATTEN](https://github.com/SHI-Labs/NATTEN/)'s implementation of Neighborhood Attention and Dilated Neighborhood Attention. You can install it with pre-built wheels for Linux by referring to [shi-labs.com/natten](https://shi-labs.com/natten), or build on your system by running `pip install natten`. Note that the latter will likely take time to compile. NATTEN does not support Windows devices yet. diff --git a/docs/source/en/model_doc/dinov2.md b/docs/source/en/model_doc/dinov2.md index 59256756acfd..0968641326af 100644 --- a/docs/source/en/model_doc/dinov2.md +++ b/docs/source/en/model_doc/dinov2.md @@ -19,7 +19,6 @@ specific language governing permissions and limitations under the License. - # DINOv2 [DINOv2](https://huggingface.co/papers/2304.07193) is a vision foundation model that uses [ViT](./vit) as a feature extractor for multiple downstream tasks like image classification and depth estimation. It focuses on stabilizing and accelerating training through techniques like a faster memory-efficient attention, sequence packing, improved stochastic depth, Fully Sharded Data Parallel (FSDP), and model distillation. diff --git a/docs/source/en/model_doc/dinov2_with_registers.md b/docs/source/en/model_doc/dinov2_with_registers.md index f89de76d2168..d6b9c08f2f8f 100644 --- a/docs/source/en/model_doc/dinov2_with_registers.md +++ b/docs/source/en/model_doc/dinov2_with_registers.md @@ -24,7 +24,8 @@ The [Vision Transformer](vit) (ViT) is a transformer encoder model (BERT-like) o Next, people figured out ways to make ViT work really well on self-supervised image feature extraction (i.e. learning meaningful features, also called embeddings) on images without requiring any labels. Some example papers here include [DINOv2](dinov2) and [MAE](vit_mae). -The authors of DINOv2 noticed that ViTs have artifacts in attention maps. It’s due to the model using some image patches as “registers”. The authors propose a fix: just add some new tokens (called "register" tokens), which you only use during pre-training (and throw away afterwards). This results in: +The authors of DINOv2 noticed that ViTs have artifacts in attention maps. It's due to the model using some image patches as “registers”. The authors propose a fix: just add some new tokens (called "register" tokens), which you only use during pre-training (and throw away afterwards). This results in: + - no artifacts - interpretable attention maps - and improved performances. @@ -45,7 +46,6 @@ Tips: This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/facebookresearch/dinov2). - ## Dinov2WithRegistersConfig [[autodoc]] Dinov2WithRegistersConfig @@ -58,4 +58,4 @@ The original code can be found [here](https://github.com/facebookresearch/dinov2 ## Dinov2WithRegistersForImageClassification [[autodoc]] Dinov2WithRegistersForImageClassification - - forward \ No newline at end of file + - forward diff --git a/docs/source/en/model_doc/dinov3.md b/docs/source/en/model_doc/dinov3.md index a11a8fd10cca..94e531651566 100644 --- a/docs/source/en/model_doc/dinov3.md +++ b/docs/source/en/model_doc/dinov3.md @@ -19,7 +19,6 @@ specific language governing permissions and limitations under the License. - # DINOv3 [DINOv3](https://huggingface.co/papers/2508.10104) is a family of versatile vision foundation models that outperforms the specialized state of the art across a broad range of settings, without fine-tuning. DINOv3 produces high-quality dense features that achieve outstanding performance on various vision tasks, significantly surpassing previous self- and weakly-supervised foundation models. diff --git a/docs/source/en/model_doc/dit.md b/docs/source/en/model_doc/dit.md index 3027905fe38b..574ffe3ef11a 100644 --- a/docs/source/en/model_doc/dit.md +++ b/docs/source/en/model_doc/dit.md @@ -85,6 +85,7 @@ print(f"The predicted class label is: {predicted_class_label}") ## Notes - The pretrained DiT weights can be loaded in a [BEiT] model with a modeling head to predict visual tokens. + ```py from transformers import BeitForMaskedImageModeling diff --git a/docs/source/en/model_doc/doge.md b/docs/source/en/model_doc/doge.md index 6221940d5d5a..b2e44356ddc4 100644 --- a/docs/source/en/model_doc/doge.md +++ b/docs/source/en/model_doc/doge.md @@ -17,7 +17,6 @@ rendered properly in your Markdown viewer. # Doge - ## Overview Doge is a series of small language models based on the [Doge](https://github.com/SmallDoges/small-doge) architecture, aiming to combine the advantages of state-space and self-attention algorithms, calculate dynamic masks from cached value states using the zero-order hold method, and solve the problem of existing mainstream language models getting lost in context. It uses the `wsd_scheduler` scheduler to pre-train on the `smollm-corpus`, and can continue training on new datasets or add sparse activation feedforward networks from stable stage checkpoints. @@ -28,7 +27,6 @@ As shown in the figure below, the sequence transformation part of the Doge archi Checkout all Doge model checkpoints [here](https://huggingface.co/collections/SmallDoge/doge-slm-679cc991f027c4a3abbded4a). - ## Usage
@@ -44,6 +42,7 @@ inputs = tokenizer("Hey how are you doing?", return_tensors="pt") outputs = model.generate(**inputs, max_new_tokens=100) print(tokenizer.batch_decode(outputs)) ``` +
@@ -82,6 +81,7 @@ outputs = model.generate( streamer=steamer ) ``` +
## DogeConfig @@ -101,4 +101,4 @@ outputs = model.generate( ## DogeForSequenceClassification [[autodoc]] DogeForSequenceClassification - - forward \ No newline at end of file + - forward diff --git a/docs/source/en/model_doc/donut.md b/docs/source/en/model_doc/donut.md index f06b6804d6e4..e582dab748ae 100644 --- a/docs/source/en/model_doc/donut.md +++ b/docs/source/en/model_doc/donut.md @@ -22,7 +22,7 @@ specific language governing permissions and limitations under the License. --> # Donut -[Donut (Document Understanding Transformer)](https://huggingface.co/papers/2111.15664) is a visual document understanding model that doesn't require an Optical Character Recognition (OCR) engine. Unlike traditional approaches that extract text using OCR before processing, Donut employs an end-to-end Transformer-based architecture to directly analyze document images. This eliminates OCR-related inefficiencies making it more accurate and adaptable to diverse languages and formats. +[Donut (Document Understanding Transformer)](https://huggingface.co/papers/2111.15664) is a visual document understanding model that doesn't require an Optical Character Recognition (OCR) engine. Unlike traditional approaches that extract text using OCR before processing, Donut employs an end-to-end Transformer-based architecture to directly analyze document images. This eliminates OCR-related inefficiencies making it more accurate and adaptable to diverse languages and formats. Donut features vision encoder ([Swin](./swin)) and a text decoder ([BART](./bart)). Swin converts document images into embeddings and BART processes them into meaningful text sequences. diff --git a/docs/source/en/model_doc/dots1.md b/docs/source/en/model_doc/dots1.md index 337cad8cb4c7..316ab3b1f5b9 100644 --- a/docs/source/en/model_doc/dots1.md +++ b/docs/source/en/model_doc/dots1.md @@ -25,7 +25,6 @@ The abstract from the report is the following: *Mixture of Experts (MoE) models have emerged as a promising paradigm for scaling language models efficiently by activating only a subset of parameters for each input token. In this report, we present dots.llm1, a large-scale MoE model that activates 14B parameters out of a total of 142B parameters, delivering performance on par with state-of-the-art models while reducing training and inference costs. Leveraging our meticulously crafted and efficient data processing pipeline, dots.llm1 achieves performance comparable to Qwen2.5-72B after pretraining on high-quality corpus and post-training to fully unlock its capabilities. Notably, no synthetic data is used during pretraining. To foster further research, we open-source intermediate training checkpoints spanning the entire training process, providing valuable insights into the learning dynamics of large language models.* - ## Dots1Config [[autodoc]] Dots1Config diff --git a/docs/source/en/model_doc/dpr.md b/docs/source/en/model_doc/dpr.md index 5fe48bc47e7b..18b060cb111d 100644 --- a/docs/source/en/model_doc/dpr.md +++ b/docs/source/en/model_doc/dpr.md @@ -44,9 +44,9 @@ This model was contributed by [lhoestq](https://huggingface.co/lhoestq). The ori - DPR consists in three models: - * Question encoder: encode questions as vectors - * Context encoder: encode contexts as vectors - * Reader: extract the answer of the questions inside retrieved contexts, along with a relevance score (high if the inferred span actually answers the question). + * Question encoder: encode questions as vectors + * Context encoder: encode contexts as vectors + * Reader: extract the answer of the questions inside retrieved contexts, along with a relevance score (high if the inferred span actually answers the question). ## DPRConfig diff --git a/docs/source/en/model_doc/edgetam.md b/docs/source/en/model_doc/edgetam.md new file mode 100644 index 000000000000..780ccb3f70b3 --- /dev/null +++ b/docs/source/en/model_doc/edgetam.md @@ -0,0 +1,331 @@ + +*This model was released on 2025-01-13 and added to Hugging Face Transformers on 2025-09-29.* +
+
+ PyTorch + SDPA + FlashAttention +
+
+ +# EdgeTAM + +## Overview + +The EdgeTAM model was proposed in [EdgeTAM: On-Device Track Anything Model](https://huggingface.co/papers/2501.07256) Chong Zhou, Chenchen Zhu, Yunyang Xiong, Saksham Suri, Fanyi Xiao, Lemeng Wu, Raghuraman Krishnamoorthi, Bo Dai, Chen Change Loy, Vikas Chandra, Bilge Soran. + +EdgeTAM is an efficient adaptation of SAM 2 that introduces a 2D Spatial Perceiver architecture to optimize memory attention mechanisms for real-time video segmentation on mobile devices. + +The abstract from the paper is the following: + +*On top of Segment Anything Model (SAM), SAM 2 further extends its capability from image to video inputs through a memory bank mechanism and obtains a remarkable performance compared with previous methods, making it a foundation model for video segmentation task. In this paper, we aim at making SAM 2 much more efficient so that it even runs on mobile devices while maintaining a comparable performance. Despite several works optimizing SAM for better efficiency, we find they are not sufficient for SAM 2 because they all focus on compressing the image encoder, while our benchmark shows that the newly introduced memory attention blocks are also the latency bottleneck. Given this observation, we propose EdgeTAM, which leverages a novel 2D Spatial Perceiver to reduce the computational cost. In particular, the proposed 2D Spatial Perceiver encodes the densely stored frame-level memories with a lightweight Transformer that contains a fixed set of learnable queries. Given that video segmentation is a dense prediction task, we find preserving the spatial structure of the memories is essential so that the queries are split into global-level and patch-level groups. We also propose a distillation pipeline that further improves the performance without inference overhead. As a result, EdgeTAM achieves 87.7, 70.0, 72.3, and 71.7 J&F on DAVIS 2017, MOSE, SA-V val, and SA-V test, while running at 16 FPS on iPhone 15 Pro Max.* + +This model was contributed by [yonigozlan](https://huggingface.co/yonigozlan). +The original code can be found [here](https://github.com/facebookresearch/EdgeTAM). + +## Usage example + +### Automatic Mask Generation with Pipeline + +EdgeTAM can be used for automatic mask generation to segment all objects in an image using the `mask-generation` pipeline: + +```python +>>> from transformers import pipeline + +>>> generator = pipeline("mask-generation", model="yonigozlan/edgetam-1", device=0) +>>> image_url = "https://huggingface.co/datasets/hf-internal-testing/sam2-fixtures/resolve/main/truck.jpg" +>>> outputs = generator(image_url, points_per_batch=64) + +>>> len(outputs["masks"]) # Number of masks generated +39 +``` + +### Basic Image Segmentation + +#### Single Point Click + +You can segment objects by providing a single point click on the object you want to segment: + +```python +>>> from transformers import Sam2Processor, EdgeTamModel, infer_device +>>> import torch +>>> from PIL import Image +>>> import requests + +>>> device = infer_device() + +>>> model = EdgeTamModel.from_pretrained("yonigozlan/edgetam-1").to(device) +>>> processor = Sam2Processor.from_pretrained("yonigozlan/edgetam-1") + +>>> image_url = "https://huggingface.co/datasets/hf-internal-testing/sam2-fixtures/resolve/main/truck.jpg" +>>> raw_image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB") + +>>> input_points = [[[[500, 375]]]] # Single point click, 4 dimensions (image_dim, object_dim, point_per_object_dim, coordinates) +>>> input_labels = [[[1]]] # 1 for positive click, 0 for negative click, 3 dimensions (image_dim, object_dim, point_label) + +>>> inputs = processor(images=raw_image, input_points=input_points, input_labels=input_labels, return_tensors="pt").to(model.device) + +>>> with torch.no_grad(): +... outputs = model(**inputs) + +>>> masks = processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"])[0] + +>>> # The model outputs multiple mask predictions ranked by quality score +>>> print(f"Generated {masks.shape[1]} masks with shape {masks.shape}") +Generated 3 masks with shape torch.Size([1, 3, 1200, 1800]) +>>> print(f"IoU scores: {outputs.iou_scores.squeeze()}") +IoU scores: tensor([0.0463, 0.4859, 0.7616], device='cuda:0') +``` + +#### Multiple Points for Refinement + +You can provide multiple points to refine the segmentation: + +```python +>>> # Add both positive and negative points to refine the mask +>>> input_points = [[[[500, 375], [1125, 625]]]] # Multiple points for refinement +>>> input_labels = [[[1, 1]]] # Both positive clicks + +>>> inputs = processor(images=raw_image, input_points=input_points, input_labels=input_labels, return_tensors="pt").to(device) + +>>> with torch.no_grad(): +... outputs = model(**inputs) + +>>> masks = processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"])[0] +>>> print(f"IoU scores: {outputs.iou_scores.squeeze()}") +IoU scores: tensor([0.8362, 0.6900, 0.2120], device='cuda:0') +``` + +#### Bounding Box Input + +EdgeTAM also supports bounding box inputs for segmentation: + +```python +>>> # Define bounding box as [x_min, y_min, x_max, y_max] +>>> input_boxes = [[[75, 275, 1725, 850]]] + +>>> inputs = processor(images=raw_image, input_boxes=input_boxes, return_tensors="pt").to(device) + +>>> with torch.no_grad(): +... outputs = model(**inputs) + +>>> masks = processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"])[0] +>>> print(f"IoU scores: {outputs.iou_scores.squeeze()}") +IoU scores: tensor([0.9301, 0.9348, 0.6605], device='cuda:0') +``` + +#### Multiple Objects Segmentation + +You can segment multiple objects simultaneously: + +```python +>>> # Define points for two different objects +>>> input_points = [[[[500, 375]], [[650, 750]]]] # Points for two objects in same image +>>> input_labels = [[[1], [1]]] # Positive clicks for both objects + +>>> inputs = processor(images=raw_image, input_points=input_points, input_labels=input_labels, return_tensors="pt").to(device) + +>>> with torch.no_grad(): +... outputs = model(**inputs, multimask_output=False) + +>>> # Each object gets its own mask +>>> masks = processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"])[0] +>>> print(f"Generated masks for {masks.shape[0]} objects") +Generated masks for 2 objects +>>> print(f"IoU scores: {outputs.iou_scores.squeeze()}") +IoU scores: tensor([0.7616, 0.9465], device='cuda:0') +``` + +### Batch Inference + +#### Batched Images + +Process multiple images simultaneously for improved efficiency: + +```python +>>> from transformers import Sam2Processor, EdgeTamModel, infer_device +>>> import torch +>>> from PIL import Image +>>> import requests + +>>> device = infer_device() + +>>> model = EdgeTamModel.from_pretrained("yonigozlan/edgetam-1").to(device) +>>> processor = Sam2Processor.from_pretrained("yonigozlan/edgetam-1") + +>>> # Load multiple images +>>> image_urls = [ +... "https://huggingface.co/datasets/hf-internal-testing/sam2-fixtures/resolve/main/truck.jpg", +... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/dog-sam.png" +... ] +>>> raw_images = [Image.open(requests.get(url, stream=True).raw).convert("RGB") for url in image_urls] + +>>> # Single point per image +>>> input_points = [[[[500, 375]]], [[[770, 200]]]] # One point for each image +>>> input_labels = [[[1]], [[1]]] # Positive clicks for both images + +>>> inputs = processor(images=raw_images, input_points=input_points, input_labels=input_labels, return_tensors="pt").to(model.device) + +>>> with torch.no_grad(): +... outputs = model(**inputs, multimask_output=False) + +>>> # Post-process masks for each image +>>> all_masks = processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"]) +>>> print(f"Processed {len(all_masks)} images, each with {all_masks[0].shape[0]} objects") +Processed 2 images, each with 1 objects +>>> print(f"IoU scores: {outputs.iou_scores.squeeze()}") +IoU scores: tensor([0.7618, 0.7999], device='cuda:0') +``` + +#### Batched Objects per Image + +Segment multiple objects within each image using batch inference: + +```python +>>> # Multiple objects per image - different numbers of objects per image +>>> input_points = [ +... [[[500, 375]], [[650, 750]]], # Truck image: 2 objects +... [[[770, 200]]] # Dog image: 1 object +... ] +>>> input_labels = [ +... [[1], [1]], # Truck image: positive clicks for both objects +... [[1]] # Dog image: positive click for the object +... ] + +>>> inputs = processor(images=raw_images, input_points=input_points, input_labels=input_labels, return_tensors="pt").to(device) + +>>> with torch.no_grad(): +... outputs = model(**inputs, multimask_output=False) + +>>> all_masks = processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"]) +``` + +#### Batched Images with Batched Objects and Multiple Points + +Handle complex batch scenarios with multiple points per object: + +```python +>>> # Add groceries image for more complex example +>>> groceries_url = "https://huggingface.co/datasets/hf-internal-testing/sam2-fixtures/resolve/main/groceries.jpg" +>>> groceries_image = Image.open(requests.get(groceries_url, stream=True).raw).convert("RGB") +>>> raw_images = [raw_images[0], groceries_image] # Use truck and groceries images + +>>> # Complex batching: multiple images, multiple objects, multiple points per object +>>> input_points = [ +... [[[500, 375]], [[650, 750]]], # Truck image: 2 objects with 1 point each +... [[[400, 300]], [[630, 300], [550, 300]]] # Groceries image: obj1 has 1 point, obj2 has 2 points +... ] +>>> input_labels = [ +... [[1], [1]], # Truck image: positive clicks +... [[1], [1, 1]] # Groceries image: positive clicks for refinement +... ] + +>>> inputs = processor(images=raw_images, input_points=input_points, input_labels=input_labels, return_tensors="pt").to(device) + +>>> with torch.no_grad(): +... outputs = model(**inputs, multimask_output=False) + +>>> all_masks = processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"]) +``` + +#### Batched Bounding Boxes + +Process multiple images with bounding box inputs: + +```python +>>> # Multiple bounding boxes per image (using truck and groceries images) +>>> input_boxes = [ +... [[75, 275, 1725, 850], [425, 600, 700, 875], [1375, 550, 1650, 800], [1240, 675, 1400, 750]], # Truck image: 4 boxes +... [[450, 170, 520, 350], [350, 190, 450, 350], [500, 170, 580, 350], [580, 170, 640, 350]] # Groceries image: 4 boxes +... ] + +>>> # Update images for this example +>>> raw_images = [raw_images[0], groceries_image] # truck and groceries + +>>> inputs = processor(images=raw_images, input_boxes=input_boxes, return_tensors="pt").to(device) + +>>> with torch.no_grad(): +... outputs = model(**inputs, multimask_output=False) + +>>> all_masks = processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"]) +>>> print(f"Processed {len(input_boxes)} images with {len(input_boxes[0])} and {len(input_boxes[1])} boxes respectively") +Processed 2 images with 4 and 4 boxes respectively +>>> print(f"IoU scores: {outputs.iou_scores.squeeze()}") +IoU scores: tensor([0.9301, 0.9348, 0.6605, 0.9465], device='cuda:0') +``` + +### Using Previous Masks as Input + +EdgeTAM can use masks from previous predictions as input to refine segmentation: + +```python +>>> # Get initial segmentation +>>> input_points = [[[[500, 375]]]] +>>> input_labels = [[[1]]] +>>> inputs = processor(images=raw_image, input_points=input_points, input_labels=input_labels, return_tensors="pt").to(device) + +>>> with torch.no_grad(): +... outputs = model(**inputs) + +>>> # Use the best mask as input for refinement +>>> mask_input = outputs.pred_masks[:, :, torch.argmax(outputs.iou_scores.squeeze())] + +>>> # Add additional points with the mask input +>>> new_input_points = [[[[500, 375], [450, 300]]]] +>>> new_input_labels = [[[1, 1]]] +>>> inputs = processor( +... input_points=new_input_points, +... input_labels=new_input_labels, +... original_sizes=inputs["original_sizes"], +... return_tensors="pt", +... ).to(device) + +>>> with torch.no_grad(): +... refined_outputs = model( +... **inputs, +... input_masks=mask_input, +... image_embeddings=outputs.image_embeddings, +... multimask_output=False, +... ) +``` + + +## EdgeTamConfig + +[[autodoc]] EdgeTamConfig + +## EdgeTamVisionConfig + +[[autodoc]] EdgeTamVisionConfig + +## EdgeTamMaskDecoderConfig + +[[autodoc]] EdgeTamMaskDecoderConfig + +## EdgeTamPromptEncoderConfig + +[[autodoc]] EdgeTamPromptEncoderConfig + +## EdgeTamVisionModel + +[[autodoc]] EdgeTamVisionModel + - forward + +## EdgeTamModel + +[[autodoc]] EdgeTamModel + - forward diff --git a/docs/source/en/model_doc/edgetam_video.md b/docs/source/en/model_doc/edgetam_video.md new file mode 100644 index 000000000000..381bace4dbe0 --- /dev/null +++ b/docs/source/en/model_doc/edgetam_video.md @@ -0,0 +1,297 @@ + +*This model was released on 2025-01-13 and added to Hugging Face Transformers on 2025-09-29.* + + +
+
+ PyTorch + SDPA + FlashAttention +
+
+ +# EdgeTAMVideo + +## Overview + +The EdgeTAM model was proposed in [EdgeTAM: On-Device Track Anything Model](https://huggingface.co/papers/2501.07256) Chong Zhou, Chenchen Zhu, Yunyang Xiong, Saksham Suri, Fanyi Xiao, Lemeng Wu, Raghuraman Krishnamoorthi, Bo Dai, Chen Change Loy, Vikas Chandra, Bilge Soran. + +EdgeTAM is an efficient adaptation of SAM 2 that introduces a 2D Spatial Perceiver architecture to optimize memory attention mechanisms for real-time video segmentation on mobile devices. + +The abstract from the paper is the following: + +*On top of Segment Anything Model (SAM), SAM 2 further extends its capability from image to video inputs through a memory bank mechanism and obtains a remarkable performance compared with previous methods, making it a foundation model for video segmentation task. In this paper, we aim at making SAM 2 much more efficient so that it even runs on mobile devices while maintaining a comparable performance. Despite several works optimizing SAM for better efficiency, we find they are not sufficient for SAM 2 because they all focus on compressing the image encoder, while our benchmark shows that the newly introduced memory attention blocks are also the latency bottleneck. Given this observation, we propose EdgeTAM, which leverages a novel 2D Spatial Perceiver to reduce the computational cost. In particular, the proposed 2D Spatial Perceiver encodes the densely stored frame-level memories with a lightweight Transformer that contains a fixed set of learnable queries. Given that video segmentation is a dense prediction task, we find preserving the spatial structure of the memories is essential so that the queries are split into global-level and patch-level groups. We also propose a distillation pipeline that further improves the performance without inference overhead. As a result, EdgeTAM achieves 87.7, 70.0, 72.3, and 71.7 J&F on DAVIS 2017, MOSE, SA-V val, and SA-V test, while running at 16 FPS on iPhone 15 Pro Max.* + +This model was contributed by [yonigozlan](https://huggingface.co/yonigozlan). +The original code can be found [here](https://github.com/facebookresearch/EdgeTAM). + +## Usage example + +### Video Segmentation and Tracking + +EdgeTAM Video's key strength is its ability to track objects across video frames efficiently on mobile devices. Here's how to use it for video segmentation: + +#### Basic Video Tracking + +```python +>>> from transformers import EdgeTamVideoModel, Sam2VideoProcessor, infer_device +>>> import torch + +>>> device = infer_device() +>>> model = EdgeTamVideoModel.from_pretrained("yonigozlan/edgetam-video-1").to(device, dtype=torch.bfloat16) +>>> processor = Sam2VideoProcessor.from_pretrained("yonigozlan/edgetam-video-1") + +>>> # Load video frames (example assumes you have a list of PIL Images) +>>> # video_frames = [Image.open(f"frame_{i:05d}.jpg") for i in range(num_frames)] + +>>> # For this example, we'll use the video loading utility +>>> from transformers.video_utils import load_video +>>> video_url = "https://huggingface.co/datasets/hf-internal-testing/sam2-fixtures/resolve/main/bedroom.mp4" +>>> video_frames, _ = load_video(video_url) + +>>> # Initialize video inference session +>>> inference_session = processor.init_video_session( +... video=video_frames, +... inference_device=device, +... dtype=torch.bfloat16, +... ) + +>>> # Add click on first frame to select object +>>> ann_frame_idx = 0 +>>> ann_obj_id = 1 +>>> points = [[[[210, 350]]]] +>>> labels = [[[1]]] + +>>> processor.add_inputs_to_inference_session( +... inference_session=inference_session, +... frame_idx=ann_frame_idx, +... obj_ids=ann_obj_id, +... input_points=points, +... input_labels=labels, +... ) + +>>> # Segment the object on the first frame +>>> outputs = model( +... inference_session=inference_session, +... frame_idx=ann_frame_idx, +... ) +>>> video_res_masks = processor.post_process_masks( +... [outputs.pred_masks], original_sizes=[[inference_session.video_height, inference_session.video_width]], binarize=False +... )[0] +>>> print(f"Segmentation shape: {video_res_masks.shape}") +Segmentation shape: torch.Size([1, 1, 540, 960]) + +>>> # Propagate through the entire video +>>> video_segments = {} +>>> for sam2_video_output in model.propagate_in_video_iterator(inference_session): +... video_res_masks = processor.post_process_masks( +... [sam2_video_output.pred_masks], original_sizes=[[inference_session.video_height, inference_session.video_width]], binarize=False +... )[0] +... video_segments[sam2_video_output.frame_idx] = video_res_masks + +>>> print(f"Tracked object through {len(video_segments)} frames") +Tracked object through 200 frames +``` + +#### Multi-Object Video Tracking + +Track multiple objects simultaneously across video frames: + +```python +>>> # Reset for new tracking session +>>> inference_session.reset_inference_session() + +>>> # Add multiple objects on the first frame +>>> ann_frame_idx = 0 +>>> obj_ids = [2, 3] +>>> input_points = [[[[200, 300]], [[400, 150]]]] # Points for two objects (batched) +>>> input_labels = [[[1], [1]]] + +>>> processor.add_inputs_to_inference_session( +... inference_session=inference_session, +... frame_idx=ann_frame_idx, +... obj_ids=obj_ids, +... input_points=input_points, +... input_labels=input_labels, +... ) + +>>> # Get masks for both objects on first frame +>>> outputs = model( +... inference_session=inference_session, +... frame_idx=ann_frame_idx, +... ) + +>>> # Propagate both objects through video +>>> video_segments = {} +>>> for sam2_video_output in model.propagate_in_video_iterator(inference_session): +... video_res_masks = processor.post_process_masks( +... [sam2_video_output.pred_masks], original_sizes=[[inference_session.video_height, inference_session.video_width]], binarize=False +... )[0] +... video_segments[sam2_video_output.frame_idx] = { +... obj_id: video_res_masks[i] +... for i, obj_id in enumerate(inference_session.obj_ids) +... } + +>>> print(f"Tracked {len(inference_session.obj_ids)} objects through {len(video_segments)} frames") +Tracked 2 objects through 200 frames +``` + +#### Refining Video Segmentation + +You can add additional clicks on any frame to refine the tracking: + +```python +>>> # Add refinement click on a later frame +>>> refine_frame_idx = 50 +>>> ann_obj_id = 2 # Refining first object +>>> points = [[[[220, 280]]]] # Additional point +>>> labels = [[[1]]] # Positive click + +>>> processor.add_inputs_to_inference_session( +... inference_session=inference_session, +... frame_idx=refine_frame_idx, +... obj_ids=ann_obj_id, +... input_points=points, +... input_labels=labels, +... ) + +>>> # Re-propagate with the additional information +>>> video_segments = {} +>>> for sam2_video_output in model.propagate_in_video_iterator(inference_session): +... video_res_masks = processor.post_process_masks( +... [sam2_video_output.pred_masks], original_sizes=[[inference_session.video_height, inference_session.video_width]], binarize=False +... )[0] +... video_segments[sam2_video_output.frame_idx] = video_res_masks +``` + +### Streaming Video Inference + +For real-time applications, EdgeTAM Video supports processing video frames as they arrive: + +```python +>>> # Initialize session for streaming +>>> inference_session = processor.init_video_session( +... inference_device=device, +... dtype=torch.bfloat16, +... ) + +>>> # Process frames one by one +>>> for frame_idx, frame in enumerate(video_frames[:10]): # Process first 10 frames +... inputs = processor(images=frame, device=device, return_tensors="pt") +... +... if frame_idx == 0: +... # Add point input on first frame +... processor.add_inputs_to_inference_session( +... inference_session=inference_session, +... frame_idx=0, +... obj_ids=1, +... input_points=[[[[210, 350], [250, 220]]]], +... input_labels=[[[1, 1]]], +... original_size=inputs.original_sizes[0], # need to be provided when using streaming video inference +... ) +... +... # Process current frame +... sam2_video_output = model(inference_session=inference_session, frame=inputs.pixel_values[0]) +... +... video_res_masks = processor.post_process_masks( +... [sam2_video_output.pred_masks], original_sizes=inputs.original_sizes, binarize=False +... )[0] +... print(f"Frame {frame_idx}: mask shape {video_res_masks.shape}") + +Frame 0: mask shape torch.Size([1, 1, 540, 960]) +... +``` + +#### Video Batch Processing for Multiple Objects + +Track multiple objects simultaneously in video by adding them all at once: + +```python +>>> # Initialize video session +>>> inference_session = processor.init_video_session( +... video=video_frames, +... inference_device=device, +... dtype=torch.bfloat16, +... ) + +>>> # Add multiple objects on the first frame using batch processing +>>> ann_frame_idx = 0 +>>> obj_ids = [2, 3] # Track two different objects +>>> input_points = [ +... [[[200, 300], [230, 250], [275, 175]], [[400, 150]]] +... ] # Object 2: 3 points (2 positive, 1 negative); Object 3: 1 point +>>> input_labels = [ +... [[1, 1, 0], [1]] +... ] # Object 2: positive, positive, negative; Object 3: positive + +>>> processor.add_inputs_to_inference_session( +... inference_session=inference_session, +... frame_idx=ann_frame_idx, +... obj_ids=obj_ids, +... input_points=input_points, +... input_labels=input_labels, +... ) + +>>> # Get masks for all objects on the first frame +>>> outputs = model( +... inference_session=inference_session, +... frame_idx=ann_frame_idx, +... ) +>>> video_res_masks = processor.post_process_masks( +... [outputs.pred_masks], original_sizes=[[inference_session.video_height, inference_session.video_width]], binarize=False +... )[0] +>>> print(f"Generated masks for {video_res_masks.shape[0]} objects") +Generated masks for 2 objects + +>>> # Propagate all objects through the video +>>> video_segments = {} +>>> for sam2_video_output in model.propagate_in_video_iterator(inference_session): +... video_res_masks = processor.post_process_masks( +... [sam2_video_output.pred_masks], original_sizes=[[inference_session.video_height, inference_session.video_width]], binarize=False +... )[0] +... video_segments[sam2_video_output.frame_idx] = { +... obj_id: video_res_masks[i] +... for i, obj_id in enumerate(inference_session.obj_ids) +... } + +>>> print(f"Tracked {len(inference_session.obj_ids)} objects through {len(video_segments)} frames") +Tracked 2 objects through 200 frames +``` + +## EdgeTamVideoMaskDecoderConfig + +[[autodoc]] EdgeTamVideoMaskDecoderConfig + +## EdgeTamVideoPromptEncoderConfig + +[[autodoc]] EdgeTamVideoPromptEncoderConfig + +## EdgeTamVideoConfig + +[[autodoc]] EdgeTamVideoConfig + +## EdgeTamVideoInferenceSession + +[[autodoc]] EdgeTamVideoInferenceSession + +## EdgeTamVideoModel + +[[autodoc]] EdgeTamVideoModel + - forward diff --git a/docs/source/en/model_doc/efficientloftr.md b/docs/source/en/model_doc/efficientloftr.md index 2994ae83262d..4efd87502b67 100644 --- a/docs/source/en/model_doc/efficientloftr.md +++ b/docs/source/en/model_doc/efficientloftr.md @@ -45,6 +45,7 @@ results = keypoint_matcher([url_0, url_1], threshold=0.9) print(results[0]) # {'keypoint_image_0': {'x': ..., 'y': ...}, 'keypoint_image_1': {'x': ..., 'y': ...}, 'score': ...} ``` + @@ -143,26 +144,23 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size ## EfficientLoFTRImageProcessor [[autodoc]] EfficientLoFTRImageProcessor - -- preprocess -- post_process_keypoint_matching -- visualize_keypoint_matching + - preprocess + - post_process_keypoint_matching + - visualize_keypoint_matching ## EfficientLoFTRImageProcessorFast [[autodoc]] EfficientLoFTRImageProcessorFast - -- preprocess -- post_process_keypoint_matching -- visualize_keypoint_matching + - preprocess + - post_process_keypoint_matching + - visualize_keypoint_matching ## EfficientLoFTRModel [[autodoc]] EfficientLoFTRModel - -- forward + - forward ## EfficientLoFTRForKeypointMatching @@ -171,4 +169,4 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size - forward - \ No newline at end of file + diff --git a/docs/source/en/model_doc/efficientnet.md b/docs/source/en/model_doc/efficientnet.md index 859923126a9d..b4fbe8225625 100644 --- a/docs/source/en/model_doc/efficientnet.md +++ b/docs/source/en/model_doc/efficientnet.md @@ -23,7 +23,7 @@ rendered properly in your Markdown viewer. ## Overview -The EfficientNet model was proposed in [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://huggingface.co/papers/1905.11946) +The EfficientNet model was proposed in [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://huggingface.co/papers/1905.11946) by Mingxing Tan and Quoc V. Le. EfficientNets are a family of image classification models, which achieve state-of-the-art accuracy, yet being an order-of-magnitude smaller and faster than previous models. The abstract from the paper is the following: @@ -34,7 +34,6 @@ To go even further, we use neural architecture search to design a new baseline n This model was contributed by [adirik](https://huggingface.co/adirik). The original code can be found [here](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet). - ## EfficientNetConfig [[autodoc]] EfficientNetConfig @@ -58,4 +57,3 @@ The original code can be found [here](https://github.com/tensorflow/tpu/tree/mas [[autodoc]] EfficientNetForImageClassification - forward - diff --git a/docs/source/en/model_doc/emu3.md b/docs/source/en/model_doc/emu3.md index 799de2f0c5c0..0c95bc6d9877 100644 --- a/docs/source/en/model_doc/emu3.md +++ b/docs/source/en/model_doc/emu3.md @@ -27,8 +27,7 @@ rendered properly in your Markdown viewer. The Emu3 model was proposed in [Emu3: Next-Token Prediction is All You Need](https://huggingface.co/papers/2409.18869) by Xinlong Wang, Xiaosong Zhang, Zhengxiong Luo, Quan Sun, Yufeng Cui, Jinsheng Wang, Fan Zhang, Yueze Wang, Zhen Li, Qiying Yu, Yingli Zhao, Yulong Ao, Xuebin Min, Tao Li, Boya Wu, Bo Zhao, Bowen Zhang, Liangdong Wang, Guang Liu, Zheqi He, Xi Yang, Jingjing Liu, Yonghua Lin, Tiejun Huang, Zhongyuan Wang. -Emu3 is a multimodal LLM that uses vector quantization to tokenize images into discrete tokens. Discretized image tokens are later fused with text token ids for image and text generation. The model can additionally generate images by predicting image token ids. - +Emu3 is a multimodal LLM that uses vector quantization to tokenize images into discrete tokens. Discretized image tokens are later fused with text token ids for image and text generation. The model can additionally generate images by predicting image token ids. The abstract from the paper is the following: @@ -45,11 +44,9 @@ Tips: > [!TIP] > Emu3 implementation in Transformers uses a special image token to indicate where to merge image embeddings. The special image token isn't new and uses one of the reserved tokens: `<|extra_0|>`. You have to add `` to your prompt in the place where the image should be embedded for correct generation. - This model was contributed by [RaushanTurganbay](https://huggingface.co/RaushanTurganbay). The original code can be found [here](https://github.com/baaivision/Emu3). - ## Usage example ### Text generation inference @@ -143,7 +140,6 @@ for i, image in enumerate(images['pixel_values']): ``` - ## Emu3Config [[autodoc]] Emu3Config diff --git a/docs/source/en/model_doc/encodec.md b/docs/source/en/model_doc/encodec.md index 890991730391..9fc6c2c97e94 100644 --- a/docs/source/en/model_doc/encodec.md +++ b/docs/source/en/model_doc/encodec.md @@ -29,14 +29,14 @@ The abstract from the paper is the following: *We introduce a state-of-the-art real-time, high-fidelity, audio codec leveraging neural networks. It consists in a streaming encoder-decoder architecture with quantized latent space trained in an end-to-end fashion. We simplify and speed-up the training by using a single multiscale spectrogram adversary that efficiently reduces artifacts and produce high-quality samples. We introduce a novel loss balancer mechanism to stabilize training: the weight of a loss now defines the fraction of the overall gradient it should represent, thus decoupling the choice of this hyper-parameter from the typical scale of the loss. Finally, we study how lightweight Transformer models can be used to further compress the obtained representation by up to 40%, while staying faster than real time. We provide a detailed description of the key design choices of the proposed model including: training objective, architectural changes and a study of various perceptual loss functions. We present an extensive subjective evaluation (MUSHRA tests) together with an ablation study for a range of bandwidths and audio domains, including speech, noisy-reverberant speech, and music. Our approach is superior to the baselines methods across all evaluated settings, considering both 24 kHz monophonic and 48 kHz stereophonic audio.* -This model was contributed by [Matthijs](https://huggingface.co/Matthijs), [Patrick Von Platen](https://huggingface.co/patrickvonplaten) and [Arthur Zucker](https://huggingface.co/ArthurZ). +This model was contributed by [Matthijs](https://huggingface.co/Matthijs), [Patrick Von Platen](https://huggingface.co/patrickvonplaten) and [Arthur Zucker](https://huggingface.co/ArthurZ). The original code can be found [here](https://github.com/facebookresearch/encodec). -## Usage example +## Usage example Here is a quick example of how to encode and decode an audio using this model: -```python +```python >>> from datasets import load_dataset, Audio >>> from transformers import EncodecModel, AutoProcessor >>> librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") diff --git a/docs/source/en/model_doc/eomt.md b/docs/source/en/model_doc/eomt.md index 754b88e2c330..7ff1419b3814 100644 --- a/docs/source/en/model_doc/eomt.md +++ b/docs/source/en/model_doc/eomt.md @@ -39,7 +39,6 @@ Architecturally, EoMT introduces a small set of **learned queries** and a lightw alt="drawing" width="500"/> - The model supports semantic, instance, and panoptic segmentation using a unified architecture and task-specific post-processing. ## Usage Examples @@ -208,4 +207,4 @@ plt.show() ## EomtForUniversalSegmentation [[autodoc]] EomtForUniversalSegmentation - - forward \ No newline at end of file + - forward diff --git a/docs/source/en/model_doc/ernie4_5.md b/docs/source/en/model_doc/ernie4_5.md index e48073bbe6c0..bf71049148d3 100644 --- a/docs/source/en/model_doc/ernie4_5.md +++ b/docs/source/en/model_doc/ernie4_5.md @@ -38,7 +38,6 @@ Other models from the family can be found at [Ernie 4.5 Moe](./ernie4_5_moe). - ## Usage Tips ### Generate text @@ -84,7 +83,6 @@ generate_text = tokenizer.decode(output_ids, skip_special_tokens=True) This model was contributed by [Anton Vlasjuk](https://huggingface.co/AntonV). The original code can be found [here](https://github.com/PaddlePaddle/ERNIE). - ## Ernie4_5Config [[autodoc]] Ernie4_5Config diff --git a/docs/source/en/model_doc/ernie4_5_moe.md b/docs/source/en/model_doc/ernie4_5_moe.md index 20c4dcfd5435..fb6b8d791bec 100644 --- a/docs/source/en/model_doc/ernie4_5_moe.md +++ b/docs/source/en/model_doc/ernie4_5_moe.md @@ -40,7 +40,6 @@ Other models from the family can be found at [Ernie 4.5](./ernie4_5). - ## Usage Tips ### Generate text @@ -167,7 +166,6 @@ generate_text = tokenizer.decode(output_ids, skip_special_tokens=True) This model was contributed by [Anton Vlasjuk](https://huggingface.co/AntonV). The original code can be found [here](https://github.com/PaddlePaddle/ERNIE). - ## Ernie4_5_MoeConfig [[autodoc]] Ernie4_5_MoeConfig diff --git a/docs/source/en/model_doc/ernie_m.md b/docs/source/en/model_doc/ernie_m.md index 508fe2f596b2..e044614e7644 100644 --- a/docs/source/en/model_doc/ernie_m.md +++ b/docs/source/en/model_doc/ernie_m.md @@ -40,7 +40,6 @@ The abstract from the paper is the following: *Recent studies have demonstrated that pre-trained cross-lingual models achieve impressive performance in downstream cross-lingual tasks. This improvement benefits from learning a large amount of monolingual and parallel corpora. Although it is generally acknowledged that parallel corpora are critical for improving the model performance, existing methods are often constrained by the size of parallel corpora, especially for lowresource languages. In this paper, we propose ERNIE-M, a new training method that encourages the model to align the representation of multiple languages with monolingual corpora, to overcome the constraint that the parallel corpus size places on the model performance. Our key insight is to integrate back-translation into the pre-training process. We generate pseudo-parallel sentence pairs on a monolingual corpus to enable the learning of semantic alignments between different languages, thereby enhancing the semantic modeling of cross-lingual models. Experimental results show that ERNIE-M outperforms existing cross-lingual models and delivers new state-of-the-art results in various cross-lingual downstream tasks.* This model was contributed by [Susnato Dhar](https://huggingface.co/susnato). The original code can be found [here](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/paddlenlp/transformers/ernie_m). - ## Usage tips - Ernie-M is a BERT-like model so it is a stacked Transformer Encoder. @@ -59,7 +58,6 @@ This model was contributed by [Susnato Dhar](https://huggingface.co/susnato). Th [[autodoc]] ErnieMConfig - ## ErnieMTokenizer [[autodoc]] ErnieMTokenizer @@ -68,7 +66,6 @@ This model was contributed by [Susnato Dhar](https://huggingface.co/susnato). Th - create_token_type_ids_from_sequences - save_vocabulary - ## ErnieMModel [[autodoc]] ErnieMModel @@ -79,19 +76,16 @@ This model was contributed by [Susnato Dhar](https://huggingface.co/susnato). Th [[autodoc]] ErnieMForSequenceClassification - forward - ## ErnieMForMultipleChoice [[autodoc]] ErnieMForMultipleChoice - forward - ## ErnieMForTokenClassification [[autodoc]] ErnieMForTokenClassification - forward - ## ErnieMForQuestionAnswering [[autodoc]] ErnieMForQuestionAnswering diff --git a/docs/source/en/model_doc/esm.md b/docs/source/en/model_doc/esm.md index e83e2d5aa1da..a6190a71f020 100644 --- a/docs/source/en/model_doc/esm.md +++ b/docs/source/en/model_doc/esm.md @@ -44,12 +44,10 @@ sequence alignment (MSA) step at inference time, which means that ESMFold checkp they do not require a database of known protein sequences and structures with associated external query tools to make predictions, and are much faster as a result. - The abstract from "Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences" is - *In the field of artificial intelligence, a combination of scale in data and model capacity enabled by unsupervised learning has led to major advances in representation learning and statistical generation. In the life sciences, the anticipated growth of sequencing promises unprecedented data on natural sequence diversity. Protein language modeling @@ -63,7 +61,6 @@ can be identified by linear projections. Representation learning produces featur applications, enabling state-of-the-art supervised prediction of mutational effect and secondary structure and improving state-of-the-art features for long-range contact prediction.* - The abstract from "Language models of protein sequences at the scale of evolution enable accurate structure prediction" is diff --git a/docs/source/en/model_doc/evolla.md b/docs/source/en/model_doc/evolla.md index a39103a06d12..ea8605050599 100644 --- a/docs/source/en/model_doc/evolla.md +++ b/docs/source/en/model_doc/evolla.md @@ -25,7 +25,7 @@ Evolla is an advanced 80-billion-parameter protein-language generative model des The abstract from the paper is the following: -*Proteins, nature’s intricate molecular machines, are the products of billions of years of evolution and play fundamental roles in sustaining life. Yet, deciphering their molecular language - that is, understanding how protein sequences and structures encode and determine biological functions - remains a corner-stone challenge in modern biology. Here, we introduce Evolla, an 80 billion frontier protein-language generative model designed to decode the molecular language of proteins. By integrating information from protein sequences, structures, and user queries, Evolla generates precise and contextually nuanced insights into protein function. A key innovation of Evolla lies in its training on an unprecedented AI-generated dataset: 546 million protein question-answer pairs and 150 billion word tokens, designed to reflect the immense complexity and functional diversity of proteins. Post-pretraining, Evolla integrates Direct Preference Optimization (DPO) to refine the model based on preference signals and Retrieval-Augmented Generation (RAG) for external knowledge incorporation, improving response quality and relevance. To evaluate its performance, we propose a novel framework, Instructional Response Space (IRS), demonstrating that Evolla delivers expert-level insights, advancing research in proteomics and functional genomics while shedding light on the molecular logic encoded in proteins. The online demo is available at http://www.chat-protein.com/.* +*Proteins, nature's intricate molecular machines, are the products of billions of years of evolution and play fundamental roles in sustaining life. Yet, deciphering their molecular language - that is, understanding how protein sequences and structures encode and determine biological functions - remains a corner-stone challenge in modern biology. Here, we introduce Evolla, an 80 billion frontier protein-language generative model designed to decode the molecular language of proteins. By integrating information from protein sequences, structures, and user queries, Evolla generates precise and contextually nuanced insights into protein function. A key innovation of Evolla lies in its training on an unprecedented AI-generated dataset: 546 million protein question-answer pairs and 150 billion word tokens, designed to reflect the immense complexity and functional diversity of proteins. Post-pretraining, Evolla integrates Direct Preference Optimization (DPO) to refine the model based on preference signals and Retrieval-Augmented Generation (RAG) for external knowledge incorporation, improving response quality and relevance. To evaluate its performance, we propose a novel framework, Instructional Response Space (IRS), demonstrating that Evolla delivers expert-level insights, advancing research in proteomics and functional genomics while shedding light on the molecular logic encoded in proteins. The online demo is available at http://www.chat-protein.com/.* Examples: @@ -75,7 +75,6 @@ Tips: - This model was contributed by [Xibin Bayes Zhou](https://huggingface.co/XibinBayesZhou). - The original code can be found [here](https://github.com/westlake-repl/Evolla). - ## EvollaConfig [[autodoc]] EvollaConfig diff --git a/docs/source/en/model_doc/exaone4.md b/docs/source/en/model_doc/exaone4.md index 69d7ee0b2a81..9482f5be2c06 100644 --- a/docs/source/en/model_doc/exaone4.md +++ b/docs/source/en/model_doc/exaone4.md @@ -20,7 +20,7 @@ rendered properly in your Markdown viewer. ## Overview **[EXAONE 4.0](https://github.com/LG-AI-EXAONE/EXAONE-4.0)** model is the language model, which integrates a **Non-reasoning mode** and **Reasoning mode** to achieve both the excellent usability of [EXAONE 3.5](https://github.com/LG-AI-EXAONE/EXAONE-3.5) and the advanced reasoning abilities of [EXAONE Deep](https://github.com/LG-AI-EXAONE/EXAONE-Deep). To pave the way for the agentic AI era, EXAONE 4.0 incorporates essential features such as agentic tool use, and its multilingual capabilities are extended -to support Spanish in addition to English and Korean. +to support Spanish in addition to English and Korean. The EXAONE 4.0 model series consists of two sizes: a mid-size **32B** model optimized for high performance, and a small-size **1.2B** model designed for on-device applications. @@ -33,7 +33,6 @@ For more details, please refer to our [technical report](https://huggingface.co/ All model weights including quantized versions are available at [Huggingface Collections](https://huggingface.co/collections/LGAI-EXAONE/exaone-40-686b2e0069800c835ed48375). - ## Model Details ### Model Specifications @@ -57,7 +56,6 @@ All model weights including quantized versions are available at [Huggingface Col | Tied word embedding | False | True | | Knowledge cut-off | Nov. 2024 | Nov. 2024 | - ## Usage tips ### Non-reasoning mode @@ -206,4 +204,4 @@ print(tokenizer.decode(output[0])) ## Exaone4ForQuestionAnswering [[autodoc]] Exaone4ForQuestionAnswering - - forward \ No newline at end of file + - forward diff --git a/docs/source/en/model_doc/falcon3.md b/docs/source/en/model_doc/falcon3.md index 368a5457ab6d..3d79a4e225dd 100644 --- a/docs/source/en/model_doc/falcon3.md +++ b/docs/source/en/model_doc/falcon3.md @@ -30,5 +30,6 @@ Depth up-scaling for improved reasoning: Building on recent studies on the effec Knowledge distillation for better tiny models: To provide compact and efficient alternatives, we developed Falcon3-1B-Base and Falcon3-3B-Base by leveraging pruning and knowledge distillation techniques, using less than 100GT of curated high-quality data, thereby redefining pre-training efficiency. ## Resources + - [Blog post](https://huggingface.co/blog/falcon3) - [Models on Huggingface](https://huggingface.co/collections/tiiuae/falcon3-67605ae03578be86e4e87026) diff --git a/docs/source/en/model_doc/falcon_h1.md b/docs/source/en/model_doc/falcon_h1.md index 981c00bd626b..48a647cd3797 100644 --- a/docs/source/en/model_doc/falcon_h1.md +++ b/docs/source/en/model_doc/falcon_h1.md @@ -21,7 +21,6 @@ The [FalconH1](https://huggingface.co/blog/tiiuae/falcon-h1) model was developed This model was contributed by [DhiyaEddine](https://huggingface.co/DhiyaEddine), [ybelkada](https://huggingface.co/ybelkada), [JingweiZuo](https://huggingface.co/JingweiZuo), [IlyasChahed](https://huggingface.co/IChahed), and [MaksimVelikanov](https://huggingface.co/yellowvm). The original code can be found [here](https://github.com/tiiuae/Falcon-H1). - ## FalconH1Config | Model | Depth | Dim | Attn Heads | KV | Mamba Heads | d_head | d_state | Ctx Len | @@ -33,8 +32,6 @@ The original code can be found [here](https://github.com/tiiuae/Falcon-H1). | H1 7B | 44 | 3072 | 12 | 2 | 24 | 128 / 128 | 256 | 256K | | H1 34B | 72 | 5120 | 20 | 4 | 32 | 128 / 128 | 256 | 256K | - - [[autodoc]] FalconH1Config -*This model was released on 2025-07-09 and added to Hugging Face Transformers on 2025-09-15.* +*This model was released on 2025-07-09 and added to Hugging Face Transformers on 2025-09-18.*
PyTorch @@ -90,6 +89,7 @@ echo -e "Plants create energy through a process known as" | transformers-cli run Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends. The example below uses [torchao](../quantization/torchao) to only quantize the weights to 4-bits. + ```py #pip install torchao @@ -119,7 +119,6 @@ print(tokenizer.decode(output[0], skip_special_tokens=True)) ``` - ## FlexOlmoConfig [[autodoc]] FlexOlmoConfig diff --git a/docs/source/en/model_doc/florence2.md b/docs/source/en/model_doc/florence2.md index 77e8de10c31b..b7171e1faabd 100644 --- a/docs/source/en/model_doc/florence2.md +++ b/docs/source/en/model_doc/florence2.md @@ -138,21 +138,21 @@ print(parsed_answer) ## Notes - Florence-2 is a prompt-based model. You need to provide a task prompt to tell the model what to do. Supported tasks are: - - `` - - `` - - `` - - `` - - `` - - `` - - `` - - `` - - `` - - `` - - `` - - `` - - `` - - `` - - `` + - `` + - `` + - `` + - `` + - `` + - `` + - `` + - `` + - `` + - `` + - `` + - `` + - `` + - `` + - `` - The raw output of the model is a string that needs to be parsed. The [`Florence2Processor`] has a [`~Florence2Processor.post_process_generation`] method that can parse the string into a more usable format, like bounding boxes and labels for object detection. ## Resources diff --git a/docs/source/en/model_doc/fnet.md b/docs/source/en/model_doc/fnet.md index 79a4e9e4434d..e89a410b105b 100644 --- a/docs/source/en/model_doc/fnet.md +++ b/docs/source/en/model_doc/fnet.md @@ -46,8 +46,8 @@ This model was contributed by [gchhablani](https://huggingface.co/gchhablani). T ## Usage tips -The model was trained without an attention mask as it is based on Fourier Transform. The model was trained with -maximum sequence length 512 which includes pad tokens. Hence, it is highly recommended to use the same maximum +The model was trained without an attention mask as it is based on Fourier Transform. The model was trained with +maximum sequence length 512 which includes pad tokens. Hence, it is highly recommended to use the same maximum sequence length for fine-tuning and inference. ## Resources diff --git a/docs/source/en/model_doc/fsmt.md b/docs/source/en/model_doc/fsmt.md index 27c7d3a899c4..13a99ae40da7 100644 --- a/docs/source/en/model_doc/fsmt.md +++ b/docs/source/en/model_doc/fsmt.md @@ -41,7 +41,6 @@ This model was contributed by [stas](https://huggingface.co/stas). The original either. Its tokenizer is very similar to [`XLMTokenizer`] and the main model is derived from [`BartModel`]. - ## FSMTConfig [[autodoc]] FSMTConfig diff --git a/docs/source/en/model_doc/funnel.md b/docs/source/en/model_doc/funnel.md index 611e17fba8ce..57b011b9400c 100644 --- a/docs/source/en/model_doc/funnel.md +++ b/docs/source/en/model_doc/funnel.md @@ -67,7 +67,6 @@ This model was contributed by [sgugger](https://huggingface.co/sgugger). The ori - [Masked language modeling task guide](../tasks/masked_language_modeling) - [Multiple choice task guide](../tasks/multiple_choice) - ## FunnelConfig [[autodoc]] FunnelConfig diff --git a/docs/source/en/model_doc/fuyu.md b/docs/source/en/model_doc/fuyu.md index 140216e2abc7..34202b022f7e 100644 --- a/docs/source/en/model_doc/fuyu.md +++ b/docs/source/en/model_doc/fuyu.md @@ -40,7 +40,6 @@ Finetuning the model in `float16` is not recommended and known to produce `nan`, - Tips: - To convert the model, you need to clone the original repository using `git clone https://github.com/persimmon-ai-labs/adept-inference`, then get the checkpoints: @@ -55,10 +54,12 @@ python src/transformers/models/fuyu/convert_fuyu_weights_to_hf.py --input_dir / ``` For the chat model: + ```bash wget https://axtkn4xl5cip.objectstorage.us-phoenix-1.oci.customer-oci.com/n/axtkn4xl5cip/b/adept-public-data/o/8b_chat_model_release.tar tar -xvf 8b_base_model_release.tar ``` + Then, model can be loaded via: ```py @@ -99,7 +100,6 @@ The `LlamaTokenizer` is used as it is a standard wrapper around sentencepiece. - The authors suggest to use the following prompt for image captioning: `f"Generate a coco-style caption.\\n"` - ## FuyuConfig [[autodoc]] FuyuConfig diff --git a/docs/source/en/model_doc/gemma.md b/docs/source/en/model_doc/gemma.md index d22d28d41c4b..f1c088caf300 100644 --- a/docs/source/en/model_doc/gemma.md +++ b/docs/source/en/model_doc/gemma.md @@ -33,7 +33,6 @@ The instruction-tuned variant was fine-tuned with supervised learning on instruc You can find all the original Gemma checkpoints under the [Gemma](https://huggingface.co/collections/google/gemma-release-65d5efbccdbb8c4202ec078b) release. - > [!TIP] > Click on the Gemma models in the right sidebar for more examples of how to apply Gemma to different language tasks. @@ -163,7 +162,6 @@ visualizer("LLMs generate text through a process known as") [[autodoc]] GemmaTokenizer - ## GemmaTokenizerFast [[autodoc]] GemmaTokenizerFast diff --git a/docs/source/en/model_doc/gemma2.md b/docs/source/en/model_doc/gemma2.md index 680de41d0380..f9189b5d3a20 100644 --- a/docs/source/en/model_doc/gemma2.md +++ b/docs/source/en/model_doc/gemma2.md @@ -40,7 +40,6 @@ The example below demonstrates how to chat with the model with [`Pipeline`] or t - ```python import torch from transformers import pipeline @@ -81,9 +80,10 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True)) -``` +```bash echo -e "Explain quantum computing simply." | transformers run --task text-generation --model google/gemma-2-2b --device 0 ``` + @@ -113,7 +113,6 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True)) Use the [AttentionMaskVisualizer](https://github.com/huggingface/transformers/blob/beb9b5b02246b9b7ee81ddf938f93f44cfeaad19/src/transformers/utils/attention_visualizer.py#L139) to better understand what tokens the model can and cannot attend to. - ```python from transformers.utils.attention_visualizer import AttentionMaskVisualizer visualizer = AttentionMaskVisualizer("google/gemma-2b") diff --git a/docs/source/en/model_doc/gemma3.md b/docs/source/en/model_doc/gemma3.md index c14b79080fcd..3c69cc1604ff 100644 --- a/docs/source/en/model_doc/gemma3.md +++ b/docs/source/en/model_doc/gemma3.md @@ -195,6 +195,7 @@ visualizer("What is shown in this image?") }, ] ``` + - Text passed to the processor should have a `` token wherever an image should be inserted. - The processor has its own [`~ProcessorMixin.apply_chat_template`] method to convert chat messages to model inputs. - By default, images aren't cropped and only the base image is forwarded to the model. In high resolution images or images with non-square aspect ratios, artifacts can result because the vision encoder uses a fixed resolution of 896x896. To prevent these artifacts and improve performance during inference, set `do_pan_and_scan=True` to crop the image into multiple smaller patches and concatenate them with the base image embedding. You can disable pan and scan for faster inference. @@ -209,6 +210,7 @@ visualizer("What is shown in this image?") + do_pan_and_scan=True, ).to(model.device) ``` + - For Gemma-3 1B checkpoint trained in text-only mode, use [`AutoModelForCausalLM`] instead. ```py diff --git a/docs/source/en/model_doc/gemma3n.md b/docs/source/en/model_doc/gemma3n.md index b43379cf3fd4..8012ed675a2a 100644 --- a/docs/source/en/model_doc/gemma3n.md +++ b/docs/source/en/model_doc/gemma3n.md @@ -121,9 +121,9 @@ echo -e "Plants create energy through a process known as" | transformers run --t ## Notes -- Use [`Gemma3nForConditionalGeneration`] for image-audio-and-text, image-and-text, image-and-audio, audio-and-text, +- Use [`Gemma3nForConditionalGeneration`] for image-audio-and-text, image-and-text, image-and-audio, audio-and-text, image-only and audio-only inputs. -- Gemma 3n supports multiple images per input, but make sure the images are correctly batched before passing them to +- Gemma 3n supports multiple images per input, but make sure the images are correctly batched before passing them to the processor. Each batch should be a list of one or more images. ```py @@ -147,11 +147,12 @@ echo -e "Plants create energy through a process known as" | transformers run --t }, ] ``` -- Text passed to the processor should have a `` token wherever an image should be inserted. -- Gemma 3n accept at most one target audio clip per input, though multiple audio clips can be provided in few-shot + +- Text passed to the processor should have a `` token wherever an image should be inserted. +- Gemma 3n accept at most one target audio clip per input, though multiple audio clips can be provided in few-shot prompts, for example. -- Text passed to the processor should have a `` token wherever an audio clip should be inserted. -- The processor has its own [`~ProcessorMixin.apply_chat_template`] method to convert chat messages to model inputs. +- Text passed to the processor should have a `` token wherever an audio clip should be inserted. +- The processor has its own [`~ProcessorMixin.apply_chat_template`] method to convert chat messages to model inputs. ## Gemma3nAudioFeatureExtractor diff --git a/docs/source/en/model_doc/git.md b/docs/source/en/model_doc/git.md index a2aa0901b21f..06a65a6dd896 100644 --- a/docs/source/en/model_doc/git.md +++ b/docs/source/en/model_doc/git.md @@ -81,4 +81,4 @@ The resource should ideally demonstrate something new instead of duplicating an ## GitForCausalLM [[autodoc]] GitForCausalLM - - forward \ No newline at end of file + - forward diff --git a/docs/source/en/model_doc/glm.md b/docs/source/en/model_doc/glm.md index ca50c32da21b..87daea7289a9 100644 --- a/docs/source/en/model_doc/glm.md +++ b/docs/source/en/model_doc/glm.md @@ -53,7 +53,6 @@ Tips: - This model was contributed by [THUDM](https://huggingface.co/THUDM). The most recent code can be found [here](https://github.com/thudm/GLM-4). - ## Usage tips `GLM-4` can be found on the [Huggingface Hub](https://huggingface.co/collections/THUDM/glm-4-665fcf188c414b03c2f7e3b7) diff --git a/docs/source/en/model_doc/glm4.md b/docs/source/en/model_doc/glm4.md index a10926bd5a09..05786d8096fe 100644 --- a/docs/source/en/model_doc/glm4.md +++ b/docs/source/en/model_doc/glm4.md @@ -21,12 +21,12 @@ rendered properly in your Markdown viewer. The GLM family welcomes new members [GLM-4-0414](https://huggingface.co/papers/2406.12793) series models. -The **GLM-4-32B-0414** series models, featuring 32 billion parameters. Its performance is comparable to OpenAI’s GPT -series and DeepSeek’s V3/R1 series. It also supports very user-friendly local deployment features. GLM-4-32B-Base-0414 +The **GLM-4-32B-0414** series models, featuring 32 billion parameters. Its performance is comparable to OpenAI's GPT +series and DeepSeek's V3/R1 series. It also supports very user-friendly local deployment features. GLM-4-32B-Base-0414 was pre-trained on 15T of high-quality data, including substantial reasoning-type synthetic data. This lays the foundation for subsequent reinforcement learning extensions. In the post-training stage, we employed human preference alignment for dialogue scenarios. Additionally, using techniques like rejection sampling and reinforcement learning, we -enhanced the model’s performance in instruction following, engineering code, and function calling, thus strengthening +enhanced the model's performance in instruction following, engineering code, and function calling, thus strengthening the atomic capabilities required for agent tasks. GLM-4-32B-0414 achieves good results in engineering code, Artifact generation, function calling, search-based Q&A, and report generation. In particular, on several benchmarks, such as code generation or specific Q&A tasks, GLM-4-32B-Base-0414 achieves comparable performance with those larger models like diff --git a/docs/source/en/model_doc/glm4v.md b/docs/source/en/model_doc/glm4v.md index be78c73b3fb4..1f80d4b2584e 100644 --- a/docs/source/en/model_doc/glm4v.md +++ b/docs/source/en/model_doc/glm4v.md @@ -75,6 +75,7 @@ messages = [ ] pipe(text=messages,max_new_tokens=20, return_full_text=False) ``` + @@ -123,6 +124,7 @@ output_text = processor.batch_decode( ) print(output_text) ``` + diff --git a/docs/source/en/model_doc/glm4v_moe.md b/docs/source/en/model_doc/glm4v_moe.md index 0388cc9eb61d..c814fdb5becd 100644 --- a/docs/source/en/model_doc/glm4v_moe.md +++ b/docs/source/en/model_doc/glm4v_moe.md @@ -35,6 +35,7 @@ Through our open-source work, we aim to explore the technological frontier toget ![bench_45](https://raw.githubusercontent.com/zai-org/GLM-V/refs/heads/main/resources/bench_45v.jpeg) Beyond benchmark performance, GLM-4.5V focuses on real-world usability. Through efficient hybrid training, it can handle diverse types of visual content, enabling full-spectrum vision reasoning, including: + - **Image reasoning** (scene understanding, complex multi-image analysis, spatial recognition) - **Video understanding** (long video segmentation and event recognition) - **GUI tasks** (screen reading, icon recognition, desktop operation assistance) diff --git a/docs/source/en/model_doc/got_ocr2.md b/docs/source/en/model_doc/got_ocr2.md index 026273aa158b..f8d6d69b0f6d 100644 --- a/docs/source/en/model_doc/got_ocr2.md +++ b/docs/source/en/model_doc/got_ocr2.md @@ -34,7 +34,6 @@ alt="drawing" width="600"/> GOT-OCR2 training stages. Taken from the original paper. - Tips: GOT-OCR2 works on a wide range of tasks, including plain document OCR, scene text OCR, formatted document OCR, and even OCR for tables, charts, mathematical formulas, geometric shapes, molecular formulas and sheet music. While this implementation of the model will only output plain text, the outputs can be further processed to render the desired format, with packages like `pdftex`, `mathpix`, `matplotlib`, `tikz`, `verovio` or `pyecharts`. @@ -129,7 +128,6 @@ GOT-OCR2 can also generate formatted text, such as markdown or LaTeX. Here is an Although it might be reasonable in most cases to use a “for loop” for multi-page processing, some text data with formatting across several pages make it necessary to process all pages at once. GOT introduces a multi-page OCR (without “for loop”) feature, where multiple pages can be processed by the model at once, with the output being one continuous text. Here is an example of how to process multiple pages at once: - ```python >>> import torch >>> from transformers import AutoProcessor, AutoModelForImageTextToText, infer_device @@ -254,6 +252,7 @@ Here is an example of how to process sheet music: >>> with open("output.svg", "w") as f: >>> f.write(svg) ``` + drawing @@ -285,4 +284,3 @@ alt="drawing" width="600"/> [[autodoc]] GotOcr2ForConditionalGeneration - forward - diff --git a/docs/source/en/model_doc/gpt2.md b/docs/source/en/model_doc/gpt2.md index 1645a92f6346..2740bfb33393 100644 --- a/docs/source/en/model_doc/gpt2.md +++ b/docs/source/en/model_doc/gpt2.md @@ -23,7 +23,6 @@ rendered properly in your Markdown viewer.
- # GPT-2 [GPT-2](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) is a scaled up version of GPT, a causal transformer language model, with 10x more parameters and training data. The model was pretrained on a 40GB dataset to predict the next word in a sequence based on all the previous words. This approach enabled the model to perform many downstream tasks in a zero-shot setting. The blog post released by OpenAI can be found [here](https://openai.com/index/better-language-models/). @@ -47,6 +46,7 @@ from transformers import pipeline pipeline = pipeline(task="text-generation", model="openai-community/gpt2", dtype=torch.float16, device=0) pipeline("Hello, I'm a language model") ``` +
@@ -75,7 +75,7 @@ echo -e "Hello, I'm a language model" | transformers run --task text-generation One can also serve the model using vLLM with the `transformers backend`. -``` +```bash vllm serve openai-community/gpt2 --model-imp transformers ``` diff --git a/docs/source/en/model_doc/gpt_bigcode.md b/docs/source/en/model_doc/gpt_bigcode.md index a16536cbbe5c..26764c38356b 100644 --- a/docs/source/en/model_doc/gpt_bigcode.md +++ b/docs/source/en/model_doc/gpt_bigcode.md @@ -36,6 +36,7 @@ The model is an optimized [GPT2 model](https://huggingface.co/docs/transformers/ ## Implementation details The main differences compared to GPT2. + - Added support for Multi-Query Attention. - Use `gelu_pytorch_tanh` instead of classic `gelu`. - Avoid unnecessary synchronizations (this has since been added to GPT2 in #20061, but wasn't in the reference codebase). @@ -47,7 +48,6 @@ The main differences compared to GPT2. - Merge the key and value caches into one (this changes the format of layer_past/ present, does it risk creating problems?) - Use the memory layout (self.num_heads, 3, self.head_dim) instead of `(3, self.num_heads, self.head_dim)` for the QKV tensor with MHA. (prevents an overhead with the merged key and values, but makes the checkpoints incompatible with the original openai-community/gpt2 model). - You can read more about the optimizations in the [original pull request](https://github.com/huggingface/transformers/pull/22575) > [!NOTE] @@ -91,7 +91,6 @@ Below is a expected speedup diagram that compares pure inference time between th - ## GPTBigCodeConfig [[autodoc]] GPTBigCodeConfig diff --git a/docs/source/en/model_doc/gpt_neo.md b/docs/source/en/model_doc/gpt_neo.md index f3de04d0e550..b0d13cf780b3 100644 --- a/docs/source/en/model_doc/gpt_neo.md +++ b/docs/source/en/model_doc/gpt_neo.md @@ -22,12 +22,10 @@ rendered properly in your Markdown viewer. - ## GPT-Neo [GPT-Neo](https://zenodo.org/records/5297715) is an open-source alternative to GPT-2 and GPT-3 models, built with Mesh TensorFlow for TPUs. GPT-Neo uses local attention in every other layer for more efficiency. It is trained on the [Pile](https://huggingface.co/datasets/EleutherAI/pile), a diverse dataset consisting of 22 smaller high-quality datasets. The original github repository can be found [here](https://github.com/EleutherAI/gpt-neo/tree/v1.1) - You can find all the original GPT-Neo checkpoints under the [EleutherAI](https://huggingface.co/EleutherAI?search_models=gpt-neo) organization. > [!TIP] @@ -45,6 +43,7 @@ from transformers import pipeline pipeline = pipeline(task="text-generation", model="EleutherAI/gpt-neo-1.3B", dtype=torch.float16, device=0) pipeline("Hello, I'm a language model") ``` + diff --git a/docs/source/en/model_doc/gpt_neox.md b/docs/source/en/model_doc/gpt_neox.md index a24fc6aa1d71..fb2ff7093040 100644 --- a/docs/source/en/model_doc/gpt_neox.md +++ b/docs/source/en/model_doc/gpt_neox.md @@ -71,7 +71,7 @@ The `generate()` method can be used to generate text using GPT Neo model. Flash Attention 2 is an faster, optimized version of the model. -### Installation +### Installation First, check whether your hardware is compatible with Flash Attention 2. The latest list of compatible hardware can be found in the [official documentation](https://github.com/Dao-AILab/flash-attention#installation-and-features). If your hardware is not compatible with Flash Attention 2, you can still benefit from attention kernel optimisations through Better Transformer support covered [above](https://huggingface.co/docs/transformers/main/en/model_doc/bark#using-better-transformer). @@ -92,7 +92,6 @@ model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/gpt-neox-20b", dtype=torc ... ``` - ### Expected speedups Below is an expected speedup diagram that compares pure inference time between the native implementation in transformers using `stockmark/gpt-neox-japanese-1.4b` checkpoint and the Flash Attention 2 version of the model using a sequence length of 2048. @@ -101,7 +100,6 @@ Below is an expected speedup diagram that compares pure inference time between t - ## Using Scaled Dot Product Attention (SDPA) PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the @@ -162,7 +160,6 @@ following speedups during training and inference. | 4 | 1024 | 11.765 | 11.303 | 4.09 | 2558.96 | 2546.04 | 0.508 | | 4 | 2048 | 19.568 | 17.735 | 10.33 | 4175.5 | 4165.26 | 0.246 | - ## Resources - [Causal language modeling task guide](../tasks/language_modeling) diff --git a/docs/source/en/model_doc/gpt_neox_japanese.md b/docs/source/en/model_doc/gpt_neox_japanese.md index 7b22484b9a76..bf786f7561d4 100644 --- a/docs/source/en/model_doc/gpt_neox_japanese.md +++ b/docs/source/en/model_doc/gpt_neox_japanese.md @@ -27,8 +27,6 @@ rendered properly in your Markdown viewer. GPT-NeoX-Japanese, a Japanese language model based on [GPT-NeoX](./gpt_neox). Japanese uses three types of characters (hiragana, katakana, kanji) and has a huge vocabulary. This model uses [BPEEncoder V2](https://github.com/tanreinama/Japanese-BPEEncoder_V2), a sub-word tokenizer to handle the different characters. - - The model also removes some bias parameters for better performance. You can find all the original GPT-NeoX-Japanese checkpoints under the [ABEJA](https://huggingface.co/abeja/models?search=gpt-neo-x) organization. diff --git a/docs/source/en/model_doc/gpt_oss.md b/docs/source/en/model_doc/gpt_oss.md index 136ebeb29570..60741d8473fa 100644 --- a/docs/source/en/model_doc/gpt_oss.md +++ b/docs/source/en/model_doc/gpt_oss.md @@ -35,13 +35,14 @@ The abstract from the paper is the following: ** Tips: +- **Attention Sinks with Flex Attention**: When using flex attention, attention sinks require special handling. Unlike with standard attention implementations where sinks can be added directly to attention scores, flex attention `score_mod` function operates on individual score elements rather than the full attention matrix. Therefore, attention sinks renormalization have to be applied after the flex attention computations by renormalizing the outputs using the log-sum-exp (LSE) values returned by flex attention. + This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/). The original code can be found [here](). - ## GptOssConfig [[autodoc]] GptOssConfig diff --git a/docs/source/en/model_doc/gptj.md b/docs/source/en/model_doc/gptj.md index 59e84daea5c5..7b81ee12d270 100644 --- a/docs/source/en/model_doc/gptj.md +++ b/docs/source/en/model_doc/gptj.md @@ -133,6 +133,7 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h - [`GPTJForCausalLM`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#gpt-2gpt-and-causal-language-modeling), [text generation example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-generation), and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb). **Documentation resources** + - [Text classification task guide](../tasks/sequence_classification) - [Question answering task guide](../tasks/question_answering) - [Causal language modeling task guide](../tasks/language_modeling) diff --git a/docs/source/en/model_doc/granite.md b/docs/source/en/model_doc/granite.md index 3f99caf7f685..475021c37168 100644 --- a/docs/source/en/model_doc/granite.md +++ b/docs/source/en/model_doc/granite.md @@ -15,7 +15,6 @@ rendered properly in your Markdown viewer. --> *This model was released on 2024-08-23 and added to Hugging Face Transformers on 2024-08-27.* -
PyTorch FlashAttention @@ -69,12 +68,14 @@ inputs = tokenizer("Explain quantum computing in simple terms", return_tensors=" outputs = model.generate(**inputs, max_length=50, cache_implementation="static") print(tokenizer.decode(outputs[0], skip_special_tokens=True)) ``` + ```python echo -e "Explain quantum computing simply." | transformers-cli run --task text-generation --model ibm-granite/granite-3.3-8b-instruct --device 0 ``` + diff --git a/docs/source/en/model_doc/granite_speech.md b/docs/source/en/model_doc/granite_speech.md index 5de42ff993f8..1d05ee346b67 100644 --- a/docs/source/en/model_doc/granite_speech.md +++ b/docs/source/en/model_doc/granite_speech.md @@ -32,13 +32,12 @@ The [Granite Speech](https://huggingface.co/papers/2505.08699) model ([blog post 4. LoRA adapter(s): The Granite Speech model contains a modality specific LoRA, which will be enabled when audio features are provided, and disabled otherwise. - Note that most of the aforementioned components are implemented generically to enable compatibility and potential integration with other model architectures in transformers. - This model was contributed by [Alexander Brooks](https://huggingface.co/abrooks9944), [Avihu Dekel](https://huggingface.co/Avihu), and [George Saon](https://huggingface.co/gsaon). ## Usage tips + - This model bundles its own LoRA adapter, which will be automatically loaded and enabled/disabled as needed during inference calls. Be sure to install [PEFT](https://github.com/huggingface/peft) to ensure the LoRA is correctly applied! @@ -47,22 +46,18 @@ This model was contributed by [Alexander Brooks](https://huggingface.co/abrooks9 [[autodoc]] GraniteSpeechConfig - ## GraniteSpeechEncoderConfig [[autodoc]] GraniteSpeechEncoderConfig - ## GraniteSpeechProcessor [[autodoc]] GraniteSpeechProcessor - ## GraniteSpeechFeatureExtractor [[autodoc]] GraniteSpeechFeatureExtractor - ## GraniteSpeechForConditionalGeneration [[autodoc]] GraniteSpeechForConditionalGeneration diff --git a/docs/source/en/model_doc/granitemoe.md b/docs/source/en/model_doc/granitemoe.md index 71c266a76b51..32616c07a289 100644 --- a/docs/source/en/model_doc/granitemoe.md +++ b/docs/source/en/model_doc/granitemoe.md @@ -65,7 +65,6 @@ for i in output: This model was contributed by [mayank-mishra](https://huggingface.co/mayank-mishra). - ## GraniteMoeConfig [[autodoc]] GraniteMoeConfig diff --git a/docs/source/en/model_doc/granitemoehybrid.md b/docs/source/en/model_doc/granitemoehybrid.md index 27b6e85d9e95..cb3db122e65d 100644 --- a/docs/source/en/model_doc/granitemoehybrid.md +++ b/docs/source/en/model_doc/granitemoehybrid.md @@ -19,10 +19,8 @@ rendered properly in your Markdown viewer. ## Overview - The [GraniteMoeHybrid](https://www.ibm.com/new/announcements/ibm-granite-4-0-tiny-preview-sneak-peek) model builds on top of GraniteMoeSharedModel and Bamba. Its decoding layers consist of state space layers or MoE attention layers with shared experts. By default, the attention layers do not use positional encoding. - ```python from transformers import AutoModelForCausalLM, AutoTokenizer diff --git a/docs/source/en/model_doc/granitemoeshared.md b/docs/source/en/model_doc/granitemoeshared.md index d09ab5766faa..9db702c9f705 100644 --- a/docs/source/en/model_doc/granitemoeshared.md +++ b/docs/source/en/model_doc/granitemoeshared.md @@ -19,7 +19,6 @@ rendered properly in your Markdown viewer. ## Overview - The GraniteMoe model was proposed in [Power Scheduler: A Batch Size and Token Number Agnostic Learning Rate Scheduler](https://huggingface.co/papers/2408.13359) by Yikang Shen, Matthew Stallone, Mayank Mishra, Gaoyuan Zhang, Shawn Tan, Aditya Prasad, Adriana Meza Soria, David D. Cox and Rameswar Panda. Additionally this class GraniteMoeSharedModel adds shared experts for Moe. @@ -51,7 +50,6 @@ for i in output: This HF implementation is contributed by [Mayank Mishra](https://huggingface.co/mayank-mishra), [Shawn Tan](https://huggingface.co/shawntan) and [Sukriti Sharma](https://huggingface.co/SukritiSharma). - ## GraniteMoeSharedConfig [[autodoc]] GraniteMoeSharedConfig @@ -64,4 +62,4 @@ This HF implementation is contributed by [Mayank Mishra](https://huggingface.co/ ## GraniteMoeSharedForCausalLM [[autodoc]] GraniteMoeSharedForCausalLM - - forward \ No newline at end of file + - forward diff --git a/docs/source/en/model_doc/granitevision.md b/docs/source/en/model_doc/granitevision.md index b138c66f79d8..b95982ee81f9 100644 --- a/docs/source/en/model_doc/granitevision.md +++ b/docs/source/en/model_doc/granitevision.md @@ -22,14 +22,17 @@ rendered properly in your Markdown viewer. The [Granite Vision](https://www.ibm.com/new/announcements/ibm-granite-3-1-powerful-performance-long-context-and-more) model is a variant of [LLaVA-NeXT](llava_next), leveraging a [Granite](granite) language model alongside a [SigLIP](SigLIP) visual encoder. It utilizes multiple concatenated vision hidden states as its image features, similar to [VipLlava](vipllava). It also uses a larger set of image grid pinpoints than the original LlaVa-NeXT models to support additional aspect ratios. Tips: + - This model is loaded into Transformers as an instance of LlaVA-Next. The usage and tips from [LLaVA-NeXT](llava_next) apply to this model as well. - You can apply the chat template on the tokenizer / processor in the same way as well. Example chat format: + ```bash "<|user|>\nWhat’s shown in this image?\n<|assistant|>\nThis image shows a red stop sign.<|end_of_text|><|user|>\nDescribe the image in more details.\n<|assistant|>\n" ``` Sample inference: + ```python from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration, infer_device diff --git a/docs/source/en/model_doc/helium.md b/docs/source/en/model_doc/helium.md index ba06feb18fbe..10748f27be43 100644 --- a/docs/source/en/model_doc/helium.md +++ b/docs/source/en/model_doc/helium.md @@ -27,7 +27,6 @@ rendered properly in your Markdown viewer. Helium was proposed in [Announcing Helium-1 Preview](https://kyutai.org/2025/01/13/helium.html) by the Kyutai Team. - Helium-1 preview is a lightweight language model with 2B parameters, targeting edge and mobile devices. It supports the following languages: English, French, German, Italian, Portuguese, Spanish. @@ -36,9 +35,6 @@ It supports the following languages: English, French, German, Italian, Portugues - **Language(s) (NLP):** English, French, German, Italian, Portuguese, Spanish - **License:** CC-BY 4.0 - - - ## Evaluation @@ -47,7 +43,7 @@ It supports the following languages: English, French, German, Italian, Portugues -The model was evaluated on MMLU, TriviaQA, NaturalQuestions, ARC Easy & Challenge, Open Book QA, Common Sense QA, +The model was evaluated on MMLU, TriviaQA, NaturalQuestions, ARC Easy & Challenge, Open Book QA, Common Sense QA, Physical Interaction QA, Social Interaction QA, HellaSwag, WinoGrande, Multilingual Knowledge QA, FLORES 200. #### Metrics @@ -92,7 +88,6 @@ We report BLEU on FLORES. || HS | 58.6 | 40.8 | 60.5 | 61.1 | 51.4 | || MKQA | 16.0 | 7.9 | 18.5 | 20.6 | 10.6 | - ## Technical Specifications ### Model Architecture and Objective @@ -110,12 +105,11 @@ Tips: - This model was contributed by [Laurent Mazare](https://huggingface.co/lmz) - ## Usage tips `Helium` can be found on the [Huggingface Hub](https://huggingface.co/models?other=helium) -In the following, we demonstrate how to use `helium-1-preview` for the inference. +In the following, we demonstrate how to use `helium-1-preview` for the inference. ```python >>> from transformers import AutoModelForCausalLM, AutoTokenizer diff --git a/docs/source/en/model_doc/herbert.md b/docs/source/en/model_doc/herbert.md index 718a1a3df0bb..aa6a4bf96adf 100644 --- a/docs/source/en/model_doc/herbert.md +++ b/docs/source/en/model_doc/herbert.md @@ -45,7 +45,6 @@ models.* This model was contributed by [rmroczkowski](https://huggingface.co/rmroczkowski). The original code can be found [here](https://github.com/allegro/HerBERT). - ## Usage example ```python diff --git a/docs/source/en/model_doc/hgnet_v2.md b/docs/source/en/model_doc/hgnet_v2.md index 7461a19a0327..8e7791ce71ea 100644 --- a/docs/source/en/model_doc/hgnet_v2.md +++ b/docs/source/en/model_doc/hgnet_v2.md @@ -81,14 +81,12 @@ print(f"The predicted class label is: {predicted_class_label}") [[autodoc]] HGNetV2Config - ## HGNetV2Backbone [[autodoc]] HGNetV2Backbone - forward - ## HGNetV2ForImageClassification [[autodoc]] HGNetV2ForImageClassification - - forward \ No newline at end of file + - forward diff --git a/docs/source/en/model_doc/hiera.md b/docs/source/en/model_doc/hiera.md index 9f4627dd53f1..b8fd9c141839 100644 --- a/docs/source/en/model_doc/hiera.md +++ b/docs/source/en/model_doc/hiera.md @@ -25,7 +25,7 @@ rendered properly in your Markdown viewer. Hiera was proposed in [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://huggingface.co/papers/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer -The paper introduces "Hiera," a hierarchical Vision Transformer that simplifies the architecture of modern hierarchical vision transformers by removing unnecessary components without compromising on accuracy or efficiency. Unlike traditional transformers that add complex vision-specific components to improve supervised classification performance, Hiera demonstrates that such additions, often termed "bells-and-whistles," are not essential for high accuracy. By leveraging a strong visual pretext task (MAE) for pretraining, Hiera retains simplicity and achieves superior accuracy and speed both in inference and training across various image and video recognition tasks. The approach suggests that spatial biases required for vision tasks can be effectively learned through proper pretraining, eliminating the need for added architectural complexity. +The paper introduces "Hiera," a hierarchical Vision Transformer that simplifies the architecture of modern hierarchical vision transformers by removing unnecessary components without compromising on accuracy or efficiency. Unlike traditional transformers that add complex vision-specific components to improve supervised classification performance, Hiera demonstrates that such additions, often termed "bells-and-whistles," are not essential for high accuracy. By leveraging a strong visual pretext task (MAE) for pretraining, Hiera retains simplicity and achieves superior accuracy and speed both in inference and training across various image and video recognition tasks. The approach suggests that spatial biases required for vision tasks can be effectively learned through proper pretraining, eliminating the need for added architectural complexity. The abstract from the paper is the following: diff --git a/docs/source/en/model_doc/hubert.md b/docs/source/en/model_doc/hubert.md index 18c8062da36e..5a072214406c 100644 --- a/docs/source/en/model_doc/hubert.md +++ b/docs/source/en/model_doc/hubert.md @@ -115,6 +115,7 @@ print(transcription[0]) - HuBERT models expect raw audio input as a 1D float array sampled at 16kHz. - If you want to use a `head_mask`, use the model with `attn_implementation="eager"`. + ```python model = HubertModel.from_pretrained("facebook/hubert-base-ls960", attn_implementation="eager") ``` diff --git a/docs/source/en/model_doc/hunyuan_v1_dense.md b/docs/source/en/model_doc/hunyuan_v1_dense.md index f87ca422c8ed..84f9e44e5225 100644 --- a/docs/source/en/model_doc/hunyuan_v1_dense.md +++ b/docs/source/en/model_doc/hunyuan_v1_dense.md @@ -13,6 +13,7 @@ specific language governing permissions and limitations under the License. rendered properly in your Markdown viewer. --> +*This model was released on {release_date} and added to Hugging Face Transformers on 2025-08-22.* # HunYuanDenseV1 @@ -24,7 +25,6 @@ To be released with the official model launch. To be released with the official model launch. - ## Usage tips To be released with the official model launch. @@ -47,4 +47,3 @@ To be released with the official model launch. [[autodoc]] HunYuanDenseV1ForSequenceClassification - forward - diff --git a/docs/source/en/model_doc/hunyuan_v1_moe.md b/docs/source/en/model_doc/hunyuan_v1_moe.md index c66846cc0881..e9bff74fe1bc 100644 --- a/docs/source/en/model_doc/hunyuan_v1_moe.md +++ b/docs/source/en/model_doc/hunyuan_v1_moe.md @@ -13,6 +13,7 @@ specific language governing permissions and limitations under the License. rendered properly in your Markdown viewer. --> +*This model was released on {release_date} and added to Hugging Face Transformers on 2025-08-22.* # HunYuanMoEV1 @@ -24,7 +25,6 @@ To be released with the official model launch. To be released with the official model launch. - ## Usage tips To be released with the official model launch. @@ -47,4 +47,3 @@ To be released with the official model launch. [[autodoc]] HunYuanMoEV1ForSequenceClassification - forward - diff --git a/docs/source/en/model_doc/idefics.md b/docs/source/en/model_doc/idefics.md index 6296e7226604..fdb6e5de4659 100644 --- a/docs/source/en/model_doc/idefics.md +++ b/docs/source/en/model_doc/idefics.md @@ -34,7 +34,6 @@ The abstract from the paper is the following: This model was contributed by [HuggingFaceM4](https://huggingface.co/HuggingFaceM4). The original code can be found [here](). (TODO: don't have a public link yet). - IDEFICS modeling code in Transformers is for finetuning and inferencing the pre-trained IDEFICS models. @@ -43,7 +42,6 @@ To train a new IDEFICS model from scratch use the m4 codebase (a link will be pr - ## IdeficsConfig [[autodoc]] IdeficsConfig diff --git a/docs/source/en/model_doc/idefics2.md b/docs/source/en/model_doc/idefics2.md index 63dd1ec8277d..696ad7c5d2bd 100644 --- a/docs/source/en/model_doc/idefics2.md +++ b/docs/source/en/model_doc/idefics2.md @@ -202,19 +202,16 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h [[autodoc]] Idefics2Config - ## Idefics2Model [[autodoc]] Idefics2Model - forward - ## Idefics2ForConditionalGeneration [[autodoc]] Idefics2ForConditionalGeneration - forward - ## Idefics2ImageProcessor [[autodoc]] Idefics2ImageProcessor - preprocess diff --git a/docs/source/en/model_doc/idefics3.md b/docs/source/en/model_doc/idefics3.md index b3e199e2b882..0c8f46a9aeef 100644 --- a/docs/source/en/model_doc/idefics3.md +++ b/docs/source/en/model_doc/idefics3.md @@ -45,6 +45,7 @@ If `do_resize` is set to `True`, the model resizes images so that the longest ed The default resizing behavior can be customized by passing a dictionary to the `size` parameter. For example, `{"longest_edge": 4 * 364}` is the default, but you can change it to a different value if needed. Here’s how to control resizing and set a custom size: + ```python image_processor = Idefics3ImageProcessor(do_resize=True, size={"longest_edge": 2 * 364}, max_image_size=364) ``` @@ -53,7 +54,6 @@ Additionally, the `max_image_size` parameter, which controls the size of each sq This model was contributed by [amyeroberts](https://huggingface.co/amyeroberts) and [andimarafioti](https://huggingface.co/andito). - ## Idefics3Config [[autodoc]] Idefics3Config @@ -76,7 +76,6 @@ This model was contributed by [amyeroberts](https://huggingface.co/amyeroberts) [[autodoc]] Idefics3ForConditionalGeneration - forward - ## Idefics3ImageProcessor [[autodoc]] Idefics3ImageProcessor - preprocess diff --git a/docs/source/en/model_doc/ijepa.md b/docs/source/en/model_doc/ijepa.md index 9d7c7874f1a5..a81e7c3ab281 100644 --- a/docs/source/en/model_doc/ijepa.md +++ b/docs/source/en/model_doc/ijepa.md @@ -31,10 +31,8 @@ You can find the original I-JEPA checkpoints under the [AI at Meta](https://hugg > [!TIP] > This model was contributed by [jmtzt](https://huggingface.co/jmtzt). - - > Click on the I-JEPA models in the right sidebar for more examples of how to apply I-JEPA to different image representation and classification tasks. The example below demonstrates how to extract image features with [`Pipeline`] or the [`AutoModel`] class. @@ -88,10 +86,10 @@ embed_2 = infer(image_2) similarity = cosine_similarity(embed_1, embed_2) print(similarity) ``` + - Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends. The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to 4-bits. @@ -142,4 +140,3 @@ print(similarity) [[autodoc]] IJepaForImageClassification - forward - diff --git a/docs/source/en/model_doc/informer.md b/docs/source/en/model_doc/informer.md index 7e79399cbc57..a9cea0f09cab 100644 --- a/docs/source/en/model_doc/informer.md +++ b/docs/source/en/model_doc/informer.md @@ -52,4 +52,4 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h ## InformerForPrediction [[autodoc]] InformerForPrediction - - forward \ No newline at end of file + - forward diff --git a/docs/source/en/model_doc/instructblip.md b/docs/source/en/model_doc/instructblip.md index b0669f1c065f..ac84a71d887e 100644 --- a/docs/source/en/model_doc/instructblip.md +++ b/docs/source/en/model_doc/instructblip.md @@ -59,7 +59,6 @@ The attributes can be obtained from model config, as `model.config.num_query_tok [[autodoc]] InstructBlipProcessor - ## InstructBlipVisionModel [[autodoc]] InstructBlipVisionModel @@ -78,4 +77,4 @@ The attributes can be obtained from model config, as `model.config.num_query_tok [[autodoc]] InstructBlipForConditionalGeneration - forward - - generate \ No newline at end of file + - generate diff --git a/docs/source/en/model_doc/instructblipvideo.md b/docs/source/en/model_doc/instructblipvideo.md index e34b454a1237..d4d868b7f90e 100644 --- a/docs/source/en/model_doc/instructblipvideo.md +++ b/docs/source/en/model_doc/instructblipvideo.md @@ -59,7 +59,6 @@ The attributes can be obtained from model config, as `model.config.num_query_tok [[autodoc]] InstructBlipVideoProcessor - ## InstructBlipVideoVideoProcessor [[autodoc]] InstructBlipVideoVideoProcessor diff --git a/docs/source/en/model_doc/internvl.md b/docs/source/en/model_doc/internvl.md index bf760fdbdd71..7e9fea7f4f20 100644 --- a/docs/source/en/model_doc/internvl.md +++ b/docs/source/en/model_doc/internvl.md @@ -15,7 +15,6 @@ rendered properly in your Markdown viewer. --> *This model was released on 2025-04-14 and added to Hugging Face Transformers on 2025-04-18.* -
PyTorch @@ -32,19 +31,14 @@ The abstract from the paper is the following: *We introduce InternVL3, a significant advancement in the InternVL series featuring a native multimodal pre-training paradigm. Rather than adapting a text-only large language model (LLM) into a multimodal large language model (MLLM) that supports visual inputs, InternVL3 jointly acquires multimodal and linguistic capabilities from both diverse multimodal data and pure-text corpora during a single pre-training stage. This unified training paradigm effectively addresses the complexities and alignment challenges commonly encountered in conventional post-hoc training pipelines for MLLMs. To further improve performance and scalability, InternVL3 incorporates variable visual position encoding (V2PE) to support extended multimodal contexts, employs advanced post-training techniques such as supervised fine-tuning (SFT) and mixed preference optimization (MPO), and adopts test-time scaling strategies alongside an optimized training infrastructure. Extensive empirical evaluations demonstrate that InternVL3 delivers superior performance across a wide range of multi-modal tasks. In particular, InternVL3-78B achieves a score of 72.2 on the MMMU benchmark, setting a new state-of-the-art among open-source MLLMs. Its capabilities remain highly competitive with leading proprietary models, including ChatGPT-4o, Claude 3.5 Sonnet, and Gemini 2.5 Pro, while also maintaining strong pure-language proficiency. In pursuit of open-science principles, we will publicly release both the training data and model weights to foster further research and development in next-generation MLLMs.* - drawing Overview of InternVL3 models architecture, which is the same as InternVL2.5. Taken from the original checkpoint. - - drawing Comparison of InternVL3 performance on OpenCompass against other SOTA VLLMs. Taken from the original checkpoint. - - This model was contributed by [yonigozlan](https://huggingface.co/yonigozlan). The original code can be found [here](https://github.com/OpenGVLab/InternVL). @@ -75,6 +69,7 @@ Here is how you can use the `image-text-to-text` pipeline to perform inference w >>> outputs[0]["generated_text"] 'The image showcases a vibrant scene of nature, featuring several flowers and a bee. \n\n1. **Foreground Flowers**: \n - The primary focus is on a large, pink cosmos flower with a prominent yellow center. The petals are soft and slightly r' ``` + ### Inference on a single image This example demonstrates how to perform inference on a single image with the InternVL models using chat templates. @@ -112,7 +107,6 @@ This example demonstrates how to perform inference on a single image with the In ### Text-only generation This example shows how to generate text using the InternVL model without providing any image input. - ```python >>> from transformers import AutoProcessor, AutoModelForImageTextToText >>> import torch diff --git a/docs/source/en/model_doc/jamba.md b/docs/source/en/model_doc/jamba.md index 0aa06b16e90f..f85d08c5f64d 100644 --- a/docs/source/en/model_doc/jamba.md +++ b/docs/source/en/model_doc/jamba.md @@ -75,6 +75,7 @@ input_ids = tokenizer("Plants create energy through a process known as", return_ output = model.generate(**input_ids, cache_implementation="static") print(tokenizer.decode(output[0], skip_special_tokens=True)) ``` + @@ -140,19 +141,16 @@ print(assistant_response) [[autodoc]] JambaConfig - ## JambaModel [[autodoc]] JambaModel - forward - ## JambaForCausalLM [[autodoc]] JambaForCausalLM - forward - ## JambaForSequenceClassification [[autodoc]] transformers.JambaForSequenceClassification diff --git a/docs/source/en/model_doc/jetmoe.md b/docs/source/en/model_doc/jetmoe.md index 059fb956ce23..3fca2c2d6764 100644 --- a/docs/source/en/model_doc/jetmoe.md +++ b/docs/source/en/model_doc/jetmoe.md @@ -27,15 +27,14 @@ rendered properly in your Markdown viewer. **JetMoe-8B** is an 8B Mixture-of-Experts (MoE) language model developed by [Yikang Shen](https://scholar.google.com.hk/citations?user=qff5rRYAAAAJ) and [MyShell](https://myshell.ai/). JetMoe project aims to provide a LLaMA2-level performance and efficient language model with a limited budget. -To achieve this goal, JetMoe uses a sparsely activated architecture inspired by the [ModuleFormer](https://huggingface.co/papers/2306.04640). +To achieve this goal, JetMoe uses a sparsely activated architecture inspired by the [ModuleFormer](https://huggingface.co/papers/2306.04640). Each JetMoe block consists of two MoE layers: Mixture of Attention Heads and Mixture of MLP Experts. Given the input tokens, it activates a subset of its experts to process them. -This sparse activation schema enables JetMoe to achieve much better training throughput than similar size dense models. +This sparse activation schema enables JetMoe to achieve much better training throughput than similar size dense models. The training throughput of JetMoe-8B is around 100B tokens per day on a cluster of 96 H100 GPUs with a straightforward 3-way pipeline parallelism strategy. This model was contributed by [Yikang Shen](https://huggingface.co/YikangS). - ## JetMoeConfig [[autodoc]] JetMoeConfig diff --git a/docs/source/en/model_doc/kosmos2_5.md b/docs/source/en/model_doc/kosmos2_5.md index 530f1d459ae7..911eea26debd 100644 --- a/docs/source/en/model_doc/kosmos2_5.md +++ b/docs/source/en/model_doc/kosmos2_5.md @@ -9,7 +9,7 @@ Unless required by applicable law or agreed to in writing, software distributed an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. --> -*This model was released on {release_date} and added to Hugging Face Transformers on 2025-08-19.* +*This model was released on 2023-09-20 and added to Hugging Face Transformers on 2025-08-19.*
@@ -19,7 +19,6 @@ specific language governing permissions and limitations under the License.
- # KOSMOS-2.5 The Kosmos-2.5 model was proposed in [KOSMOS-2.5: A Multimodal Literate Model](https://huggingface.co/papers/2309.11419/) by Microsoft. @@ -159,7 +158,6 @@ image.save("output.png")
- ## Chat version The authors also released Kosmos-2.5 Chat, which is a chat version optimized for document understanding. You can use it like so: diff --git a/docs/source/en/model_doc/kyutai_speech_to_text.md b/docs/source/en/model_doc/kyutai_speech_to_text.md index 30497e69594c..f3482c37ae05 100644 --- a/docs/source/en/model_doc/kyutai_speech_to_text.md +++ b/docs/source/en/model_doc/kyutai_speech_to_text.md @@ -15,10 +15,11 @@ rendered properly in your Markdown viewer. --> *This model was released on 2025-06-17 and added to Hugging Face Transformers on 2025-06-25.* -# Kyutai Speech-To-Text +# Kyutai Speech-To-Text ## Overview -[Kyutai STT](https://kyutai.org/next/stt) is a speech-to-text model architecture based on the [Mimi codec](https://huggingface.co/docs/transformers/en/model_doc/mimi), which encodes audio into discrete tokens in a streaming fashion, and a [Moshi-like](https://huggingface.co/docs/transformers/en/model_doc/moshi) autoregressive decoder. Kyutai’s lab has released two model checkpoints: +[Kyutai STT](https://kyutai.org/next/stt) is a speech-to-text model architecture based on the [Mimi codec](https://huggingface.co/docs/transformers/en/model_doc/mimi), which encodes audio into discrete tokens in a streaming fashion, and a [Moshi-like](https://huggingface.co/docs/transformers/en/model_doc/moshi) autoregressive decoder. Kyutai's lab has released two model checkpoints: + - [kyutai/stt-1b-en_fr](https://huggingface.co/kyutai/stt-1b-en_fr): a 1B-parameter model capable of transcribing both English and French - [kyutai/stt-2.6b-en](https://huggingface.co/kyutai/stt-2.6b-en): a 2.6B-parameter model focused solely on English, optimized for maximum transcription accuracy @@ -98,7 +99,6 @@ for output in decoded_outputs: This model was contributed by [Eustache Le Bihan](https://huggingface.co/eustlb). The original code can be found [here](https://github.com/kyutai-labs/moshi). - ## KyutaiSpeechToTextConfig [[autodoc]] KyutaiSpeechToTextConfig diff --git a/docs/source/en/model_doc/layoutlm.md b/docs/source/en/model_doc/layoutlm.md index 708a5bc1ab40..88dde323e299 100644 --- a/docs/source/en/model_doc/layoutlm.md +++ b/docs/source/en/model_doc/layoutlm.md @@ -116,7 +116,6 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h - Refer to this [notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb) for an example of how to fine-tune LayoutLM for token classification. - Read [Deploy LayoutLM with Hugging Face Inference Endpoints](https://www.philschmid.de/inference-endpoints-layoutlm) to learn how to deploy LayoutLM. - ## LayoutLMConfig [[autodoc]] LayoutLMConfig diff --git a/docs/source/en/model_doc/layoutlmv2.md b/docs/source/en/model_doc/layoutlmv2.md index c376c04ad76e..f74d3b4294ee 100644 --- a/docs/source/en/model_doc/layoutlmv2.md +++ b/docs/source/en/model_doc/layoutlmv2.md @@ -55,10 +55,12 @@ this https URL.* LayoutLMv2 depends on `detectron2`, `torchvision` and `tesseract`. Run the following to install them: + ```bash python -m pip install 'git+https://github.com/facebookresearch/detectron2.git' python -m pip install torchvision tesseract ``` + (If you are developing for LayoutLMv2, note that passing the doctests also requires the installation of these packages.) ## Usage tips @@ -145,7 +147,6 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h - See also: [Question answering task guide](../tasks/question_answering) - See also: [Document question answering task guide](../tasks/document_question_answering) - - A notebook on how to [finetune LayoutLMv2 for token-classification on CORD dataset](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv2/CORD/Fine_tuning_LayoutLMv2ForTokenClassification_on_CORD.ipynb). diff --git a/docs/source/en/model_doc/layoutlmv3.md b/docs/source/en/model_doc/layoutlmv3.md index 9bb75e7772b7..b9964fa3f86c 100644 --- a/docs/source/en/model_doc/layoutlmv3.md +++ b/docs/source/en/model_doc/layoutlmv3.md @@ -37,8 +37,8 @@ This model was contributed by [nielsr](https://huggingface.co/nielsr). The origi ## Usage tips - In terms of data processing, LayoutLMv3 is identical to its predecessor [LayoutLMv2](layoutlmv2), except that: - - images need to be resized and normalized with channels in regular RGB format. LayoutLMv2 on the other hand normalizes the images internally and expects the channels in BGR format. - - text is tokenized using byte-pair encoding (BPE), as opposed to WordPiece. + - images need to be resized and normalized with channels in regular RGB format. LayoutLMv2 on the other hand normalizes the images internally and expects the channels in BGR format. + - text is tokenized using byte-pair encoding (BPE), as opposed to WordPiece. Due to these differences in data preprocessing, one can use [`LayoutLMv3Processor`] which internally combines a [`LayoutLMv3ImageProcessor`] (for the image modality) and a [`LayoutLMv3Tokenizer`]/[`LayoutLMv3TokenizerFast`] (for the text modality) to prepare all data for the model. - Regarding usage of [`LayoutLMv3Processor`], we refer to the [usage guide](layoutlmv2#usage-layoutlmv2processor) of its predecessor. @@ -73,6 +73,7 @@ LayoutLMv3 is nearly identical to LayoutLMv2, so we've also included LayoutLMv2 - [Question answering task guide](../tasks/question_answering) **Document question answering** + - [Document question answering task guide](../tasks/document_question_answering) ## LayoutLMv3Config diff --git a/docs/source/en/model_doc/led.md b/docs/source/en/model_doc/led.md index 8a732ae85cff..b0d4f08943e9 100644 --- a/docs/source/en/model_doc/led.md +++ b/docs/source/en/model_doc/led.md @@ -89,6 +89,7 @@ print(tokenizer.decode(output[0], skip_special_tokens=True)) ```bash !echo -e "Plants are among the most remarkable and essential life forms on Earth, possessing a unique ability to produce their own food through a process known as photosynthesis. This complex biochemical process is fundamental not only to plant life but to virtually all life on the planet. Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts." | transformers-cli run --task summarization --model allenai/led-base-16384 --device 0 ``` + diff --git a/docs/source/en/model_doc/lfm2.md b/docs/source/en/model_doc/lfm2.md index 3ea0936b96be..58f1d754588d 100644 --- a/docs/source/en/model_doc/lfm2.md +++ b/docs/source/en/model_doc/lfm2.md @@ -23,7 +23,7 @@ rendered properly in your Markdown viewer. ## Overview -[LFM2](https://www.liquid.ai/blog/liquid-foundation-models-v2-our-second-series-of-generative-ai-models) represents a new generation of Liquid Foundation Models developed by [Liquid AI](https://liquid.ai/), specifically designed for edge AI and on-device deployment. +[LFM2](https://www.liquid.ai/blog/liquid-foundation-models-v2-our-second-series-of-generative-ai-models) represents a new generation of Liquid Foundation Models developed by [Liquid AI](https://liquid.ai/), specifically designed for edge AI and on-device deployment. The models are available in three sizes (350M, 700M, and 1.2B parameters) and are engineered to run efficiently on CPU, GPU, and NPU hardware, making them particularly well-suited for applications requiring low latency, offline operation, and privacy. @@ -82,4 +82,4 @@ print(tokenizer.decode(output[0], skip_special_tokens=False)) ## Lfm2ForCausalLM [[autodoc]] Lfm2ForCausalLM - - forward \ No newline at end of file + - forward diff --git a/docs/source/en/model_doc/lfm2_vl.md b/docs/source/en/model_doc/lfm2_vl.md new file mode 100644 index 000000000000..fb6b2ad8a4e2 --- /dev/null +++ b/docs/source/en/model_doc/lfm2_vl.md @@ -0,0 +1,98 @@ + +*This model was released on {release_date} and added to Hugging Face Transformers on 2025-09-18.* + +
+PyTorch +
+ +# LFM2-VL + +## Overview + +[LFM2-VL](https://www.liquid.ai/blog/lfm2-vl-efficient-vision-language-models) first series of vision-language foundation models developed by [Liquid AI](https://liquid.ai/). These multimodal models are designed for low-latency and device-aware deployment. LFM2-VL extends the LFM2 family of open-weight Liquid Foundation Models (LFMs) into the vision-language space, supporting both text and image inputs with variable resolutions. + +## Architecture + +LFM2-VL consists of three main components: a language model backbone, a vision encoder, and a multimodal projector. LFM2-VL builds upon the LFM2 backbone, inheriting from either LFM2-1.2B (for LFM2-VL-1.6B) or LFM2-350M (for LFM2-VL-450M). For the vision tower, LFM2-VL uses SigLIP2 NaFlex encoders to convert input images into token sequences. Two variants are implemented: + +* Shape-optimized (400M) for more fine-grained vision capabilities for LFM2-VL-1.6B +* Base (86M) for fast image processing for LFM2-VL-450M + +The encoder processes images at their native resolution up to 512×512 pixels, efficiently handling smaller images without upscaling and supporting non-standard aspect ratios without distortion. Larger images are split into non-overlapping square patches of 512×512 each, preserving detail. In LFM2-VL-1.6B, the model also receives a thumbnail (a small, downscaled version of the original image capturing the overall scene) to enhance global context understanding and alignment. Special tokens mark each patch’s position and indicate the thumbnail’s start. The multimodal connector is a 2-layer MLP connector with pixel unshuffle to reduce image token count. + +## Example + +The following example shows how to generate an answer using the `AutoModelForImageTextToText` class. + +```python +from transformers import AutoProcessor, AutoModelForImageTextToText +\ +# Load model and processor +model_id = "LiquidAI/LFM2-VL-1.6B" +model = AutoModelForImageTextToText.from_pretrained( + model_id, + device_map="auto", + dtype="bfloat16", +) +processor = AutoProcessor.from_pretrained(model_id) + +# Load image and create conversation +conversation = [ + { + "role": "user", + "content": [ + {"type": "image", "image": "https://www.ilankelman.org/stopsigns/australia.jpg"}, + {"type": "text", "text": "What is in this image?"}, + ], + }, +] + +# Generate snswer +inputs = processor.apply_chat_template( + conversation, + add_generation_prompt=True, + return_tensors="pt", + return_dict=True, + tokenize=True, +).to(model.device) + +outputs = model.generate(**inputs, max_new_tokens=64) +processor.batch_decode(outputs, skip_special_tokens=True)[0] + +``` + +## Lfm2VlImageProcessorFast + +[[autodoc]] Lfm2VlImageProcessorFast + +## Lfm2VlProcessor + +[[autodoc]] Lfm2VlProcessor + +## Lfm2VlConfig + +[[autodoc]] Lfm2VlConfig + +## Lfm2VlModel + +[[autodoc]] Lfm2VlModel + - forward + +## Lfm2VlForConditionalGeneration + +[[autodoc]] Lfm2VlForConditionalGeneration + - forward diff --git a/docs/source/en/model_doc/lightglue.md b/docs/source/en/model_doc/lightglue.md index 13ac58a1b842..2a173a8e1422 100644 --- a/docs/source/en/model_doc/lightglue.md +++ b/docs/source/en/model_doc/lightglue.md @@ -143,10 +143,9 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size ## LightGlueImageProcessor [[autodoc]] LightGlueImageProcessor - -- preprocess -- post_process_keypoint_matching -- visualize_keypoint_matching + - preprocess + - post_process_keypoint_matching + - visualize_keypoint_matching diff --git a/docs/source/en/model_doc/lilt.md b/docs/source/en/model_doc/lilt.md index 54475e7cb3b5..407e4aad3c40 100644 --- a/docs/source/en/model_doc/lilt.md +++ b/docs/source/en/model_doc/lilt.md @@ -62,6 +62,7 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h - Demo notebooks for LiLT can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/LiLT). **Documentation resources** + - [Text classification task guide](../tasks/sequence_classification) - [Token classification task guide](../tasks/token_classification) - [Question answering task guide](../tasks/question_answering) diff --git a/docs/source/en/model_doc/llama2.md b/docs/source/en/model_doc/llama2.md index 96c733d88fa4..c66667f235f6 100644 --- a/docs/source/en/model_doc/llama2.md +++ b/docs/source/en/model_doc/llama2.md @@ -130,11 +130,13 @@ visualizer("Plants create energy through a process known as") # update model config with padding token model.config.pad_token_id ``` + - It is recommended to initialize the `embed_tokens` layer with the following code to ensure encoding the padding token outputs zeros. ```py self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.config.padding_idx) ``` + - The tokenizer is a byte-pair encoding model based on [SentencePiece](https://github.com/google/sentencepiece). During decoding, if the first token is the start of the word (for example, "Banana"), the tokenizer doesn't prepend the prefix space to the string. - Don't use the `dtype` parameter in [`~AutoModel.from_pretrained`] if you're using FlashAttention-2 because it only supports fp16 or bf16. You should use [Automatic Mixed Precision](https://pytorch.org/tutorials/recipes/recipes/amp_recipe.html), set fp16 or bf16 to `True` if using [`Trainer`], or use [torch.autocast](https://pytorch.org/docs/stable/amp.html#torch.autocast). @@ -142,7 +144,6 @@ visualizer("Plants create energy through a process known as") [[autodoc]] LlamaConfig - ## LlamaTokenizer [[autodoc]] LlamaTokenizer @@ -165,7 +166,6 @@ visualizer("Plants create energy through a process known as") [[autodoc]] LlamaModel - forward - ## LlamaForCausalLM [[autodoc]] LlamaForCausalLM diff --git a/docs/source/en/model_doc/llama3.md b/docs/source/en/model_doc/llama3.md index 1764617a7d4f..4f98d9c778a5 100644 --- a/docs/source/en/model_doc/llama3.md +++ b/docs/source/en/model_doc/llama3.md @@ -60,7 +60,7 @@ Tips: - Weights for the Llama3 models can be obtained by filling out [this form](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) - The architecture is exactly the same as Llama2. -- The tokenizer is a BPE model based on [tiktoken](https://github.com/openai/tiktoken) (vs the one based on sentencepiece implementation for Llama2). The main difference that it ignores BPE merge rules when an input token is part of the vocab. This means that if no merge exist to produce `"hugging"`, instead of having the smallest units, like `["hug","ging"] form 2 tokens, if `"hugging"` is part of the vocab, it will be automatically returned as a token. +- The tokenizer is a BPE model based on [tiktoken](https://github.com/openai/tiktoken) (vs the one based on sentencepiece implementation for Llama2). The main difference that it ignores BPE merge rules when an input token is part of the vocab. This means that if no merge exist to produce `"hugging"`, instead of having the smallest units, like `["hug","ging"]` form 2 tokens, if `"hugging"` is part of the vocab, it will be automatically returned as a token. - The original model uses `pad_id = -1` which means that there is no padding token. We can't have the same logic, make sure to add a padding token using `tokenizer.add_special_tokens({"pad_token":""})` and resize the token embedding accordingly. You should also set the `model.config.pad_token_id`. The `embed_tokens` layer of the model is initialized with `self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.config.padding_idx)`, which makes sure that encoding the padding token will output zeros, so passing it when initializing is recommended. - The original checkpoint can be converted using the [conversion script](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py). The script can be called with the following (example) command: diff --git a/docs/source/en/model_doc/llama4.md b/docs/source/en/model_doc/llama4.md index 28e168b90439..ee7f2e2a54f5 100644 --- a/docs/source/en/model_doc/llama4.md +++ b/docs/source/en/model_doc/llama4.md @@ -17,7 +17,6 @@ rendered properly in your Markdown viewer. # Llama4 -
PyTorch @@ -28,9 +27,11 @@ rendered properly in your Markdown viewer. [Llama 4](https://ai.meta.com/blog/llama-4-multimodal-intelligence/), developed by Meta, introduces a new auto-regressive Mixture-of-Experts (MoE) architecture. This generation includes two models: + - The highly capable Llama 4 Maverick with 17B active parameters out of ~400B total, with 128 experts. - The efficient Llama 4 Scout also has 17B active parameters out of ~109B total, using just 16 experts. - + Both models leverage early fusion for native multimodality, enabling them to process text and image inputs. Maverick and Scout are both trained on up to 40 trillion tokens on data encompassing 200 languages (with specific fine-tuning support for 12 languages including Arabic, Spanish, German, and Hindi). @@ -53,7 +54,6 @@ The examples below demonstrates how to generate with [`Pipeline`] or the [`AutoM showcasing how to toggle the right attributes to enable very long-context generations, as some flavors of Llama 4 have context lengths going up to 10 million tokens. - @@ -255,7 +255,6 @@ Updating the default attention function can significantly improve compute perfor As of release, the Llama 4 model supports the following attention methods: `eager`, `flex_attention`, `sdpa`. We recommend using `flex_attention` for best results. Switching attention mechanism is done at the model initialization step: - @@ -278,6 +277,7 @@ model = Llama4ForConditionalGeneration.from_pretrained( dtype=torch.bfloat16, ) ``` + The `sdpa` attention method is generally more compute-efficient than the `eager` method. @@ -293,6 +293,7 @@ model = Llama4ForConditionalGeneration.from_pretrained( dtype=torch.bfloat16, ) ``` + The `eager` attention method is set by default, so no need for anything different when loading the model: @@ -307,10 +308,10 @@ model = Llama4ForConditionalGeneration.from_pretrained( dtype=torch.bfloat16, ) ``` + - ### Quantization Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for available quantization backends. @@ -318,8 +319,6 @@ At time of release, both FBGEMM and LLM-Compressor are supported; more quantizat See below for examples using both: - - Here is an example loading an BF16 model in FP8 using the FBGEMM approach: @@ -378,6 +377,7 @@ outputs = model.generate(**inputs.to(model.device), max_new_tokens=100) outputs = tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:]) print(outputs[0]) ``` + @@ -423,24 +423,24 @@ model = Llama4ForConditionalGeneration.from_pretrained( ## Llama4ForConditionalGeneration [[autodoc]] Llama4ForConditionalGeneration -- forward + - forward ## Llama4ForCausalLM [[autodoc]] Llama4ForCausalLM -- forward + - forward ## Llama4TextModel [[autodoc]] Llama4TextModel -- forward + - forward ## Llama4ForCausalLM [[autodoc]] Llama4ForCausalLM -- forward + - forward ## Llama4VisionModel [[autodoc]] Llama4VisionModel -- forward + - forward diff --git a/docs/source/en/model_doc/llava.md b/docs/source/en/model_doc/llava.md index 1d7427b9015e..e387fb4b54c7 100644 --- a/docs/source/en/model_doc/llava.md +++ b/docs/source/en/model_doc/llava.md @@ -47,27 +47,24 @@ The original code can be found [here](https://github.com/haotian-liu/LLaVA/tree/ - Note the model has not been explicitly trained to process multiple images in the same prompt, although this is technically possible, you may experience inaccurate results. - > [!NOTE] -> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. +> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and `processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. Adding these attributes means that LLaVA will try to infer the number of image tokens required per image and expand the text with as many `` placeholders as there will be tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there will be failure when merging the embeddings. The attributes can be obtained from model config, as `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`. The `num_additional_image_tokens` should be `1` if the vision backbone adds a CLS token or `0` if nothing extra is added to the vision patches. - ### Formatting Prompts with Chat Templates -Each **checkpoint** is trained with a specific prompt format, depending on the underlying large language model backbone. To ensure correct formatting, use the processor’s `apply_chat_template` method. +Each **checkpoint** is trained with a specific prompt format, depending on the underlying large language model backbone. To ensure correct formatting, use the processor's `apply_chat_template` method. **Important:** + - You must construct a conversation history — passing a plain string won't work. - Each message should be a dictionary with `"role"` and `"content"` keys. - The `"content"` should be a list of dictionaries for different modalities like `"text"` and `"image"`. - -Here’s an example of how to structure your input. +Here's an example of how to structure your input. We will use [llava-hf/llava-1.5-7b-hf](https://huggingface.co/llava-hf/llava-1.5-7b-hf) and a conversation history of text and image. Each content field has to be a list of dicts, as follows: - ```python from transformers import AutoProcessor @@ -104,6 +101,7 @@ print(text_prompt) - If you want to construct a chat prompt yourself, below is a list of prompt formats accepted by each llava checkpoint: [llava-interleave models](https://huggingface.co/collections/llava-hf/llava-interleave-668e19a97da0036aad4a2f19) requires the following format: + ```bash "<|im_start|>user \nWhat is shown in this image?<|im_end|><|im_start|>assistant" ``` @@ -115,6 +113,7 @@ For multiple turns conversation: ``` [llava-1.5 models](https://huggingface.co/collections/llava-hf/llava-15-65f762d5b6941db5c2ba07e0) requires the following format: + ```bash "USER: \n ASSISTANT:" ``` @@ -127,12 +126,10 @@ For multiple turns conversation: 🚀 **Bonus:** If you're using `transformers>=4.49.0`, you can also get a vectorized output from `apply_chat_template`. See the **Usage Examples** below for more details on how to use it. - ## Usage examples ### Single input inference - ```python import torch from transformers import AutoProcessor, LlavaForConditionalGeneration @@ -164,7 +161,6 @@ generate_ids = model.generate(**inputs, max_new_tokens=30) processor.batch_decode(generate_ids, skip_special_tokens=True) ``` - ### Batched inference LLaVa also supports batched inference. Here is how you can do it: @@ -214,7 +210,6 @@ generate_ids = model.generate(**inputs, max_new_tokens=30) processor.batch_decode(generate_ids, skip_special_tokens=True) ``` - ## Note regarding reproducing original implementation In order to match the logits of the [original implementation](https://github.com/haotian-liu/LLaVA/tree/main), one needs to additionally specify `do_pad=True` when instantiating `LlavaImageProcessor`: @@ -238,7 +233,6 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h - A [Google Colab demo](https://colab.research.google.com/drive/1qsl6cd2c8gGtEW1xV5io7S8NHh-Cp1TV?usp=sharing) on how to run Llava on a free-tier Google colab instance leveraging 4-bit inference. - A [similar notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LLaVa/Inference_with_LLaVa_for_multimodal_generation.ipynb) showcasing batched inference. 🌎 - ## LlavaConfig [[autodoc]] LlavaConfig diff --git a/docs/source/en/model_doc/llava_next.md b/docs/source/en/model_doc/llava_next.md index e7ff4c896e25..3857f154cf4b 100644 --- a/docs/source/en/model_doc/llava_next.md +++ b/docs/source/en/model_doc/llava_next.md @@ -141,7 +141,6 @@ with torch.inference_mode(): print(processor.decode(output[0], skip_special_tokens=True)) ``` - ## Notes * Different checkpoints (Mistral, Vicuna, etc.) require a specific prompt format depending on the underlying LLM. Always use [`~ProcessorMixin.apply_chat_template`] to ensure correct formatting. Refer to the [Templates](../chat_templating) guide for more details. @@ -189,7 +188,6 @@ output = model.generate(**inputs, max_new_tokens=100) print(processor.decode(output[0], skip_special_tokens=True)) ``` - ## LlavaNextConfig [[autodoc]] LlavaNextConfig diff --git a/docs/source/en/model_doc/llava_next_video.md b/docs/source/en/model_doc/llava_next_video.md index 9379c1cc2ed6..61aa7e1ffc51 100644 --- a/docs/source/en/model_doc/llava_next_video.md +++ b/docs/source/en/model_doc/llava_next_video.md @@ -30,19 +30,17 @@ The LLaVa-NeXT-Video model was proposed in [LLaVA-NeXT: A Strong Zero-shot Video [LLaVA-NeXT](llava_next) surprisingly has strong performance in understanding video content in zero-shot fashion with the AnyRes technique that it uses. The AnyRes technique naturally represents a high-resolution image into multiple images. This technique is naturally generalizable to represent videos because videos can be considered as a set of frames (similar to a set of images in LLaVa-NeXT). The current version of LLaVA-NeXT makes use of AnyRes and trains with supervised fine-tuning (SFT) on top of LLaVA-Next on video data to achieves better video understanding capabilities.The model is a current SOTA among open-source models on [VideoMME bench](https://huggingface.co/papers/2405.21075). - The introduction from the blog is the following: On January 30, 2024, we released LLaVA-NeXT, an open-source Large Multimodal Model (LMM) that has been trained exclusively on text-image data. With the proposed AnyRes technique, it boosts capabilities in reasoning, OCR, and world knowledge, demonstrating remarkable performance across a spectrum of image-based multimodal understanding tasks, and even exceeding Gemini-Pro on several image benchmarks, e.g. MMMU and MathVista. -**In today’s exploration, we delve into the performance of LLaVA-NeXT within the realm of video understanding tasks. We reveal that LLaVA-NeXT surprisingly has strong performance in understanding video content. The current version of LLaVA-NeXT for videos has several improvements: +**In today's exploration, we delve into the performance of LLaVA-NeXT within the realm of video understanding tasks. We reveal that LLaVA-NeXT surprisingly has strong performance in understanding video content. The current version of LLaVA-NeXT for videos has several improvements: - Zero-shot video representation capabilities with AnyRes: The AnyRes technique naturally represents a high-resolution image into multiple images that a pre-trained VIT is able to digest, and forms them into a concatenated sequence. This technique is naturally generalizable to represent videos (consisting of multiple frames), allowing the image-only-trained LLaVA-Next model to perform surprisingly well on video tasks. Notably, this is the first time that LMMs show strong zero-shot modality transfer ability. - Inference with length generalization improves on longer videos. The linear scaling technique enables length generalization, allowing LLaVA-NeXT to effectively handle long-video beyond the limitation of the "max_token_length" of the LLM. - Strong video understanding ability. (1) LLaVA-Next-Image, which combines the above two techniques, yields superior zero-shot performance than open-source LMMs tuned on videos. (2) LLaVA-Next-Video, further supervised fine-tuning (SFT) LLaVA-Next-Image on video data, achieves better video understanding capabilities compared to LLaVA-Next-Image. (3) LLaVA-Next-Video-DPO, which aligns the model response with AI feedback using direct preference optimization (DPO), showing significant performance boost. - Efficient deployment and inference with SGLang. It allows 5x faster inference on video tasks, allowing more scalable serving such as million-level video re-captioning. See instructions in our repo.** - This model was contributed by [RaushanTurganbay](https://huggingface.co/RaushanTurganbay). The original code can be found [here](https://github.com/LLaVA-VL/LLaVA-NeXT/tree/inference). @@ -56,24 +54,22 @@ The original code can be found [here](https://github.com/LLaVA-VL/LLaVA-NeXT/tre - > [!NOTE] -> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. +> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and `processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. Adding these attributes means that LLaVA will try to infer the number of image tokens required per image and expand the text with as many `` placeholders as there will be tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there will be failure when merging the embeddings. The attributes can be obtained from model config, as `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`. The `num_additional_image_tokens` should be `1` if the vision backbone adds a CLS token or `0` if nothing extra is added to the vision patches. - ### Formatting Prompts with Chat Templates -Each **checkpoint** is trained with a specific prompt format, depending on the underlying large language model backbone. To ensure correct formatting, use the processor’s `apply_chat_template` method. +Each **checkpoint** is trained with a specific prompt format, depending on the underlying large language model backbone. To ensure correct formatting, use the processor's `apply_chat_template` method. **Important:** + - You must construct a conversation history — passing a plain string won't work. - Each message should be a dictionary with `"role"` and `"content"` keys. - The `"content"` should be a list of dictionaries for different modalities like `"text"` and `"image"`. - -Here’s an example of how to structure your input. We will use [LLaVA-NeXT-Video-7B-hf](https://huggingface.co/llava-hf/LLaVA-NeXT-Video-7B-hf) and a conversation history of videos and images. +Here's an example of how to structure your input. We will use [LLaVA-NeXT-Video-7B-hf](https://huggingface.co/llava-hf/LLaVA-NeXT-Video-7B-hf) and a conversation history of videos and images. ```python from transformers import LlavaNextVideoProcessor @@ -116,8 +112,6 @@ print(text_prompt) 🚀 **Bonus:** If you're using `transformers>=4.49.0`, you can also get a vectorized output from `apply_chat_template`. See the **Usage Examples** below for more details on how to use it. - - ## Usage example ### Single Media Mode @@ -153,10 +147,9 @@ out = model.generate(**inputs, max_new_tokens=60) processor.batch_decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True) ``` - ### Mixed Media Mode -The model can also generate from an interleaved image-video inputs. However note, that it was not trained in interleaved image-video setting which might affect the performance. Below is an example usage for mixed media input, add the following lines to the above code snippet: +The model can also generate from an interleaved image-video inputs. However note, that it was not trained in interleaved image-video setting which might affect the performance. Below is an example usage for mixed media input, add the following lines to the above code snippet: ```python @@ -196,7 +189,7 @@ processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokeniza ### Quantization using Bitsandbytes for memory efficiency -The model can be loaded in lower bits, significantly reducing memory burden while maintaining the performance of the original model. This allows for efficient deployment on resource-constrained cases. +The model can be loaded in lower bits, significantly reducing memory burden while maintaining the performance of the original model. This allows for efficient deployment on resource-constrained cases. First, make sure to install bitsandbytes by running `pip install bitsandbytes` and to have access to a GPU/accelerator that is supported by the library. @@ -210,7 +203,6 @@ We value your feedback to help identify bugs before the full release! Check out Then simply load the quantized model by adding [`BitsAndBytesConfig`](../main_classes/quantization#transformers.BitsAndBytesConfig) as shown below: - ```python from transformers import LlavaNextVideoForConditionalGeneration, LlavaNextVideoProcessor @@ -224,7 +216,6 @@ quantization_config = BitsAndBytesConfig( model = LlavaNextVideoForConditionalGeneration.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf", quantization_config=quantization_config, device_map="auto") ``` - ### Flash-Attention 2 to speed-up generation Additionally, we can greatly speed-up model inference by using [Flash Attention](../perf_train_gpu_one#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. @@ -249,8 +240,6 @@ model = LlavaNextVideoForConditionalGeneration.from_pretrained( ).to(0) ``` - - ## LlavaNextVideoConfig [[autodoc]] LlavaNextVideoConfig diff --git a/docs/source/en/model_doc/llava_onevision.md b/docs/source/en/model_doc/llava_onevision.md index e546530922ad..08bc075495b0 100644 --- a/docs/source/en/model_doc/llava_onevision.md +++ b/docs/source/en/model_doc/llava_onevision.md @@ -54,18 +54,17 @@ Tips: - ### Formatting Prompts with Chat Templates Each **checkpoint** is trained with a specific prompt format, depending on the underlying large language model backbone. To ensure correct formatting, use the processor’s `apply_chat_template` method. **Important:** + - You must construct a conversation history — passing a plain string won't work. - Each message should be a dictionary with `"role"` and `"content"` keys. - The `"content"` should be a list of dictionaries for different modalities like `"text"` and `"image"`. - -Here’s an example of how to structure your input. +Here’s an example of how to structure your input. We will use [llava-onevision-qwen2-7b-si-hf](https://huggingface.co/llava-hf/llava-onevision-qwen2-7b-si-hf) and a conversation history of text and image. Each content field has to be a list of dicts, as follows: ```python @@ -103,11 +102,9 @@ print(text_prompt) 🚀 **Bonus:** If you're using `transformers>=4.49.0`, you can also get a vectorized output from `apply_chat_template`. See the **Usage Examples** below for more details on how to use it. - This model was contributed by [RaushanTurganbay](https://huggingface.co/RaushanTurganbay). The original code can be found [here](https://github.com/LLaVA-VL/LLaVA-NeXT/tree/main). - ## Usage example ### Single image inference @@ -293,7 +290,6 @@ model = LlavaOnevisionForConditionalGeneration.from_pretrained( ).to(0) ``` - ## LlavaOnevisionConfig [[autodoc]] LlavaOnevisionConfig diff --git a/docs/source/en/model_doc/longcat_flash.md b/docs/source/en/model_doc/longcat_flash.md index b2c2d7a00646..651f3386f161 100644 --- a/docs/source/en/model_doc/longcat_flash.md +++ b/docs/source/en/model_doc/longcat_flash.md @@ -12,12 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. - ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be rendered properly in your Markdown viewer. --> -*This model was released on 2025-09-01 and added to Hugging Face Transformers on 2025-09-15.* - +*This model was released on 2025-09-01 and added to Hugging Face Transformers on 2025-09-17.* # LongCatFlash @@ -44,6 +42,7 @@ The original code can be found [here](https://huggingface.co/meituan-longcat/Lon ## Usage examples The model is large: you will need 2x8 H100 to run inference. + ```python # launch_longcat.py from transformers import LongcatFlashForCausalLM, AutoTokenizer @@ -70,13 +69,14 @@ outputs = model.generate(inputs, max_new_tokens=30) print(tokenizer.batch_decode(outputs)) ``` -To run with TP, you will need torchrun: +To run with TP, you will need torchrun: ```bash torchrun --nproc_per_node=8 --nnodes=2 --node_rank=0 | 1 --rdzv-id --rdzv-backend c10d --rdzv-endpoint $NODE_ID:$NODE_PORT --log-dir ./logs_longcat launch_longcat.py ``` And you'll get a nice generation: + ```json [Round 0] USER:Hello! What is the capital of France? What can you tell me about it? ASSISTANT:Hello! 😊 The capital of France is Paris, one of the most famous and beloved cities in the world. Here’s a quick overview of what makes Paris special: 1. Iconic Landmarks diff --git a/docs/source/en/model_doc/longformer.md b/docs/source/en/model_doc/longformer.md index c80294ab7a04..b8375998a06b 100644 --- a/docs/source/en/model_doc/longformer.md +++ b/docs/source/en/model_doc/longformer.md @@ -85,7 +85,6 @@ echo -e "San Francisco 49ers cornerback Shawntae Spencer will miss the rest of t - ## Notes - Longformer is based on [RoBERTa](https://huggingface.co/docs/transformers/en/model_doc/roberta) and doesn't have `token_type_ids`. You don't need to indicate which token belongs to which segment. You only need to separate the segments with the separation token `` or `tokenizer.sep_token`. diff --git a/docs/source/en/model_doc/longt5.md b/docs/source/en/model_doc/longt5.md index bd22d757a74f..a197de15a576 100644 --- a/docs/source/en/model_doc/longt5.md +++ b/docs/source/en/model_doc/longt5.md @@ -29,7 +29,6 @@ encoder-decoder transformer pre-trained in a text-to-text denoising generative s T5 model, and it enables using one of the two different efficient attention mechanisms - (1) Local attention, or (2) Transient-Global attention. - The abstract from the paper is the following: *Recent work has shown that either (1) increasing the input length or (2) increasing model size can improve the @@ -95,7 +94,6 @@ The complexity of this mechanism is `O(l(r + l/k))`. >>> rouge.compute(predictions=result["predicted_abstract"], references=result["abstract"]) ``` - ## Resources - [Translation task guide](../tasks/translation) diff --git a/docs/source/en/model_doc/m2m_100.md b/docs/source/en/model_doc/m2m_100.md index 29d43af97a2f..f9ac7e5ebe92 100644 --- a/docs/source/en/model_doc/m2m_100.md +++ b/docs/source/en/model_doc/m2m_100.md @@ -44,7 +44,6 @@ open-source our scripts so that others may reproduce the data, evaluation, and f This model was contributed by [valhalla](https://huggingface.co/valhalla). - ## Usage tips and examples M2M100 is a multilingual encoder-decoder (seq-to-seq) model primarily intended for translation tasks. As the model is @@ -76,9 +75,9 @@ loss = model(**model_inputs).loss # forward pass **Generation** -M2M100 uses the `eos_token_id` as the `decoder_start_token_id` for generation with the target language id -being forced as the first generated token. To force the target language id as the first generated token, pass the -*forced_bos_token_id* parameter to the *generate* method. The following example shows how to translate between +M2M100 uses the `eos_token_id` as the `decoder_start_token_id` for generation with the target language id +being forced as the first generated token. To force the target language id as the first generated token, pass the +*forced_bos_token_id* parameter to the *generate* method. The following example shows how to translate between Hindi to French and Chinese to English using the *facebook/m2m100_418M* checkpoint. ```python @@ -136,7 +135,7 @@ Hindi to French and Chinese to English using the *facebook/m2m100_418M* checkpoi Flash Attention 2 is a faster, optimized version of the attention scores computation which relies on `cuda` kernels. -### Installation +### Installation First, check whether your hardware is compatible with Flash Attention 2. The latest list of compatible hardware can be found in the [official documentation](https://github.com/Dao-AILab/flash-attention#installation-and-features). diff --git a/docs/source/en/model_doc/mamba.md b/docs/source/en/model_doc/mamba.md index d243bcf7e40d..031e353c93da 100644 --- a/docs/source/en/model_doc/mamba.md +++ b/docs/source/en/model_doc/mamba.md @@ -27,7 +27,6 @@ rendered properly in your Markdown viewer. You can find all the original Mamba checkpoints under the [State Space Models](https://huggingface.co/state-spaces) organization. - > [!TIP] > This model was contributed by [Molbap](https://huggingface.co/Molbap) and [AntonV](https://huggingface.co/AntonV). > Click on the Mamba models in the right sidebar for more examples of how to apply Mamba to different language tasks. @@ -93,6 +92,7 @@ input_ids = tokenizer("Plants create energy through a process known as", return_ output = model.generate(**input_ids) print(tokenizer.decode(output[0], skip_special_tokens=True)) ``` + ## Notes - The current implementation uses the original CUDA kernels. The FlashAttention equivalent implementation is hosted in the [mamba-ssm](https://github.com/state-spaces/mamba) and [causal_conv1d](https://github.com/Dao-AILab/causal-conv1d) repositories. Make sure to install them if your hardware supports it! diff --git a/docs/source/en/model_doc/mamba2.md b/docs/source/en/model_doc/mamba2.md index f8532f3cfbe6..f1750ef2e2f5 100644 --- a/docs/source/en/model_doc/mamba2.md +++ b/docs/source/en/model_doc/mamba2.md @@ -1,4 +1,4 @@ - +*This model was released on {release_date} and added to Hugging Face Transformers on 2025-09-11.*
@@ -29,7 +30,6 @@ rendered properly in your Markdown viewer. This architecture turns out to coincide with Qwen2, with the main difference being the presence of biases in attention projections in Ministral. - You can find the Ministral checkpoints under the [Mistral AI](https://huggingface.co/mistralai) organization. ## Usage @@ -83,4 +83,4 @@ The example below demonstrates how to use Ministral for text generation: ## MinistralForQuestionAnswering [[autodoc]] MinistralForQuestionAnswering -- forward \ No newline at end of file + - forward diff --git a/docs/source/en/model_doc/mistral.md b/docs/source/en/model_doc/mistral.md index 3714f45e55a0..4c598fc79a71 100644 --- a/docs/source/en/model_doc/mistral.md +++ b/docs/source/en/model_doc/mistral.md @@ -86,7 +86,6 @@ echo -e "My favorite condiment is" | transformers chat mistralai/Mistral-7B-v0.3 - Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends. The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to 4-bits. @@ -164,4 +163,4 @@ Use the [AttentionMaskVisualizer](https://github.com/huggingface/transformers/bl ## MistralForQuestionAnswering [[autodoc]] MistralForQuestionAnswering -- forward + - forward diff --git a/docs/source/en/model_doc/mistral3.md b/docs/source/en/model_doc/mistral3.md index 54af880ed467..4ac264ac9854 100644 --- a/docs/source/en/model_doc/mistral3.md +++ b/docs/source/en/model_doc/mistral3.md @@ -27,7 +27,6 @@ rendered properly in your Markdown viewer. You can find the original Mistral 3 checkpoints under the [Mistral AI](https://huggingface.co/mistralai/models?search=mistral-small-3) organization. - > [!TIP] > This model was contributed by [cyrilvallez](https://huggingface.co/cyrilvallez) and [yonigozlan](https://huggingface.co/yonigozlan). > Click on the Mistral3 models in the right sidebar for more examples of how to apply Mistral3 to different tasks. @@ -62,6 +61,7 @@ outputs = pipeline(text=messages, max_new_tokens=50, return_full_text=False) outputs[0]["generated_text"] 'The image depicts a vibrant and lush garden scene featuring a variety of wildflowers and plants. The central focus is on a large, pinkish-purple flower, likely a Greater Celandine (Chelidonium majus), with a' ``` + @@ -100,13 +100,15 @@ decoded_output = processor.decode(generate_ids[0, inputs["input_ids"].shape[1] : decoded_output 'The image depicts a vibrant and lush garden scene featuring a variety of wildflowers and plants. The central focus is on a large, pinkish-purple flower, likely a Greater Celandine (Chelidonium majus), with a' ``` + -## Notes +## Notes + +- Mistral 3 supports text-only generation. -- Mistral 3 supports text-only generation. -```py +```py import torch from transformers import AutoProcessor, AutoModelForImageTextToText, infer_device @@ -136,13 +138,16 @@ print(decoded_output) 5. Je me casse, à plus! ``` + /\_/\ ( o.o ) > ^ < + ```" ```` -- Mistral 3 accepts batched image and text inputs. +- Mistral 3 accepts batched image and text inputs. + ```py import torch from transformers import AutoProcessor, AutoModelForImageTextToText, infer_device @@ -184,7 +189,7 @@ messages = [ , "Describe this imageThe image depicts a vibrant street scene in what appears to be a Chinatown district. The focal point is a traditional Chinese"] ``` -- Mistral 3 also supported batched image and text inputs with a different number of images for each text. The example below quantizes the model with bitsandbytes. +- Mistral 3 also supported batched image and text inputs with a different number of images for each text. The example below quantizes the model with bitsandbytes. ```py import torch diff --git a/docs/source/en/model_doc/mixtral.md b/docs/source/en/model_doc/mixtral.md index ff501cd1a84d..1e9574145aa1 100644 --- a/docs/source/en/model_doc/mixtral.md +++ b/docs/source/en/model_doc/mixtral.md @@ -39,9 +39,10 @@ Mixtral-8x7B is the second large language model (LLM) released by [mistral.ai](h Mixtral-8x7B is a decoder-only Transformer with the following architectural choices: - Mixtral is a Mixture of Experts (MoE) model with 8 experts per MLP, with a total of 45 billion parameters. To learn more about mixture-of-experts, refer to the [blog post](https://huggingface.co/blog/moe). -- Despite the model having 45 billion parameters, the compute required for a single forward pass is the same as that of a 14 billion parameter model. This is because even though each of the experts have to be loaded in RAM (70B like ram requirement) each token from the hidden states are dispatched twice (top 2 routing) and thus the compute (the operation required at each forward computation) is just 2 X sequence_length. +- Despite the model having 45 billion parameters, the compute required for a single forward pass is the same as that of a 14 billion parameter model. This is because even though each of the experts have to be loaded in RAM (70B like ram requirement) each token from the hidden states are dispatched twice (top 2 routing) and thus the compute (the operation required at each forward computation) is just 2 X sequence_length. The following implementation details are shared with Mistral AI's first model [Mistral-7B](mistral): + - Sliding Window Attention - Trained with 8k context length and fixed cache size, with a theoretical attention span of 128K tokens - GQA (Grouped Query Attention) - allowing faster inference and lower cache size. - Byte-fallback BPE tokenizer - ensures that characters are never mapped to out of vocabulary tokens. @@ -55,6 +56,7 @@ For more details refer to the [release blog post](https://mistral.ai/news/mixtra ## Usage tips The Mistral team has released 2 checkpoints: + - a base model, [Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1), which has been pre-trained to predict the next token on internet-scale data. - an instruction tuned model, [Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1), which is the base model optimized for chat purposes using supervised fine-tuning (SFT) and direct preference optimization (DPO). @@ -138,8 +140,8 @@ Below is a expected speedup diagram that compares pure inference time between th ### Sliding window Attention -The current implementation supports the sliding window attention mechanism and memory efficient cache management. -To enable sliding window attention, just make sure to have a `flash-attn` version that is compatible with sliding window attention (`>=2.3.0`). +The current implementation supports the sliding window attention mechanism and memory efficient cache management. +To enable sliding window attention, just make sure to have a `flash-attn` version that is compatible with sliding window attention (`>=2.3.0`). The Flash Attention-2 model uses also a more memory efficient cache slicing mechanism - as recommended per the official implementation of Mistral model that use rolling cache mechanism we keep the cache size fixed (`self.config.sliding_window`), support batched generation only for `padding_side="left"` and use the absolute position of the current token to compute the positional embedding. diff --git a/docs/source/en/model_doc/mlcd.md b/docs/source/en/model_doc/mlcd.md index 1ce785ee76bb..7ff2fb434da0 100644 --- a/docs/source/en/model_doc/mlcd.md +++ b/docs/source/en/model_doc/mlcd.md @@ -32,9 +32,9 @@ Tips: - We adopted the official [LLaVA-NeXT](https://github.com/LLaVA-VL/LLaVA-NeXT) and the official training dataset [LLaVA-NeXT-Data](https://huggingface.co/datasets/lmms-lab/LLaVA-NeXT-Data) for evaluating the foundational visual models. -- The language model is [Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct). +- The language model is [Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct). -Result: +Result: | Vision Tower | RoPE2D | ChartQA | DocVQA | InfoVQA | OCRBench | MMMU | | :-------------------------------------------------------------------------------------------- | :----: | :-------- | :-------- | :-------- | :--------- | :-------- | @@ -45,7 +45,6 @@ Result: | **[MLCD (ViT-bigG-14-336px)](https://huggingface.co/DeepGlint-AI/mlcd-vit-bigG-patch14-336)** | √ | 71.07 | 79.63 | 44.38 | 572.00 | 46.78 | | **[MLCD (ViT-bigG-14-448px)](https://huggingface.co/DeepGlint-AI/mlcd-vit-bigG-patch14-448)** | √ | **73.80** | **83.34** | **46.59** | **582.00** | 46.00 | - ## Usage ```python diff --git a/docs/source/en/model_doc/mllama.md b/docs/source/en/model_doc/mllama.md index 1ea7f172bb3a..a0fc5db41cfe 100644 --- a/docs/source/en/model_doc/mllama.md +++ b/docs/source/en/model_doc/mllama.md @@ -35,15 +35,12 @@ The [Llama 3.2-Vision](https://ai.meta.com/blog/llama-3-2-connect-2024-vision-ed - The text passed to the processor should have the `"<|image|>"` tokens where the images should be inserted. - The processor has its own `apply_chat_template` method to convert chat messages to text that can then be passed as text to the processor. If you're using `transformers>=4.49.0`, you can also get a vectorized output from `apply_chat_template`. See the **Usage Examples** below for more details on how to use it. - - Mllama has an extra token used as a placeholder for image positions in the text. It means that input ids and an input embedding layer will have an extra token. But since the weights for input and output embeddings are not tied, the `lm_head` layer has one less token and will fail if you want to calculate loss on image tokens or apply some logit processors. In case you are training, make sure to mask out special `"<|image|>"` tokens in the `labels` as the model should not be trained on predicting them. Otherwise if you see CUDA-side index errors when generating, use the below code to expand the `lm_head` by one more token. - ```python old_embeddings = model.get_output_embeddings() @@ -52,12 +49,13 @@ resized_embeddings = model._get_resized_lm_head(old_embeddings, new_num_tokens=n resized_embeddings.requires_grad_(old_embeddings.weight.requires_grad) model.set_output_embeddings(resized_embeddings) ``` - + ## Usage Example #### Instruct model + ```python import torch from transformers import MllamaForConditionalGeneration, AutoProcessor @@ -83,6 +81,7 @@ print(processor.decode(output[0])) ``` #### Base model + ```python import requests import torch @@ -102,7 +101,6 @@ output = model.generate(**inputs, do_sample=False, max_new_tokens=25) print(processor.decode(output[0], skip_special_tokens=True)) ``` - ## MllamaConfig [[autodoc]] MllamaConfig @@ -111,7 +109,6 @@ print(processor.decode(output[0], skip_special_tokens=True)) [[autodoc]] MllamaProcessor - ## MllamaImageProcessor [[autodoc]] MllamaImageProcessor diff --git a/docs/source/en/model_doc/mm-grounding-dino.md b/docs/source/en/model_doc/mm-grounding-dino.md index e411ef5defb6..0d628c3b31de 100644 --- a/docs/source/en/model_doc/mm-grounding-dino.md +++ b/docs/source/en/model_doc/mm-grounding-dino.md @@ -100,7 +100,6 @@ for box, score, labels in zip(result["boxes"], result["scores"], result["labels" | [mm_grounding_dino_tiny_o365v1_goldg_v3det](https://huggingface.co/openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg_v3det) | O365,GoldG,V3Det | 33.0 | 36.0 | 45.9 | 40.5(+11.7) | 21.5 | 25.5 | 40.2 | 30.6(+10.5) | | [mm_grounding_dino_tiny_o365v1_goldg_grit_v3det](https://huggingface.co/openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg_grit_v3det) | O365,GoldG,GRIT,V3Det | 34.2 | 37.4 | 46.2 | 41.4(+12.6) | 23.6 | 27.6 | 40.5 | 31.9(+11.8) | - - This implementation also supports inference for [LLMDet](https://github.com/iSEE-Laboratory/LLMDet). Here's a table of LLMDet models and their performance on LVIS (results from [official repo](https://github.com/iSEE-Laboratory/LLMDet)): | Model | Pre-Train Data | MiniVal APr | MiniVal APc | MiniVal APf | MiniVal AP | Val1.0 APr | Val1.0 APc | Val1.0 APf | Val1.0 AP | @@ -109,7 +108,6 @@ for box, score, labels in zip(result["boxes"], result["scores"], result["labels" | [llmdet_base](https://huggingface.co/iSEE-Laboratory/llmdet_base) | (O365,GoldG,V3Det) + GroundingCap-1M | 48.3 | 40.8 | 43.1 | 54.3 | 38.5 | 28.2 | 34.3 | 47.8 | | [llmdet_large](https://huggingface.co/iSEE-Laboratory/llmdet_large) | (O365V2,OpenImageV6,GoldG) + GroundingCap-1M | 51.1 | 45.1 | 46.1 | 56.6 | 42.0 | 31.6 | 38.8 | 50.2 | - ## MMGroundingDinoConfig [[autodoc]] MMGroundingDinoConfig diff --git a/docs/source/en/model_doc/mms.md b/docs/source/en/model_doc/mms.md index 3ac351d0ddcb..171beaf440d1 100644 --- a/docs/source/en/model_doc/mms.md +++ b/docs/source/en/model_doc/mms.md @@ -376,6 +376,7 @@ detected_lang = model.config.id2label[lang_id] ``` To see all the supported languages of a checkpoint, you can print out the language ids as follows: + ```py processor.id2label.values() ``` diff --git a/docs/source/en/model_doc/mobilebert.md b/docs/source/en/model_doc/mobilebert.md index 4e3cc2e5d647..08486ace56eb 100644 --- a/docs/source/en/model_doc/mobilebert.md +++ b/docs/source/en/model_doc/mobilebert.md @@ -15,7 +15,6 @@ rendered properly in your Markdown viewer. --> *This model was released on 2020-04-06 and added to Hugging Face Transformers on 2020-11-16.* -
PyTorch @@ -47,6 +46,7 @@ pipeline = pipeline( ) pipeline("The capital of France is [MASK].") ``` + @@ -85,7 +85,6 @@ echo -e "The capital of France is [MASK]." | transformers run --task fill-mask - - ## Notes - Inputs should be padded on the right because BERT uses absolute position embeddings. diff --git a/docs/source/en/model_doc/mobilenet_v1.md b/docs/source/en/model_doc/mobilenet_v1.md index c77bef730423..eea159bdd738 100644 --- a/docs/source/en/model_doc/mobilenet_v1.md +++ b/docs/source/en/model_doc/mobilenet_v1.md @@ -32,7 +32,6 @@ You can all the original MobileNet checkpoints under the [Google](https://huggin The example below demonstrates how to classify an image with [`Pipeline`] or the [`AutoModel`] class. - @@ -84,23 +83,24 @@ print(f"The predicted class label is: {predicted_class_label}") - ## Notes -- Checkpoint names follow the pattern `mobilenet_v1_{depth_multiplier}_{resolution}`, like `mobilenet_v1_1.0_224`. `1.0` is the depth multiplier and `224` is the image resolution. -- While trained on images of a specific sizes, the model architecture works with images of different sizes (minimum 32x32). The [`MobileNetV1ImageProcessor`] handles the necessary preprocessing. -- MobileNet is pretrained on [ImageNet-1k](https://huggingface.co/datasets/imagenet-1k), a dataset with 1000 classes. However, the model actually predicts 1001 classes. The additional class is an extra "background" class (index 0). -- The original TensorFlow checkpoints determines the padding amount at inference because it depends on the input image size. To use the native PyTorch padding behavior, set `tf_padding=False` in [`MobileNetV1Config`]. +- Checkpoint names follow the pattern `mobilenet_v1_{depth_multiplier}_{resolution}`, like `mobilenet_v1_1.0_224`. `1.0` is the depth multiplier and `224` is the image resolution. +- While trained on images of a specific sizes, the model architecture works with images of different sizes (minimum 32x32). The [`MobileNetV1ImageProcessor`] handles the necessary preprocessing. +- MobileNet is pretrained on [ImageNet-1k](https://huggingface.co/datasets/imagenet-1k), a dataset with 1000 classes. However, the model actually predicts 1001 classes. The additional class is an extra "background" class (index 0). +- The original TensorFlow checkpoints determines the padding amount at inference because it depends on the input image size. To use the native PyTorch padding behavior, set `tf_padding=False` in [`MobileNetV1Config`]. + ```python from transformers import MobileNetV1Config config = MobileNetV1Config.from_pretrained("google/mobilenet_v1_1.0_224", tf_padding=True) ``` -- The Transformers implementation does not support the following features. - - Uses global average pooling instead of the optional 7x7 average pooling with stride 2. For larger inputs, this gives a pooled output that is larger than a 1x1 pixel. - - Does not support other `output_stride` values (fixed at 32). For smaller `output_strides`, the original implementation uses dilated convolution to prevent spatial resolution from being reduced further. (which would require dilated convolutions). - - `output_hidden_states=True` returns *all* intermediate hidden states. It is not possible to extract the output from specific layers for other downstream purposes. - - Does not include the quantized models from the original checkpoints because they include "FakeQuantization" operations to unquantize the weights. + +- The Transformers implementation does not support the following features. + - Uses global average pooling instead of the optional 7x7 average pooling with stride 2. For larger inputs, this gives a pooled output that is larger than a 1x1 pixel. + - Does not support other `output_stride` values (fixed at 32). For smaller `output_strides`, the original implementation uses dilated convolution to prevent spatial resolution from being reduced further. (which would require dilated convolutions). + - `output_hidden_states=True` returns *all* intermediate hidden states. It is not possible to extract the output from specific layers for other downstream purposes. + - Does not include the quantized models from the original checkpoints because they include "FakeQuantization" operations to unquantize the weights. ## MobileNetV1Config diff --git a/docs/source/en/model_doc/mobilenet_v2.md b/docs/source/en/model_doc/mobilenet_v2.md index 3e1379e3f079..bf94454e438d 100644 --- a/docs/source/en/model_doc/mobilenet_v2.md +++ b/docs/source/en/model_doc/mobilenet_v2.md @@ -30,10 +30,8 @@ You can all the original MobileNet checkpoints under the [Google](https://huggin > [!TIP] > Click on the MobileNet V2 models in the right sidebar for more examples of how to apply MobileNet to different vision tasks. - The examples below demonstrate how to classify an image with [`Pipeline`] or the [`AutoModel`] class. - @@ -82,24 +80,25 @@ print(f"The predicted class label is: {predicted_class_label}") - ## Notes -- Classification checkpoint names follow the pattern `mobilenet_v2_{depth_multiplier}_{resolution}`, like `mobilenet_v2_1.4_224`. `1.4` is the depth multiplier and `224` is the image resolution. Segmentation checkpoint names follow the pattern `deeplabv3_mobilenet_v2_{depth_multiplier}_{resolution}`. -- While trained on images of a specific sizes, the model architecture works with images of different sizes (minimum 32x32). The [`MobileNetV2ImageProcessor`] handles the necessary preprocessing. -- MobileNet is pretrained on [ImageNet-1k](https://huggingface.co/datasets/imagenet-1k), a dataset with 1000 classes. However, the model actually predicts 1001 classes. The additional class is an extra "background" class (index 0). -- The segmentation models use a [DeepLabV3+](https://huggingface.co/papers/1802.02611) head which is often pretrained on datasets like [PASCAL VOC](https://huggingface.co/datasets/merve/pascal-voc). -- The original TensorFlow checkpoints determines the padding amount at inference because it depends on the input image size. To use the native PyTorch padding behavior, set `tf_padding=False` in [`MobileNetV2Config`]. +- Classification checkpoint names follow the pattern `mobilenet_v2_{depth_multiplier}_{resolution}`, like `mobilenet_v2_1.4_224`. `1.4` is the depth multiplier and `224` is the image resolution. Segmentation checkpoint names follow the pattern `deeplabv3_mobilenet_v2_{depth_multiplier}_{resolution}`. +- While trained on images of a specific sizes, the model architecture works with images of different sizes (minimum 32x32). The [`MobileNetV2ImageProcessor`] handles the necessary preprocessing. +- MobileNet is pretrained on [ImageNet-1k](https://huggingface.co/datasets/imagenet-1k), a dataset with 1000 classes. However, the model actually predicts 1001 classes. The additional class is an extra "background" class (index 0). +- The segmentation models use a [DeepLabV3+](https://huggingface.co/papers/1802.02611) head which is often pretrained on datasets like [PASCAL VOC](https://huggingface.co/datasets/merve/pascal-voc). +- The original TensorFlow checkpoints determines the padding amount at inference because it depends on the input image size. To use the native PyTorch padding behavior, set `tf_padding=False` in [`MobileNetV2Config`]. + ```python from transformers import MobileNetV2Config config = MobileNetV2Config.from_pretrained("google/mobilenet_v2_1.4_224", tf_padding=True) ``` -- The Transformers implementation does not support the following features. - - Uses global average pooling instead of the optional 7x7 average pooling with stride 2. For larger inputs, this gives a pooled output that is larger than a 1x1 pixel. - - `output_hidden_states=True` returns *all* intermediate hidden states. It is not possible to extract the output from specific layers for other downstream purposes. - - Does not include the quantized models from the original checkpoints because they include "FakeQuantization" operations to unquantize the weights. - - For segmentation models, the final convolution layer of the backbone is computed even though the DeepLabV3+ head doesn't use it. + +- The Transformers implementation does not support the following features. + - Uses global average pooling instead of the optional 7x7 average pooling with stride 2. For larger inputs, this gives a pooled output that is larger than a 1x1 pixel. + - `output_hidden_states=True` returns *all* intermediate hidden states. It is not possible to extract the output from specific layers for other downstream purposes. + - Does not include the quantized models from the original checkpoints because they include "FakeQuantization" operations to unquantize the weights. + - For segmentation models, the final convolution layer of the backbone is computed even though the DeepLabV3+ head doesn't use it. ## MobileNetV2Config diff --git a/docs/source/en/model_doc/mobilevit.md b/docs/source/en/model_doc/mobilevit.md index b4a51bd200f2..ca0a35f6ece8 100644 --- a/docs/source/en/model_doc/mobilevit.md +++ b/docs/source/en/model_doc/mobilevit.md @@ -11,11 +11,8 @@ Unless required by applicable law or agreed to in writing, software distributed --> *This model was released on 2021-10-05 and added to Hugging Face Transformers on 2022-06-29.* - - # MobileViT -
PyTorch @@ -24,21 +21,18 @@ Unless required by applicable law or agreed to in writing, software distributed [MobileViT](https://huggingface.co/papers/2110.02178) is a lightweight vision transformer for mobile devices that merges CNNs's efficiency and inductive biases with transformers global context modeling. It treats transformers as convolutions, enabling global information processing without the heavy computational cost of standard ViTs. -
- You can find all the original MobileViT checkpoints under the [Apple](https://huggingface.co/apple/models?search=mobilevit) organization. - > [!TIP] +> > - This model was contributed by [matthijs](https://huggingface.co/Matthijs). > > Click on the MobileViT models in the right sidebar for more examples of how to apply MobileViT to different vision tasks. - The example below demonstrates how to do [Image Classification] with [`Pipeline`] and the [`AutoModel`] class. @@ -92,7 +86,6 @@ print(f"The predicted class label is:{predicted_class_label}") - ## Notes - Does **not** operate on sequential data, it's purely designed for image tasks. @@ -102,8 +95,6 @@ print(f"The predicted class label is:{predicted_class_label}") - The classification models are pretrained on [ImageNet-1k](https://huggingface.co/datasets/imagenet-1k). - The segmentation models use a [DeepLabV3](https://huggingface.co/papers/1706.05587) head and are pretrained on [PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/). - - ## MobileViTConfig [[autodoc]] MobileViTConfig diff --git a/docs/source/en/model_doc/modernbert-decoder.md b/docs/source/en/model_doc/modernbert-decoder.md index 013b9d24b5f4..1ab96700659b 100644 --- a/docs/source/en/model_doc/modernbert-decoder.md +++ b/docs/source/en/model_doc/modernbert-decoder.md @@ -36,7 +36,7 @@ You can find all the original ModernBERT Decoder checkpoints under the [jhu-clsp > > Click on the ModernBERT Decoder models in the right sidebar for more examples of how to apply ModernBERT Decoder to different text generation tasks. -The example below demonstrates how to use ModernBERT Decoder for text generation with [`Pipeline`], [`AutoModel`] (with and without quantization), and from the command line. +The example below demonstrates how to use ModernBERT Decoder for text generation with [`Pipeline`], [`AutoModel`] (with and without quantization), and from the command line. @@ -119,7 +119,7 @@ print(f"Prediction probabilities: {predictions}") -``` +```py import torch from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig @@ -151,6 +151,7 @@ with torch.no_grad(): generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) print(f"Generated text: {generated_text}") ``` + @@ -162,7 +163,6 @@ echo "The future of artificial intelligence is" | transformers run --task text-g - ## ModernBertDecoderConfig [[autodoc]] ModernBertDecoderConfig diff --git a/docs/source/en/model_doc/moonshine.md b/docs/source/en/model_doc/moonshine.md index 7abe123b88e2..b85a174a86fb 100644 --- a/docs/source/en/model_doc/moonshine.md +++ b/docs/source/en/model_doc/moonshine.md @@ -83,6 +83,7 @@ predicted_ids = model.generate(**input_features, cache_implementation="static") transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) transcription[0] ``` + @@ -101,4 +102,3 @@ transcription[0] [[autodoc]] MoonshineForConditionalGeneration - forward - generate - diff --git a/docs/source/en/model_doc/moshi.md b/docs/source/en/model_doc/moshi.md index e17a1b7b8b14..885623b26e52 100644 --- a/docs/source/en/model_doc/moshi.md +++ b/docs/source/en/model_doc/moshi.md @@ -35,9 +35,10 @@ Moshi is a speech-text foundation model that casts spoken dialogue as speech-to- The abstract from the paper is the following: -*We introduce Moshi, a speech-text foundation model and full-duplex spoken dialogue framework. Current systems for spoken dialogue rely on pipelines of independent components, namely voice activity detection, speech recognition, textual dialogue and text-to-speech. Such frameworks cannot emulate the experience of real conversations. First, their complexity induces a latency of several seconds between interactions. Second, text being the intermediate modality for dialogue, non-linguistic information that modifies meaning— such as emotion or non-speech sounds— is lost in the interaction. Finally, they rely on a segmentation into speaker turns, which does not take into account overlapping speech, interruptions and interjections. Moshi solves these independent issues altogether by casting spoken dialogue as speech-to-speech generation. Starting from a text language model backbone, Moshi generates speech as tokens from the residual quantizer of a neural audio codec, while modeling separately its own speech and that of the user into parallel streams. This allows for the removal of explicit speaker turns, and the modeling of arbitrary conversational dynamics. We moreover extend the hierarchical semantic-to-acoustic token generation of previous work to first predict time-aligned text tokens as a prefix to audio tokens. Not only this “Inner Monologue” method significantly improves the linguistic quality of generated speech, but we also illustrate how it can provide streaming speech recognition and text-to-speech. Our resulting model is the first real-time full-duplex spoken large language model, with a theoretical latency of 160ms, 200ms in practice, and is available at github.com/kyutai-labs/moshi.* +*We introduce Moshi, a speech-text foundation model and full-duplex spoken dialogue framework. Current systems for spoken dialogue rely on pipelines of independent components, namely voice activity detection, speech recognition, textual dialogue and text-to-speech. Such frameworks cannot emulate the experience of real conversations. First, their complexity induces a latency of several seconds between interactions. Second, text being the intermediate modality for dialogue, non-linguistic information that modifies meaning— such as emotion or non-speech sounds— is lost in the interaction. Finally, they rely on a segmentation into speaker turns, which does not take into account overlapping speech, interruptions and interjections. Moshi solves these independent issues altogether by casting spoken dialogue as speech-to-speech generation. Starting from a text language model backbone, Moshi generates speech as tokens from the residual quantizer of a neural audio codec, while modeling separately its own speech and that of the user into parallel streams. This allows for the removal of explicit speaker turns, and the modeling of arbitrary conversational dynamics. We moreover extend the hierarchical semantic-to-acoustic token generation of previous work to first predict time-aligned text tokens as a prefix to audio tokens. Not only this “Inner Monologue” method significantly improves the linguistic quality of generated speech, but we also illustrate how it can provide streaming speech recognition and text-to-speech. Our resulting model is the first real-time full-duplex spoken large language model, with a theoretical latency of 160ms, 200ms in practice, and is available at github.com/kyutai-labs/moshi.* Moshi deals with 3 streams of information: + 1. The user's audio 2. Moshi's audio 3. Moshi's textual output @@ -49,7 +50,7 @@ Moshi's made of 3 components: **1. The main decoder (Helium in the paper)** -It corresponds to [`MoshiForCausalLM`]. It is strictly a classic text LLM, that uses an architecture similar to [` ~GemmaForCausalLM`]. In other words, it takes text tokens, embeds them, pass them through the decoder and a language head, to get text logits. +It corresponds to [`MoshiForCausalLM`]. It is strictly a classic text LLM, that uses an architecture similar to [`~GemmaForCausalLM`]. In other words, it takes text tokens, embeds them, pass them through the decoder and a language head, to get text logits. **2. The depth decoder** @@ -63,15 +64,14 @@ Note that each timestamp - i.e each codebook - gets its own set of Linear Layers It's the audio encoder from Kyutai, that has recently been integrated to transformers, which is used to "tokenize" audio. It has the same use that [`~EncodecModel`] has in [`~MusicgenModel`]. - ## Tips: -The original checkpoints can be converted using the conversion script `src/transformers/models/moshi/convert_moshi_transformers.py` - +The original checkpoints can be converted using the conversion script `src/transformers/models/moshi/convert_moshi_transformers.py` ### How to use the model: This implementation has two main aims: + 1. quickly test model generation by simplifying the original API 2. simplify training. A training guide will come soon, but user contributions are welcomed! @@ -86,6 +86,7 @@ It is designed for intermediate use. We strongly recommend using the original [i Moshi is a streaming auto-regressive model with two streams of audio. To put it differently, one audio stream corresponds to what the model said/will say and the other audio stream corresponds to what the user said/will say. [`MoshiForConditionalGeneration.generate`] thus needs 3 inputs: + 1. `input_ids` - corresponding to the text token history 2. `moshi_input_values` or `moshi_audio_codes`- corresponding to the model audio history 3. `user_input_values` or `user_audio_codes` - corresponding to the user audio history @@ -93,6 +94,7 @@ Moshi is a streaming auto-regressive model with two streams of audio. To put it These three inputs must be synchronized. Meaning that their lengths must correspond to the same number of tokens. You can dynamically use the 3 inputs depending on what you want to test: + 1. Simply check the model response to an user prompt - in that case, `input_ids` can be filled with pad tokens and `user_input_values` can be a zero tensor of the same shape than the user prompt. 2. Test more complex behaviour - in that case, you must be careful about how the input tokens are synchronized with the audios. @@ -108,12 +110,9 @@ To follow the example of the following image, `"Hello, I'm Moshi"` could be tran
- [`MoshiForConditionalGeneration.generate`] then auto-regressively feeds to itself its own audio stream, but since it doesn't have access to the user input stream while using `transformers`, it will thus **assume that the user is producing blank audio**. - - -```python +```python >>> from datasets import load_dataset, Audio >>> import torch, math >>> from transformers import MoshiForConditionalGeneration, AutoFeatureExtractor, AutoTokenizer, infer_device @@ -149,7 +148,7 @@ To follow the example of the following image, `"Hello, I'm Moshi"` could be tran Most of the work has to be done during data creation/pre-processing, because of the need to align/synchronize streams. Once it's done, you can simply forward `text_labels` and `audio_labels` to [`MoshiForConditionalGeneration.forward`], alongside the usual inputs, to get the model loss. - + A training guide will come soon, but user contributions are welcomed! ### How does the model forward the inputs / generate: @@ -162,13 +161,10 @@ A training guide will come soon, but user contributions are welcomed! 3. The depth decoder switches the dimension on which we forward / generate (codebooks instead of time). It uses the token generated from `text logits` and the `temporal context` to auto-regressively generate audio codebooks. - This model was contributed by [Yoach Lacombe (ylacombe)](https://huggingface.co/ylacombe). The original code can be found [here](https://github.com/kyutai-labs/moshi). - - ## MoshiConfig [[autodoc]] MoshiConfig diff --git a/docs/source/en/model_doc/mpt.md b/docs/source/en/model_doc/mpt.md index 9482e6a91958..60d14641177c 100644 --- a/docs/source/en/model_doc/mpt.md +++ b/docs/source/en/model_doc/mpt.md @@ -23,11 +23,11 @@ rendered properly in your Markdown viewer. ## Overview -The MPT model was proposed by the [MosaicML](https://www.mosaicml.com/) team and released with multiple sizes and finetuned variants. The MPT models are a series of open source and commercially usable LLMs pre-trained on 1T tokens. +The MPT model was proposed by the [MosaicML](https://www.mosaicml.com/) team and released with multiple sizes and finetuned variants. The MPT models are a series of open source and commercially usable LLMs pre-trained on 1T tokens. -MPT models are GPT-style decoder-only transformers with several improvements: performance-optimized layer implementations, architecture changes that provide greater training stability, and the elimination of context length limits by replacing positional embeddings with ALiBi. +MPT models are GPT-style decoder-only transformers with several improvements: performance-optimized layer implementations, architecture changes that provide greater training stability, and the elimination of context length limits by replacing positional embeddings with ALiBi. -- MPT base: MPT base pre-trained models on next token prediction +- MPT base: MPT base pre-trained models on next token prediction - MPT instruct: MPT base models fine-tuned on instruction based tasks - MPT storywriter: MPT base models fine-tuned for 2500 steps on 65k-token excerpts of fiction books contained in the books3 corpus, this enables the model to handle very long sequences diff --git a/docs/source/en/model_doc/mra.md b/docs/source/en/model_doc/mra.md index ed11d1d9e04f..422ed3cec515 100644 --- a/docs/source/en/model_doc/mra.md +++ b/docs/source/en/model_doc/mra.md @@ -64,4 +64,4 @@ The original code can be found [here](https://github.com/mlpen/mra-attention). ## MraForQuestionAnswering [[autodoc]] MraForQuestionAnswering - - forward \ No newline at end of file + - forward diff --git a/docs/source/en/model_doc/mt5.md b/docs/source/en/model_doc/mt5.md index fa02ee4c3c08..4e652458e1b3 100644 --- a/docs/source/en/model_doc/mt5.md +++ b/docs/source/en/model_doc/mt5.md @@ -133,7 +133,6 @@ print(tokenizer.decode(output[0], skip_special_tokens=True)) See [`T5Tokenizer`] for all details. - ## MT5TokenizerFast [[autodoc]] MT5TokenizerFast diff --git a/docs/source/en/model_doc/musicgen.md b/docs/source/en/model_doc/musicgen.md index 7e91b2265fe3..1b0e8868ac82 100644 --- a/docs/source/en/model_doc/musicgen.md +++ b/docs/source/en/model_doc/musicgen.md @@ -77,9 +77,9 @@ Generation is limited by the sinusoidal positional embeddings to 30 second input than 30 seconds of audio (1503 tokens), and input audio passed by Audio-Prompted Generation contributes to this limit so, given an input of 20 seconds of audio, MusicGen cannot generate more than 10 seconds of additional audio. -Transformers supports both mono (1-channel) and stereo (2-channel) variants of MusicGen. The mono channel versions -generate a single set of codebooks. The stereo versions generate 2 sets of codebooks, 1 for each channel (left/right), -and each set of codebooks is decoded independently through the audio compression model. The audio streams for each +Transformers supports both mono (1-channel) and stereo (2-channel) variants of MusicGen. The mono channel versions +generate a single set of codebooks. The stereo versions generate 2 sets of codebooks, 1 for each channel (left/right), +and each set of codebooks is decoded independently through the audio compression model. The audio streams for each channel are combined to give the final stereo output. ### Unconditional Generation @@ -208,7 +208,7 @@ For batched audio-prompted generation, the generated `audio_values` can be post- ### Generation Configuration -The default parameters that control the generation process, such as sampling, guidance scale and number of generated +The default parameters that control the generation process, such as sampling, guidance scale and number of generated tokens, can be found in the model's generation config, and updated as desired: ```python @@ -226,20 +226,21 @@ tokens, can be found in the model's generation config, and updated as desired: >>> model.generation_config.max_length = 256 ``` -Note that any arguments passed to the generate method will **supersede** those in the generation config, so setting -`do_sample=False` in the call to generate will supersede the setting of `model.generation_config.do_sample` in the +Note that any arguments passed to the generate method will **supersede** those in the generation config, so setting +`do_sample=False` in the call to generate will supersede the setting of `model.generation_config.do_sample` in the generation config. ## Model Structure The MusicGen model can be de-composed into three distinct stages: + 1. Text encoder: maps the text inputs to a sequence of hidden-state representations. The pre-trained MusicGen models use a frozen text encoder from either T5 or Flan-T5 2. MusicGen decoder: a language model (LM) that auto-regressively generates audio tokens (or codes) conditional on the encoder hidden-state representations 3. Audio encoder/decoder: used to encode an audio prompt to use as prompt tokens, and recover the audio waveform from the audio tokens predicted by the decoder Thus, the MusicGen model can either be used as a standalone decoder model, corresponding to the class [`MusicgenForCausalLM`], or as a composite model that includes the text encoder and audio encoder/decoder, corresponding to the class -[`MusicgenForConditionalGeneration`]. If only the decoder needs to be loaded from the pre-trained checkpoint, it can be loaded by first +[`MusicgenForConditionalGeneration`]. If only the decoder needs to be loaded from the pre-trained checkpoint, it can be loaded by first specifying the correct config, or be accessed through the `.decoder` attribute of the composite model: ```python @@ -259,6 +260,7 @@ be combined with the frozen text encoder and audio encoder/decoders to recover t model. Tips: + * MusicGen is trained on the 32kHz checkpoint of Encodec. You should ensure you use a compatible version of the Encodec model. * Sampling mode tends to deliver better results than greedy - you can toggle sampling with the variable `do_sample` in the call to [`MusicgenForConditionalGeneration.generate`] diff --git a/docs/source/en/model_doc/musicgen_melody.md b/docs/source/en/model_doc/musicgen_melody.md index d2cd51bbcf2c..baf21adaab21 100644 --- a/docs/source/en/model_doc/musicgen_melody.md +++ b/docs/source/en/model_doc/musicgen_melody.md @@ -35,13 +35,12 @@ The abstract from the paper is the following: *We tackle the task of conditional music generation. We introduce MusicGen, a single Language Model (LM) that operates over several streams of compressed discrete music representation, i.e., tokens. Unlike prior work, MusicGen is comprised of a single-stage transformer LM together with efficient token interleaving patterns, which eliminates the need for cascading several models, e.g., hierarchically or upsampling. Following this approach, we demonstrate how MusicGen can generate high-quality samples, while being conditioned on textual description or melodic features, allowing better controls over the generated output. We conduct extensive empirical evaluation, considering both automatic and human studies, showing the proposed approach is superior to the evaluated baselines on a standard text-to-music benchmark. Through ablation studies, we shed light over the importance of each of the components comprising MusicGen.* - This model was contributed by [ylacombe](https://huggingface.co/ylacombe). The original code can be found [here](https://github.com/facebookresearch/audiocraft). The pre-trained checkpoints can be found on the [Hugging Face Hub](https://huggingface.co/models?sort=downloads&search=facebook%2Fmusicgen). - ## Difference with [MusicGen](https://huggingface.co/docs/transformers/main/en/model_doc/musicgen) There are two key differences with MusicGen: + 1. The audio prompt is used here as a conditional signal for the generated audio sample, whereas it's used for audio continuation in [MusicGen](https://huggingface.co/docs/transformers/main/en/model_doc/musicgen). 2. Conditional text and audio signals are concatenated to the decoder's hidden states instead of being used as a cross-attention signal, as in MusicGen. @@ -54,19 +53,19 @@ MusicGen Melody is compatible with two generation modes: greedy and sampling. In Transformers supports both mono (1-channel) and stereo (2-channel) variants of MusicGen Melody. The mono channel versions generate a single set of codebooks. The stereo versions generate 2 sets of codebooks, 1 for each channel (left/right), and each set of codebooks is decoded independently through the audio compression model. The audio streams for each channel are combined to give the final stereo output. - #### Audio Conditional Generation The model can generate an audio sample conditioned on a text and an audio prompt through use of the [`MusicgenMelodyProcessor`] to pre-process the inputs. In the following examples, we load an audio file using the 🤗 Datasets library, which can be pip installed through the command below: -``` +```bash pip install --upgrade pip pip install datasets[audio] ``` The audio file we are about to use is loaded as follows: + ```python >>> from datasets import load_dataset @@ -147,10 +146,9 @@ Or save them as a `.wav` file using a third-party library, e.g. `soundfile`: >>> sf.write("musicgen_out.wav", audio_values[0].T.numpy(), sampling_rate) ``` - ### Text-only Conditional Generation -The same [`MusicgenMelodyProcessor`] can be used to pre-process a text-only prompt. +The same [`MusicgenMelodyProcessor`] can be used to pre-process a text-only prompt. ```python >>> from transformers import AutoProcessor, MusicgenMelodyForConditionalGeneration @@ -168,7 +166,6 @@ The same [`MusicgenMelodyProcessor`] can be used to pre-process a text-only prom The `guidance_scale` is used in classifier free guidance (CFG), setting the weighting between the conditional logits (which are predicted from the text prompts) and the unconditional logits (which are predicted from an unconditional or 'null' prompt). Higher guidance scale encourages the model to generate samples that are more closely linked to the input prompt, usually at the expense of poorer audio quality. CFG is enabled by setting `guidance_scale > 1`. For best results, use `guidance_scale=3` (default). - You can also generate in batch: ```python @@ -231,6 +228,7 @@ Note that any arguments passed to the generate method will **supersede** those i ## Model Structure The MusicGen model can be de-composed into three distinct stages: + 1. Text encoder: maps the text inputs to a sequence of hidden-state representations. The pre-trained MusicGen models use a frozen text encoder from either T5 or Flan-T5. 2. MusicGen Melody decoder: a language model (LM) that auto-regressively generates audio tokens (or codes) conditional on the encoder hidden-state representations 3. Audio decoder: used to recover the audio waveform from the audio tokens predicted by the decoder. @@ -260,10 +258,10 @@ python src/transformers/models/musicgen_melody/convert_musicgen_melody_transform ``` Tips: + * MusicGen is trained on the 32kHz checkpoint of Encodec. You should ensure you use a compatible version of the Encodec model. * Sampling mode tends to deliver better results than greedy - you can toggle sampling with the variable `do_sample` in the call to [`MusicgenMelodyForConditionalGeneration.generate`] - ## MusicgenMelodyDecoderConfig [[autodoc]] MusicgenMelodyDecoderConfig @@ -294,4 +292,4 @@ Tips: ## MusicgenMelodyForConditionalGeneration [[autodoc]] MusicgenMelodyForConditionalGeneration - - forward \ No newline at end of file + - forward diff --git a/docs/source/en/model_doc/mvp.md b/docs/source/en/model_doc/mvp.md index 2cce9bd6cac1..26aa2f29b76d 100644 --- a/docs/source/en/model_doc/mvp.md +++ b/docs/source/en/model_doc/mvp.md @@ -25,7 +25,6 @@ rendered properly in your Markdown viewer. The MVP model was proposed in [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://huggingface.co/papers/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen. - According to the abstract, - MVP follows a standard Transformer encoder-decoder architecture. @@ -67,6 +66,7 @@ For summarization, it is an example to use MVP and MVP with summarization-specif ``` For data-to-text generation, it is an example to use MVP and multi-task pre-trained variants. + ```python >>> from transformers import MvpTokenizerFast, MvpForConditionalGeneration diff --git a/docs/source/en/model_doc/myt5.md b/docs/source/en/model_doc/myt5.md index 409735751252..35ab716a8e71 100644 --- a/docs/source/en/model_doc/myt5.md +++ b/docs/source/en/model_doc/myt5.md @@ -44,4 +44,3 @@ The original code can be found [here](https://github.com/tomlimi/MYTE). ## MyT5Tokenizer [[autodoc]] MyT5Tokenizer - diff --git a/docs/source/en/model_doc/nat.md b/docs/source/en/model_doc/nat.md index dadcae6f17f0..36662173f2f4 100644 --- a/docs/source/en/model_doc/nat.md +++ b/docs/source/en/model_doc/nat.md @@ -68,6 +68,7 @@ The `reshaped_hidden_states` have a shape of `(batch, num_channels, height, widt `(batch_size, height, width, num_channels)`. Notes: + - NAT depends on [NATTEN](https://github.com/SHI-Labs/NATTEN/)'s implementation of Neighborhood Attention. You can install it with pre-built wheels for Linux by referring to [shi-labs.com/natten](https://shi-labs.com/natten), or build on your system by running `pip install natten`. diff --git a/docs/source/en/model_doc/nemotron.md b/docs/source/en/model_doc/nemotron.md index 360a6ba22267..50f6f99eae2f 100644 --- a/docs/source/en/model_doc/nemotron.md +++ b/docs/source/en/model_doc/nemotron.md @@ -97,7 +97,6 @@ Minitron is released under the [NVIDIA Open Model License Agreement](https://dev | :------------- | :------------- | :------------- | :------------- | :------------- | | 75.0 | 74.0 | 24.1 | 50.9 | 29.5 - *Code generation performance*. Evaluated using [HumanEval](https://github.com/openai/human-eval): | p@1, 0-Shot | @@ -109,7 +108,8 @@ Please refer to our [paper](https://huggingface.co/papers/2407.14679) for the fu ### Citation If you find our work helpful, please consider citing our paper: -``` + +```bibtex @article{minitron2024, title={Compact Language Models via Pruning and Knowledge Distillation}, author={Saurav Muralidharan and Sharath Turuvekere Sreenivas and Raviraj Joshi and Marcin Chochowski and Mostofa Patwary and Mohammad Shoeybi and Bryan Catanzaro and Jan Kautz and Pavlo Molchanov}, @@ -123,13 +123,11 @@ If you find our work helpful, please consider citing our paper: [[autodoc]] NemotronConfig - ## NemotronModel [[autodoc]] NemotronModel - forward - ## NemotronForCausalLM [[autodoc]] NemotronForCausalLM @@ -140,13 +138,11 @@ If you find our work helpful, please consider citing our paper: [[autodoc]] NemotronForSequenceClassification - forward - ## NemotronForQuestionAnswering [[autodoc]] NemotronForQuestionAnswering - forward - ## NemotronForTokenClassification [[autodoc]] NemotronForTokenClassification diff --git a/docs/source/en/model_doc/nllb-moe.md b/docs/source/en/model_doc/nllb-moe.md index f1456ee402dd..d8c44a5fc0f8 100644 --- a/docs/source/en/model_doc/nllb-moe.md +++ b/docs/source/en/model_doc/nllb-moe.md @@ -110,7 +110,6 @@ See example below for a translation from romanian to german: - [Translation task guide](../tasks/translation) - [Summarization task guide](../tasks/summarization) - ## NllbMoeConfig [[autodoc]] NllbMoeConfig @@ -135,4 +134,3 @@ See example below for a translation from romanian to german: [[autodoc]] NllbMoeForConditionalGeneration - forward - diff --git a/docs/source/en/model_doc/nllb.md b/docs/source/en/model_doc/nllb.md index 6f12a3aa746b..f44c03dcfdd3 100644 --- a/docs/source/en/model_doc/nllb.md +++ b/docs/source/en/model_doc/nllb.md @@ -29,7 +29,6 @@ rendered properly in your Markdown viewer. [NLLB: No Language Left Behind](https://huggingface.co/papers/2207.04672) is a multilingual translation model. It's trained on data using data mining techniques tailored for low-resource languages and supports over 200 languages. NLLB features a conditional compute architecture using a Sparsely Gated Mixture of Experts. - You can find all the original NLLB checkpoints under the [AI at Meta](https://huggingface.co/facebook/models?search=nllb) organization. > [!TIP] @@ -129,9 +128,10 @@ visualizer("UN Chief says there is no military solution in Syria") >>> tokenizer = NllbTokenizer.from_pretrained("facebook/nllb-200-distilled-600M", legacy_behaviour=True) ``` - - For non-English languages, specify the language's [BCP-47](https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200) code with the `src_lang` keyword as shown below. +- For non-English languages, specify the language's [BCP-47](https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200) code with the `src_lang` keyword as shown below. + +- See example below for a translation from Romanian to German. - - See example below for a translation from Romanian to German. ```python >>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer diff --git a/docs/source/en/model_doc/olmo2.md b/docs/source/en/model_doc/olmo2.md index 158909c085c3..ba2c93d3ab26 100644 --- a/docs/source/en/model_doc/olmo2.md +++ b/docs/source/en/model_doc/olmo2.md @@ -87,6 +87,7 @@ echo -e "Plants create energy through a process known as" | transformers-cli run Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends. The example below uses [torchao](../quantization/torchao) to only quantize the weights to 4-bits. + ```py #pip install torchao @@ -116,7 +117,6 @@ print(tokenizer.decode(output[0], skip_special_tokens=True)) ``` - ## Notes - OLMo2 uses RMSNorm instead of standard layer norm. The RMSNorm is applied to attention queries and keys, and it is applied after the attention and feedforward layers rather than before. @@ -129,7 +129,6 @@ print(tokenizer.decode(output[0], skip_special_tokens=True)) model = AutoModelForCausalLM.from_pretrained("allenai/OLMo-2-0425-1B", revision="stage1-step140000-tokens294B") ``` - ## Olmo2Config [[autodoc]] Olmo2Config diff --git a/docs/source/en/model_doc/olmo3.md b/docs/source/en/model_doc/olmo3.md index e320181925ca..07a3cc3ebed9 100644 --- a/docs/source/en/model_doc/olmo3.md +++ b/docs/source/en/model_doc/olmo3.md @@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. - ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be rendered properly in your Markdown viewer. --> -*This model was released on {release_date} and added to Hugging Face Transformers on 2025-09-08.* +*This model was released on {release_date} and added to Hugging Face Transformers on 2025-09-16.* +
PyTorch @@ -46,7 +46,7 @@ pipe = pipeline( dtype=torch.bfloat16, device=0, ) - + result = pipe("Plants create energy through a process known as") print(result) ``` @@ -87,6 +87,7 @@ echo -e "Plants create energy through a process known as" | transformers-cli run Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends. The example below uses [torchao](../quantization/torchao) to only quantize the weights to 4-bits. + ```py #pip install torchao @@ -116,18 +117,16 @@ print(tokenizer.decode(output[0], skip_special_tokens=True)) ``` - ## Notes -- Load specific intermediate checkpoints by adding the `revision` parameter to [`~PreTrainedModel.from_pretrained`]. +- Load specific intermediate checkpoints by adding the `revision` parameter to [`~PreTrainedModel.from_pretrained`]. ```py from transformers import AutoModelForCausalLM - + model = AutoModelForCausalLM.from_pretrained("allenai/TBA", revision="stage1-step140000-tokens294B") ``` - ## Olmo3Config [[autodoc]] Olmo3Config @@ -144,4 +143,4 @@ print(tokenizer.decode(output[0], skip_special_tokens=True)) ## Olmo3PreTrainedModel [[autodoc]] Olmo3PreTrainedModel - - forward \ No newline at end of file + - forward diff --git a/docs/source/en/model_doc/oneformer.md b/docs/source/en/model_doc/oneformer.md index c4b3bd142fe0..7f5d32bc55a8 100644 --- a/docs/source/en/model_doc/oneformer.md +++ b/docs/source/en/model_doc/oneformer.md @@ -39,7 +39,7 @@ This model was contributed by [Jitesh Jain](https://huggingface.co/praeclarumjj3 ## Usage tips -- OneFormer requires two inputs during inference: *image* and *task token*. +- OneFormer requires two inputs during inference: *image* and *task token*. - During training, OneFormer only uses panoptic annotations. - If you want to train the model in a distributed environment across multiple nodes, then one should update the `get_num_masks` function inside in the `OneFormerLoss` class of `modeling_oneformer.py`. When training on multiple nodes, this should be diff --git a/docs/source/en/model_doc/openai-gpt.md b/docs/source/en/model_doc/openai-gpt.md index b45b205e2592..04d37d89cc49 100644 --- a/docs/source/en/model_doc/openai-gpt.md +++ b/docs/source/en/model_doc/openai-gpt.md @@ -15,7 +15,6 @@ rendered properly in your Markdown viewer. --> *This model was released on 2018-06-11 and added to Hugging Face Transformers on 2023-06-20.* -
PyTorch @@ -24,8 +23,6 @@ rendered properly in your Markdown viewer.
- - # GPT [GPT (Generative Pre-trained Transformer)](https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf) ([blog post](https://openai.com/index/language-unsupervised/)) focuses on effectively learning text representations and transferring them to tasks. This model trains the Transformer decoder to predict the next word, and then fine-tuned on labeled data. @@ -39,12 +36,9 @@ You can find all the original GPT checkpoints under the [OpenAI community](https The example below demonstrates how to generate text with [`Pipeline`], [`AutoModel`], and from the command line. - - - ```python import torch from transformers import pipeline @@ -75,6 +69,7 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True)) echo -e "The future of AI is" | transformers run --task text-generation --model openai-community/openai-gpt --device 0 ``` + @@ -89,22 +84,22 @@ echo -e "The future of AI is" | transformers run --task text-generation --model ## OpenAIGPTModel [[autodoc]] OpenAIGPTModel -- forward + - forward ## OpenAIGPTLMHeadModel [[autodoc]] OpenAIGPTLMHeadModel -- forward + - forward ## OpenAIGPTDoubleHeadsModel [[autodoc]] OpenAIGPTDoubleHeadsModel -- forward + - forward ## OpenAIGPTForSequenceClassification [[autodoc]] OpenAIGPTForSequenceClassification -- forward + - forward ## OpenAIGPTTokenizer diff --git a/docs/source/en/model_doc/opt.md b/docs/source/en/model_doc/opt.md index e645956f1ece..7c65689594e4 100644 --- a/docs/source/en/model_doc/opt.md +++ b/docs/source/en/model_doc/opt.md @@ -36,7 +36,6 @@ You can find all the original OPT checkpoints under the [OPT](https://huggingfac The example below demonstrates how to generate text with [`Pipeline`], [`AutoModel`], and from the command line. - @@ -65,12 +64,14 @@ model_inputs = tokenizer([prompt], return_tensors="pt").to(model.device) generated_ids = model.generate(**model_inputs, max_new_tokens=30, do_sample=False) tokenizer.batch_decode(generated_ids)[0] ``` + ```py echo -e "Plants create energy through a process known as" | transformers run --task text-generation --model facebook/opt-125m --device 0 ``` + diff --git a/docs/source/en/model_doc/ovis2.md b/docs/source/en/model_doc/ovis2.md index ab1d761f19ed..731ebbb83f08 100644 --- a/docs/source/en/model_doc/ovis2.md +++ b/docs/source/en/model_doc/ovis2.md @@ -13,12 +13,13 @@ specific language governing permissions and limitations under the License. rendered properly in your Markdown viewer. --> +*This model was released on 2024-05-31 and added to Hugging Face Transformers on 2025-08-18.* # Ovis2 ## Overview -The [Ovis2](https://github.com/AIDC-AI/Ovis) is an updated version of the [Ovis](https://huggingface.co/papers/2405.20797) model developed by the AIDC-AI team at Alibaba International Digital Commerce Group. +The [Ovis2](https://github.com/AIDC-AI/Ovis) is an updated version of the [Ovis](https://huggingface.co/papers/2405.20797) model developed by the AIDC-AI team at Alibaba International Digital Commerce Group. Ovis2 is the latest advancement in multi-modal large language models (MLLMs), succeeding Ovis1.6. It retains the architectural design of the Ovis series, which focuses on aligning visual and textual embeddings, and introduces major improvements in data curation and training methods. diff --git a/docs/source/en/model_doc/paligemma.md b/docs/source/en/model_doc/paligemma.md index 58aa622a0d37..fa7c193da453 100644 --- a/docs/source/en/model_doc/paligemma.md +++ b/docs/source/en/model_doc/paligemma.md @@ -140,6 +140,7 @@ visualizer(" What is in this image?") answer = "a pallas cat" inputs = processor(images=image, text=prompt, suffix=answer, return_tensors="pt") ``` + - PaliGemma can support multiple input images if it is fine-tuned to accept multiple images. For example, the [NLVR2](https://huggingface.co/google/paligemma-3b-ft-nlvr2-448) checkpoint supports multiple images. Pass the images as a list to the processor. ```py diff --git a/docs/source/en/model_doc/parakeet.md b/docs/source/en/model_doc/parakeet.md new file mode 100644 index 000000000000..4cb72e7e4585 --- /dev/null +++ b/docs/source/en/model_doc/parakeet.md @@ -0,0 +1,221 @@ + +*This model was released on {release_date} and added to Hugging Face Transformers on 2025-09-25.* + +
+PyTorch +SDPA +
+ +# Parakeet + +## Overview + +Parakeet models, [introduced by NVIDIA NeMo](https://developer.nvidia.com/blog/pushing-the-boundaries-of-speech-recognition-with-nemo-parakeet-asr-models/), are models that combine a [Fast Conformer](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/asr/models.html#fast-conformer) encoder with connectionist temporal classification (CTC), recurrent neural network transducer (RNNT) or token and duration transducer (TDT) decoder for automatic speech recognition. + +**Model Architecture** + +- **Fast Conformer Encoder**: A linearly scalable Conformer architecture that processes mel-spectrogram features and reduces sequence length through subsampling. This is more efficient version of the Conformer Encoder found in [FastSpeech2Conformer](./fastspeech2_conformer.md) (see [`ParakeetEncoder`] for the encoder implementation and details). +- [**ParakeetForCTC**](#parakeetforctc): a Fast Conformer Encoder + a CTC decoder + - **CTC Decoder**: Simple but effective decoder consisting of: + - 1D convolution projection from encoder hidden size to vocabulary size (for optimal NeMo compatibility). + - CTC loss computation for training. + - Greedy CTC decoding for inference. + +The original implementation can be found in [NVIDIA NeMo](https://github.com/NVIDIA/NeMo). +Model checkpoints are to be found under [the NVIDIA organization](https://huggingface.co/nvidia/models?search=parakeet). + +This model was contributed by [Nithin Rao Koluguri](https://huggingface.co/nithinraok), [Eustache Le Bihan](https://huggingface.co/eustlb) and [Eric Bezzam](https://huggingface.co/bezzam). + +## Usage + +### Basic usage + + + + +```py +from transformers import pipeline + +pipe = pipeline("automatic-speech-recognition", model="nvidia/parakeet-ctc-1.1b") +out = pipe("https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/bcn_weather.mp3") +print(out) +``` + + + + +```py +from transformers import AutoModelForCTC, AutoProcessor +from datasets import load_dataset, Audio +import torch + +device = "cuda" if torch.cuda.is_available() else "cpu" + +processor = AutoProcessor.from_pretrained("nvidia/parakeet-ctc-1.1b") +model = AutoModelForCTC.from_pretrained("nvidia/parakeet-ctc-1.1b", dtype="auto", device_map=device) + +ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") +ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate)) +speech_samples = [el['array'] for el in ds["audio"][:5]] + +inputs = processor(speech_samples, sampling_rate=processor.feature_extractor.sampling_rate) +inputs.to(model.device, dtype=model.dtype) +outputs = model.generate(**inputs) +print(processor.batch_decode(outputs)) +``` + + + + +### Making The Model Go Brrr + +Parakeet supports full-graph compilation with CUDA graphs! This optimization is most effective when you know the maximum audio length you want to transcribe. The key idea is using static input shapes to avoid recompilation. For example, if you know your audio will be under 30 seconds, you can use the processor to pad all inputs to 30 seconds, preparing consistent input features and attention masks. See the example below! + +```python +from transformers import AutoModelForCTC, AutoProcessor +from datasets import load_dataset, Audio +import torch + +device = "cuda" if torch.cuda.is_available() else "cpu" + +processor = AutoProcessor.from_pretrained("nvidia/parakeet-ctc-1.1b") +model = AutoModelForCTC.from_pretrained("nvidia/parakeet-ctc-1.1b", dtype="auto", device_map=device) + +ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") +ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate)) +speech_samples = [el['array'] for el in ds["audio"][:5]] + +# Compile the generate method with fullgraph and CUDA graphs +model.generate = torch.compile(model.generate, fullgraph=True, mode="reduce-overhead") + +# let's define processor kwargs to pad to 30 seconds +processor_kwargs = { + "padding": "max_length", + "max_length": 30 * processor.feature_extractor.sampling_rate, +} + +# Define a timing context using CUDA events +class TimerContext: + def __init__(self, name="Execution"): + self.name = name + self.start_event = None + self.end_event = None + + def __enter__(self): + # Use CUDA events for more accurate GPU timing + self.start_event = torch.cuda.Event(enable_timing=True) + self.end_event = torch.cuda.Event(enable_timing=True) + self.start_event.record() + return self + + def __exit__(self, *args): + self.end_event.record() + torch.cuda.synchronize() + elapsed_time = self.start_event.elapsed_time(self.end_event) / 1000.0 + print(f"{self.name} time: {elapsed_time:.4f} seconds") + + +inputs = processor(speech_samples[0], **processor_kwargs) +inputs.to(device, dtype=model.dtype) +print("\n" + "="*50) +print("First generation - compiling...") +# Generate with the compiled model +with TimerContext("First generation"): + outputs = model.generate(**inputs) +print(processor.batch_decode(outputs)) + +inputs = processor(speech_samples[1], **processor_kwargs) +inputs.to(device, dtype=model.dtype) +print("\n" + "="*50) +print("Second generation - recording CUDA graphs...") +with TimerContext("Second generation"): + outputs = model.generate(**inputs) +print(processor.batch_decode(outputs)) + +inputs = processor(speech_samples[2], **processor_kwargs) +inputs.to(device, dtype=model.dtype) +print("\n" + "="*50) +print("Third generation - fast !!!") +with TimerContext("Third generation"): + outputs = model.generate(**inputs) +print(processor.batch_decode(outputs)) + +inputs = processor(speech_samples[3], **processor_kwargs) +inputs.to(device, dtype=model.dtype) +print("\n" + "="*50) +print("Fourth generation - still fast !!!") +with TimerContext("Fourth generation"): + outputs = model.generate(**inputs) +print(processor.batch_decode(outputs)) +``` + +### Training + +```python +from transformers import AutoModelForCTC, AutoProcessor +from datasets import load_dataset, Audio +import torch + +device = "cuda" if torch.cuda.is_available() else "cpu" + +processor = AutoProcessor.from_pretrained("nvidia/parakeet-ctc-1.1b") +model = AutoModelForCTC.from_pretrained("nvidia/parakeet-ctc-1.1b", dtype="auto", device_map=device) + +ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") +ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate)) +speech_samples = [el['array'] for el in ds["audio"][:5]] +text_samples = [el for el in ds["text"][:5]] + +# passing `text` to the processor will prepare inputs' `labels` key +inputs = processor(audio=speech_samples, text=text_samples, sampling_rate=processor.feature_extractor.sampling_rate) +inputs.to(device, dtype=model.dtype) + +outputs = model(**inputs) +outputs.loss.backward() +``` + +## ParakeetTokenizerFast + +[[autodoc]] ParakeetTokenizerFast + +## ParakeetFeatureExtractor + +[[autodoc]] ParakeetFeatureExtractor + - __call__ + +## ParakeetProcessor + +[[autodoc]] ParakeetProcessor + - __call__ + - batch_decode + - decode + +## ParakeetEncoderConfig + +[[autodoc]] ParakeetEncoderConfig + +## ParakeetCTCConfig + +[[autodoc]] ParakeetCTCConfig + +## ParakeetEncoder + +[[autodoc]] ParakeetEncoder + +## ParakeetForCTC + +[[autodoc]] ParakeetForCTC diff --git a/docs/source/en/model_doc/patchtsmixer.md b/docs/source/en/model_doc/patchtsmixer.md index 5541f4d80936..4a9ddef46416 100644 --- a/docs/source/en/model_doc/patchtsmixer.md +++ b/docs/source/en/model_doc/patchtsmixer.md @@ -25,15 +25,13 @@ rendered properly in your Markdown viewer. The PatchTSMixer model was proposed in [TSMixer: Lightweight MLP-Mixer Model for Multivariate Time Series Forecasting](https://huggingface.co/papers/2306.09364) by Vijay Ekambaram, Arindam Jati, Nam Nguyen, Phanwadee Sinthong and Jayant Kalagnanam. - PatchTSMixer is a lightweight time-series modeling approach based on the MLP-Mixer architecture. In this HuggingFace implementation, we provide PatchTSMixer's capabilities to effortlessly facilitate lightweight mixing across patches, channels, and hidden features for effective multivariate time-series modeling. It also supports various attention mechanisms starting from simple gated attention to more complex self-attention blocks that can be customized accordingly. The model can be pretrained and subsequently used for various downstream tasks such as forecasting, classification and regression. - The abstract from the paper is the following: *TSMixer is a lightweight neural architecture exclusively composed of multi-layer perceptron (MLP) modules designed for multivariate forecasting and representation learning on patched time series. Our model draws inspiration from the success of MLP-Mixer models in computer vision. We demonstrate the challenges involved in adapting Vision MLP-Mixer for time series and introduce empirically validated components to enhance accuracy. This includes a novel design paradigm of attaching online reconciliation heads to the MLP-Mixer backbone, for explicitly modeling the time-series properties such as hierarchy and channel-correlations. We also propose a Hybrid channel modeling approach to effectively handle noisy channel interactions and generalization across diverse datasets, a common challenge in existing patch channel-mixing methods. Additionally, a simple gated attention mechanism is introduced in the backbone to prioritize important features. By incorporating these lightweight components, we significantly enhance the learning capability of simple MLP structures, outperforming complex Transformer models with minimal computing usage. Moreover, TSMixer's modular design enables compatibility with both supervised and masked self-supervised learning methods, making it a promising building block for time-series Foundation Models. TSMixer outperforms state-of-the-art MLP and Transformer models in forecasting by a considerable margin of 8-60%. It also outperforms the latest strong benchmarks of Patch-Transformer models (by 1-2%) with a significant reduction in memory and runtime (2-3X).* -This model was contributed by [ajati](https://huggingface.co/ajati), [vijaye12](https://huggingface.co/vijaye12), +This model was contributed by [ajati](https://huggingface.co/ajati), [vijaye12](https://huggingface.co/vijaye12), [gsinthong](https://huggingface.co/gsinthong), [namctin](https://huggingface.co/namctin), [wmgifford](https://huggingface.co/wmgifford), [kashif](https://huggingface.co/kashif). @@ -68,32 +66,27 @@ The model can also be used for time series classification and time series regres [[autodoc]] PatchTSMixerConfig - ## PatchTSMixerModel [[autodoc]] PatchTSMixerModel - forward - ## PatchTSMixerForPrediction [[autodoc]] PatchTSMixerForPrediction - forward - ## PatchTSMixerForTimeSeriesClassification [[autodoc]] PatchTSMixerForTimeSeriesClassification - forward - ## PatchTSMixerForPretraining [[autodoc]] PatchTSMixerForPretraining - forward - ## PatchTSMixerForRegression [[autodoc]] PatchTSMixerForRegression - - forward \ No newline at end of file + - forward diff --git a/docs/source/en/model_doc/pegasus_x.md b/docs/source/en/model_doc/pegasus_x.md index 791618c67d30..4f42b787b925 100644 --- a/docs/source/en/model_doc/pegasus_x.md +++ b/docs/source/en/model_doc/pegasus_x.md @@ -53,6 +53,7 @@ Through photosynthesis, plants capture energy from sunlight using a green pigmen These ingredients are then transformed into glucose, a type of sugar that serves as a source of chemical energy, and oxygen, which is released as a byproduct into the atmosphere. The glucose produced during photosynthesis is not just used immediately; plants also store it as starch or convert it into other organic compounds like cellulose, which is essential for building their cellular structure. This energy reserve allows them to grow, develop leaves, produce flowers, bear fruit, and carry out various physiological processes throughout their lifecycle.""") ``` + @@ -78,12 +79,14 @@ input_ids = tokenizer(input_text, return_tensors="pt").to(model.device) output = model.generate(**input_ids, cache_implementation="static") print(tokenizer.decode(output[0], skip_special_tokens=True)) ``` + ```bash echo -e "Plants are among the most remarkable and essential life forms on Earth, possessing a unique ability to produce their own food through a process known as photosynthesis. This complex biochemical process is fundamental not only to plant life but to virtually all life on the planet. Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts." | transformers-cli run --task summarization --model google/pegasus-x-large --device 0 ``` + diff --git a/docs/source/en/model_doc/perception_lm.md b/docs/source/en/model_doc/perception_lm.md index ee6b63fce6fd..7d3d608253fc 100644 --- a/docs/source/en/model_doc/perception_lm.md +++ b/docs/source/en/model_doc/perception_lm.md @@ -38,11 +38,9 @@ video captions. Additionally, we introduce PLM–VideoBench, a suite for evaluat understanding tasks focusing on the ability to reason about “what”, “where”, “when”, and “how” of a video. We make our work fully reproducible by providing data, training recipes, code & models.* - This model was contributed by [shumingh](https://huggingface.co/shumingh). The original code can be found [here](https://github.com/facebookresearch/perception_models). - ## PerceptionLMConfig [[autodoc]] PerceptionLMConfig diff --git a/docs/source/en/model_doc/persimmon.md b/docs/source/en/model_doc/persimmon.md index 764c959879ad..854eaee835df 100644 --- a/docs/source/en/model_doc/persimmon.md +++ b/docs/source/en/model_doc/persimmon.md @@ -39,7 +39,7 @@ The original code can be found [here](https://github.com/persimmon-ai-labs/adept The `Persimmon` models were trained using `bfloat16`, but the original inference uses `float16` The checkpoints uploaded on the hub use `dtype = 'float16'` which will be -used by the `AutoModel` API to cast the checkpoints from `torch.float32` to `torch.float16`. +used by the `AutoModel` API to cast the checkpoints from `torch.float32` to `torch.float16`. The `dtype` of the online weights is mostly irrelevant, unless you are using `dtype="auto"` when initializing a model using `model = AutoModelForCausalLM.from_pretrained("path", dtype = "auto")`. The reason is that the model will first be downloaded ( using the `dtype` of the checkpoints online) then it will be cast to the default `dtype` of `torch` (becomes `torch.float32`). Users should specify the `dtype` they want, and if they don't it will be `torch.float32`. @@ -47,7 +47,6 @@ Finetuning the model in `float16` is not recommended and known to produce `nan`, - Tips: - To convert the model, you need to clone the original repository using `git clone https://github.com/persimmon-ai-labs/adept-inference`, then get the checkpoints: @@ -62,6 +61,7 @@ python src/transformers/models/persimmon/convert_persimmon_weights_to_hf.py --i ``` For the chat model: + ```bash wget https://axtkn4xl5cip.objectstorage.us-phoenix-1.oci.customer-oci.com/n/axtkn4xl5cip/b/adept-public-data/o/8b_chat_model_release.tar tar -xvf 8b_base_model_release.tar @@ -76,13 +76,11 @@ model = PersimmonForCausalLM.from_pretrained("/output/path") tokenizer = PersimmonTokenizer.from_pretrained("/output/path") ``` - - Perismmon uses a `sentencepiece` based tokenizer, with a `Unigram` model. It supports bytefallback, which is only available in `tokenizers==0.14.0` for the fast tokenizer. The `LlamaTokenizer` is used as it is a standard wrapper around sentencepiece. The `chat` template will be updated with the templating functions in a follow up PR! - The authors suggest to use the following prompt format for the chat mode: `f"human: {prompt}\n\nadept:"` - ## PersimmonConfig [[autodoc]] PersimmonConfig diff --git a/docs/source/en/model_doc/phimoe.md b/docs/source/en/model_doc/phimoe.md index 319cbc470b91..64a12e3820ae 100644 --- a/docs/source/en/model_doc/phimoe.md +++ b/docs/source/en/model_doc/phimoe.md @@ -45,12 +45,14 @@ The original code for PhiMoE can be found [here](https://huggingface.co/microsof Phi-3.5-MoE-instruct has been integrated in the development version (4.44.2.dev) of `transformers`. Until the official version is released through `pip`, ensure that you are doing the following: + * When loading the model, ensure that `trust_remote_code=True` is passed as an argument of the `from_pretrained()` function. The current `transformers` version can be verified with: `pip list | grep transformers`. Examples of required packages: -``` + +```bash flash_attn==2.5.8 torch==2.3.1 accelerate==0.31.0 diff --git a/docs/source/en/model_doc/pix2struct.md b/docs/source/en/model_doc/pix2struct.md index c43c9b3b92ed..412d2c2fef95 100644 --- a/docs/source/en/model_doc/pix2struct.md +++ b/docs/source/en/model_doc/pix2struct.md @@ -79,4 +79,4 @@ The original code can be found [here](https://github.com/google-research/pix2str ## Pix2StructForConditionalGeneration [[autodoc]] Pix2StructForConditionalGeneration - - forward \ No newline at end of file + - forward diff --git a/docs/source/en/model_doc/pixtral.md b/docs/source/en/model_doc/pixtral.md index 55ba09084292..bb175973bd23 100644 --- a/docs/source/en/model_doc/pixtral.md +++ b/docs/source/en/model_doc/pixtral.md @@ -15,7 +15,6 @@ rendered properly in your Markdown viewer. --> *This model was released on 2024-09-17 and added to Hugging Face Transformers on 2024-09-14.* -
PyTorch diff --git a/docs/source/en/model_doc/plbart.md b/docs/source/en/model_doc/plbart.md index d8ce330cb0f7..b3459299437e 100644 --- a/docs/source/en/model_doc/plbart.md +++ b/docs/source/en/model_doc/plbart.md @@ -120,4 +120,4 @@ it's passed with the `text_target` keyword argument. ## PLBartForCausalLM [[autodoc]] PLBartForCausalLM - - forward \ No newline at end of file + - forward diff --git a/docs/source/en/model_doc/pop2piano.md b/docs/source/en/model_doc/pop2piano.md index 5f68b1805000..c934d8789037 100644 --- a/docs/source/en/model_doc/pop2piano.md +++ b/docs/source/en/model_doc/pop2piano.md @@ -21,14 +21,14 @@ specific language governing permissions and limitations under the License. The Pop2Piano model was proposed in [Pop2Piano : Pop Audio-based Piano Cover Generation](https://huggingface.co/papers/2211.00895) by Jongho Choi and Kyogu Lee. -Piano covers of pop music are widely enjoyed, but generating them from music is not a trivial task. It requires great -expertise with playing piano as well as knowing different characteristics and melodies of a song. With Pop2Piano you -can directly generate a cover from a song's audio waveform. It is the first model to directly generate a piano cover -from pop audio without melody and chord extraction modules. - -Pop2Piano is an encoder-decoder Transformer model based on [T5](https://huggingface.co/papers/1910.10683). The input audio -is transformed to its waveform and passed to the encoder, which transforms it to a latent representation. The decoder -uses these latent representations to generate token ids in an autoregressive way. Each token id corresponds to one of four +Piano covers of pop music are widely enjoyed, but generating them from music is not a trivial task. It requires great +expertise with playing piano as well as knowing different characteristics and melodies of a song. With Pop2Piano you +can directly generate a cover from a song's audio waveform. It is the first model to directly generate a piano cover +from pop audio without melody and chord extraction modules. + +Pop2Piano is an encoder-decoder Transformer model based on [T5](https://huggingface.co/papers/1910.10683). The input audio +is transformed to its waveform and passed to the encoder, which transforms it to a latent representation. The decoder +uses these latent representations to generate token ids in an autoregressive way. Each token id corresponds to one of four different token types: time, velocity, note and 'special'. The token ids are then decoded to their equivalent MIDI file. The abstract from the paper is the following: @@ -53,10 +53,13 @@ The original code can be found [here](https://github.com/sweetcocoa/pop2piano). ## Usage tips * To use Pop2Piano, you will need to install the 🤗 Transformers library, as well as the following third party modules: + ```bash pip install pretty-midi==0.2.9 essentia==2.1b6.dev1034 librosa scipy ``` + Please note that you may need to restart your runtime after installation. + * Pop2Piano is an Encoder-Decoder based model like T5. * Pop2Piano can be used to generate midi-audio files for a given audio sequence. * Choosing different composers in `Pop2PianoForConditionalGeneration.generate()` can lead to variety of different results. @@ -131,7 +134,6 @@ Please note that you may need to restart your runtime after installation. >>> tokenizer_output[1].write("./Outputs/midi_output2.mid") ``` - - Example of processing multiple audio files in batch (Using `Pop2PianoFeatureExtractor` and `Pop2PianoTokenizer`): ```python @@ -166,7 +168,6 @@ Please note that you may need to restart your runtime after installation. >>> tokenizer_output[1].write("./Outputs/midi_output2.mid") ``` - ## Pop2PianoConfig [[autodoc]] Pop2PianoConfig diff --git a/docs/source/en/model_doc/prompt_depth_anything.md b/docs/source/en/model_doc/prompt_depth_anything.md index 5af13c5d630e..d4b6f4cc2598 100644 --- a/docs/source/en/model_doc/prompt_depth_anything.md +++ b/docs/source/en/model_doc/prompt_depth_anything.md @@ -19,8 +19,7 @@ rendered properly in your Markdown viewer. ## Overview -The Prompt Depth Anything model was introduced in [Prompting Depth Anything for 4K Resolution Accurate Metric Depth Estimation](https://huggingface.co/papers/2412.14015) by Haotong Lin, Sida Peng, Jingxiao Chen, Songyou Peng, Jiaming Sun, Minghuan Liu, Hujun Bao, Jiashi Feng, Xiaowei Zhou, Bingyi Kang. - +The Prompt Depth Anything model was introduced in [Prompting Depth Anything for 4K Resolution Accurate Metric Depth Estimation](https://huggingface.co/papers/2412.14015) by Haotong Lin, Sida Peng, Jingxiao Chen, Songyou Peng, Jiaming Sun, Minghuan Liu, Hujun Bao, Jiashi Feng, Xiaowei Zhou, Bingyi Kang. The abstract from the paper is as follows: @@ -100,4 +99,4 @@ If you are interested in submitting a resource to be included here, please feel [[autodoc]] PromptDepthAnythingImageProcessorFast - preprocess - - post_process_depth_estimation \ No newline at end of file + - post_process_depth_estimation diff --git a/docs/source/en/model_doc/pvt.md b/docs/source/en/model_doc/pvt.md index e7902affe5f4..38858db55529 100644 --- a/docs/source/en/model_doc/pvt.md +++ b/docs/source/en/model_doc/pvt.md @@ -29,23 +29,22 @@ is used to further reduce the resource consumption when learning high-resolution The abstract from the paper is the following: -*Although convolutional neural networks (CNNs) have achieved great success in computer vision, this work investigates a -simpler, convolution-free backbone network useful for many dense prediction tasks. Unlike the recently proposed Vision -Transformer (ViT) that was designed for image classification specifically, we introduce the Pyramid Vision Transformer -(PVT), which overcomes the difficulties of porting Transformer to various dense prediction tasks. PVT has several -merits compared to current state of the arts. Different from ViT that typically yields low resolution outputs and -incurs high computational and memory costs, PVT not only can be trained on dense partitions of an image to achieve high -output resolution, which is important for dense prediction, but also uses a progressive shrinking pyramid to reduce the -computations of large feature maps. PVT inherits the advantages of both CNN and Transformer, making it a unified -backbone for various vision tasks without convolutions, where it can be used as a direct replacement for CNN backbones. +*Although convolutional neural networks (CNNs) have achieved great success in computer vision, this work investigates a +simpler, convolution-free backbone network useful for many dense prediction tasks. Unlike the recently proposed Vision +Transformer (ViT) that was designed for image classification specifically, we introduce the Pyramid Vision Transformer +(PVT), which overcomes the difficulties of porting Transformer to various dense prediction tasks. PVT has several +merits compared to current state of the arts. Different from ViT that typically yields low resolution outputs and +incurs high computational and memory costs, PVT not only can be trained on dense partitions of an image to achieve high +output resolution, which is important for dense prediction, but also uses a progressive shrinking pyramid to reduce the +computations of large feature maps. PVT inherits the advantages of both CNN and Transformer, making it a unified +backbone for various vision tasks without convolutions, where it can be used as a direct replacement for CNN backbones. We validate PVT through extensive experiments, showing that it boosts the performance of many downstream tasks, including -object detection, instance and semantic segmentation. For example, with a comparable number of parameters, PVT+RetinaNet -achieves 40.4 AP on the COCO dataset, surpassing ResNet50+RetinNet (36.3 AP) by 4.1 absolute AP (see Figure 2). We hope +object detection, instance and semantic segmentation. For example, with a comparable number of parameters, PVT+RetinaNet +achieves 40.4 AP on the COCO dataset, surpassing ResNet50+RetinNet (36.3 AP) by 4.1 absolute AP (see Figure 2). We hope that PVT could serve as an alternative and useful backbone for pixel-level predictions and facilitate future research.* This model was contributed by [Xrenya](https://huggingface.co/Xrenya). The original code can be found [here](https://github.com/whai362/PVT). - - PVTv1 on ImageNet-1K | **Model variant** |**Size** |**Acc@1**|**Params (M)**| @@ -55,7 +54,6 @@ This model was contributed by [Xrenya](https://huggingface.co/Xrenya). The origi | PVT-Medium | 224 | 81.2 | 44.2 | | PVT-Large | 224 | 81.7 | 61.4 | - ## PvtConfig [[autodoc]] PvtConfig diff --git a/docs/source/en/model_doc/pvt_v2.md b/docs/source/en/model_doc/pvt_v2.md index 0d0ee3cca751..5be8998f4cc2 100644 --- a/docs/source/en/model_doc/pvt_v2.md +++ b/docs/source/en/model_doc/pvt_v2.md @@ -26,7 +26,7 @@ The PVTv2 encoder structure has been successfully deployed to achieve state-of-t PVTv2 belongs to a family of models called [hierarchical transformers](https://natecibik.medium.com/the-rise-of-vision-transformers-f623c980419f) , which make adaptations to transformer layers in order to generate multi-scale feature maps. Unlike the columnal structure of Vision Transformer ([ViT](https://huggingface.co/papers/2010.11929)) which loses fine-grained detail, multi-scale feature maps are known preserve this detail and aid performance in dense prediction tasks. In the case of PVTv2, this is achieved by generating image patch tokens using 2D convolution with overlapping kernels in each encoder layer. -The multi-scale features of hierarchical transformers allow them to be easily swapped in for traditional workhorse computer vision backbone models like ResNet in larger architectures. Both Segformer and Panoptic Segformer demonstrated that configurations using PVTv2 for a backbone consistently outperformed those with similarly sized ResNet backbones. +The multi-scale features of hierarchical transformers allow them to be easily swapped in for traditional workhorse computer vision backbone models like ResNet in larger architectures. Both Segformer and Panoptic Segformer demonstrated that configurations using PVTv2 for a backbone consistently outperformed those with similarly sized ResNet backbones. Another powerful feature of the PVTv2 is the complexity reduction in the self-attention layers called Spatial Reduction Attention (SRA), which uses 2D convolution layers to project hidden states to a smaller resolution before attending to them with the queries, improving the $O(n^2)$ complexity of self-attention to $O(n^2/R)$, with $R$ being the spatial reduction ratio (`sr_ratio`, aka kernel size and stride in the 2D convolution). @@ -48,6 +48,7 @@ This model was contributed by [FoamoftheSea](https://huggingface.co/FoamoftheSea - ImageNet pretrained weights for all model sizes can be found on the [hub](https://huggingface.co/models?other=pvt_v2). The best way to get started with the PVTv2 is to load the pretrained checkpoint with the size of your choosing using `AutoModelForImageClassification`: + ```python import requests import torch @@ -99,7 +100,6 @@ outputs = model(torch.tensor(processed["pixel_values"])) | PVT-V2-B4 | 224 | 83.6 | 62.6 | | PVT-V2-B5 | 224 | 83.8 | 82.0 | - ## PvtV2Config [[autodoc]] PvtV2Config diff --git a/docs/source/en/model_doc/qdqbert.md b/docs/source/en/model_doc/qdqbert.md index 4c934d92d5fc..b791b4b2afe6 100644 --- a/docs/source/en/model_doc/qdqbert.md +++ b/docs/source/en/model_doc/qdqbert.md @@ -115,7 +115,7 @@ tensors. After setting up the tensor quantizers, one can use the following examp The goal of exporting to ONNX is to deploy inference by [TensorRT](https://developer.nvidia.com/tensorrt). Fake quantization will be broken into a pair of QuantizeLinear/DequantizeLinear ONNX ops. After setting static member of -TensorQuantizer to use Pytorch’s own fake quantization functions, fake quantized model can be exported to ONNX, follow +TensorQuantizer to use Pytorch's own fake quantization functions, fake quantized model can be exported to ONNX, follow the instructions in [torch.onnx](https://pytorch.org/docs/stable/onnx.html). Example: ```python diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index 3f872302cc27..feeb69959b21 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -142,7 +142,6 @@ outputs = model.generate(**inputs, max_new_tokens=100) print(tokenizer.decode(outputs[0], skip_special_tokens=True)) ``` - ## Notes - Ensure your Transformers library version is up-to-date. Qwen2 requires Transformers>=4.37.0 for full support. diff --git a/docs/source/en/model_doc/qwen2_5_omni.md b/docs/source/en/model_doc/qwen2_5_omni.md index e124f7cdb421..e2e0dc348a1c 100644 --- a/docs/source/en/model_doc/qwen2_5_omni.md +++ b/docs/source/en/model_doc/qwen2_5_omni.md @@ -29,9 +29,7 @@ The [Qwen2.5-Omni](https://qwenlm.github.io/blog/qwen2.5-omni/) model is a unifi The abstract from the technical report is the following: -*We present Qwen2.5-Omni, an end-to-end multimodal model designed to perceive diverse modalities, including text, images, audio, and video, while simultaneously generating text and natural speech responses in a streaming manner. To enable the streaming of multimodal information inputs, both audio and visual encoders utilize a block-wise processing approach. This strategy effectively decouples the handling of long sequences of multimodal data, assigning the perceptual responsibilities to the multimodal encoder and entrusting the modeling of extended sequences to a large language model. Such a division of labor enhances the fusion of different modalities via the shared attention mechanism. To synchronize the timestamps of video inputs with audio, we organized the audio and video sequentially in an interleaved manner and propose a novel position embedding approach, named TMRoPE (Time-aligned Multimodal RoPE). To concurrently generate text and speech while avoiding interference between the two modalities, we propose Thinker-Talker architecture. In this framework, Thinker functions as a large language model tasked with text generation, while Talker is a dual-track autoregressive model that directly utilizes the hidden representations from the Thinker to produce audio tokens as output. Both the Thinker and Talker models are designed to be trained and inferred in an end-to-end manner. For decoding audio tokens in a streaming manner, we introduce a sliding-window DiT that restricts the receptive field, aiming to reduce the initial package delay. Qwen2.5-Omni outperforms the similarly sized Qwen2-VL and Qwen2-Audio in both image and audio capabilities. Furthermore, Qwen2.5-Omni achieves state-of-the-art performance on multimodal benchmarks like Omni-Bench. Notably, Qwen2.5-Omni is the first open-source model to achieve a level of performance in end-to-end speech instruction following that is comparable to its capabilities with text inputs, as evidenced by benchmarks such as MMLU and GSM8K. As for speech generation, Qwen2.5-Omni’s streaming Talker outperform most existing streaming and non-streaming alternatives in robustness and naturalness.* - - +*We present Qwen2.5-Omni, an end-to-end multimodal model designed to perceive diverse modalities, including text, images, audio, and video, while simultaneously generating text and natural speech responses in a streaming manner. To enable the streaming of multimodal information inputs, both audio and visual encoders utilize a block-wise processing approach. This strategy effectively decouples the handling of long sequences of multimodal data, assigning the perceptual responsibilities to the multimodal encoder and entrusting the modeling of extended sequences to a large language model. Such a division of labor enhances the fusion of different modalities via the shared attention mechanism. To synchronize the timestamps of video inputs with audio, we organized the audio and video sequentially in an interleaved manner and propose a novel position embedding approach, named TMRoPE (Time-aligned Multimodal RoPE). To concurrently generate text and speech while avoiding interference between the two modalities, we propose Thinker-Talker architecture. In this framework, Thinker functions as a large language model tasked with text generation, while Talker is a dual-track autoregressive model that directly utilizes the hidden representations from the Thinker to produce audio tokens as output. Both the Thinker and Talker models are designed to be trained and inferred in an end-to-end manner. For decoding audio tokens in a streaming manner, we introduce a sliding-window DiT that restricts the receptive field, aiming to reduce the initial package delay. Qwen2.5-Omni outperforms the similarly sized Qwen2-VL and Qwen2-Audio in both image and audio capabilities. Furthermore, Qwen2.5-Omni achieves state-of-the-art performance on multimodal benchmarks like Omni-Bench. Notably, Qwen2.5-Omni is the first open-source model to achieve a level of performance in end-to-end speech instruction following that is comparable to its capabilities with text inputs, as evidenced by benchmarks such as MMLU and GSM8K. As for speech generation, Qwen2.5-Omni's streaming Talker outperform most existing streaming and non-streaming alternatives in robustness and naturalness.* ## Notes @@ -40,7 +38,6 @@ The abstract from the technical report is the following: - In case out out-of-memory errors hwen working with video input, decrease `processor.max_pixels`. By default the maximum is set to a very arge value and high resolution visuals will not be resized, unless resolution exceeds `processor.max_pixels`. - The processor has its own [`~ProcessorMixin.apply_chat_template`] method to convert chat messages to model inputs. - ## Usage example `Qwen2.5-Omni` can be found on the [Huggingface Hub](https://huggingface.co/Qwen). @@ -275,7 +272,8 @@ processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B", min_pixels=min #### Prompt for audio output If users need audio output, the system prompt must be set as "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.", otherwise the audio output may not work as expected. -``` + +```python { "role": "system", "content": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.", @@ -285,6 +283,7 @@ If users need audio output, the system prompt must be set as "You are Qwen, a vi #### Use audio output or not The model supports both text and audio outputs, if users do not need audio outputs, they can set `enable_audio_output` in the `from_pretrained` function. This option will save about `~2GB` of GPU memory but the `return_audio` option for `generate` function will only allow to be set at `False`. + ```python model = Qwen2_5OmniForConditionalGeneration.from_pretrained( "Qwen/Qwen2.5-Omni-7B", @@ -341,8 +340,6 @@ model = Qwen2_5OmniForConditionalGeneration.from_pretrained( ) ``` - - ## Qwen2_5OmniConfig [[autodoc]] Qwen2_5OmniConfig diff --git a/docs/source/en/model_doc/qwen2_5_vl.md b/docs/source/en/model_doc/qwen2_5_vl.md index 62527ea4963a..7f682bf80201 100644 --- a/docs/source/en/model_doc/qwen2_5_vl.md +++ b/docs/source/en/model_doc/qwen2_5_vl.md @@ -26,7 +26,6 @@ rendered properly in your Markdown viewer. [Qwen2.5-VL](https://huggingface.co/papers/2502.13923) is a multimodal vision-language model, available in 3B, 7B, and 72B parameters, pretrained on 4.1T tokens. The model introduces window attention in the ViT encoder to accelerate training and inference, dynamic FPS sampling on the spatial and temporal dimensions for better video understanding across different sampling rates, and an upgraded MRoPE (multi-resolutional rotary positional encoding) mechanism to better capture and learn temporal dynamics. - You can find all the original Qwen2.5-VL checkpoints under the [Qwen2.5-VL](https://huggingface.co/collections/Qwen/qwen25-vl-6795ffac22b334a837c0f9a5) collection. > [!TIP] @@ -61,6 +60,7 @@ messages = [ pipe(text=messages,max_new_tokens=20, return_full_text=False) ``` + @@ -110,6 +110,7 @@ output_text = processor.batch_decode( ) print(output_text) ``` + @@ -130,9 +131,11 @@ model = Qwen2_5_VLForConditionalGeneration.from_pretrained( ) ``` + ### Notes - Use Qwen2.5-VL for video inputs by setting `"type": "video"` as shown below. + ```python conversation = [ { @@ -159,8 +162,10 @@ model = Qwen2_5_VLForConditionalGeneration.from_pretrained( output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True) print(output_text) ``` + - Use Qwen2.5-VL for a mixed batch of inputs (images, videos, text). Add labels when handling multiple images or videos for better reference as show below. + ```python import torch from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor @@ -221,14 +226,15 @@ model = Qwen2_5_VLForConditionalGeneration.from_pretrained( max_pixels = 2048*2048 processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels) ``` - + Higher resolution can require more compute whereas reducing the resolution can save memory as follows: - + ```python min_pixels = 256*28*28 max_pixels = 1024*28*28 processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels) ``` + ## Qwen2_5_VLConfig [[autodoc]] Qwen2_5_VLConfig diff --git a/docs/source/en/model_doc/qwen2_audio.md b/docs/source/en/model_doc/qwen2_audio.md index 7cdcd52119c0..9b9dd43a919d 100644 --- a/docs/source/en/model_doc/qwen2_audio.md +++ b/docs/source/en/model_doc/qwen2_audio.md @@ -36,7 +36,6 @@ The abstract from the paper is the following: *We introduce the latest progress of Qwen-Audio, a large-scale audio-language model called Qwen2-Audio, which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions. In contrast to complex hierarchical tags, we have simplified the pre-training process by utilizing natural language prompts for different data and tasks, and have further expanded the data volume. We have boosted the instruction-following capability of Qwen2-Audio and implemented two distinct audio interaction modes for voice chat and audio analysis. In the voice chat mode, users can freely engage in voice interactions with Qwen2-Audio without text input. In the audio analysis mode, users could provide audio and text instructions for analysis during the interaction. Note that we do not use any system prompts to switch between voice chat and audio analysis modes. Qwen2-Audio is capable of intelligently comprehending the content within audio and following voice commands to respond appropriately. For instance, in an audio segment that simultaneously contains sounds, multi-speaker conversations, and a voice command, Qwen2-Audio can directly understand the command and provide an interpretation and response to the audio. Additionally, DPO has optimized the model's performance in terms of factuality and adherence to desired behavior. According to the evaluation results from AIR-Bench, Qwen2-Audio outperformed previous SOTAs, such as Gemini-1.5-pro, in tests focused on audio-centric instruction-following capabilities. Qwen2-Audio is open-sourced with the aim of fostering the advancement of the multi-modal language community. * - ## Usage tips `Qwen2-Audio-7B` and `Qwen2-Audio-7B-Instruct` can be found on the [Huggingface Hub](https://huggingface.co/Qwen) @@ -79,6 +78,7 @@ In the following, we demonstrate how to use `Qwen2-Audio-7B-Instruct` for the in ### Voice Chat Inference In the voice chat mode, users can freely engage in voice interactions with Qwen2-Audio without text input: + ```python from io import BytesIO from urllib.request import urlopen @@ -119,6 +119,7 @@ response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_ ### Audio Analysis Inference In the audio analysis, users could provide both audio and text instructions for analysis: + ```python from io import BytesIO from urllib.request import urlopen @@ -167,6 +168,7 @@ response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_ ### Batch Inference We also support batch inference: + ```python from io import BytesIO from urllib.request import urlopen diff --git a/docs/source/en/model_doc/qwen2_moe.md b/docs/source/en/model_doc/qwen2_moe.md index b8a3fe65d310..9d55de63e16d 100644 --- a/docs/source/en/model_doc/qwen2_moe.md +++ b/docs/source/en/model_doc/qwen2_moe.md @@ -24,7 +24,6 @@ rendered properly in your Markdown viewer. # Qwen2MoE - [Qwen2MoE](https://huggingface.co/papers/2407.10671) is a Mixture-of-Experts (MoE) variant of [Qwen2](./qwen2), available as a base model and an aligned chat model. It uses SwiGLU activation, group query attention and a mixture of sliding window attention and full attention. The tokenizer can also be adapted to multiple languages and codes. The MoE architecture uses upcyled models from the dense language models. For example, Qwen1.5-MoE-A2.7B is upcycled from Qwen-1.8B. It has 14.3B parameters but only 2.7B parameters are activated during runtime. @@ -57,6 +56,7 @@ messages = [ outputs = pipe(messages, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95) print(outputs[0]["generated_text"][-1]['content']) ``` + @@ -100,14 +100,14 @@ generated_ids = [ response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] print(response) ``` - + + ```bash transformers chat Qwen/Qwen1.5-MoE-A2.7B-Chat --dtype auto --attn_implementation flash_attention_2 ``` - - + Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends. diff --git a/docs/source/en/model_doc/qwen2_vl.md b/docs/source/en/model_doc/qwen2_vl.md index 8ff09ca57238..59dc25b5e085 100644 --- a/docs/source/en/model_doc/qwen2_vl.md +++ b/docs/source/en/model_doc/qwen2_vl.md @@ -25,7 +25,7 @@ rendered properly in your Markdown viewer. ## Overview -The [Qwen2-VL](https://huggingface.co/papers/2409.12191) ([blog post](https://qwenlm.github.io/blog/qwen2-vl/)) model is a major update to [Qwen-VL](https://huggingface.co/papers/2308.12966) from the Qwen team at Alibaba Research. +The [Qwen2-VL](https://huggingface.co/papers/2409.12191) ([blog post](https://qwenlm.github.io/blog/qwen2-vl/)) model is a major update to [Qwen-VL](https://huggingface.co/papers/2308.12966) from the Qwen team at Alibaba Research. The abstract from the blog is the following: @@ -203,8 +203,8 @@ min_pixels = 256*28*28 max_pixels = 1024*28*28 processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels) ``` -This ensures each image gets encoded using a number between 256-1024 tokens. The 28 comes from the fact that the model uses a patch size of 14 and a temporal patch size of 2 (14 x 2 = 28). +This ensures each image gets encoded using a number between 256-1024 tokens. The 28 comes from the fact that the model uses a patch size of 14 and a temporal patch size of 2 (14 x 2 = 28). #### Multiple Image Inputs @@ -307,7 +307,7 @@ model = Qwen2VLForConditionalGeneration.from_pretrained( [[autodoc]] Qwen2VLTextModel - forward - + ## Qwen2VLModel [[autodoc]] Qwen2VLModel diff --git a/docs/source/en/model_doc/qwen3.md b/docs/source/en/model_doc/qwen3.md index 87e6ba500f96..0141388fb97f 100644 --- a/docs/source/en/model_doc/qwen3.md +++ b/docs/source/en/model_doc/qwen3.md @@ -25,7 +25,6 @@ rendered properly in your Markdown viewer. To be released with the official model launch. - ## Usage tips To be released with the official model launch. diff --git a/docs/source/en/model_doc/qwen3_next.md b/docs/source/en/model_doc/qwen3_next.md index f2e003182ee7..62b52e3d6d5e 100644 --- a/docs/source/en/model_doc/qwen3_next.md +++ b/docs/source/en/model_doc/qwen3_next.md @@ -13,18 +13,21 @@ specific language governing permissions and limitations under the License. rendered properly in your Markdown viewer. --> +*This model was released on {release_date} and added to Hugging Face Transformers on 2025-09-10.* + ## Overview -The Qwen3-Next series represents our next-generation foundation models, optimized for extreme context length and large-scale parameter efficiency. +The Qwen3-Next series represents our next-generation foundation models, optimized for extreme context length and large-scale parameter efficiency. The series introduces a suite of architectural innovations designed to maximize performance while minimizing computational cost: -- **Hybrid Attention**: Replaces standard attention with the combination of **Gated DeltaNet** and **Gated Attention**, enabling efficient context modeling. + +- **Hybrid Attention**: Replaces standard attention with the combination of **Gated DeltaNet** and **Gated Attention**, enabling efficient context modeling. - **High-Sparsity MoE**: Achieves an extreme low activation ratio as 1:50 in MoE layers — drastically reducing FLOPs per token while preserving model capacity. - **Multi-Token Prediction(MTP)**: Boosts pretraining model performance, and accelerates inference. -- **Other Optimizations**: Includes techniques such as **zero-centered and weight-decayed layernorm**, **Gated Attention**, and other stabilizing enhancements for robust training. +- **Other Optimizations**: Includes techniques such as **zero-centered and weight-decayed layernorm**, **Gated Attention**, and other stabilizing enhancements for robust training. Built on this architecture, we trained and open-sourced Qwen3-Next-80B-A3B — 80B total parameters, only 3B active — achieving extreme sparsity and efficiency. -Despite its ultra-efficiency, it outperforms Qwen3-32B on downstream tasks — while requiring **less than 1/10 of the training cost**. +Despite its ultra-efficiency, it outperforms Qwen3-32B on downstream tasks — while requiring **less than 1/10 of the training cost**. Moreover, it delivers over **10x higher inference throughput** than Qwen3-32B when handling contexts longer than 32K tokens. For more details, please visit our blog [Qwen3-Next](qwen3_next) ([blog post](https://qwenlm.github.io/blog/qwen3_next/)). @@ -60,7 +63,7 @@ generated_ids = model.generate( **model_inputs, max_new_tokens=512 ) -output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() +output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() content = tokenizer.decode(output_ids, skip_special_tokens=True) diff --git a/docs/source/en/model_doc/qwen3_omni_moe.md b/docs/source/en/model_doc/qwen3_omni_moe.md new file mode 100644 index 000000000000..9b7fa18d3812 --- /dev/null +++ b/docs/source/en/model_doc/qwen3_omni_moe.md @@ -0,0 +1,409 @@ + +*This model was released on 2025-03-26 and added to Hugging Face Transformers on 2025-09-21.* + +# Qwen3-Omni-MOE + +
+PyTorch +FlashAttention +SDPA +
+ +## Overview + +The Qwen3-Omni-MOE model is a unified multiple modalities model proposed in [Qwen3-Omni Technical Report](https://huggingface.co/papers/2509.17765) from Qwen team, Alibaba Group. + +The abstract from the technical report is the following: + +*We present Qwen3-Omni, a single multimodal model that, for the first time, maintains state-of-the-art performance across text, image, audio, and video without any degradation relative to single-modal counterparts. Qwen3-Omni matches the performance of same-sized single-modal models within the Qwen series and excels particularly on audio tasks. Across 36 audio and audio-visual benchmarks, Qwen3-Omni achieves open-source SOTA on 32 benchmarks and overall SOTA on 22, outperforming strong closed-source models such as Gemini-2.5-Pro, Seed-ASR, and GPT-4o-Transcribe. Qwen3-Omni adopts a Thinker-Talker MoE architecture that unifies perception and generation across text, images, audio, and video, yielding fluent text and natural real-time speech. It supports text interaction in 119 languages, speech understanding in 19 languages, and speech generation in 10 languages. To reduce first-packet latency in streaming synthesis, Talker autoregressively predicts discrete speech codecs using a multi-codebook scheme. Leveraging the representational capacity of these codebooks, we replace computationally intensive block-wise diffusion with a lightweight causal ConvNet, enabling streaming from the first codec frame. In cold-start settings, Qwen3-Omni achieves a theoretical end-to-end first-packet latency of 234 ms. To further strengthen multimodal reasoning, we introduce a Thinking model that explicitly reasons over inputs from any modality. Since the research community currently lacks a general-purpose audio captioning model, we fine-tuned Qwen3-Omni-30B-A3B to obtain Qwen3-Omni-30B-A3B-Captioner, which produces detailed, low-hallucination captions for arbitrary audio inputs. Qwen3-Omni-30B-A3B, Qwen3-Omni-30B-A3B-Thinking, and Qwen3-Omni-30B-A3B-Captioner are publicly released under the Apache 2.0 license. + +## Notes + +- Use [`Qwen3OmniMoeForConditionalGeneration`] to generate audio and text output. To generate only one output type, use [`Qwen3OmniMoeThinkerForConditionalGeneration`] for text-only and [`Qwen3OmniMoeTalkerForConditionalGeneration`] for audio-only outputs. +- Audio generation with [`Qwen3OmniMoeForConditionalGeneration`] supports only single batch size at the moment. +- In case out out-of-memory errors hwen working with video input, decrease `processor.max_pixels`. By default the maximum is set to a very arge value and high resolution visuals will not be resized, unless resolution exceeds `processor.max_pixels`. +- The processor has its own [`~ProcessorMixin.apply_chat_template`] method to convert chat messages to model inputs. + +## Usage example + +`Qwen3-Omni` can be found on the [Huggingface Hub](https://huggingface.co/Qwen). + +### Single Media inference + +The model can accept text, images, audio and videos as input. Here's an example code for inference. + +```python +import soundfile as sf +from transformers import Qwen3OmniMoeForConditionalGeneration, Qwen3OmniMoeProcessor + +model = Qwen3OmniMoeForConditionalGeneration.from_pretrained( + "Qwen/Qwen3-Omni-30B-A3B-Instruct", + dtype="auto", + device_map="auto" +) +processor = Qwen3OmniMoeProcessor.from_pretrained("Qwen/Qwen3-Omni-30B-A3B-Instruct") + +conversations = [ + { + "role": "system", + "content": [ + {"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."} + ], + }, + { + "role": "user", + "content": [ + {"type": "video", "video": "/path/to/video.mp4"}, + {"type": "text", "text": "What cant you hear and see in this video?"}, + ], + }, +] + +inputs = processor.apply_chat_template( + conversations, + load_audio_from_video=True, + add_generation_prompt=True, + tokenize=True, + return_dict=True, + return_tensors="pt", + video_fps=1, + + # kwargs to be passed to `Qwen3OmniMoeProcessor` + padding=True, + use_audio_in_video=True, +).to(model.device) + +# Generation params for audio or text can be different and have to be prefixed with `thinker_` or `talker_` +text_ids, audio = model.generate(**inputs, use_audio_in_video=True, thinker_do_sample=False, talker_do_sample=True) +text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False) + +sf.write( + "output.wav", + audio.reshape(-1).detach().cpu().numpy(), + samplerate=24000, +) +print(text) +``` + +### Text-only generation + +To generate only text output and save compute by not loading the audio generation model, we can use `Qwen3OmniMoeThinkerForConditionalGeneration` model. + +```python +from transformers import Qwen3OmniMoeThinkerForConditionalGeneration, Qwen3OmniMoeProcessor + +model = Qwen3OmniMoeThinkerForConditionalGeneration.from_pretrained( + "Qwen/Qwen3-Omni-30B-A3B-Instruct", + dtype="auto", + device_map="auto", +) +processor = Qwen3OmniMoeProcessor.from_pretrained("Qwen/Qwen3-Omni-30B-A3B-Instruct") + +conversations = [ + { + "role": "system", + "content": [ + {"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."} + ], + }, + { + "role": "user", + "content": [ + {"type": "video", "video": "/path/to/video.mp4"}, + {"type": "text", "text": "What cant you hear and see in this video?"}, + ], + }, +] + +inputs = processor.apply_chat_template( + conversations, + load_audio_from_video=True, + add_generation_prompt=True, + tokenize=True, + return_dict=True, + return_tensors="pt", + video_fps=1, + + # kwargs to be passed to `Qwen3OmniMoeProcessor` + padding=True, + use_audio_in_video=True, +).to(model.device) + + +text_ids = model.generate(**inputs, use_audio_in_video=True) +text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False) + +sf.write( + "output.wav", + audio.reshape(-1).detach().cpu().numpy(), + samplerate=24000, +) +print(text) +``` + +### Batch Mixed Media Inference + +The model can batch inputs composed of mixed samples of various types such as text, images, audio and videos as input when using `Qwen3OmniMoeThinkerForConditionalGeneration` model. Here is an example. + +```python +import soundfile as sf +from transformers import Qwen3OmniMoeForConditionalGeneration, Qwen3OmniMoeProcessor + +model = Qwen3OmniMoeForConditionalGeneration.from_pretrained( + "Qwen/Qwen3-Omni-30B-A3B-Instruct", + dtype="auto", + device_map="auto" +) +processor = Qwen3OmniMoeProcessor.from_pretrained("Qwen/Qwen3-Omni-30B-A3B-Instruct") + +# Conversation with video only +conversation1 = [ + { + "role": "system", + "content": [ + {"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."} + ], + }, + { + "role": "user", + "content": [ + {"type": "video", "path": "/path/to/video.mp4"}, + ] + } +] + +# Conversation with audio only +conversation2 = [ + { + "role": "system", + "content": [ + {"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."} + ], + }, + { + "role": "user", + "content": [ + {"type": "audio", "path": "/path/to/audio.wav"}, + ] + } +] + +# Conversation with pure text +conversation3 = [ + { + "role": "system", + "content": [ + {"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."} + ], + }, + { + "role": "user", + "content": [{"type": "text", "text": "who are you?"}], + } +] + + +# Conversation with mixed media +conversation4 = [ + { + "role": "system", + "content": [ + {"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."} + ], + }, + { + "role": "user", + "content": [ + {"type": "image", "path": "/path/to/image.jpg"}, + {"type": "video", "path": "/path/to/video.mp4"}, + {"type": "audio", "path": "/path/to/audio.wav"}, + {"type": "text", "text": "What are the elements can you see and hear in these medias?"}, + ], + } +] + +conversations = [conversation1, conversation2, conversation3, conversation4] + +inputs = processor.apply_chat_template( + conversations, + load_audio_from_video=True, + add_generation_prompt=True, + tokenize=True, + return_dict=True, + return_tensors="pt", + video_fps=1, + + # kwargs to be passed to `Qwen3OmniMoeProcessor` + padding=True, + use_audio_in_video=True, +).to(model.thinker.device) + +text_ids = model.generate(**inputs, use_audio_in_video=True) +text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False) + +print(text) +``` + +### Usage Tips + +#### Image Resolution trade-off + +The model supports a wide range of resolution inputs. By default, it uses the native resolution for input, but higher resolutions can enhance performance at the cost of more computation. Users can set the minimum and maximum number of pixels to achieve an optimal configuration for their needs. + +```python +min_pixels = 128*28*28 +max_pixels = 768*28*28 +processor = AutoProcessor.from_pretrained("Qwen/Qwen3-Omni-30B-A3B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels) +``` + +#### Prompt for audio output +If users need audio output, the system prompt must be set as "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.", otherwise the audio output may not work as expected. + +```json +{ + "role": "system", + "content": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.", +} +``` + +#### Use audio output or not + +The model supports both text and audio outputs, if users do not need audio outputs, they can set `enable_audio_output` in the `from_pretrained` function. This option will save about `~2GB` of GPU memory but the `return_audio` option for `generate` function will only allow to be set at `False`. + +```python +model = Qwen3OmniMoeForConditionalGeneration.from_pretrained( + "Qwen/Qwen3-Omni-30B-A3B-Instruct", + dtype="auto", + device_map="auto", + enable_audio_output=False, +) +``` + +In order to obtain a flexible experience, we recommend that users set `enable_audio_output` at `True` when initializing the model through `from_pretrained` function, and then decide whether to return audio when `generate` function is called. When `return_audio` is set to `False`, the model will only return text outputs to get text responses faster. + +```python +model = Qwen3OmniMoeForConditionalGeneration.from_pretrained( + "Qwen/Qwen3-Omni-30B-A3B-Instruct", + dtype="auto", + device_map="auto", + enable_audio_output=True, +) +... +text_ids = model.generate(**inputs, return_audio=False) +``` + +#### Change voice type of output audio +Qwen3-Omni-MOE supports the ability to change the voice of the output audio. Users can use the `spk` parameter of `generate` function to specify the voice type. The `"Qwen/Qwen3-Omni-30B-A3B-Instruct"` checkpoint support two voice types: `Chelsie` and `Ethan`, while `Chelsie` is a female voice and `Ethan` is a male voice. By default, if `spk` is not specified, the default voice type is `Chelsie`. + +```python +text_ids, audio = model.generate(**inputs, spk="Chelsie") +``` + +```python +text_ids, audio = model.generate(**inputs, spk="Ethan") +``` + +#### Flash-Attention 2 to speed up generation + +First, make sure to install the latest version of Flash Attention 2: + +```bash +pip install -U flash-attn --no-build-isolation +``` + +Also, you should have hardware that is compatible with FlashAttention 2. Read more about it in the official documentation of the [flash attention repository](https://github.com/Dao-AILab/flash-attention). FlashAttention-2 can only be used when a model is loaded in `torch.float16` or `torch.bfloat16`. + +To load and run a model using FlashAttention-2, add `attn_implementation="flash_attention_2"` when loading the model: + +```python +from transformers import Qwen3OmniMoeForConditionalGeneration + +model = Qwen3OmniMoeForConditionalGeneration.from_pretrained( + "Qwen/Qwen3-Omni-30B-A3B-Instruct", + device_map="auto", + dtype=torch.bfloat16, + attn_implementation="flash_attention_2", +) +``` + +## Qwen3OmniMoeConfig + +[[autodoc]] Qwen3OmniMoeConfig + +## Qwen3OmniMoeThinkerConfig + +[[autodoc]] Qwen3OmniMoeThinkerConfig + +## Qwen3OmniMoeTalkerConfig + +[[autodoc]] Qwen3OmniMoeTalkerConfig + +## Qwen3OmniMoeForConditionalGeneration + +[[autodoc]] Qwen3OmniMoeForConditionalGeneration + +## Qwen3OmniMoeThinkerTextModel + +[[autodoc]] Qwen3OmniMoeThinkerTextModel + +## Qwen3OmniMoeThinkerForConditionalGeneration + +[[autodoc]] Qwen3OmniMoeThinkerForConditionalGeneration + +## Qwen3OmniMoeTalkerForConditionalGeneration + +[[autodoc]] Qwen3OmniMoeTalkerForConditionalGeneration + +## Qwen3OmniMoePreTrainedModel + +[[autodoc]] Qwen3OmniMoePreTrainedModel + +## Qwen3OmniMoePreTrainedModelForConditionalGeneration + +[[autodoc]] Qwen3OmniMoePreTrainedModelForConditionalGeneration + +## Qwen3OmniMoeTalkerModel + +[[autodoc]] Qwen3OmniMoeTalkerModel + +## Qwen3OmniMoeThinkerTextPreTrainedModel + +[[autodoc]] Qwen3OmniMoeThinkerTextPreTrainedModel + +## Qwen3OmniMoeProcessor + +[[autodoc]] Qwen3OmniMoeProcessor + +## Qwen3OmniMoeCode2Wav + +[[autodoc]] Qwen3OmniMoeCode2Wav + +## Qwen3OmniMoeCode2WavDecoderBlock + +[[autodoc]] Qwen3OmniMoeCode2WavDecoderBlock + +## Qwen3OmniMoeCode2WavTransformerModel + +[[autodoc]] Qwen3OmniMoeCode2WavTransformerModel + +## Qwen3OmniMoeTalkerCodePredictorModel + +[[autodoc]] Qwen3OmniMoeTalkerCodePredictorModel + +## Qwen3OmniMoeTalkerCodePredictorModelForConditionalGeneration + +[[autodoc]] Qwen3OmniMoeTalkerCodePredictorModelForConditionalGeneration diff --git a/docs/source/en/model_doc/qwen3_vl.md b/docs/source/en/model_doc/qwen3_vl.md index 9e90363a1eba..33c8c7e96aee 100644 --- a/docs/source/en/model_doc/qwen3_vl.md +++ b/docs/source/en/model_doc/qwen3_vl.md @@ -13,7 +13,7 @@ specific language governing permissions and limitations under the License. rendered properly in your Markdown viewer. --> -*This model was released on None and added to Hugging Face Transformers on 2025-08-16.* +*This model was released on 2025-09-23 and added to Hugging Face Transformers on 2025-09-15.*
@@ -77,6 +77,7 @@ output_text = processor.batch_decode( ) print(output_text) ``` + diff --git a/docs/source/en/model_doc/qwen3_vl_moe.md b/docs/source/en/model_doc/qwen3_vl_moe.md index 76d046efff2d..771f6d411cf2 100644 --- a/docs/source/en/model_doc/qwen3_vl_moe.md +++ b/docs/source/en/model_doc/qwen3_vl_moe.md @@ -13,7 +13,7 @@ specific language governing permissions and limitations under the License. rendered properly in your Markdown viewer. --> -*This model was released on None and added to Hugging Face Transformers on 2025-08-17.* +*This model was released on 2025-02-19 and added to Hugging Face Transformers on 2025-09-15.*
@@ -77,6 +77,7 @@ output_text = processor.batch_decode( ) print(output_text) ``` + diff --git a/docs/source/en/model_doc/recurrent_gemma.md b/docs/source/en/model_doc/recurrent_gemma.md index 1cd4e784a5bd..2d7c940e00a9 100644 --- a/docs/source/en/model_doc/recurrent_gemma.md +++ b/docs/source/en/model_doc/recurrent_gemma.md @@ -31,16 +31,14 @@ The abstract from the paper is the following: Tips: -- The original checkpoints can be converted using the conversion script [`src/transformers/models/recurrent_gemma/convert_recurrent_gemma_weights_to_hf.py`](https://github.com/huggingface/transformers/blob/main/src/transformers/models/recurrent_gemma/convert_recurrent_gemma_to_hf.py). +- The original checkpoints can be converted using the conversion script [`src/transformers/models/recurrent_gemma/convert_recurrent_gemma_weights_to_hf.py`](https://github.com/huggingface/transformers/blob/main/src/transformers/models/recurrent_gemma/convert_recurrent_gemma_to_hf.py). This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ). The original code can be found [here](https://github.com/google-deepmind/recurrentgemma). - ## RecurrentGemmaConfig [[autodoc]] RecurrentGemmaConfig - ## RecurrentGemmaModel [[autodoc]] RecurrentGemmaModel @@ -50,4 +48,3 @@ This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ). T [[autodoc]] RecurrentGemmaForCausalLM - forward - diff --git a/docs/source/en/model_doc/reformer.md b/docs/source/en/model_doc/reformer.md index f94134609d2b..c556e01ba13c 100644 --- a/docs/source/en/model_doc/reformer.md +++ b/docs/source/en/model_doc/reformer.md @@ -41,8 +41,8 @@ found [here](https://github.com/google/trax/tree/master/trax/models/reformer). ## Usage tips - Reformer does **not** work with *torch.nn.DataParallel* due to a bug in PyTorch, see [issue #36035](https://github.com/pytorch/pytorch/issues/36035). -- Use Axial position encoding (see below for more details). It’s a mechanism to avoid having a huge positional encoding matrix (when the sequence length is very big) by factorizing it into smaller matrices. -- Replace traditional attention by LSH (local-sensitive hashing) attention (see below for more details). It’s a technique to avoid computing the full product query-key in the attention layers. +- Use Axial position encoding (see below for more details). It's a mechanism to avoid having a huge positional encoding matrix (when the sequence length is very big) by factorizing it into smaller matrices. +- Replace traditional attention by LSH (local-sensitive hashing) attention (see below for more details). It's a technique to avoid computing the full product query-key in the attention layers. - Avoid storing the intermediate results of each layer by using reversible transformer layers to obtain them during the backward pass (subtracting the residuals from the input of the next layer gives them back) or recomputing them for results inside a given layer (less efficient than storing them but saves memory). - Compute the feedforward operations by chunks and not on the whole batch. @@ -89,7 +89,6 @@ equal to `config.hidden_size` and `config.axial_pos_shape` is set to a tuple \\( product has to be equal to `config.max_embedding_size`, which during training has to be equal to the *sequence length* of the `input_ids`. - ### LSH Self Attention In Locality sensitive hashing (LSH) self attention the key and query projection weights are tied. Therefore, the key @@ -122,7 +121,6 @@ Using LSH self attention, the memory and time complexity of the query-key matmul \\(\mathcal{O}(n_s \times n_s)\\) to \\(\mathcal{O}(n_s \times \log(n_s))\\), which usually represents the memory and time bottleneck in a transformer model, with \\(n_s\\) being the sequence length. - ### Local Self Attention Local self attention is essentially a "normal" self attention layer with key, query and value projections, but is @@ -134,7 +132,6 @@ Using Local self attention, the memory and time complexity of the query-key matm \\(\mathcal{O}(n_s \times n_s)\\) to \\(\mathcal{O}(n_s \times \log(n_s))\\), which usually represents the memory and time bottleneck in a transformer model, with \\(n_s\\) being the sequence length. - ### Training During training, we must ensure that the sequence length is set to a value that can be divided by the least common diff --git a/docs/source/en/model_doc/retribert.md b/docs/source/en/model_doc/retribert.md index 871bdc6e8c86..829fed24215f 100644 --- a/docs/source/en/model_doc/retribert.md +++ b/docs/source/en/model_doc/retribert.md @@ -39,7 +39,6 @@ pair of BERT encoders with lower-dimension projection for dense semantic indexin This model was contributed by [yjernite](https://huggingface.co/yjernite). Code to train and use the model can be found [here](https://github.com/huggingface/transformers/tree/main/examples/research-projects/distillation). - ## RetriBertConfig [[autodoc]] RetriBertConfig diff --git a/docs/source/en/model_doc/roberta.md b/docs/source/en/model_doc/roberta.md index da393646442a..43414fac4c88 100644 --- a/docs/source/en/model_doc/roberta.md +++ b/docs/source/en/model_doc/roberta.md @@ -28,7 +28,6 @@ rendered properly in your Markdown viewer. You can find all the original RoBERTa checkpoints under the [Facebook AI](https://huggingface.co/FacebookAI) organization. - > [!TIP] > Click on the RoBERTa models in the right sidebar for more examples of how to apply RoBERTa to different language tasks. diff --git a/docs/source/en/model_doc/rt_detr.md b/docs/source/en/model_doc/rt_detr.md index 02accfd6d9f7..d4c85f63fc37 100644 --- a/docs/source/en/model_doc/rt_detr.md +++ b/docs/source/en/model_doc/rt_detr.md @@ -23,7 +23,6 @@ rendered properly in your Markdown viewer. ## Overview - The RT-DETR model was proposed in [DETRs Beat YOLOs on Real-time Object Detection](https://huggingface.co/papers/2304.08069) by Wenyu Lv, Yian Zhao, Shangliang Xu, Jinman Wei, Guanzhong Wang, Cheng Cui, Yuning Du, Qingqing Dang, Yi Liu. RT-DETR is an object detection model that stands for "Real-Time DEtection Transformer." This model is designed to perform object detection tasks with a focus on achieving real-time performance while maintaining high accuracy. Leveraging the transformer architecture, which has gained significant popularity in various fields of deep learning, RT-DETR processes images to identify and locate multiple objects within them. @@ -39,7 +38,6 @@ alt="drawing" width="600"/> The model version was contributed by [rafaelpadilla](https://huggingface.co/rafaelpadilla) and [sangbumchoi](https://github.com/SangbumChoi). The original code can be found [here](https://github.com/lyuwenyu/RT-DETR/). - ## Usage tips Initially, an image is processed using a pre-trained convolutional neural network, specifically a Resnet-D variant as referenced in the original code. This network extracts features from the final three layers of the architecture. Following this, a hybrid encoder is employed to convert the multi-scale features into a sequential array of image features. Then, a decoder, equipped with auxiliary prediction heads is used to refine the object queries. This process facilitates the direct generation of bounding boxes, eliminating the need for any additional post-processing to acquire the logits and coordinates for the bounding boxes. diff --git a/docs/source/en/model_doc/rt_detr_v2.md b/docs/source/en/model_doc/rt_detr_v2.md index f5eb54625c84..3f814ce0d649 100644 --- a/docs/source/en/model_doc/rt_detr_v2.md +++ b/docs/source/en/model_doc/rt_detr_v2.md @@ -34,9 +34,9 @@ The abstract from the paper is the following: This model was contributed by [jadechoghari](https://huggingface.co/jadechoghari). The original code can be found [here](https://github.com/lyuwenyu/RT-DETR). -## Usage tips +## Usage tips -This second version of RT-DETR improves how the decoder finds objects in an image. +This second version of RT-DETR improves how the decoder finds objects in an image. - **better sampling** – adjusts offsets so the model looks at the right areas - **flexible attention** – can use smooth (bilinear) or fixed (discrete) sampling @@ -85,17 +85,15 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h - See also: [Object detection task guide](../tasks/object_detection). - Notebooks for [inference](https://github.com/qubvel/transformers-notebooks/blob/main/notebooks/RT_DETR_v2_inference.ipynb) and [fine-tuning](https://github.com/qubvel/transformers-notebooks/blob/main/notebooks/RT_DETR_v2_finetune_on_a_custom_dataset.ipynb) RT-DETRv2 on a custom dataset (🌎). - ## RTDetrV2Config [[autodoc]] RTDetrV2Config - ## RTDetrV2Model [[autodoc]] RTDetrV2Model - forward - + ## RTDetrV2ForObjectDetection [[autodoc]] RTDetrV2ForObjectDetection diff --git a/docs/source/en/model_doc/rwkv.md b/docs/source/en/model_doc/rwkv.md index 4d9d6bbb8860..9b5d64fedbb7 100644 --- a/docs/source/en/model_doc/rwkv.md +++ b/docs/source/en/model_doc/rwkv.md @@ -58,7 +58,7 @@ torch.allclose(torch.cat([output_one, output_two], dim=1), output_whole, atol=1e If you want to make sure the model stops generating when `'\n\n'` is detected, we recommend using the following stopping criteria: -```python +```python from transformers import StoppingCriteria class RwkvStoppingCriteria(StoppingCriteria): @@ -152,4 +152,4 @@ $$D_{i} = e^{u + K_{i} - q} + e^{M_{i}} \tilde{D}_{i} \hbox{ where } q = \max( which finally gives us -$$O_{i} = \sigma(R_{i}) \frac{N_{i}}{D_{i}}$$ \ No newline at end of file +$$O_{i} = \sigma(R_{i}) \frac{N_{i}}{D_{i}}$$ diff --git a/docs/source/en/model_doc/sam.md b/docs/source/en/model_doc/sam.md index 49a58254630a..65286eb8428d 100644 --- a/docs/source/en/model_doc/sam.md +++ b/docs/source/en/model_doc/sam.md @@ -41,7 +41,6 @@ Tips: - Fine-tuning the model is not supported yet - According to the paper, textual input should be also supported. However, at this time of writing this seems not to be supported according to [the official repository](https://github.com/facebookresearch/segment-anything/issues/4#issuecomment-1497626844). - This model was contributed by [ybelkada](https://huggingface.co/ybelkada) and [ArthurZ](https://huggingface.co/ArthurZ). The original code can be found [here](https://github.com/facebookresearch/segment-anything). @@ -98,6 +97,7 @@ masks = processor.image_processor.post_process_masks( ) scores = outputs.iou_scores ``` + ## Resources A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with SAM. diff --git a/docs/source/en/model_doc/sam_hq.md b/docs/source/en/model_doc/sam_hq.md index 2bd14229c37c..9dea1de7a77e 100644 --- a/docs/source/en/model_doc/sam_hq.md +++ b/docs/source/en/model_doc/sam_hq.md @@ -25,7 +25,6 @@ The model is an enhancement to the original SAM model that produces significantl ![example image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/sam-output.png) - SAM-HQ introduces several key improvements over the original SAM model: 1. High-Quality Output Token: A learnable token injected into SAM's mask decoder for higher quality mask prediction @@ -105,7 +104,6 @@ masks = processor.image_processor.post_process_masks( scores = outputs.iou_scores ``` - ## Resources A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with SAM-HQ: @@ -137,7 +135,6 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h [[autodoc]] SamHQVisionModel - ## SamHQModel [[autodoc]] SamHQModel diff --git a/docs/source/en/model_doc/seamless_m4t.md b/docs/source/en/model_doc/seamless_m4t.md index c6f3a56f9ba1..e7fc00d047c3 100644 --- a/docs/source/en/model_doc/seamless_m4t.md +++ b/docs/source/en/model_doc/seamless_m4t.md @@ -67,7 +67,6 @@ Here is how to use the processor to process text and audio: >>> text_inputs = processor(text = "Hello, my dog is cute", src_lang="eng", return_tensors="pt") ``` - ### Speech [`SeamlessM4TModel`] can *seamlessly* generate text or speech with few or no changes. Let's target Russian voice translation: @@ -84,7 +83,7 @@ With basically the same code, I've translated English text and Arabic speech to Similarly, you can generate translated text from audio files or from text with the same model. You only have to pass `generate_speech=False` to [`SeamlessM4TModel.generate`]. This time, let's translate to French. -```python +```python >>> # from audio >>> output_tokens = model.generate(**audio_inputs, tgt_lang="fra", generate_speech=False) >>> translated_text_from_audio = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True) @@ -96,11 +95,10 @@ This time, let's translate to French. ### Tips - #### 1. Use dedicated models [`SeamlessM4TModel`] is transformers top level model to generate speech and text, but you can also use dedicated models that perform the task without additional components, thus reducing the memory footprint. -For example, you can replace the audio-to-audio generation snippet with the model dedicated to the S2ST task, the rest is exactly the same code: +For example, you can replace the audio-to-audio generation snippet with the model dedicated to the S2ST task, the rest is exactly the same code: ```python >>> from transformers import SeamlessM4TForSpeechToSpeech @@ -130,7 +128,6 @@ Use `return_intermediate_token_ids=True` with [`SeamlessM4TModel`] to return bot ## Model architecture - SeamlessM4T features a versatile architecture that smoothly handles the sequential generation of text and speech. This setup comprises two sequence-to-sequence (seq2seq) models. The first model translates the input modality into translated text, while the second model generates speech tokens, known as "unit tokens," from the translated text. Each modality has its own dedicated encoder with a unique architecture. Additionally, for speech output, a vocoder inspired by the [HiFi-GAN](https://huggingface.co/papers/2010.05646) architecture is placed on top of the second seq2seq model. @@ -142,7 +139,6 @@ Here's how the generation process works: - If speech generation is required, the second seq2seq model, following a standard encoder-decoder structure, generates unit tokens. - These unit tokens are then passed through the final vocoder to produce the actual speech. - This model was contributed by [ylacombe](https://huggingface.co/ylacombe). The original code can be found [here](https://github.com/facebookresearch/seamless_communication). ## SeamlessM4TModel @@ -150,19 +146,16 @@ This model was contributed by [ylacombe](https://huggingface.co/ylacombe). The o [[autodoc]] SeamlessM4TModel - generate - ## SeamlessM4TForTextToSpeech [[autodoc]] SeamlessM4TForTextToSpeech - generate - ## SeamlessM4TForSpeechToSpeech [[autodoc]] SeamlessM4TForSpeechToSpeech - generate - ## SeamlessM4TForTextToText [[autodoc]] transformers.SeamlessM4TForTextToText @@ -179,7 +172,6 @@ This model was contributed by [ylacombe](https://huggingface.co/ylacombe). The o [[autodoc]] SeamlessM4TConfig - ## SeamlessM4TTokenizer [[autodoc]] SeamlessM4TTokenizer @@ -189,7 +181,6 @@ This model was contributed by [ylacombe](https://huggingface.co/ylacombe). The o - create_token_type_ids_from_sequences - save_vocabulary - ## SeamlessM4TTokenizerFast [[autodoc]] SeamlessM4TTokenizerFast @@ -209,7 +200,6 @@ This model was contributed by [ylacombe](https://huggingface.co/ylacombe). The o [[autodoc]] SeamlessM4TCodeHifiGan - ## SeamlessM4THifiGan [[autodoc]] SeamlessM4THifiGan @@ -221,5 +211,3 @@ This model was contributed by [ylacombe](https://huggingface.co/ylacombe). The o ## SeamlessM4TTextToUnitForConditionalGeneration [[autodoc]] SeamlessM4TTextToUnitForConditionalGeneration - - diff --git a/docs/source/en/model_doc/seamless_m4t_v2.md b/docs/source/en/model_doc/seamless_m4t_v2.md index 8a4ab82d2e98..4a32199243ab 100644 --- a/docs/source/en/model_doc/seamless_m4t_v2.md +++ b/docs/source/en/model_doc/seamless_m4t_v2.md @@ -35,7 +35,7 @@ SeamlessM4T-v2 enables multiple tasks without relying on separate models: The abstract from the paper is the following: -*Recent advancements in automatic speech translation have dramatically expanded language coverage, improved multimodal capabilities, and enabled a wide range of tasks and functionalities. That said, large-scale automatic speech translation systems today lack key features that help machine-mediated communication feel seamless when compared to human-to-human dialogue. In this work, we introduce a family of models that enable end-to-end expressive and multilingual translations in a streaming fashion. First, we contribute an improved version of the massively multilingual and multimodal SeamlessM4T model—SeamlessM4T v2. This newer model, incorporating an updated UnitY2 framework, was trained on more low-resource language data. The expanded version of SeamlessAlign adds 114,800 hours of automatically aligned data for a total of 76 languages. SeamlessM4T v2 provides the foundation on which our two newest models, SeamlessExpressive and SeamlessStreaming, are initiated. SeamlessExpressive enables translation that preserves vocal styles and prosody. Compared to previous efforts in expressive speech research, our work addresses certain underexplored aspects of prosody, such as speech rate and pauses, while also preserving the style of one’s voice. As for SeamlessStreaming, our model leverages the Efficient Monotonic Multihead Attention (EMMA) mechanism to generate low-latency target translations without waiting for complete source utterances. As the first of its kind, SeamlessStreaming enables simultaneous speech-to-speech/text translation for multiple source and target languages. To understand the performance of these models, we combined novel and modified versions of existing automatic metrics to evaluate prosody, latency, and robustness. For human evaluations, we adapted existing protocols tailored for measuring the most relevant attributes in the preservation of meaning, naturalness, and expressivity. To ensure that our models can be used safely and responsibly, we implemented the first known red-teaming effort for multimodal machine translation, a system for the detection and mitigation of added toxicity, a systematic evaluation of gender bias, and an inaudible localized watermarking mechanism designed to dampen the impact of deepfakes. Consequently, we bring major components from SeamlessExpressive and SeamlessStreaming together to form Seamless, the first publicly available system that unlocks expressive cross-lingual communication in real-time. In sum, Seamless gives us a pivotal look at the technical foundation needed to turn the Universal Speech Translator from a science fiction concept into a real-world technology. Finally, contributions in this work—including models, code, and a watermark detector—are publicly released and accessible at the link below.* +*Recent advancements in automatic speech translation have dramatically expanded language coverage, improved multimodal capabilities, and enabled a wide range of tasks and functionalities. That said, large-scale automatic speech translation systems today lack key features that help machine-mediated communication feel seamless when compared to human-to-human dialogue. In this work, we introduce a family of models that enable end-to-end expressive and multilingual translations in a streaming fashion. First, we contribute an improved version of the massively multilingual and multimodal SeamlessM4T model—SeamlessM4T v2. This newer model, incorporating an updated UnitY2 framework, was trained on more low-resource language data. The expanded version of SeamlessAlign adds 114,800 hours of automatically aligned data for a total of 76 languages. SeamlessM4T v2 provides the foundation on which our two newest models, SeamlessExpressive and SeamlessStreaming, are initiated. SeamlessExpressive enables translation that preserves vocal styles and prosody. Compared to previous efforts in expressive speech research, our work addresses certain underexplored aspects of prosody, such as speech rate and pauses, while also preserving the style of one's voice. As for SeamlessStreaming, our model leverages the Efficient Monotonic Multihead Attention (EMMA) mechanism to generate low-latency target translations without waiting for complete source utterances. As the first of its kind, SeamlessStreaming enables simultaneous speech-to-speech/text translation for multiple source and target languages. To understand the performance of these models, we combined novel and modified versions of existing automatic metrics to evaluate prosody, latency, and robustness. For human evaluations, we adapted existing protocols tailored for measuring the most relevant attributes in the preservation of meaning, naturalness, and expressivity. To ensure that our models can be used safely and responsibly, we implemented the first known red-teaming effort for multimodal machine translation, a system for the detection and mitigation of added toxicity, a systematic evaluation of gender bias, and an inaudible localized watermarking mechanism designed to dampen the impact of deepfakes. Consequently, we bring major components from SeamlessExpressive and SeamlessStreaming together to form Seamless, the first publicly available system that unlocks expressive cross-lingual communication in real-time. In sum, Seamless gives us a pivotal look at the technical foundation needed to turn the Universal Speech Translator from a science fiction concept into a real-world technology. Finally, contributions in this work—including models, code, and a watermark detector—are publicly released and accessible at the link below.* ## Usage @@ -67,7 +67,6 @@ Here is how to use the processor to process text and audio: >>> text_inputs = processor(text = "Hello, my dog is cute", src_lang="eng", return_tensors="pt") ``` - ### Speech [`SeamlessM4Tv2Model`] can *seamlessly* generate text or speech with few or no changes. Let's target Russian voice translation: @@ -84,7 +83,7 @@ With basically the same code, I've translated English text and Arabic speech to Similarly, you can generate translated text from audio files or from text with the same model. You only have to pass `generate_speech=False` to [`SeamlessM4Tv2Model.generate`]. This time, let's translate to French. -```python +```python >>> # from audio >>> output_tokens = model.generate(**audio_inputs, tgt_lang="fra", generate_speech=False) >>> translated_text_from_audio = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True) @@ -96,11 +95,10 @@ This time, let's translate to French. ### Tips - #### 1. Use dedicated models [`SeamlessM4Tv2Model`] is transformers top level model to generate speech and text, but you can also use dedicated models that perform the task without additional components, thus reducing the memory footprint. -For example, you can replace the audio-to-audio generation snippet with the model dedicated to the S2ST task, the rest is exactly the same code: +For example, you can replace the audio-to-audio generation snippet with the model dedicated to the S2ST task, the rest is exactly the same code: ```python >>> from transformers import SeamlessM4Tv2ForSpeechToSpeech @@ -141,6 +139,7 @@ The architecture of this new version differs from the first in a few aspects: #### Improvements on the second-pass model The second seq2seq model, named text-to-unit model, is now non-auto regressive, meaning that it computes units in a **single forward pass**. This achievement is made possible by: + - the use of **character-level embeddings**, meaning that each character of the predicted translated text has its own embeddings, which are then used to predict the unit tokens. - the use of an intermediate duration predictor, that predicts speech duration at the **character-level** on the predicted translated text. - the use of a new text-to-unit decoder mixing convolutions and self-attention to handle longer context. @@ -148,6 +147,7 @@ The second seq2seq model, named text-to-unit model, is now non-auto regressive, #### Difference in the speech encoder The speech encoder, which is used during the first-pass generation process to predict the translated text, differs mainly from the previous speech encoder through these mechanisms: + - the use of chunked attention mask to prevent attention across chunks, ensuring that each position attends only to positions within its own chunk and a fixed number of previous chunks. - the use of relative position embeddings which only considers distance between sequence elements rather than absolute positions. Please refer to [Self-Attentionwith Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155) for more details. - the use of a causal depth-wise convolution instead of a non-causal one. @@ -161,7 +161,6 @@ Here's how the generation process works: - If speech generation is required, the second seq2seq model, generates unit tokens in an non auto-regressive way. - These unit tokens are then passed through the final vocoder to produce the actual speech. - This model was contributed by [ylacombe](https://huggingface.co/ylacombe). The original code can be found [here](https://github.com/facebookresearch/seamless_communication). ## SeamlessM4Tv2Model @@ -169,19 +168,16 @@ This model was contributed by [ylacombe](https://huggingface.co/ylacombe). The o [[autodoc]] SeamlessM4Tv2Model - generate - ## SeamlessM4Tv2ForTextToSpeech [[autodoc]] SeamlessM4Tv2ForTextToSpeech - generate - ## SeamlessM4Tv2ForSpeechToSpeech [[autodoc]] SeamlessM4Tv2ForSpeechToSpeech - generate - ## SeamlessM4Tv2ForTextToText [[autodoc]] transformers.SeamlessM4Tv2ForTextToText diff --git a/docs/source/en/model_doc/seed_oss.md b/docs/source/en/model_doc/seed_oss.md index 0f0dacb2be90..dbcddcb5f2c7 100644 --- a/docs/source/en/model_doc/seed_oss.md +++ b/docs/source/en/model_doc/seed_oss.md @@ -1,17 +1,20 @@ - + +*This model was released on {release_date} and added to Hugging Face Transformers on 2025-08-22.* # SeedOss @@ -54,4 +57,4 @@ To be released with the official model launch. ## SeedOssForQuestionAnswering [[autodoc]] SeedOssForQuestionAnswering - - forward \ No newline at end of file + - forward diff --git a/docs/source/en/model_doc/segformer.md b/docs/source/en/model_doc/segformer.md index 756c98d45f08..a6b407e58793 100644 --- a/docs/source/en/model_doc/segformer.md +++ b/docs/source/en/model_doc/segformer.md @@ -71,8 +71,6 @@ logits = outputs.logits # shape [batch, num_labels, height, width] - - ## Notes - SegFormer works with **any input size**, padding inputs to be divisible by `config.patch_sizes`. diff --git a/docs/source/en/model_doc/seggpt.md b/docs/source/en/model_doc/seggpt.md index 9e8c08cf2d2e..356b0f7abcf6 100644 --- a/docs/source/en/model_doc/seggpt.md +++ b/docs/source/en/model_doc/seggpt.md @@ -30,6 +30,7 @@ The abstract from the paper is the following: *We present SegGPT, a generalist model for segmenting everything in context. We unify various segmentation tasks into a generalist in-context learning framework that accommodates different kinds of segmentation data by transforming them into the same format of images. The training of SegGPT is formulated as an in-context coloring problem with random color mapping for each data sample. The objective is to accomplish diverse tasks according to the context, rather than relying on specific colors. After training, SegGPT can perform arbitrary segmentation tasks in images or videos via in-context inference, such as object instance, stuff, part, contour, and text. SegGPT is evaluated on a broad range of tasks, including few-shot semantic segmentation, video object segmentation, semantic segmentation, and panoptic segmentation. Our results show strong capabilities in segmenting in-domain and out-of* Tips: + - One can use [`SegGptImageProcessor`] to prepare image input, prompt and mask to the model. - One can either use segmentation maps or RGB images as prompt masks. If using the latter make sure to set `do_convert_rgb=False` in the `preprocess` method. - It's highly advisable to pass `num_labels` when using `segmentation_maps` (not considering background) during preprocessing and postprocessing with [`SegGptImageProcessor`] for your use case. @@ -74,7 +75,6 @@ mask = image_processor.post_process_semantic_segmentation(outputs, target_sizes, This model was contributed by [EduardoPacheco](https://huggingface.co/EduardoPacheco). The original code can be found [here]([(https://github.com/baaivision/Painter/tree/main)). - ## SegGptConfig [[autodoc]] SegGptConfig diff --git a/docs/source/en/model_doc/shieldgemma2.md b/docs/source/en/model_doc/shieldgemma2.md index 99ffde6288ff..6a67c2d61b5a 100644 --- a/docs/source/en/model_doc/shieldgemma2.md +++ b/docs/source/en/model_doc/shieldgemma2.md @@ -22,9 +22,9 @@ rendered properly in your Markdown viewer. The ShieldGemma 2 model was proposed in a [technical report](https://huggingface.co/papers/2504.01081) by Google. ShieldGemma 2, built on [Gemma 3](https://ai.google.dev/gemma/docs/core/model_card_3), is a 4 billion (4B) parameter model that checks the safety of both synthetic and natural images against key categories to help you build robust datasets and models. With this addition to the Gemma family of models, researchers and developers can now easily minimize the risk of harmful content in their models across key areas of harm as defined below: -- No Sexually Explicit content: The image shall not contain content that depicts explicit or graphic sexual acts (e.g., pornography, erotic nudity, depictions of rape or sexual assault). -- No Dangerous Content: The image shall not contain content that facilitates or encourages activities that could cause real-world harm (e.g., building firearms and explosive devices, promotion of terrorism, instructions for suicide). -- No Violence/Gore content: The image shall not contain content that depicts shocking, sensational, or gratuitous violence (e.g., excessive blood and gore, gratuitous violence against animals, extreme injury or moment of death). +- No Sexually Explicit content: The image shall not contain content that depicts explicit or graphic sexual acts (e.g., pornography, erotic nudity, depictions of rape or sexual assault). +- No Dangerous Content: The image shall not contain content that facilitates or encourages activities that could cause real-world harm (e.g., building firearms and explosive devices, promotion of terrorism, instructions for suicide). +- No Violence/Gore content: The image shall not contain content that depicts shocking, sensational, or gratuitous violence (e.g., excessive blood and gore, gratuitous violence against animals, extreme injury or moment of death). We recommend using ShieldGemma 2 as an input filter to vision language models, or as an output filter of image generation systems. To train a robust image safety model, we curated training datasets of natural and synthetic images and instruction-tuned Gemma 3 to demonstrate strong performance. @@ -86,7 +86,6 @@ output = model(**inputs) print(output.probabilities) ``` - ## ShieldGemma2Processor [[autodoc]] ShieldGemma2Processor diff --git a/docs/source/en/model_doc/siglip.md b/docs/source/en/model_doc/siglip.md index c0eb9a8ac6b5..bf9c0a460348 100644 --- a/docs/source/en/model_doc/siglip.md +++ b/docs/source/en/model_doc/siglip.md @@ -31,7 +31,6 @@ Unlike CLIP, SigLIP employs a pairwise sigmoid loss on image-text pairs during t You can find all the original SigLIP checkpoints under the [SigLIP](https://huggingface.co/collections/google/siglip-659d5e62f0ae1a57ae0e83ba) collection. - > [!TIP] > Click on the SigLIP models in the right sidebar for more examples of how to apply SigLIP to different image and text tasks. @@ -107,12 +106,14 @@ logits_per_image = outputs.logits_per_image probs = torch.sigmoid(logits_per_image) print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'") ``` + ## Notes - Training is supported for DDP and FSDP on single-node multi-GPU setups. However, it does not use [torch.distributed](https://pytorch.org/tutorials/beginner/dist_overview.html) utilities which may limit the scalability of batch size. - When using the standalone [`SiglipTokenizer`] or [`SiglipProcessor`], make sure to pass `padding="max_length"` because that is how the model was trained. - To get the same results as the [`Pipeline`], a prompt template of `"This is a photo of {label}."` should be passed to the processor. - Toggle the `attn_implementation` parameter to either `"sdpa"` or `"flash_attention_2"` to use a more memory-efficient attention. + ```py # pip install -U flash-attn --no-build-isolation @@ -126,7 +127,6 @@ print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'") ) ``` - ## SiglipConfig [[autodoc]] SiglipConfig @@ -179,7 +179,6 @@ print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'") [[autodoc]] SiglipVisionModel - forward - ## SiglipForImageClassification [[autodoc]] SiglipForImageClassification diff --git a/docs/source/en/model_doc/siglip2.md b/docs/source/en/model_doc/siglip2.md index f2684c6defcf..6a058f8907a4 100644 --- a/docs/source/en/model_doc/siglip2.md +++ b/docs/source/en/model_doc/siglip2.md @@ -32,7 +32,6 @@ rendered properly in your Markdown viewer. - NaFlex supports different resolutions and maintains the native image aspect ratio - FixRes supports fixed resolutions and is backwards compatible with [SigLIP](./siglip) - You can find all the original SigLIP2 checkpoints under the [SigLIP2](https://huggingface.co/collections/google/siglip2-67b5dcef38c175486e240107) collection. > [!TIP] @@ -157,6 +156,7 @@ print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'") NaFlex resizes the input image so the height and width are multiples of the patch size after resizing. It keeps the aspect ratio distortion as low as possible and produces a sequence length of at most the desired target sequence length (`max_num_patches`). After resizing, the image is split into a sequence of patches and a mask with padding information is added. - Toggle the `attn_implementation` parameter to either `"sdpa"` or `"flash_attention_2"` to use a more memory-efficient attention. + ```py # pip install -U flash-attn --no-build-isolation @@ -169,6 +169,7 @@ print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'") device_map=device, ) ``` + ## Siglip2Config [[autodoc]] Siglip2Config diff --git a/docs/source/en/model_doc/smollm3.md b/docs/source/en/model_doc/smollm3.md index da98a15e33b5..db2ddd336013 100644 --- a/docs/source/en/model_doc/smollm3.md +++ b/docs/source/en/model_doc/smollm3.md @@ -139,7 +139,6 @@ outputs = model.generate(**inputs, max_new_tokens=100) print(tokenizer.decode(outputs[0], skip_special_tokens=True)) ``` - ## Notes - Ensure your Transformers library version is up-to-date. SmolLM3 requires Transformers>=4.53.0 for full support. diff --git a/docs/source/en/model_doc/smolvlm.md b/docs/source/en/model_doc/smolvlm.md index c9a886ac8769..61400bac177b 100644 --- a/docs/source/en/model_doc/smolvlm.md +++ b/docs/source/en/model_doc/smolvlm.md @@ -38,7 +38,8 @@ Videos should not be upsampled. If `do_resize` is set to `True`, the model resizes images so that the longest edge is 4*512 pixels by default. The default resizing behavior can be customized by passing a dictionary to the `size` parameter. For example, `{"longest_edge": 4 * 512}` is the default, but you can change it to a different value if needed. -Here’s how to control resizing and set a custom size: +Here's how to control resizing and set a custom size: + ```python image_processor = SmolVLMImageProcessor(do_resize=True, size={"longest_edge": 2 * 512}, max_image_size=512) ``` @@ -47,8 +48,6 @@ Additionally, the `max_image_size` parameter, which controls the size of each sq This model was contributed by [orrzohar](https://huggingface.co/orrzohar). - - ## Usage example ### Single Media inference diff --git a/docs/source/en/model_doc/stablelm.md b/docs/source/en/model_doc/stablelm.md index 29f32a0004e2..e47598a8f852 100644 --- a/docs/source/en/model_doc/stablelm.md +++ b/docs/source/en/model_doc/stablelm.md @@ -92,7 +92,6 @@ Now, to run the model with Flash Attention 2, refer to the snippet below: ['The weather is always wonderful in Costa Rica, which makes it a prime destination for retirees. That’s where the Pensionado program comes in, offering'] ``` - ## StableLmConfig [[autodoc]] StableLmConfig diff --git a/docs/source/en/model_doc/starcoder2.md b/docs/source/en/model_doc/starcoder2.md index 2d27aed399cd..b67e5dedd2cc 100644 --- a/docs/source/en/model_doc/starcoder2.md +++ b/docs/source/en/model_doc/starcoder2.md @@ -34,7 +34,7 @@ The abstract of the paper is the following: ## License The models are licensed under the [BigCode OpenRAIL-M v1 license agreement](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement). - + ## Usage tips The StarCoder2 models can be found in the [HuggingFace hub](https://huggingface.co/collections/bigcode/starcoder2-65de6da6e87db3383572be1a). You can find some examples for inference and fine-tuning in StarCoder2's [GitHub repo](https://github.com/bigcode-project/starcoder2). diff --git a/docs/source/en/model_doc/superglue.md b/docs/source/en/model_doc/superglue.md index 3e42b002ec6a..061f3ec2b9fb 100644 --- a/docs/source/en/model_doc/superglue.md +++ b/docs/source/en/model_doc/superglue.md @@ -143,10 +143,9 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size ## SuperGlueImageProcessor [[autodoc]] SuperGlueImageProcessor - -- preprocess -- post_process_keypoint_matching -- visualize_keypoint_matching + - preprocess + - post_process_keypoint_matching + - visualize_keypoint_matching @@ -157,4 +156,4 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size - forward - \ No newline at end of file + diff --git a/docs/source/en/model_doc/superpoint.md b/docs/source/en/model_doc/superpoint.md index b86f7fd4aa77..3efd5ecf90f2 100644 --- a/docs/source/en/model_doc/superpoint.md +++ b/docs/source/en/model_doc/superpoint.md @@ -33,8 +33,6 @@ You can find all the original SuperPoint checkpoints under the [Magic Leap Commu > > Click on the SuperPoint models in the right sidebar for more examples of how to apply SuperPoint to different computer vision tasks. - - The example below demonstrates how to detect interest points in an image with the [`AutoModel`] class. @@ -101,6 +99,7 @@ processed_outputs = processor.post_process_keypoint_detection(outputs, [image_si ``` - You can then print the keypoints on the image of your choice to visualize the result: + ```py import matplotlib.pyplot as plt plt.axis("off") @@ -130,16 +129,15 @@ processed_outputs = processor.post_process_keypoint_detection(outputs, [image_si ## SuperPointImageProcessor [[autodoc]] SuperPointImageProcessor - -- preprocess + - preprocess ## SuperPointImageProcessorFast [[autodoc]] SuperPointImageProcessorFast -- preprocess -- post_process_keypoint_detection + - preprocess + - post_process_keypoint_detection ## SuperPointForKeypointDetection [[autodoc]] SuperPointForKeypointDetection -- forward + - forward diff --git a/docs/source/en/model_doc/swin.md b/docs/source/en/model_doc/swin.md index f6a994ef69bc..81142f6c4111 100644 --- a/docs/source/en/model_doc/swin.md +++ b/docs/source/en/model_doc/swin.md @@ -47,6 +47,7 @@ pipeline = pipeline( ) pipeline("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg") ``` + @@ -79,6 +80,7 @@ class_labels = model.config.id2label predicted_class_label = class_labels[predicted_class_id] print(f"The predicted class label is: {predicted_class_label}") ``` + diff --git a/docs/source/en/model_doc/swinv2.md b/docs/source/en/model_doc/swinv2.md index 507b79fc7cf1..0dc008767ac3 100644 --- a/docs/source/en/model_doc/swinv2.md +++ b/docs/source/en/model_doc/swinv2.md @@ -81,7 +81,7 @@ print(f"The predicted class label is: {predicted_class_label}") ## Notes -- Swin Transformer V2 can pad the inputs for any input height and width divisible by `32`. +- Swin Transformer V2 can pad the inputs for any input height and width divisible by `32`. - Swin Transformer V2 can be used as a [backbone](../backbones). When `output_hidden_states = True`, it outputs both `hidden_states` and `reshaped_hidden_states`. The `reshaped_hidden_states` have a shape of `(batch, num_channels, height, width)` rather than `(batch_size, sequence_length, num_channels)`. ## Swinv2Config diff --git a/docs/source/en/model_doc/switch_transformers.md b/docs/source/en/model_doc/switch_transformers.md index efa6bd499dbc..5eb27a9e7d8c 100644 --- a/docs/source/en/model_doc/switch_transformers.md +++ b/docs/source/en/model_doc/switch_transformers.md @@ -27,7 +27,6 @@ rendered properly in your Markdown viewer. You can find all the original Switch Transformers checkpoints under the [Switch Transformer](https://huggingface.co/collections/google/switch-transformers-release-6548c35c6507968374b56d1f) collection. - > [!TIP] > This model was contributed by [ybelkada](https://huggingface.co/ybelkada) and [ArthurZ](https://huggingface.co/ArthurZ). > @@ -99,7 +98,6 @@ outputs = model.generate(input_ids) print(tokenizer.decode(outputs[0])) ``` - ## SwitchTransformersConfig [[autodoc]] SwitchTransformersConfig diff --git a/docs/source/en/model_doc/t5gemma.md b/docs/source/en/model_doc/t5gemma.md index aa8d3b7880ed..80880cf6559d 100644 --- a/docs/source/en/model_doc/t5gemma.md +++ b/docs/source/en/model_doc/t5gemma.md @@ -39,7 +39,6 @@ The example below demonstrates how to chat with the model with [`Pipeline`] or t - ```python import torch from transformers import pipeline @@ -86,9 +85,10 @@ print(tokenizer.decode(outputs[0])) -``` +```bash echo -e "Write me a poem about Machine Learning. Answer:" | transformers run --task text2text-generation --model google/t5gemma-2b-2b-prefixlm --device 0 ``` + diff --git a/docs/source/en/model_doc/t5v1.1.md b/docs/source/en/model_doc/t5v1.1.md index 4ad072addcc0..62787d5f9d62 100644 --- a/docs/source/en/model_doc/t5v1.1.md +++ b/docs/source/en/model_doc/t5v1.1.md @@ -68,7 +68,6 @@ Google has released the following variants: - [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl). - Refer to [T5's documentation page](t5) for all API reference, tips, code examples and notebooks. diff --git a/docs/source/en/model_doc/table-transformer.md b/docs/source/en/model_doc/table-transformer.md index b35df2aec311..c982d3059072 100644 --- a/docs/source/en/model_doc/table-transformer.md +++ b/docs/source/en/model_doc/table-transformer.md @@ -43,8 +43,8 @@ alt="drawing" width="600"/> Table detection and table structure recognition clarified. Taken from the original paper. -The authors released 2 models, one for [table detection](https://huggingface.co/microsoft/table-transformer-detection) in -documents, one for [table structure recognition](https://huggingface.co/microsoft/table-transformer-structure-recognition) +The authors released 2 models, one for [table detection](https://huggingface.co/microsoft/table-transformer-detection) in +documents, one for [table structure recognition](https://huggingface.co/microsoft/table-transformer-structure-recognition) (the task of recognizing the individual rows, columns etc. in a table). This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be diff --git a/docs/source/en/model_doc/tapas.md b/docs/source/en/model_doc/tapas.md index 4dfac5edce37..09c624c7fb7e 100644 --- a/docs/source/en/model_doc/tapas.md +++ b/docs/source/en/model_doc/tapas.md @@ -30,6 +30,7 @@ token types that encode tabular structure. TAPAS is pre-trained on the masked la millions of tables from English Wikipedia and corresponding texts. For question answering, TAPAS has 2 heads on top: a cell selection head and an aggregation head, for (optionally) performing aggregations (such as counting or summing) among selected cells. TAPAS has been fine-tuned on several datasets: + - [SQA](https://www.microsoft.com/en-us/download/details.aspx?id=54253) (Sequential Question Answering by Microsoft) - [WTQ](https://github.com/ppasupat/WikiTableQuestions) (Wiki Table Questions by Stanford University) - [WikiSQL](https://github.com/salesforce/WikiSQL) (by Salesforce). @@ -76,7 +77,6 @@ To summarize: | Weak supervision for aggregation | WTQ | Questions might involve aggregation, and the model must learn this given only the answer as supervision | | Strong supervision for aggregation | WikiSQL-supervised | Questions might involve aggregation, and the model must learn this given the gold aggregation operator | - Initializing a model with a pre-trained base and randomly initialized classification heads from the hub can be done as shown below. ```py @@ -105,7 +105,6 @@ Of course, you don't necessarily have to follow one of these three ways in which >>> model = TapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config) ``` - What you can also do is start from an already fine-tuned checkpoint. A note here is that the already fine-tuned checkpoint on WTQ has some issues due to the L2-loss which is somewhat brittle. See [here](https://github.com/google-research/tapas/issues/91#issuecomment-735719340) for more info. For a list of all pre-trained and fine-tuned TAPAS checkpoints available on HuggingFace's hub, see [here](https://huggingface.co/models?search=tapas). @@ -128,7 +127,6 @@ The tables themselves should be present in a folder, each table being a separate **STEP 3: Convert your data into tensors using TapasTokenizer** - Third, given that you've prepared your data in this TSV/CSV format (and corresponding CSV files containing the tabular data), you can then use [`TapasTokenizer`] to convert table-question pairs into `input_ids`, `attention_mask`, `token_type_ids` and so on. Again, based on which of the three cases you picked above, [`TapasForQuestionAnswering`] requires different inputs to be fine-tuned: @@ -214,13 +212,11 @@ Of course, this only shows how to encode a single training example. It is advise >>> train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32) ``` - Note that here, we encode each table-question pair independently. This is fine as long as your dataset is **not conversational**. In case your dataset involves conversational questions (such as in SQA), then you should first group together the `queries`, `answer_coordinates` and `answer_text` per table (in the order of their `position` index) and batch encode each table with its questions. This will make sure that the `prev_labels` token types (see docs of [`TapasTokenizer`]) are set correctly. See [this notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) for more info. **STEP 4: Train (fine-tune) the model - You can then fine-tune [`TapasForQuestionAnswering`] as follows (shown here for the weak supervision for aggregation case): ```py @@ -272,10 +268,8 @@ You can then fine-tune [`TapasForQuestionAnswering`] as follows (shown here for ... optimizer.step() ``` - ## Usage: inference - Here we explain how you can use [`TapasForQuestionAnswering`] for inference (i.e. making predictions on new data). For inference, only `input_ids`, `attention_mask` and `token_type_ids` (which you can obtain using [`TapasTokenizer`]) have to be provided to the model to obtain the logits. Next, you can use the handy [`~models.tapas.tokenization_tapas.convert_logits_to_predictions`] method to convert these into predicted coordinates and optional aggregation indices. However, note that inference is **different** depending on whether or not the setup is conversational. In a non-conversational set-up, inference can be done in parallel on all table-question pairs of a batch. Here's an example of that: @@ -333,7 +327,6 @@ What is the total number of movies? Predicted answer: SUM > 87, 53, 69 ``` - In case of a conversational set-up, then each table-question pair must be provided **sequentially** to the model, such that the `prev_labels` token types can be overwritten by the predicted `labels` of the previous table-question pair. Again, more info can be found in [this notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb). ## Resources diff --git a/docs/source/en/model_doc/tapex.md b/docs/source/en/model_doc/tapex.md index 0a10826ee1af..606d8940c4ed 100644 --- a/docs/source/en/model_doc/tapex.md +++ b/docs/source/en/model_doc/tapex.md @@ -37,6 +37,7 @@ Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou. TAPE which it can be fine-tuned to answer natural language questions related to tabular data, as well as performing table fact checking. TAPEX has been fine-tuned on several datasets: + - [SQA](https://www.microsoft.com/en-us/download/details.aspx?id=54253) (Sequential Question Answering by Microsoft) - [WTQ](https://github.com/ppasupat/WikiTableQuestions) (Wiki Table Questions by Stanford University) - [WikiSQL](https://github.com/salesforce/WikiSQL) (by Salesforce) diff --git a/docs/source/en/model_doc/textnet.md b/docs/source/en/model_doc/textnet.md index 9c29a8b16bee..c986b17dbff0 100644 --- a/docs/source/en/model_doc/textnet.md +++ b/docs/source/en/model_doc/textnet.md @@ -34,7 +34,7 @@ This model was contributed by [Raghavan](https://huggingface.co/Raghavan), [jade ## Usage tips -TextNet is mainly used as a backbone network for the architecture search of text detection. Each stage of the backbone network is comprised of a stride-2 convolution and searchable blocks. +TextNet is mainly used as a backbone network for the architecture search of text detection. Each stage of the backbone network is comprised of a stride-2 convolution and searchable blocks. Specifically, we present a layer-level candidate set, defined as {conv3×3, conv1×3, conv3×1, identity}. As the 1×3 and 3×1 convolutions have asymmetric kernels and oriented structure priors, they may help to capture the features of extreme aspect-ratio and rotated text lines. TextNet is the backbone for Fast, but can also be used as an efficient text/image classification, we add a `TextNetForImageClassification` as is it would allow people to train an image classifier on top of the pre-trained textnet weights @@ -62,4 +62,3 @@ TextNet is the backbone for Fast, but can also be used as an efficient text/imag [[autodoc]] TextNetForImageClassification - forward - diff --git a/docs/source/en/model_doc/time_series_transformer.md b/docs/source/en/model_doc/time_series_transformer.md index c38671f00fb3..36a68af80ca8 100644 --- a/docs/source/en/model_doc/time_series_transformer.md +++ b/docs/source/en/model_doc/time_series_transformer.md @@ -35,16 +35,16 @@ point forecasting model. This means that the model learns a distribution, from w and a decoder, which predicts a `prediction_length` of time series values into the future (called `future_values`). During training, one needs to provide pairs of (`past_values` and `future_values`) to the model. - In addition to the raw (`past_values` and `future_values`), one typically provides additional features to the model. These can be the following: - - `past_time_features`: temporal features which the model will add to `past_values`. These serve as "positional encodings" for the Transformer encoder. + - `past_time_features`: temporal features which the model will add to `past_values`. These serve as "positional encodings" for the Transformer encoder. Examples are "day of the month", "month of the year", etc. as scalar values (and then stacked together as a vector). e.g. if a given time-series value was obtained on the 11th of August, then one could have [11, 8] as time feature vector (11 being "day of the month", 8 being "month of the year"). - - `future_time_features`: temporal features which the model will add to `future_values`. These serve as "positional encodings" for the Transformer decoder. + - `future_time_features`: temporal features which the model will add to `future_values`. These serve as "positional encodings" for the Transformer decoder. Examples are "day of the month", "month of the year", etc. as scalar values (and then stacked together as a vector). e.g. if a given time-series value was obtained on the 11th of August, then one could have [11, 8] as time feature vector (11 being "day of the month", 8 being "month of the year"). - - `static_categorical_features`: categorical features which are static over time (i.e., have the same value for all `past_values` and `future_values`). + - `static_categorical_features`: categorical features which are static over time (i.e., have the same value for all `past_values` and `future_values`). An example here is the store ID or region ID that identifies a given time-series. Note that these features need to be known for ALL data points (also those in the future). - - `static_real_features`: real-valued features which are static over time (i.e., have the same value for all `past_values` and `future_values`). + - `static_real_features`: real-valued features which are static over time (i.e., have the same value for all `past_values` and `future_values`). An example here is the image representation of the product for which you have the time-series values (like the [ResNet](resnet) embedding of a "shoe" picture, if your time-series is about the sales of shoes). Note that these features need to be known for ALL data points (also those in the future). @@ -61,7 +61,6 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h - Check out the Time Series Transformer blog-post in HuggingFace blog: [Probabilistic Time Series Forecasting with 🤗 Transformers](https://huggingface.co/blog/time-series-transformers) - ## TimeSeriesTransformerConfig [[autodoc]] TimeSeriesTransformerConfig diff --git a/docs/source/en/model_doc/timesfm.md b/docs/source/en/model_doc/timesfm.md index 83dee48e71be..e8938202ee9e 100644 --- a/docs/source/en/model_doc/timesfm.md +++ b/docs/source/en/model_doc/timesfm.md @@ -25,16 +25,13 @@ rendered properly in your Markdown viewer. TimesFM (Time Series Foundation Model) is a pretrained time-series foundation model proposed in [A decoder-only foundation model for time-series forecasting](https://huggingface.co/papers/2310.10688) by Abhimanyu Das, Weihao Kong, Rajat Sen, and Yichen Zhou. It is a decoder only model that uses non-overlapping patches of time-series data as input and outputs some output patch length prediction in an autoregressive fashion. - The abstract from the paper is the following: *Motivated by recent advances in large language models for Natural Language Processing (NLP), we design a time-series foundation model for forecasting whose out-of-the-box zero-shot performance on a variety of public datasets comes close to the accuracy of state-of-the-art supervised forecasting models for each individual dataset. Our model is based on pretraining a patched-decoder style attention model on a large time-series corpus, and can work well across different forecasting history lengths, prediction lengths and temporal granularities.* - This model was contributed by [kashif](https://huggingface.co/kashif). The original code can be found [here](https://github.com/google-research/timesfm). - To use the model: ```python diff --git a/docs/source/en/model_doc/timesformer.md b/docs/source/en/model_doc/timesformer.md index 59e9ee71817d..1d87158d72e1 100644 --- a/docs/source/en/model_doc/timesformer.md +++ b/docs/source/en/model_doc/timesformer.md @@ -54,4 +54,4 @@ the number of input frames per clip changes based on the model size so you shoul ## TimesformerForVideoClassification [[autodoc]] TimesformerForVideoClassification - - forward \ No newline at end of file + - forward diff --git a/docs/source/en/model_doc/transfo-xl.md b/docs/source/en/model_doc/transfo-xl.md index 5d9b92f7946f..0bd1b0f57e1d 100644 --- a/docs/source/en/model_doc/transfo-xl.md +++ b/docs/source/en/model_doc/transfo-xl.md @@ -90,7 +90,6 @@ This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The o - Basically, the hidden states of the previous segment are concatenated to the current input to compute the attention scores. This allows the model to pay attention to information that was in the previous segment as well as the current one. By stacking multiple attention layers, the receptive field can be increased to multiple previous segments. - This changes the positional embeddings to positional relative embeddings (as the regular positional embeddings would give the same results in the current input and the current hidden state at a given position) and needs to make some adjustments in the way attention scores are computed. - TransformerXL does **not** work with *torch.nn.DataParallel* due to a bug in PyTorch, see [issue #36035](https://github.com/pytorch/pytorch/issues/36035) diff --git a/docs/source/en/model_doc/trocr.md b/docs/source/en/model_doc/trocr.md index 6346977dafa1..da5c71edde36 100644 --- a/docs/source/en/model_doc/trocr.md +++ b/docs/source/en/model_doc/trocr.md @@ -14,8 +14,6 @@ rendered properly in your Markdown viewer. specific language governing permissions and limitations under the License. --> *This model was released on 2021-09-21 and added to Hugging Face Transformers on 2021-10-13.* - -
PyTorch @@ -32,13 +30,11 @@ You can find all the original TrOCR checkpoints under the [Microsoft](https://hu alt="drawing" width="600"/> TrOCR architecture. Taken from the original paper. - > [!TIP] > This model was contributed by [nielsr](https://huggingface.co/nielsr). > > Click on the TrOCR models in the right sidebar for more examples of how to apply TrOCR to different image and text tasks. - The example below demonstrates how to perform optical character recognition (OCR) with the [`AutoModel`] class. @@ -113,7 +109,6 @@ print(generated_text) - A notebook on [inference with TrOCR](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TrOCR/Inference_with_TrOCR_%2B_Gradio_demo.ipynb) and Gradio demo. - A notebook on [evaluating TrOCR on the IAM test set](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TrOCR/Evaluating_TrOCR_base_handwritten_on_the_IAM_test_set.ipynb). - ## TrOCRConfig [[autodoc]] TrOCRConfig diff --git a/docs/source/en/model_doc/tvp.md b/docs/source/en/model_doc/tvp.md index 49a538ffa8c4..2df4da02555a 100644 --- a/docs/source/en/model_doc/tvp.md +++ b/docs/source/en/model_doc/tvp.md @@ -47,6 +47,7 @@ The [`TvpProcessor`] wraps [`BertTokenizer`] and [`TvpImageProcessor`] into a si encode the text and prepare the images respectively. The following example shows how to run temporal video grounding using [`TvpProcessor`] and [`TvpForVideoGrounding`]. + ```python import av import cv2 @@ -165,7 +166,6 @@ Tips: - Checkpoints for pre-trained [tvp-base](https://huggingface.co/Intel/tvp-base) is released. - Please refer to [Table 2](https://huggingface.co/papers/2303.04995) for TVP's performance on Temporal Video Grounding task. - ## TvpConfig [[autodoc]] TvpConfig diff --git a/docs/source/en/model_doc/udop.md b/docs/source/en/model_doc/udop.md index eb400cc39d5f..cc370accf3e3 100644 --- a/docs/source/en/model_doc/udop.md +++ b/docs/source/en/model_doc/udop.md @@ -115,4 +115,4 @@ to fine-tune UDOP on a custom dataset as well as inference. 🌎 ## UdopEncoderModel [[autodoc]] UdopEncoderModel - - forward \ No newline at end of file + - forward diff --git a/docs/source/en/model_doc/umt5.md b/docs/source/en/model_doc/umt5.md index 349dcecf03cc..784cc9974df1 100644 --- a/docs/source/en/model_doc/umt5.md +++ b/docs/source/en/model_doc/umt5.md @@ -39,7 +39,7 @@ Google has released the following variants: This model was contributed by [agemagician](https://huggingface.co/agemagician) and [stefan-it](https://huggingface.co/stefan-it). The original code can be found [here](https://github.com/google-research/t5x). -## Usage tips +## Usage tips - UMT5 was only pre-trained on [mC4](https://huggingface.co/datasets/mc4) excluding any supervised training. Therefore, this model has to be fine-tuned before it is usable on a downstream task, unlike the original T5 model. @@ -67,7 +67,7 @@ The conversion script is also different because the model was saved in t5x's lat ['nyone who drink a alcohol A A. This I'] ``` - + Refer to [T5's documentation page](t5) for more tips, code examples and notebooks. @@ -105,4 +105,3 @@ Refer to [T5's documentation page](t5) for more tips, code examples and notebook [[autodoc]] UMT5ForQuestionAnswering - forward - diff --git a/docs/source/en/model_doc/univnet.md b/docs/source/en/model_doc/univnet.md index e20bc5c405e8..4329846ab7f9 100644 --- a/docs/source/en/model_doc/univnet.md +++ b/docs/source/en/model_doc/univnet.md @@ -69,7 +69,6 @@ write("sample_audio.wav", feature_extractor.sampling_rate, audio) This model was contributed by [dg845](https://huggingface.co/dg845). To the best of my knowledge, there is no official code release, but an unofficial implementation can be found at [maum-ai/univnet](https://github.com/maum-ai/univnet) with pretrained checkpoints [here](https://github.com/maum-ai/univnet#pre-trained-model). - ## UnivNetConfig [[autodoc]] UnivNetConfig @@ -82,4 +81,4 @@ To the best of my knowledge, there is no official code release, but an unofficia ## UnivNetModel [[autodoc]] UnivNetModel - - forward \ No newline at end of file + - forward diff --git a/docs/source/en/model_doc/upernet.md b/docs/source/en/model_doc/upernet.md index 2c2e50fc560d..900b5635fc16 100644 --- a/docs/source/en/model_doc/upernet.md +++ b/docs/source/en/model_doc/upernet.md @@ -81,4 +81,4 @@ If you're interested in submitting a resource to be included here, please feel f ## UperNetForSemanticSegmentation [[autodoc]] UperNetForSemanticSegmentation - - forward \ No newline at end of file + - forward diff --git a/docs/source/en/model_doc/van.md b/docs/source/en/model_doc/van.md index 0e07e314bee9..0a4ded430211 100644 --- a/docs/source/en/model_doc/van.md +++ b/docs/source/en/model_doc/van.md @@ -74,4 +74,3 @@ If you're interested in submitting a resource to be included here, please feel f [[autodoc]] VanForImageClassification - forward - diff --git a/docs/source/en/model_doc/vaultgemma.md b/docs/source/en/model_doc/vaultgemma.md index c9eb36124fca..deada15dc0f7 100644 --- a/docs/source/en/model_doc/vaultgemma.md +++ b/docs/source/en/model_doc/vaultgemma.md @@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. - ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be rendered properly in your Markdown viewer. --> +*This model was released on 2016-07-01 and added to Hugging Face Transformers on 2025-09-12.* # VaultGemma @@ -30,7 +30,7 @@ sequence length. VaultGemma was trained from scratch with sequence-level differential privacy (DP). Its training data includes the same mixture as the [Gemma 2 models](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315), consisting of a number of documents of varying lengths. Additionally, it is trained using -[DP stochastic gradient descent (DP-SGD)](https://arxiv.org/abs/1607.00133) and provides a +[DP stochastic gradient descent (DP-SGD)](https://huggingface.co/papers/1607.00133) and provides a (ε ≤ 2.0, δ ≤ 1.1e-10)-sequence-level DP guarantee, where a sequence consists of 1024 consecutive tokens extracted from heterogeneous data sources. Specifically, the privacy unit of the guarantee is for the sequences after sampling and packing of the mixture. @@ -44,7 +44,6 @@ command line. - ```python from transformers import pipeline @@ -82,7 +81,7 @@ print(tokenizer.decode(outputs[0])) -``` +```bash echo -e "Write me a poem about Machine Learning. Answer:" | transformers run --task text2text-generation --model google/vaultgemma-1b-pt --device 0 ``` diff --git a/docs/source/en/model_doc/video_llava.md b/docs/source/en/model_doc/video_llava.md index 6b09367f37c8..2e1bf19abdc6 100644 --- a/docs/source/en/model_doc/video_llava.md +++ b/docs/source/en/model_doc/video_llava.md @@ -27,7 +27,6 @@ rendered properly in your Markdown viewer. Video-LLaVa is an open-source multimodal LLM trained by fine-tuning LlamA/Vicuna on multimodal instruction-following data generated by Llava1.5 and VideChat. It is an auto-regressive language model, based on the transformer architecture. Video-LLaVa unifies visual representations to the language feature space, and enables an LLM to perform visual reasoning capabilities on both images and videos simultaneously. - The Video-LLaVA model was proposed in [Video-LLaVA: Learning United Visual Representation by Alignment Before Projection](https://huggingface.co/papers/2311.10122) by Bin Lin, Yang Ye, Bin Zhu, Jiaxi Cui, Munang Ning, Peng Jin, Li Yuan. The abstract from the paper is the following: @@ -55,18 +54,16 @@ for the LLM* - Note the model has not been explicitly trained to process multiple images/videos in the same prompt, although this is technically possible, you may experience inaccurate results. -- Note that the video inputs should have exactly 8 frames at the input, since the models were trained in that setting. +- Note that the video inputs should have exactly 8 frames at the input, since the models were trained in that setting. This model was contributed by [RaushanTurganbay](https://huggingface.co/RaushanTurganbay). The original code can be found [here](https://github.com/PKU-YuanGroup/Video-LLaVA). - > [!NOTE] -> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. +> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and `processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. Adding these attributes means that LLaVA will try to infer the number of image tokens required per image and expand the text with as many `` placeholders as there will be tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there will be failure when merging the embeddings. The attributes can be obtained from model config, as `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`. The `num_additional_image_tokens` should be `1` if the vision backbone adds a CLS token or `0` if nothing extra is added to the vision patches. - ## Usage example ### Single Media Mode @@ -126,7 +123,7 @@ For multiple turns conversation change the prompt format to: ### Mixed Media Mode -The model can also generate from an interleaved image-video inputs. However note, that it was not trained in interleaved image-video setting which might affect the performance. Below is an example usage for mixed media input, add the following lines to the above code snippet: +The model can also generate from an interleaved image-video inputs. However note, that it was not trained in interleaved image-video setting which might affect the performance. Below is an example usage for mixed media input, add the following lines to the above code snippet: ```python from PIL import Image @@ -150,7 +147,7 @@ processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokeniza ### Quantization using Bitsandbytes for memory efficiency -The model can be loaded in lower bits, significantly reducing memory burden while maintaining the performance of the original model. his allows for efficient deployment on resource-constrained cases. +The model can be loaded in lower bits, significantly reducing memory burden while maintaining the performance of the original model. his allows for efficient deployment on resource-constrained cases. First make sure to install bitsandbytes by running `pip install bitsandbytes` and to have access to a GPU/accelerator that is supported by the library. @@ -164,7 +161,6 @@ We value your feedback to help identify bugs before the full release! Check out Load the quantized model by simply adding [`BitsAndBytesConfig`](../main_classes/quantization#transformers.BitsAndBytesConfig) as shown below: - ```python from transformers import VideoLlavaForConditionalGeneration, BitsAndBytesConfig @@ -178,7 +174,6 @@ quantization_config = BitsAndBytesConfig( model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", quantization_config=quantization_config, device_map="auto") ``` - ### Flash-Attention 2 to speed-up generation Additionally, we can greatly speed-up model inference by using [Flash Attention](../perf_train_gpu_one#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. @@ -203,7 +198,6 @@ model = VideoLlavaForConditionalGeneration.from_pretrained( ).to(0) ``` - ## VideoLlavaConfig [[autodoc]] VideoLlavaConfig @@ -212,7 +206,6 @@ model = VideoLlavaForConditionalGeneration.from_pretrained( [[autodoc]] VideoLlavaImageProcessor - ## VideoLlavaVideoProcessor [[autodoc]] VideoLlavaVideoProcessor diff --git a/docs/source/en/model_doc/videomae.md b/docs/source/en/model_doc/videomae.md index e0ebbaa42885..eb02fc48bb40 100644 --- a/docs/source/en/model_doc/videomae.md +++ b/docs/source/en/model_doc/videomae.md @@ -42,16 +42,16 @@ The original code can be found [here](https://github.com/MCG-NJU/VideoMAE). ## Using Scaled Dot Product Attention (SDPA) -PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function -encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the -[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) +PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function +encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the +[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention) page for more information. -SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set +SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set `attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used. -``` +```py from transformers import VideoMAEForVideoClassification model = VideoMAEForVideoClassification.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics", attn_implementation="sdpa", dtype=torch.float16) ... @@ -75,6 +75,7 @@ you're interested in submitting a resource to be included here, please feel free review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. **Video classification** + - [A notebook](https://github.com/huggingface/notebooks/blob/main/examples/video_classification.ipynb) that shows how to fine-tune a VideoMAE model on a custom dataset. - [Video classification task guide](../tasks/video_classification) diff --git a/docs/source/en/model_doc/vipllava.md b/docs/source/en/model_doc/vipllava.md index 0d0a209c27a6..a6554c91b57c 100644 --- a/docs/source/en/model_doc/vipllava.md +++ b/docs/source/en/model_doc/vipllava.md @@ -37,7 +37,6 @@ The original code can be found [here](https://github.com/mu-cai/ViP-LLaVA). This model was contributed by [Younes Belkada](https://huggingface.co/ybelkada) - ## Usage tips: - The architecture is similar than llava architecture except that the multi-modal projector takes a set of concatenated vision hidden states and has an additional layernorm layer on that module. @@ -47,11 +46,10 @@ This model was contributed by [Younes Belkada](https://huggingface.co/ybelkada) - Note the model has not been explicitly trained to process multiple images in the same prompt, although this is technically possible, you may experience inaccurate results. > [!NOTE] -> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. +> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and `processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. Adding these attributes means that LLaVA will try to infer the number of image tokens required per image and expand the text with as many `` placeholders as there will be tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there will be failure when merging the embeddings. The attributes can be obtained from model config, as `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`. The `num_additional_image_tokens` should be `1` if the vision backbone adds a CLS token or `0` if nothing extra is added to the vision patches. - - For better results, we recommend users to use the processor's `apply_chat_template()` method to format your prompt correctly. For that you need to construct a conversation history, passing in a plain string will not format your prompt. Each message in the conversation history for chat templates is a dictionary with keys "role" and "content". The "content" should be a list of dictionaries, for "text" and "image" modalities, as follows: ```python @@ -88,16 +86,17 @@ print(text_prompt) ``` - If you want to construct a chat prompt yourself, below is a list of prompt formats accepted by VipLLaVa checkpoints: + ```bash A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: \n###Assistant: ``` For multiple turns conversation: + ```bash A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: \n###Assistant: ###Human: ###Assistant: ``` - ## VipLlavaConfig [[autodoc]] VipLlavaConfig diff --git a/docs/source/en/model_doc/visual_bert.md b/docs/source/en/model_doc/visual_bert.md index 7a7ac24e4dbf..a9912144c4f9 100644 --- a/docs/source/en/model_doc/visual_bert.md +++ b/docs/source/en/model_doc/visual_bert.md @@ -27,7 +27,6 @@ rendered properly in your Markdown viewer. You can find all the original VisualBERT checkpoints under the [UCLA NLP](https://huggingface.co/uclanlp/models?search=visualbert) organization. - > [!TIP] > This model was contributed by [gchhablani](https://huggingface.co/gchhablani). > Click on the VisualBERT models in the right sidebar for more examples of how to apply VisualBERT to different image and language tasks. diff --git a/docs/source/en/model_doc/vit_hybrid.md b/docs/source/en/model_doc/vit_hybrid.md index 86c2c7229f58..c10d1c489b76 100644 --- a/docs/source/en/model_doc/vit_hybrid.md +++ b/docs/source/en/model_doc/vit_hybrid.md @@ -55,16 +55,16 @@ found [here](https://github.com/google-research/vision_transformer). ## Using Scaled Dot Product Attention (SDPA) -PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function -encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the -[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) +PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function +encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the +[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention) page for more information. -SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set +SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set `attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used. -``` +```py from transformers import ViTHybridForImageClassification model = ViTHybridForImageClassification.from_pretrained("google/vit-hybrid-base-bit-384", attn_implementation="sdpa", dtype=torch.float16) ... diff --git a/docs/source/en/model_doc/vit_mae.md b/docs/source/en/model_doc/vit_mae.md index b8b9867e8812..0547594ae118 100644 --- a/docs/source/en/model_doc/vit_mae.md +++ b/docs/source/en/model_doc/vit_mae.md @@ -15,7 +15,6 @@ rendered properly in your Markdown viewer. --> *This model was released on 2021-11-11 and added to Hugging Face Transformers on 2022-01-18.* -
PyTorch @@ -67,6 +66,7 @@ reconstruction = outputs.logits ## Notes + - ViTMAE is typically used in two stages. Self-supervised pretraining with [`ViTMAEForPreTraining`], and then discarding the decoder and fine-tuning the encoder. After fine-tuning, the weights can be plugged into a model like [`ViTForImageClassification`]. - Use [`ViTImageProcessor`] for input preparation. diff --git a/docs/source/en/model_doc/vit_msn.md b/docs/source/en/model_doc/vit_msn.md index 5b727f34256c..d7a8172a18f3 100644 --- a/docs/source/en/model_doc/vit_msn.md +++ b/docs/source/en/model_doc/vit_msn.md @@ -40,11 +40,11 @@ while producing representations of a high semantic level that perform competitiv on ImageNet-1K, with only 5,000 annotated images, our base MSN model achieves 72.4% top-1 accuracy, and with 1% of ImageNet-1K labels, we achieve 75.7% top-1 accuracy, setting a new state-of-the-art for self-supervised learning on this benchmark.* -drawing +drawing MSN architecture. Taken from the original paper. -This model was contributed by [sayakpaul](https://huggingface.co/sayakpaul). The original code can be found [here](https://github.com/facebookresearch/msn). +This model was contributed by [sayakpaul](https://huggingface.co/sayakpaul). The original code can be found [here](https://github.com/facebookresearch/msn). ## Usage tips @@ -58,16 +58,16 @@ labels when fine-tuned. ### Using Scaled Dot Product Attention (SDPA) -PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function -encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the -[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) +PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function +encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the +[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention) page for more information. -SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set +SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set `attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used. -``` +```py from transformers import ViTMSNForImageClassification model = ViTMSNForImageClassification.from_pretrained("facebook/vit-msn-base", attn_implementation="sdpa", dtype=torch.float16) ... diff --git a/docs/source/en/model_doc/vitdet.md b/docs/source/en/model_doc/vitdet.md index 539ae5e376c8..a1250f1bb909 100644 --- a/docs/source/en/model_doc/vitdet.md +++ b/docs/source/en/model_doc/vitdet.md @@ -40,4 +40,4 @@ Tips: ## VitDetModel [[autodoc]] VitDetModel - - forward \ No newline at end of file + - forward diff --git a/docs/source/en/model_doc/vitmatte.md b/docs/source/en/model_doc/vitmatte.md index 519a2dd74d66..0584df8e67a5 100644 --- a/docs/source/en/model_doc/vitmatte.md +++ b/docs/source/en/model_doc/vitmatte.md @@ -62,4 +62,4 @@ The model expects both the image and trimap (concatenated) as input. Use [`ViTMa ## VitMatteForImageMatting [[autodoc]] VitMatteForImageMatting - - forward \ No newline at end of file + - forward diff --git a/docs/source/en/model_doc/vits.md b/docs/source/en/model_doc/vits.md index 2c1777b77f18..96dc93892470 100644 --- a/docs/source/en/model_doc/vits.md +++ b/docs/source/en/model_doc/vits.md @@ -149,11 +149,10 @@ Audio(waveform, rate=model.config.sampling_rate) ## VitsTokenizer [[autodoc]] VitsTokenizer -- __call__ -- save_vocabulary + - __call__ + - save_vocabulary ## VitsModel [[autodoc]] VitsModel -- forward - + - forward diff --git a/docs/source/en/model_doc/vivit.md b/docs/source/en/model_doc/vivit.md index 041f80f61ae6..fc127fa6f595 100644 --- a/docs/source/en/model_doc/vivit.md +++ b/docs/source/en/model_doc/vivit.md @@ -32,16 +32,16 @@ This model was contributed by [jegormeister](https://huggingface.co/jegormeister ### Using Scaled Dot Product Attention (SDPA) -PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function -encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the -[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) +PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function +encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the +[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention) page for more information. -SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set +SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set `attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used. -``` +```py from transformers import VivitModel model = VivitModel.from_pretrained("google/vivit-b-16x2-kinetics400", attn_implementation="sdpa", dtype=torch.float16) ... @@ -56,8 +56,6 @@ On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32` |---------------------:|-------------:|----------:|--------------:|----------------------:|---------------------:|-----------------:| | 100 | 1 | True | 7.122 | 2575.28 | 5932.54 | 130.364 | - - ### Inference | num_batches | batch_size | is cuda | is half | Speedup (%) | Mem eager (MB) | Mem BT (MB) | Mem saved (%) | |---------------|--------------|-----------|-----------|---------------|------------------|---------------|-----------------| @@ -65,7 +63,6 @@ On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32` | 20 | 2 | True | False | 17.146 | 1234.75 | 447.175 | 176.122 | | 20 | 4 | True | False | 18.093 | 2275.82 | 709.864 | 220.6 | | 20 | 8 | True | False | 19.284 | 4358.19 | 1233.24 | 253.393 | - ## VivitConfig diff --git a/docs/source/en/model_doc/vjepa2.md b/docs/source/en/model_doc/vjepa2.md index 93960f051893..049c7ff98f21 100644 --- a/docs/source/en/model_doc/vjepa2.md +++ b/docs/source/en/model_doc/vjepa2.md @@ -15,7 +15,6 @@ rendered properly in your Markdown viewer. --> *This model was released on 2025-06-11 and added to Hugging Face Transformers on 2025-06-11.* -
PyTorch @@ -34,7 +33,6 @@ rendered properly in your Markdown viewer. You can find all original V-JEPA2 checkpoints under the [V-JEPA 2](https://huggingface.co/collections/facebook/v-jepa-2-6841bad8413014e185b497a6) collection. - This model was contributed by [koustuvs](https://huggingface.co/koustuvs), [yonigozlan](https://huggingface.co/yonigozlan) and [qubvel](https://huggingface.co/qubvel-hf). The original code can be found [here](https://github.com/facebookresearch/vjepa2). ## Usage example diff --git a/docs/source/en/model_doc/voxtral.md b/docs/source/en/model_doc/voxtral.md index 71f0661c8276..3dd2fc9e0d31 100644 --- a/docs/source/en/model_doc/voxtral.md +++ b/docs/source/en/model_doc/voxtral.md @@ -22,6 +22,7 @@ Voxtral is an upgrade of [Ministral 3B and Mistral Small 3B](https://mistral.ai/ You can read more in Mistral's [realease blog post](https://mistral.ai/news/voxtral). The model is available in two checkpoints: + - 3B: [mistralai/Voxtral-Mini-3B-2507](https://huggingface.co/mistralai/Voxtral-Mini-3B-2507) - 24B: [mistralai/Voxtral-Small-24B-2507](https://huggingface.co/mistralai/Voxtral-Small-24B-2507) @@ -43,6 +44,7 @@ Voxtral builds on Ministral-3B by adding audio processing capabilities: The model supports audio-text instructions, including multi-turn and multi-audio interactions, all processed in batches. ➡️ audio + text instruction + ```python import torch from transformers import VoxtralForConditionalGeneration, AutoProcessor, infer_device @@ -78,7 +80,8 @@ print(decoded_outputs[0]) print("=" * 80) ``` -➡️ multi-audio + text instruction +➡️ multi-audio + text instruction + ```python import torch from transformers import VoxtralForConditionalGeneration, AutoProcessor, infer_device @@ -119,6 +122,7 @@ print("=" * 80) ``` ➡️ multi-turn: + ```python import torch from transformers import VoxtralForConditionalGeneration, AutoProcessor, infer_device @@ -173,6 +177,7 @@ print("=" * 80) ``` ➡️ text only: + ```python import torch from transformers import VoxtralForConditionalGeneration, AutoProcessor, infer_device @@ -208,6 +213,7 @@ print("=" * 80) ``` ➡️ audio only: + ```python import torch from transformers import VoxtralForConditionalGeneration, AutoProcessor, infer_device @@ -243,6 +249,7 @@ print("=" * 80) ``` ➡️ batched inference! + ```python import torch from transformers import VoxtralForConditionalGeneration, AutoProcessor, infer_device() diff --git a/docs/source/en/model_doc/wav2vec2-bert.md b/docs/source/en/model_doc/wav2vec2-bert.md index 4edb67498aaa..23409b0898c3 100644 --- a/docs/source/en/model_doc/wav2vec2-bert.md +++ b/docs/source/en/model_doc/wav2vec2-bert.md @@ -31,7 +31,7 @@ The official results of the model can be found in Section 3.2.1 of the paper. The abstract from the paper is the following: -*Recent advancements in automatic speech translation have dramatically expanded language coverage, improved multimodal capabilities, and enabled a wide range of tasks and functionalities. That said, large-scale automatic speech translation systems today lack key features that help machine-mediated communication feel seamless when compared to human-to-human dialogue. In this work, we introduce a family of models that enable end-to-end expressive and multilingual translations in a streaming fashion. First, we contribute an improved version of the massively multilingual and multimodal SeamlessM4T model—SeamlessM4T v2. This newer model, incorporating an updated UnitY2 framework, was trained on more low-resource language data. The expanded version of SeamlessAlign adds 114,800 hours of automatically aligned data for a total of 76 languages. SeamlessM4T v2 provides the foundation on which our two newest models, SeamlessExpressive and SeamlessStreaming, are initiated. SeamlessExpressive enables translation that preserves vocal styles and prosody. Compared to previous efforts in expressive speech research, our work addresses certain underexplored aspects of prosody, such as speech rate and pauses, while also preserving the style of one’s voice. As for SeamlessStreaming, our model leverages the Efficient Monotonic Multihead Attention (EMMA) mechanism to generate low-latency target translations without waiting for complete source utterances. As the first of its kind, SeamlessStreaming enables simultaneous speech-to-speech/text translation for multiple source and target languages. To understand the performance of these models, we combined novel and modified versions of existing automatic metrics to evaluate prosody, latency, and robustness. For human evaluations, we adapted existing protocols tailored for measuring the most relevant attributes in the preservation of meaning, naturalness, and expressivity. To ensure that our models can be used safely and responsibly, we implemented the first known red-teaming effort for multimodal machine translation, a system for the detection and mitigation of added toxicity, a systematic evaluation of gender bias, and an inaudible localized watermarking mechanism designed to dampen the impact of deepfakes. Consequently, we bring major components from SeamlessExpressive and SeamlessStreaming together to form Seamless, the first publicly available system that unlocks expressive cross-lingual communication in real-time. In sum, Seamless gives us a pivotal look at the technical foundation needed to turn the Universal Speech Translator from a science fiction concept into a real-world technology. Finally, contributions in this work—including models, code, and a watermark detector—are publicly released and accessible at the link below.* +*Recent advancements in automatic speech translation have dramatically expanded language coverage, improved multimodal capabilities, and enabled a wide range of tasks and functionalities. That said, large-scale automatic speech translation systems today lack key features that help machine-mediated communication feel seamless when compared to human-to-human dialogue. In this work, we introduce a family of models that enable end-to-end expressive and multilingual translations in a streaming fashion. First, we contribute an improved version of the massively multilingual and multimodal SeamlessM4T model—SeamlessM4T v2. This newer model, incorporating an updated UnitY2 framework, was trained on more low-resource language data. The expanded version of SeamlessAlign adds 114,800 hours of automatically aligned data for a total of 76 languages. SeamlessM4T v2 provides the foundation on which our two newest models, SeamlessExpressive and SeamlessStreaming, are initiated. SeamlessExpressive enables translation that preserves vocal styles and prosody. Compared to previous efforts in expressive speech research, our work addresses certain underexplored aspects of prosody, such as speech rate and pauses, while also preserving the style of one's voice. As for SeamlessStreaming, our model leverages the Efficient Monotonic Multihead Attention (EMMA) mechanism to generate low-latency target translations without waiting for complete source utterances. As the first of its kind, SeamlessStreaming enables simultaneous speech-to-speech/text translation for multiple source and target languages. To understand the performance of these models, we combined novel and modified versions of existing automatic metrics to evaluate prosody, latency, and robustness. For human evaluations, we adapted existing protocols tailored for measuring the most relevant attributes in the preservation of meaning, naturalness, and expressivity. To ensure that our models can be used safely and responsibly, we implemented the first known red-teaming effort for multimodal machine translation, a system for the detection and mitigation of added toxicity, a systematic evaluation of gender bias, and an inaudible localized watermarking mechanism designed to dampen the impact of deepfakes. Consequently, we bring major components from SeamlessExpressive and SeamlessStreaming together to form Seamless, the first publicly available system that unlocks expressive cross-lingual communication in real-time. In sum, Seamless gives us a pivotal look at the technical foundation needed to turn the Universal Speech Translator from a science fiction concept into a real-world technology. Finally, contributions in this work—including models, code, and a watermark detector—are publicly released and accessible at the link below.* This model was contributed by [ylacombe](https://huggingface.co/ylacombe). The original code can be found [here](https://github.com/facebookresearch/seamless_communication). @@ -54,7 +54,6 @@ This model was contributed by [ylacombe](https://huggingface.co/ylacombe). The o - [`Wav2Vec2BertForSequenceClassification`] can be used by adapting this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/audio-classification). - See also: [Audio classification task guide](../tasks/audio_classification) - ## Wav2Vec2BertConfig [[autodoc]] Wav2Vec2BertConfig diff --git a/docs/source/en/model_doc/wav2vec2-conformer.md b/docs/source/en/model_doc/wav2vec2-conformer.md index e2a56b450df3..663b6163011b 100644 --- a/docs/source/en/model_doc/wav2vec2-conformer.md +++ b/docs/source/en/model_doc/wav2vec2-conformer.md @@ -38,7 +38,7 @@ Note: Meta (FAIR) released a new version of [Wav2Vec2-BERT 2.0](https://huggingf - Wav2Vec2-Conformer follows the same architecture as Wav2Vec2, but replaces the *Attention*-block with a *Conformer*-block as introduced in [Conformer: Convolution-augmented Transformer for Speech Recognition](https://huggingface.co/papers/2005.08100). -- For the same number of layers, Wav2Vec2-Conformer requires more parameters than Wav2Vec2, but also yields +- For the same number of layers, Wav2Vec2-Conformer requires more parameters than Wav2Vec2, but also yields an improved word error rate. - Wav2Vec2-Conformer uses the same tokenizer and feature extractor as Wav2Vec2. - Wav2Vec2-Conformer can use either no relative position embeddings, Transformer-XL-like position embeddings, or diff --git a/docs/source/en/model_doc/wav2vec2.md b/docs/source/en/model_doc/wav2vec2.md index 6c4772f90bc8..1f5f4a905767 100644 --- a/docs/source/en/model_doc/wav2vec2.md +++ b/docs/source/en/model_doc/wav2vec2.md @@ -80,13 +80,10 @@ model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-large-960h-lv60-self", Below is an expected speedup diagram comparing the pure inference time between the native implementation in transformers of the `facebook/wav2vec2-large-960h-lv60-self` model and the flash-attention-2 and sdpa (scale-dot-product-attention) versions. . We show the average speedup obtained on the `librispeech_asr` `clean` validation split: -
- - ## Resources A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Wav2Vec2. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. diff --git a/docs/source/en/model_doc/wav2vec2_phoneme.md b/docs/source/en/model_doc/wav2vec2_phoneme.md index fe989def3bdd..206ea048c023 100644 --- a/docs/source/en/model_doc/wav2vec2_phoneme.md +++ b/docs/source/en/model_doc/wav2vec2_phoneme.md @@ -53,7 +53,6 @@ The original code can be found [here](https://github.com/pytorch/fairseq/tree/ma - By default, the model outputs a sequence of phonemes. In order to transform the phonemes to a sequence of words one should make use of a dictionary and language model. - Wav2Vec2Phoneme's architecture is based on the Wav2Vec2 model, for API reference, check out [`Wav2Vec2`](wav2vec2)'s documentation page @@ -64,7 +63,7 @@ except for the tokenizer. ## Wav2Vec2PhonemeCTCTokenizer [[autodoc]] Wav2Vec2PhonemeCTCTokenizer - - __call__ - - batch_decode - - decode - - phonemize + - __call__ + - batch_decode + - decode + - phonemize diff --git a/docs/source/en/model_doc/whisper.md b/docs/source/en/model_doc/whisper.md index 673085ac3e7d..5e19e870bddc 100644 --- a/docs/source/en/model_doc/whisper.md +++ b/docs/source/en/model_doc/whisper.md @@ -15,7 +15,6 @@ rendered properly in your Markdown viewer. --> *This model was released on 2022-12-06 and added to Hugging Face Transformers on 2022-10-05.* -
PyTorch diff --git a/docs/source/en/model_doc/xcodec.md b/docs/source/en/model_doc/xcodec.md index c4a0b92a26f6..957a74093484 100644 --- a/docs/source/en/model_doc/xcodec.md +++ b/docs/source/en/model_doc/xcodec.md @@ -33,9 +33,10 @@ The X-Codec model is a neural audio codec that integrates semantic information f The abstract of the paper states the following: -*Recent advancements in audio generation have been significantly propelled by the capabilities of Large Language Models (LLMs). The existing research on audio LLM has primarily focused on enhancing the architecture and scale of audio language models, as well as leveraging larger datasets, and generally, acoustic codecs, such as EnCodec, are used for audio tokenization. However, these codecs were originally designed for audio compression, which may lead to suboptimal performance in the context of audio LLM. Our research aims to address the shortcomings of current audio LLM codecs, particularly their challenges in maintaining semantic integrity in generated audio. For instance, existing methods like VALL-E, which condition acoustic token generation on text transcriptions, often suffer from content inaccuracies and elevated word error rates (WER) due to semantic misinterpretations of acoustic tokens, resulting in word skipping and errors. To overcome these issues, we propose a straightforward yet effective approach called X-Codec. X-Codec incorporates semantic features from a pre-trained semantic encoder before the Residual Vector Quantization (RVQ) stage and introduces a semantic reconstruction loss after RVQ. By enhancing the semantic ability of the codec, X-Codec significantly reduces WER in speech synthesis tasks and extends these benefits to non-speech applications, including music and sound generation. Our experiments in text-to-speech, music continuation, and text-to-sound tasks demonstrate that integrating semantic information substantially improves the overall performance of language models in audio generation.* +*Recent advancements in audio generation have been significantly propelled by the capabilities of Large Language Models (LLMs). The existing research on audio LLM has primarily focused on enhancing the architecture and scale of audio language models, as well as leveraging larger datasets, and generally, acoustic codecs, such as EnCodec, are used for audio tokenization. However, these codecs were originally designed for audio compression, which may lead to suboptimal performance in the context of audio LLM. Our research aims to address the shortcomings of current audio LLM codecs, particularly their challenges in maintaining semantic integrity in generated audio. For instance, existing methods like VALL-E, which condition acoustic token generation on text transcriptions, often suffer from content inaccuracies and elevated word error rates (WER) due to semantic misinterpretations of acoustic tokens, resulting in word skipping and errors. To overcome these issues, we propose a straightforward yet effective approach called X-Codec. X-Codec incorporates semantic features from a pre-trained semantic encoder before the Residual Vector Quantization (RVQ) stage and introduces a semantic reconstruction loss after RVQ. By enhancing the semantic ability of the codec, X-Codec significantly reduces WER in speech synthesis tasks and extends these benefits to non-speech applications, including music and sound generation. Our experiments in text-to-speech, music continuation, and text-to-sound tasks demonstrate that integrating semantic information substantially improves the overall performance of language models in audio generation.* Model cards: + - [xcodec-hubert-librispeech](https://huggingface.co/hf-audio/xcodec-hubert-librispeech) (for speech) - [xcodec-wavlm-mls](https://huggingface.co/hf-audio/xcodec-wavlm-mls) (for speech) - [xcodec-wavlm-more-data](https://huggingface.co/hf-audio/xcodec-wavlm-more-data) (for speech) @@ -46,12 +47,11 @@ This model was contributed by [Manal El Aidouni](https://huggingface.co/Manel). Demos can be found on this [page](https://x-codec-audio.github.io/). - -## Usage example +## Usage example Here is a quick example of how to encode and decode an audio using this model: -```python +```python from datasets import load_dataset, Audio from transformers import XcodecModel, AutoFeatureExtractor dummy_dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") @@ -75,6 +75,7 @@ audio_values = decoder_outputs.audio_values audio_values = model(inputs["input_values"]).audio_values ``` + To listen to the original and reconstructed audio, run the snippet below and then open the generated `original.wav` and `reconstruction.wav` files in your music player to compare. ```python @@ -88,15 +89,13 @@ sf.write("original.wav", original, sampling_rate) sf.write("reconstruction.wav", reconstruction.T, sampling_rate) ``` - ## XcodecConfig [[autodoc]] XcodecConfig - ## XcodecModel [[autodoc]] XcodecModel - decode - encode - - forward \ No newline at end of file + - forward diff --git a/docs/source/en/model_doc/xglm.md b/docs/source/en/model_doc/xglm.md index d82bba7d23f9..9372b52af1f7 100644 --- a/docs/source/en/model_doc/xglm.md +++ b/docs/source/en/model_doc/xglm.md @@ -44,7 +44,6 @@ showing in particular that it enables cross-lingual in-context learning on some on surface form robustness and adaptation to tasks that do not have a natural cloze form. Finally, we evaluate our models in social value tasks such as hate speech detection in five languages and find it has limitations similar to comparable sized GPT-3 models.* - This model was contributed by [Suraj](https://huggingface.co/valhalla). The original code can be found [here](https://github.com/pytorch/fairseq/tree/main/examples/xglm). ## Resources diff --git a/docs/source/en/model_doc/xlm-prophetnet.md b/docs/source/en/model_doc/xlm-prophetnet.md index 4dad4c0afa78..fbf47d8c422a 100644 --- a/docs/source/en/model_doc/xlm-prophetnet.md +++ b/docs/source/en/model_doc/xlm-prophetnet.md @@ -41,7 +41,6 @@ You can do so by running the following command: `pip install -U transformers==4. **DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title) and assign @patrickvonplaten - ## Overview The XLM-ProphetNet model was proposed in [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training,](https://huggingface.co/papers/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei diff --git a/docs/source/en/model_doc/xlm-roberta-xl.md b/docs/source/en/model_doc/xlm-roberta-xl.md index 988107fdacc6..97dc6f1a7445 100644 --- a/docs/source/en/model_doc/xlm-roberta-xl.md +++ b/docs/source/en/model_doc/xlm-roberta-xl.md @@ -77,6 +77,7 @@ predicted_token = tokenizer.decode(predicted_token_id) print(f"The predicted token is: {predicted_token}") ``` + @@ -84,6 +85,7 @@ print(f"The predicted token is: {predicted_token}") ```bash echo -e "Plants create through a process known as photosynthesis." | transformers-cli run --task fill-mask --model facebook/xlm-roberta-xl --device 0 ``` + diff --git a/docs/source/en/model_doc/xlm-roberta.md b/docs/source/en/model_doc/xlm-roberta.md index a662742c2674..3a4b8e682603 100644 --- a/docs/source/en/model_doc/xlm-roberta.md +++ b/docs/source/en/model_doc/xlm-roberta.md @@ -87,6 +87,7 @@ print(f"The predicted token is: {predicted_token}") ```bash echo -e "Plants create through a process known as photosynthesis." | transformers-cli run --task fill-mask --model FacebookAI/xlm-roberta-base --device 0 ``` + diff --git a/docs/source/en/model_doc/xlm.md b/docs/source/en/model_doc/xlm.md index dc51fa4be4cd..11c00f4ec8ed 100644 --- a/docs/source/en/model_doc/xlm.md +++ b/docs/source/en/model_doc/xlm.md @@ -79,6 +79,7 @@ print(f"Predicted token: {predicted_token}") ```bash echo -e "Plants create through a process known as photosynthesis." | transformers-cli run --task fill-mask --model FacebookAI/xlm-mlm-en-2048 --device 0 ``` + diff --git a/docs/source/en/model_doc/xlstm.md b/docs/source/en/model_doc/xlstm.md index b239d631fbbc..e1ba3195eccf 100644 --- a/docs/source/en/model_doc/xlstm.md +++ b/docs/source/en/model_doc/xlstm.md @@ -15,7 +15,6 @@ rendered properly in your Markdown viewer. --> *This model was released on 2024-05-07 and added to Hugging Face Transformers on 2025-07-25.* - # xLSTM ## Overview @@ -32,7 +31,6 @@ The abstract from the paper is the following: This model was contributed by [NX-AI](https://huggingface.co/NX-AI). The original code can be found [here](https://github.com/NX-AI/xlstm). - ## xLSTMConfig [[autodoc]] xLSTMConfig diff --git a/docs/source/en/model_doc/xmod.md b/docs/source/en/model_doc/xmod.md index 0593e9940bd6..624b7ebb2d23 100644 --- a/docs/source/en/model_doc/xmod.md +++ b/docs/source/en/model_doc/xmod.md @@ -36,6 +36,7 @@ The original code can be found [here](https://github.com/facebookresearch/fairse ## Usage tips Tips: + - X-MOD is similar to [XLM-R](xlm-roberta), but a difference is that the input language needs to be specified so that the correct language adapter can be activated. - The main models – base and large – have adapters for 81 languages. @@ -44,6 +45,7 @@ Tips: ### Input language There are two ways to specify the input language: + 1. By setting a default language before using the model: ```python diff --git a/docs/source/en/model_doc/yolos.md b/docs/source/en/model_doc/yolos.md index 5c31b539e59c..4a75b2ed020f 100644 --- a/docs/source/en/model_doc/yolos.md +++ b/docs/source/en/model_doc/yolos.md @@ -26,14 +26,12 @@ rendered properly in your Markdown viewer. [YOLOS](https://huggingface.co/papers/2106.00666) uses a [Vision Transformer (ViT)](./vit) for object detection with minimal modifications and region priors. It can achieve performance comparable to specialized object detection models and frameworks with knowledge about 2D spatial structures. - You can find all the original YOLOS checkpoints under the [HUST Vision Lab](https://huggingface.co/hustvl/models?search=yolos) organization. drawing YOLOS architecture. Taken from the original paper. - > [!TIP] > This model wasa contributed by [nielsr](https://huggingface.co/nielsr). > Click on the YOLOS models in the right sidebar for more examples of how to apply YOLOS to different object detection tasks. @@ -98,8 +96,8 @@ for score, label, box in zip(filtered_scores, filtered_labels, pixel_boxes): - ## Notes + - Use [`YolosImageProcessor`] for preparing images (and optional targets) for the model. Contrary to [DETR](./detr), YOLOS doesn't require a `pixel_mask`. ## Resources diff --git a/docs/source/en/model_doc/yoso.md b/docs/source/en/model_doc/yoso.md index f07e5aba0827..211b0dcf8091 100644 --- a/docs/source/en/model_doc/yoso.md +++ b/docs/source/en/model_doc/yoso.md @@ -26,20 +26,20 @@ rendered properly in your Markdown viewer. The YOSO model was proposed in [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://huggingface.co/papers/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh. YOSO approximates standard softmax self-attention via a Bernoulli sampling scheme based on Locality Sensitive Hashing (LSH). In principle, all the Bernoulli random variables can be sampled with -a single hash. +a single hash. The abstract from the paper is the following: -*Transformer-based models are widely used in natural language processing (NLP). Central to the transformer model is -the self-attention mechanism, which captures the interactions of token pairs in the input sequences and depends quadratically -on the sequence length. Training such models on longer sequences is expensive. In this paper, we show that a Bernoulli sampling -attention mechanism based on Locality Sensitive Hashing (LSH), decreases the quadratic complexity of such models to linear. -We bypass the quadratic cost by considering self-attention as a sum of individual tokens associated with Bernoulli random -variables that can, in principle, be sampled at once by a single hash (although in practice, this number may be a small constant). -This leads to an efficient sampling scheme to estimate self-attention which relies on specific modifications of -LSH (to enable deployment on GPU architectures). We evaluate our algorithm on the GLUE benchmark with standard 512 sequence -length where we see favorable performance relative to a standard pretrained Transformer. On the Long Range Arena (LRA) benchmark, -for evaluating performance on long sequences, our method achieves results consistent with softmax self-attention but with sizable +*Transformer-based models are widely used in natural language processing (NLP). Central to the transformer model is +the self-attention mechanism, which captures the interactions of token pairs in the input sequences and depends quadratically +on the sequence length. Training such models on longer sequences is expensive. In this paper, we show that a Bernoulli sampling +attention mechanism based on Locality Sensitive Hashing (LSH), decreases the quadratic complexity of such models to linear. +We bypass the quadratic cost by considering self-attention as a sum of individual tokens associated with Bernoulli random +variables that can, in principle, be sampled at once by a single hash (although in practice, this number may be a small constant). +This leads to an efficient sampling scheme to estimate self-attention which relies on specific modifications of +LSH (to enable deployment on GPU architectures). We evaluate our algorithm on the GLUE benchmark with standard 512 sequence +length where we see favorable performance relative to a standard pretrained Transformer. On the Long Range Arena (LRA) benchmark, +for evaluating performance on long sequences, our method achieves results consistent with softmax self-attention but with sizable speed-ups and memory savings and often outperforms other efficient self-attention methods. Our code is available at this https URL* This model was contributed by [novice03](https://huggingface.co/novice03). The original code can be found [here](https://github.com/mlpen/YOSO). @@ -50,12 +50,12 @@ This model was contributed by [novice03](https://huggingface.co/novice03). The o in parallel on a GPU. - The kernels provide a `fast_hash` function, which approximates the random projections of the queries and keys using the Fast Hadamard Transform. Using these hash codes, the `lsh_cumulation` function approximates self-attention via LSH-based Bernoulli sampling. -- To use the custom kernels, the user should set `config.use_expectation = False`. To ensure that the kernels are compiled successfully, -the user must install the correct version of PyTorch and cudatoolkit. By default, `config.use_expectation = True`, which uses YOSO-E and +- To use the custom kernels, the user should set `config.use_expectation = False`. To ensure that the kernels are compiled successfully, +the user must install the correct version of PyTorch and cudatoolkit. By default, `config.use_expectation = True`, which uses YOSO-E and does not require compiling CUDA kernels. +alt="drawing" width="600"/> YOSO Attention Algorithm. Taken from the original paper. @@ -99,4 +99,4 @@ alt="drawing" width="600"/> ## YosoForQuestionAnswering [[autodoc]] YosoForQuestionAnswering - - forward \ No newline at end of file + - forward diff --git a/docs/source/en/model_doc/zamba.md b/docs/source/en/model_doc/zamba.md index bb9740807703..847f0532e2a7 100644 --- a/docs/source/en/model_doc/zamba.md +++ b/docs/source/en/model_doc/zamba.md @@ -24,7 +24,6 @@ rendered properly in your Markdown viewer. This model was contributed by [pglo](https://huggingface.co/pglo). - ## Model details Zamba-7B-v1 is a hybrid between state-space models (Specifically [Mamba](https://github.com/state-spaces/mamba)) and transformer, and was trained using next-token prediction. Zamba uses a shared transformer layer after every 6 mamba blocks. It uses the [Mistral v0.1 tokenizer](https://huggingface.co/mistralai/Mistral-7B-v0.1). We came to this architecture after a series of ablations at small scales. Zamba-7B-v1 was pre-trained on 1T tokens of text and code data. @@ -33,23 +32,24 @@ Zamba-7B-v1 is a hybrid between state-space models (Specifically [Mamba](https:/ ## Quick start - ### Presequities Zamba requires you use `transformers` version 4.46.0 or higher: + ```bash pip install transformers>=4.45.0 ``` In order to run optimized Mamba implementations, you first need to install `mamba-ssm` and `causal-conv1d`: + ```bash pip install mamba-ssm causal-conv1d>=1.2.0 ``` + You also have to have the model on a CUDA device. You can run the model not using the optimized Mamba kernels, but it is **not** recommended as it will result in significantly lower latencies. In order to do that, you'll need to specify `use_mamba_kernels=False` when loading the model. - ## Inference ```python @@ -66,39 +66,33 @@ outputs = model.generate(**input_ids, max_new_tokens=100) print(tokenizer.decode(outputs[0])) ``` - ## Model card The model cards can be found at: -* [Zamba-7B](https://huggingface.co/Zyphra/Zamba-7B-v1) +* [Zamba-7B](https://huggingface.co/Zyphra/Zamba-7B-v1) ## Issues For issues with model output, or community discussion, please use the Hugging Face community [forum](https://huggingface.co/Zyphra/Zamba-7B-v1/discussions) - ## License The model weights are open-sourced via an Apache 2.0 license. - ## ZambaConfig [[autodoc]] ZambaConfig - ## ZambaModel [[autodoc]] ZambaModel - forward - ## ZambaForCausalLM [[autodoc]] ZambaForCausalLM - forward - ## ZambaForSequenceClassification [[autodoc]] transformers.ZambaForSequenceClassification diff --git a/docs/source/en/model_doc/zamba2.md b/docs/source/en/model_doc/zamba2.md index 1d911a59c277..c9d3d3d1de75 100644 --- a/docs/source/en/model_doc/zamba2.md +++ b/docs/source/en/model_doc/zamba2.md @@ -26,19 +26,18 @@ rendered properly in your Markdown viewer. This model was contributed by [pglo](https://huggingface.co/pglo). - ## Model details -[Zamba2-1.2B](https://www.zyphra.com/post/zamba2-mini), [Zamba2-2.7B](https://www.zyphra.com/post/zamba2-small) and [Zamba2-7B](https://www.zyphra.com/post/zamba2-7b) are hybrid models combining state-space models (Specifically [Mamba](https://github.com/state-spaces/mamba)) and transformer, and were trained using next-token prediction. Zamba2 uses shared transformer layers after every 6 mamba blocks. It uses the [Mistral v0.1 tokenizer](https://huggingface.co/mistralai/Mistral-7B-v0.1). We came to this architecture after a series of ablations at small scales. Zamba2-1.2B, Zamba2-2.7B and Zamba2-7B were pre-trained on 2T and 3T tokens, respectively. +[Zamba2-1.2B](https://www.zyphra.com/post/zamba2-mini), [Zamba2-2.7B](https://www.zyphra.com/post/zamba2-small) and [Zamba2-7B](https://www.zyphra.com/post/zamba2-7b) are hybrid models combining state-space models (Specifically [Mamba2](https://github.com/state-spaces/mamba)) and transformer, and were trained using next-token prediction. Zamba2 uses shared transformer layers after every 6 mamba blocks. It uses the [Mistral v0.1 tokenizer](https://huggingface.co/mistralai/Mistral-7B-v0.1). We came to this architecture after a series of ablations at small scales. Zamba2-1.2B, Zamba2-2.7B and Zamba2-7B were pre-trained on 2T and 3T tokens, respectively. ## Quick start - ### Presequities Zamba2 requires you use `transformers` version 4.48.0 or higher: + ```bash pip install transformers>=4.48.0 ``` @@ -59,41 +58,35 @@ outputs = model.generate(**input_ids, max_new_tokens=100) print(tokenizer.decode(outputs[0])) ``` - ## Model card The model cards can be found at: + * [Zamba2-1.2B](https://huggingface.co/Zyphra/Zamba2-1.2B) * [Zamba2-2.7B](https://huggingface.co/Zyphra/Zamba2-2.7B) * [Zamba2-7B](https://huggingface.co/Zyphra/Zamba2-7B) - ## Issues For issues with model output, or community discussion, please use the Hugging Face community [forum](https://huggingface.co/Zyphra/Zamba2-7B/discussions) - ## License The model weights are open-sourced via an Apache 2.0 license. - ## Zamba2Config [[autodoc]] Zamba2Config - ## Zamba2Model [[autodoc]] Zamba2Model - forward - ## Zamba2ForCausalLM [[autodoc]] Zamba2ForCausalLM - forward - ## Zamba2ForSequenceClassification [[autodoc]] transformers.Zamba2ForSequenceClassification diff --git a/docs/source/en/model_doc/zoedepth.md b/docs/source/en/model_doc/zoedepth.md index 367c630a3224..92840a770462 100644 --- a/docs/source/en/model_doc/zoedepth.md +++ b/docs/source/en/model_doc/zoedepth.md @@ -15,7 +15,6 @@ rendered properly in your Markdown viewer. --> *This model was released on 2023-02-23 and added to Hugging Face Transformers on 2024-07-08.* -
PyTorch @@ -97,6 +96,7 @@ Image.fromarray(depth.astype("uint8")) ## Notes - In the [original implementation](https://github.com/isl-org/ZoeDepth/blob/edb6daf45458569e24f50250ef1ed08c015f17a7/zoedepth/models/depth_model.py#L131) ZoeDepth performs inference on both the original and flipped images and averages the results. The `post_process_depth_estimation` function handles this by passing the flipped outputs to the optional `outputs_flipped` argument as shown below. + ```py with torch.no_grad(): outputs = model(pixel_values) @@ -107,8 +107,9 @@ Image.fromarray(depth.astype("uint8")) outputs_flipped=outputs_flipped, ) ``` - + ## Resources + - Refer to this [notebook](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/ZoeDepth) for an inference example. ## ZoeDepthConfig diff --git a/docs/source/en/model_memory_anatomy.md b/docs/source/en/model_memory_anatomy.md index 7ef53f40566e..f0a215b05c1b 100644 --- a/docs/source/en/model_memory_anatomy.md +++ b/docs/source/en/model_memory_anatomy.md @@ -16,24 +16,23 @@ limitations under the License. # Model training anatomy -To understand performance optimization techniques that one can apply to improve efficiency of model training -speed and memory utilization, it's helpful to get familiar with how GPU is utilized during training, and how compute +To understand performance optimization techniques that one can apply to improve efficiency of model training +speed and memory utilization, it's helpful to get familiar with how GPU is utilized during training, and how compute intensity varies depending on an operation performed. -Let's start by exploring a motivating example of GPU utilization and the training run of a model. For the demonstration, -we'll need to install a few libraries: +Let's start by exploring a motivating example of GPU utilization and the training run of a model. For the demonstration, +we'll need to install a few libraries: ```bash -pip install transformers datasets accelerate nvidia-ml-py3 +pip install transformers datasets accelerate nvidia-ml-py ``` -The `nvidia-ml-py3` library allows us to monitor the memory usage of the models from within Python. You might be familiar +The `nvidia-ml-py` library allows us to monitor the memory usage of the models from within Python. You might be familiar with the `nvidia-smi` command in the terminal - this library allows to access the same information in Python directly. -Then, we create some dummy data: random token IDs between 100 and 30000 and binary labels for a classifier. +Then, we create some dummy data: random token IDs between 100 and 30000 and binary labels for a classifier. In total, we get 512 sequences each with length 512 and store them in a [`~datasets.Dataset`] with PyTorch format. - ```py >>> import numpy as np >>> from datasets import Dataset @@ -74,9 +73,9 @@ Let's verify that we start with a free GPU memory: GPU memory occupied: 0 MB. ``` -That looks good: the GPU memory is not occupied as we would expect before we load any models. If that's not the case on -your machine make sure to stop all processes that are using GPU memory. However, not all free GPU memory can be used by -the user. When a model is loaded to the GPU the kernels are also loaded, which can take up 1-2GB of memory. To see how +That looks good: the GPU memory is not occupied as we would expect before we load any models. If that's not the case on +your machine make sure to stop all processes that are using GPU memory. However, not all free GPU memory can be used by +the user. When a model is loaded to the GPU the kernels are also loaded, which can take up 1-2GB of memory. To see how much it is we load a tiny tensor into the GPU which triggers the kernels to be loaded as well. ```py @@ -92,10 +91,9 @@ We see that the kernels alone take up 1.3GB of GPU memory. Now let's see how muc ## Load Model -First, we load the `google-bert/bert-large-uncased` model. We load the model weights directly to the GPU so that we can check +First, we load the `google-bert/bert-large-uncased` model. We load the model weights directly to the GPU so that we can check how much space just the weights use. - ```py >>> from transformers import AutoModelForSequenceClassification @@ -105,17 +103,16 @@ how much space just the weights use. GPU memory occupied: 2631 MB. ``` -We can see that the model weights alone take up 1.3 GB of GPU memory. The exact number depends on the specific -GPU you are using. Note that on newer GPUs a model can sometimes take up more space since the weights are loaded in an -optimized fashion that speeds up the usage of the model. Now we can also quickly check if we get the same result +We can see that the model weights alone take up 1.3 GB of GPU memory. The exact number depends on the specific +GPU you are using. Note that on newer GPUs a model can sometimes take up more space since the weights are loaded in an +optimized fashion that speeds up the usage of the model. Now we can also quickly check if we get the same result as with `nvidia-smi` CLI: - ```bash nvidia-smi ``` -```bash +```text Tue Jan 11 08:58:05 2022 +-----------------------------------------------------------------------------+ | NVIDIA-SMI 460.91.03 Driver Version: 460.91.03 CUDA Version: 11.2 | @@ -138,8 +135,8 @@ Tue Jan 11 08:58:05 2022 +-----------------------------------------------------------------------------+ ``` -We get the same number as before and you can also see that we are using a V100 GPU with 16GB of memory. So now we can -start training the model and see how the GPU memory consumption changes. First, we set up a few standard training +We get the same number as before and you can also see that we are using a V100 GPU with 16GB of memory. So now we can +start training the model and see how the GPU memory consumption changes. First, we set up a few standard training arguments: ```py @@ -154,7 +151,7 @@ default_args = { - If you plan to run multiple experiments, in order to properly clear the memory between experiments, restart the Python + If you plan to run multiple experiments, in order to properly clear the memory between experiments, restart the Python kernel between experiments. @@ -175,15 +172,15 @@ Let's use the [`Trainer`] and train the model without using any GPU performance >>> print_summary(result) ``` -``` +```text Time: 57.82 Samples/second: 8.86 GPU memory occupied: 14949 MB. ``` -We see that already a relatively small batch size almost fills up our GPU's entire memory. However, a larger batch size +We see that already a relatively small batch size almost fills up our GPU's entire memory. However, a larger batch size can often result in faster model convergence or better end performance. So ideally we want to tune the batch size to our -model's needs and not to the GPU limitations. What's interesting is that we use much more memory than the size of the model. +model's needs and not to the GPU limitations. What's interesting is that we use much more memory than the size of the model. To understand a bit better why this is the case let's have a look at a model's operations and memory needs. ## Anatomy of Model's Operations @@ -206,10 +203,9 @@ This knowledge can be helpful to know when analyzing performance bottlenecks. This summary is derived from [Data Movement Is All You Need: A Case Study on Optimizing Transformers 2020](https://huggingface.co/papers/2007.00072) - ## Anatomy of Model's Memory -We've seen that training the model uses much more memory than just putting the model on the GPU. This is because there +We've seen that training the model uses much more memory than just putting the model on the GPU. This is because there are many components during training that use GPU memory. The components on GPU memory are the following: 1. model weights @@ -219,8 +215,8 @@ are many components during training that use GPU memory. The components on GPU m 5. temporary buffers 6. functionality-specific memory -A typical model trained in mixed precision with AdamW requires 18 bytes per model parameter plus activation memory. For -inference there are no optimizer states and gradients, so we can subtract those. And thus we end up with 6 bytes per +A typical model trained in mixed precision with AdamW requires 18 bytes per model parameter plus activation memory. For +inference there are no optimizer states and gradients, so we can subtract those. And thus we end up with 6 bytes per model parameter for mixed precision inference, plus activation memory. Let's look at the details. @@ -244,29 +240,29 @@ Let's look at the details. - size depends on many factors, the key ones being sequence length, hidden size and batch size. -There are the input and output that are being passed and returned by the forward and the backward functions and the +There are the input and output that are being passed and returned by the forward and the backward functions and the forward activations saved for gradient computation. **Temporary Memory** -Additionally, there are all kinds of temporary variables which get released once the calculation is done, but in the -moment these could require additional memory and could push to OOM. Therefore, when coding it's crucial to think +Additionally, there are all kinds of temporary variables which get released once the calculation is done, but in the +moment these could require additional memory and could push to OOM. Therefore, when coding it's crucial to think strategically about such temporary variables and sometimes to explicitly free those as soon as they are no longer needed. **Functionality-specific memory** -Then, your software could have special memory needs. For example, when generating text using beam search, the software +Then, your software could have special memory needs. For example, when generating text using beam search, the software needs to maintain multiple copies of inputs and outputs. **`forward` vs `backward` Execution Speed** -For convolutions and linear layers there are 2x flops in the backward compared to the forward, which generally translates -into ~2x slower (sometimes more, because sizes in the backward tend to be more awkward). Activations are usually -bandwidth-limited, and it’s typical for an activation to have to read more data in the backward than in the forward -(e.g. activation forward reads once, writes once, activation backward reads twice, gradOutput and output of the forward, +For convolutions and linear layers there are 2x flops in the backward compared to the forward, which generally translates +into ~2x slower (sometimes more, because sizes in the backward tend to be more awkward). Activations are usually +bandwidth-limited, and it's typical for an activation to have to read more data in the backward than in the forward +(e.g. activation forward reads once, writes once, activation backward reads twice, gradOutput and output of the forward, and writes once, gradInput). -As you can see, there are potentially a few places where we could save GPU memory or speed up operations. -Now that you understand what affects GPU utilization and computation speed, refer to -the [Methods and tools for efficient training on a single GPU](perf_train_gpu_one) documentation page to learn about -performance optimization techniques. +As you can see, there are potentially a few places where we could save GPU memory or speed up operations. +Now that you understand what affects GPU utilization and computation speed, refer to +the [Methods and tools for efficient training on a single GPU](perf_train_gpu_one) documentation page to learn about +performance optimization techniques. diff --git a/docs/source/en/models.md b/docs/source/en/models.md index fdfcfba6585a..ae5572c0c77a 100644 --- a/docs/source/en/models.md +++ b/docs/source/en/models.md @@ -45,7 +45,6 @@ There are two general types of models you can load: 1. A barebones model, like [`AutoModel`] or [`LlamaModel`], that outputs hidden states. 2. A model with a specific *head* attached, like [`AutoModelForCausalLM`] or [`LlamaForCausalLM`], for performing specific tasks. - ## Model classes To get a pretrained model, you need to load the weights into the model. This is done by calling [`~PreTrainedModel.from_pretrained`] which accepts weights from the Hugging Face Hub or a local directory. @@ -111,7 +110,6 @@ You need enough memory to hold two copies of the model weights (random and pretr Transformers reduces some of these memory-related challenges with fast initialization, sharded checkpoints, Accelerate's [Big Model Inference](https://hf.co/docs/accelerate/usage_guides/big_modeling) feature, and supporting lower bit data types. - ### Sharded checkpoints The [`~PreTrainedModel.save_pretrained`] method automatically shards checkpoints larger than 10GB. diff --git a/docs/source/en/models_timeline.md b/docs/source/en/models_timeline.md new file mode 100644 index 000000000000..61514d08ea47 --- /dev/null +++ b/docs/source/en/models_timeline.md @@ -0,0 +1,28 @@ + + +# Models Timeline + +The [Models Timeline](https://huggingface.co/spaces/yonigozlan/Transformers-Timeline) is an interactive chart of how architectures in Transformers have changed over time. You can scroll through models in order, spanning text, vision, audio, video, and multimodal use cases. + +Use the filters to narrow models by modality or task. Set custom date ranges to focus on models added during specific periods. Click a model card to see its capabilities, supported tasks, and documentation. + + diff --git a/docs/source/en/modular_transformers.md b/docs/source/en/modular_transformers.md index 39d29f8a6cd4..17001cc81ee9 100644 --- a/docs/source/en/modular_transformers.md +++ b/docs/source/en/modular_transformers.md @@ -82,7 +82,7 @@ class RobertaForMaskedLM(BertForMaskedLM): If you don't use the defined dependency, you'll receive the following error. -``` +```text ValueError: You defined `RobertaEmbeddings` in the modular_roberta.py, it should be used when you define `BertModel`, as it is one of it's direct dependencies. Make sure you use it in the `__init__` function. ``` diff --git a/docs/source/en/open_webui.md b/docs/source/en/open_webui.md index 9042131631e7..2946fc95f145 100644 --- a/docs/source/en/open_webui.md +++ b/docs/source/en/open_webui.md @@ -9,6 +9,7 @@ transformers serve --enable-cors ``` Before you can speak into Open WebUI, you need to update its settings to use your server for speech to text (STT) tasks. Launch Open WebUI, and navigate to the audio tab inside the admin settings. If you're using Open WebUI with the default ports, [this link (default)](http://localhost:3000/admin/settings/audio) or [this link (python deployment)](http://localhost:8080/admin/settings/audio) will take you there. Do the following changes there: + 1. Change the type of "Speech-to-Text Engine" to "OpenAI"; 2. Update the address to your server's address -- `http://localhost:8000/v1` by default; 3. Type your model of choice into the "STT Model" field, e.g. `openai/whisper-large-v3` ([available models](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&sort=trending)). diff --git a/docs/source/en/pad_truncation.md b/docs/source/en/pad_truncation.md index 345f86283d12..45b2509e86de 100644 --- a/docs/source/en/pad_truncation.md +++ b/docs/source/en/pad_truncation.md @@ -22,25 +22,25 @@ In most cases, padding your batch to the length of the longest sequence and trun The `padding` argument controls padding. It can be a boolean or a string: - - `True` or `'longest'`: pad to the longest sequence in the batch (no padding is applied if you only provide +- `True` or `'longest'`: pad to the longest sequence in the batch (no padding is applied if you only provide a single sequence). - - `'max_length'`: pad to a length specified by the `max_length` argument or the maximum length accepted +- `'max_length'`: pad to a length specified by the `max_length` argument or the maximum length accepted by the model if no `max_length` is provided (`max_length=None`). Padding will still be applied if you only provide a single sequence. - - `False` or `'do_not_pad'`: no padding is applied. This is the default behavior. +- `False` or `'do_not_pad'`: no padding is applied. This is the default behavior. The `truncation` argument controls truncation. It can be a boolean or a string: - - `True` or `'longest_first'`: truncate to a maximum length specified by the `max_length` argument or +- `True` or `'longest_first'`: truncate to a maximum length specified by the `max_length` argument or the maximum length accepted by the model if no `max_length` is provided (`max_length=None`). This will truncate token by token, removing a token from the longest sequence in the pair until the proper length is reached. - - `'only_second'`: truncate to a maximum length specified by the `max_length` argument or the maximum +- `'only_second'`: truncate to a maximum length specified by the `max_length` argument or the maximum length accepted by the model if no `max_length` is provided (`max_length=None`). This will only truncate the second sentence of a pair if a pair of sequences (or a batch of pairs of sequences) is provided. - - `'only_first'`: truncate to a maximum length specified by the `max_length` argument or the maximum +- `'only_first'`: truncate to a maximum length specified by the `max_length` argument or the maximum length accepted by the model if no `max_length` is provided (`max_length=None`). This will only truncate the first sentence of a pair if a pair of sequences (or a batch of pairs of sequences) is provided. - - `False` or `'do_not_truncate'`: no truncation is applied. This is the default behavior. +- `False` or `'do_not_truncate'`: no truncation is applied. This is the default behavior. The `max_length` argument controls the length of the padding and truncation. It can be an integer or `None`, in which case it will default to the maximum length the model can accept. If the model has no specific maximum input length, truncation or padding to `max_length` is deactivated. diff --git a/docs/source/en/perf_infer_gpu_multi.md b/docs/source/en/perf_infer_gpu_multi.md index 01823dd5b200..21d1817e302b 100644 --- a/docs/source/en/perf_infer_gpu_multi.md +++ b/docs/source/en/perf_infer_gpu_multi.md @@ -45,13 +45,7 @@ This guide shows how to enable tensor parallelism with Transformers and differen ## Partitioning a model -Transformers supports tensor parallelism if a model has a `tp_plan`. There are two plans to partition a model. - -- The `auto` tensor parallelism plan partitions a model (see the supported models above) based on a predefined configuration. -- You can also manually specify your own partitioning plan and pass it to the `tp_plan` parameter in [`~PreTrainedModel.from_pretrained`]. - - - +Transformers supports tensor parallelism if a model has a `tp_plan`. Set `tp_plan="auto"` to automatically use a tensor parallelism plan based on a model's predefined configuration. ```py import os @@ -78,32 +72,6 @@ Launch the inference script above on [torchrun](https://pytorch.org/docs/stable/ torchrun --nproc-per-node 4 demo.py ``` - - - -Define a tensor parallel plan for each layer in `tp_plan` and pass it to [`~PreTrainedModel.from_pretrained`]. The example below uses a combination of column and row partitioning. Refer to the [Partitioning strategies](#partitioning-strategies) section to learn about other supported partitioning strategies. - -> [!WARNING] -> Manually specifying your own partitioning plan requires a good understanding of the model architecture and how the partitioning strategies interact together. If you are not sure about the partitioning strategies, the resulting model can be very slow, even failing or incorrect. Refer to the [Ultra-Scale Playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook?section=tensor_parallelism) to learn more. - -```py -from transformers import AutoModelForCausalLM - -tp_plan = { - "model.layers.*.self_attn.q_proj": "colwise", - "model.layers.*.self_attn.k_proj": "colwise", - "model.layers.*.self_attn.v_proj": "colwise", - "model.layers.*.self_attn.o_proj": "rowwise", - ... -} - -model = AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16, tp_plan=tp_plan) -print(model._tp_plan) -``` - - - - ## Partitioning strategies All partitioning strategies are defined in the [`ParallelInterface`] class which maps a string to the strategy implementation. You don't need to interact with this class directly since all the strategies are set with `tp_plan` in [`~PreTrainedModel.from_pretrained`], but it is useful for checking what strategies are available. diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md index 33fe9358fe7d..ed6c2b4a8d1a 100644 --- a/docs/source/en/perf_infer_gpu_one.md +++ b/docs/source/en/perf_infer_gpu_one.md @@ -69,7 +69,7 @@ Learn in more detail the concepts underlying 8-bit quantization in the [Gentle I Set up a [`BitsAndBytesConfig`] and set `load_in_4bit=True` to load a model in 4-bit precision. The [`BitsAndBytesConfig`] is passed to the `quantization_config` parameter in [`~PreTrainedModel.from_pretrained`]. -Allow Accelerate to automatically distribute the model across your available hardware by setting `device_map=“auto”`. +Allow Accelerate to automatically distribute the model across your available hardware by setting `device_map="auto"`. Place all inputs on the same device as the model. diff --git a/docs/source/en/perf_train_gaudi.md b/docs/source/en/perf_train_gaudi.md index 2ba792d484a3..0e5140d731ec 100644 --- a/docs/source/en/perf_train_gaudi.md +++ b/docs/source/en/perf_train_gaudi.md @@ -20,14 +20,17 @@ The Intel Gaudi AI accelerator family includes [Intel Gaudi 1](https://habana.ai [`TrainingArguments`], [`Trainer`] and [`Pipeline`] detect and set the backend device to `hpu` if an Intel Gaudi device is available. No additional changes are required to enable training and inference on your device. Some modeling code in Transformers is not optimized for HPU lazy mode. If you encounter any errors, set the environment variable below to use eager mode: -``` -PT_HPU_LAZY_MODE=0 + +```bash +export PT_HPU_LAZY_MODE=0 ``` In some cases, you'll also need to enable int64 support to avoid casting issues with long integers: + +```bash +export PT_ENABLE_INT64_SUPPORT=1 ``` -PT_ENABLE_INT64_SUPPORT=1 -``` + Refer to the [Gaudi docs](https://docs.habana.ai/en/latest/index.html) for more details. > [!TIP] diff --git a/docs/source/en/philosophy.md b/docs/source/en/philosophy.md index 7cfa46458b75..e98b1fa57bd9 100644 --- a/docs/source/en/philosophy.md +++ b/docs/source/en/philosophy.md @@ -26,24 +26,24 @@ The library was designed with two strong goals in mind: 1. Be as easy and fast to use as possible: - - We strongly limited the number of user-facing abstractions to learn, in fact, there are almost no abstractions, +- We strongly limited the number of user-facing abstractions to learn, in fact, there are almost no abstractions, just three standard classes required to use each model: [configuration](main_classes/configuration), [models](main_classes/model), and a preprocessing class ([tokenizer](main_classes/tokenizer) for NLP, [image processor](main_classes/image_processor) for vision, [feature extractor](main_classes/feature_extractor) for audio, and [processor](main_classes/processors) for multimodal inputs). - - All of these classes can be initialized in a simple and unified way from pretrained instances by using a common +- All of these classes can be initialized in a simple and unified way from pretrained instances by using a common `from_pretrained()` method which downloads (if needed), caches and loads the related class instance and associated data (configurations' hyperparameters, tokenizers' vocabulary, and models' weights) from a pretrained checkpoint provided on [Hugging Face Hub](https://huggingface.co/models) or your own saved checkpoint. - - On top of those three base classes, the library provides two APIs: [`pipeline`] for quickly +- On top of those three base classes, the library provides two APIs: [`pipeline`] for quickly using a model for inference on a given task and [`Trainer`] to quickly train or fine-tune a PyTorch model. - - As a consequence, this library is NOT a modular toolbox of building blocks for neural nets. If you want to +- As a consequence, this library is NOT a modular toolbox of building blocks for neural nets. If you want to extend or build upon the library, just use regular Python or PyTorch and inherit from the base classes of the library to reuse functionalities like model loading and saving. If you'd like to learn more about our coding philosophy for models, check out our [Repeat Yourself](https://huggingface.co/blog/transformers-design-philosophy) blog post. 2. Provide state-of-the-art models with performances as close as possible to the original models: - - We provide at least one example for each architecture which reproduces a result provided by the official authors +- We provide at least one example for each architecture which reproduces a result provided by the official authors of said architecture. - - The code is usually as close to the original code base as possible which means some PyTorch code may be not as +- The code is usually as close to the original code base as possible which means some PyTorch code may be not as *pytorchic* as it could be as a result of being converted from other Deep Learning frameworks. A few other goals: diff --git a/docs/source/en/pipeline_gradio.md b/docs/source/en/pipeline_gradio.md index 0cd65665d33d..b53bcc8bd184 100644 --- a/docs/source/en/pipeline_gradio.md +++ b/docs/source/en/pipeline_gradio.md @@ -45,8 +45,8 @@ gr.Interface.from_pipeline(pipeline).launch(share=True) The Space below is created with the code above and hosted on Spaces. diff --git a/docs/source/en/pipeline_webserver.md b/docs/source/en/pipeline_webserver.md index 0112d116c47d..37d245483b94 100644 --- a/docs/source/en/pipeline_webserver.md +++ b/docs/source/en/pipeline_webserver.md @@ -82,6 +82,7 @@ Query the server with a POST request. ```bash curl -X POST -d "Paris is the [MASK] of France." http://localhost:8000/ ``` + This should return the output below. ```bash diff --git a/docs/source/en/pr_checks.md b/docs/source/en/pr_checks.md index a5634c29ee49..5fdbbbab05bc 100644 --- a/docs/source/en/pr_checks.md +++ b/docs/source/en/pr_checks.md @@ -21,6 +21,7 @@ rendered properly in your Markdown viewer. # Checks on a Pull Request When you open a pull request on 🤗 Transformers, a fair number of checks will be run to make sure the patch you are adding is not breaking anything existing. Those checks are of four types: + - regular tests - documentation build - code and documentation style @@ -52,7 +53,6 @@ or for an editable install: pip install -e .[quality] ``` - ## Tests All the jobs that begin with `ci/circleci: run_tests_` run parts of the Transformers testing suite. Each of those jobs focuses on a part of the library in a certain environment: for instance `ci/circleci: run_tests_pipelines` runs the pipeline tests in an environment where all pipeline-related requirements are installed. @@ -195,6 +195,7 @@ Another way when the patterns are just different casings of the same replacement ``` In this case, the code is copied from `BertForSequenceClassification` by replacing: + - `Bert` by `MobileBert` (for instance when using `MobileBertModel` in the init) - `bert` by `mobilebert` (for instance when defining `self.mobilebert`) - `BERT` by `MOBILEBERT` (in the constant `MOBILEBERT_INPUTS_DOCSTRING`) diff --git a/docs/source/en/quantization/auto_round.md b/docs/source/en/quantization/auto_round.md index 15abf9faa846..7526597ee86f 100644 --- a/docs/source/en/quantization/auto_round.md +++ b/docs/source/en/quantization/auto_round.md @@ -11,18 +11,17 @@ rendered properly in your Markdown viewer. # AutoRound -[AutoRound](https://github.com/intel/auto-round) is an advanced quantization algorithm that delivers strong accuracy, even at 2-bit precision. -It leverages sign gradient descent to fine-tune both rounding values and min-max clipping thresholds in just 200 steps. Designed for broad compatibility, it seamlessly supports a wide range of LLMs and is actively expanding to cover more VLMs as well. +[AutoRound](https://github.com/intel/auto-round) is an advanced quantization algorithm that delivers strong accuracy, even at 2-bit precision. +It leverages sign gradient descent to fine-tune both rounding values and min-max clipping thresholds in just 200 steps. Designed for broad compatibility, it seamlessly supports a wide range of LLMs and is actively expanding to cover more VLMs as well. It also supports quantization and inference across multiple hardware platforms, including CPU, XPU, and CUDA. -AutoRound also offers a variety of useful features, including mixed-bit tuning and inference, lm-head quantization, support for exporting to formats like GPTQ/AWQ/GGUF, and flexible tuning recipes. +AutoRound also offers a variety of useful features, including mixed-bit tuning and inference, lm-head quantization, support for exporting to formats like GPTQ/AWQ/GGUF, and flexible tuning recipes. For a comprehensive overview and the latest updates, check out the AutoRound [README](https://github.com/intel/auto-round). -AutoRound was originally developed as part of the [Intel Neural Compressor](https://github.com/intel/neural-compressor), serving as a general-purpose model compression library for deep learning. -It has since evolved into a standalone library focused specifically on low-precision optimization for large language models (LLMs). +AutoRound was originally developed as part of the [Intel Neural Compressor](https://github.com/intel/neural-compressor), serving as a general-purpose model compression library for deep learning. +It has since evolved into a standalone library focused specifically on low-precision optimization for large language models (LLMs). AutoRound remains fully integrated with the Intel Neural Compressor, and you can explore the repository for more details. - ## Installation ```bash @@ -51,6 +50,7 @@ Currently, only offline mode is supported to generate quantized models. ### Command Line Usage + ```bash auto-round \ --model facebook/opt-125m \ @@ -59,7 +59,7 @@ auto-round \ --output_dir ./tmp_autoround ``` -AutoRound also offer another two recipes, `auto-round-best` and `auto-round-light`, designed for optimal accuracy and improved speed, respectively. +AutoRound also offer another two recipes, `auto-round-best` and `auto-round-light`, designed for optimal accuracy and improved speed, respectively. For 2 bits, we recommend using `auto-round-best` or `auto-round`. @@ -99,6 +99,7 @@ autoround.quantize_and_save(output_dir, format='auto_round') ### AutoRoundBest recipe This setting provides the best accuracy in most scenarios but is 4–5× slower than the standard AutoRound recipe. It is especially recommended for 2-bit quantization and is a good choice if sufficient resources are available. + ```python from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound @@ -121,6 +122,7 @@ autoround = AutoRound( output_dir = "./tmp_autoround" autoround.quantize_and_save(output_dir, format='auto_round') ``` + @@ -230,7 +232,7 @@ print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50, do_sample=Fal AutoRound automatically selects the backend for each layer based on compatibility. In general, the priority order is Marlin > ExLLaMAV2 > Triton, but the final choice depends on factors such as group size, bit width, packing format, hardware device, and other implementation details. For more details, please refer to [backends](https://github.com/intel/auto-round?tab=readme-ov-file#specify-backend), -The backend may not always be the most suitable for certain devices. +The backend may not always be the most suitable for certain devices. You can specify your preferred backend such as "ipex" for CPU, "ipex/triton" for XPU, "marlin/exllamav2/triton" for CUDA, according to your needs or hardware compatibility. Please note that additional corresponding libraries may be required. ```python @@ -247,7 +249,6 @@ print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50, do_sample=Fal - ### Convert GPTQ/AWQ to AutoRound @@ -277,7 +278,6 @@ the [transformers](https://github.com/huggingface/transformers/issues) repositor If you encounter any issues with auto-round, please open an issue on the [AutoRound](https://github.com/intel/auto-round/issues) repository. - ## Acknowledgement Special thanks to open-source low precision libraries such as AutoGPTQ, AutoAWQ, GPTQModel, Triton, Marlin, and ExLLaMAV2 for providing low-precision CUDA kernels, which are leveraged in AutoRound. diff --git a/docs/source/en/quantization/awq.md b/docs/source/en/quantization/awq.md index b6437e2588a8..b2cf4b9ecdf6 100644 --- a/docs/source/en/quantization/awq.md +++ b/docs/source/en/quantization/awq.md @@ -25,6 +25,7 @@ Run the command below to install autoawq ```bash pip install autoawq ``` + > [!WARNING] > AutoAWQ downgrades Transformers to version 4.47.1. If you want to do inference with AutoAWQ, you may need to reinstall your Transformers' version after installing AutoAWQ. diff --git a/docs/source/en/quantization/bitnet.md b/docs/source/en/quantization/bitnet.md index 922210b2137b..31474e1d3213 100644 --- a/docs/source/en/quantization/bitnet.md +++ b/docs/source/en/quantization/bitnet.md @@ -41,7 +41,7 @@ model = AutoModelForCausalLM.from_pretrained(path, device_map="auto") ## Kernels -`@torch.compile` is used to unpack the weights and perform the forward pass. It’s very straightforward to implement and delivers significant speed improvements. Additional optimized kernels will be integrated in future versions. +`@torch.compile` is used to unpack the weights and perform the forward pass. It's very straightforward to implement and delivers significant speed improvements. Additional optimized kernels will be integrated in future versions. ## Resources diff --git a/docs/source/en/quantization/bitsandbytes.md b/docs/source/en/quantization/bitsandbytes.md index 60c3c2dfebf9..81238c0707e7 100644 --- a/docs/source/en/quantization/bitsandbytes.md +++ b/docs/source/en/quantization/bitsandbytes.md @@ -16,7 +16,7 @@ rendered properly in your Markdown viewer. # Bitsandbytes -The [bitsandbytes](https://github.com/bitsandbytes-foundation/bitsandbytes) library provides quantization tools for LLMs through a lightweight Python wrapper around CUDA functions. It enables working with large models using limited computational resources by reducing their memory footprint. +The [bitsandbytes](https://github.com/bitsandbytes-foundation/bitsandbytes) library provides quantization tools for LLMs through a lightweight Python wrapper around hardware accelerator functions. It enables working with large models using limited computational resources by reducing their memory footprint. At its core, bitsandbytes provides: @@ -32,36 +32,38 @@ bitsandbytes offers two main quantization features: > **Note:** For a user-friendly quantization experience, you can use the `bitsandbytes` [community space](https://huggingface.co/spaces/bnb-community/bnb-my-repo). - Run the command below to install bitsandbytes. ```bash pip install --upgrade transformers accelerate bitsandbytes ``` + To compile from source, follow the instructions in the [bitsandbytes installation guide](https://huggingface.co/docs/bitsandbytes/main/en/installation). ## Hardware Compatibility -bitsandbytes is currently only supported on CUDA GPUs for CUDA versions 11.0 - 12.8. However, there's an ongoing multi-backend effort under development, which is currently in alpha. If you're interested in providing feedback or testing, check out the [bitsandbytes repository](https://github.com/bitsandbytes-foundation/bitsandbytes) for more information. +bitsandbytes is supported on NVIDIA GPUs for CUDA versions 11.8 - 13.0, Intel XPU, Intel Gaudi (HPU), and CPU. There is an ongoing effort to support additional platforms. If you're interested in providing feedback or testing, check out the [bitsandbytes repository](https://github.com/bitsandbytes-foundation/bitsandbytes) for more information. -### CUDA +### NVIDIA GPUs (CUDA) + +This backend is supported on Linux x86-64, Linux aarch64, and Windows platforms. | Feature | Minimum Hardware Requirement | |---------|-------------------------------| -| 8-bit optimizers | NVIDIA Maxwell (GTX 900 series, TITAN X, M40) or newer GPUs * | -| LLM.int8() | NVIDIA Turing (RTX 20 series, T4) or newer GPUs | -| NF4/FP4 quantization | NVIDIA Maxwell (GTX 900 series, TITAN X, M40) or newer GPUs * | +| 8-bit optimizers | NVIDIA Pascal (GTX 10X0 series, P100) or newer GPUs * | +| LLM.int8() | NVIDIA Turing (RTX 20X0 series, T4) or newer GPUs | +| NF4/FP4 quantization | NVIDIA Pascal (GTX 10X0 series, P100) or newer GPUs * | + +### Intel GPUs (XPU) -### Multi-backend +This backend is supported on Linux x86-64 and Windows x86-64 platforms. -| Backend | Supported Versions | Python versions | Architecture Support | Status | -|---------|-------------------|----------------|---------------------|---------| -| AMD ROCm | 6.1+ | 3.10+ | minimum CDNA - gfx90a, RDNA - gfx1100 | Alpha | -| Apple Silicon (MPS) | WIP | 3.10+ | M1/M2 chips | Planned | -| Intel CPU | v2.4.0+ (ipex) | 3.10+ | Intel CPU | Alpha | -| Intel GPU | v2.4.0+ (ipex) | 3.10+ | Intel GPU | Experimental | -| Ascend NPU | 2.1.0+ (torch_npu) | 3.10+ | Ascend NPU | Experimental | +### Intel Gaudi (HPU) -> **Note:** Bitsandbytes is moving away from the multi-backend approach towards using [Pytorch Custom Operators](https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html), as the main mechanism for supporting new hardware, and dispatching to the correct backend. +This backend is supported on Linux x86-64 for Gaudi2 and Gaudi3. + +### CPU + +This backend is supported on Linux x86-64, Linux aarch64, and Windows x86-64 platforms. ## Quantization Examples @@ -116,6 +118,7 @@ model = AutoModelForCausalLM.from_pretrained( model.push_to_hub("bloom-560m-8bit") ``` +
@@ -166,6 +169,7 @@ model = AutoModelForCausalLM.from_pretrained( model.push_to_hub("bloom-560m-4bit") ``` +
diff --git a/docs/source/en/quantization/compressed_tensors.md b/docs/source/en/quantization/compressed_tensors.md index a3b01a1b4489..4f55f008aa8d 100644 --- a/docs/source/en/quantization/compressed_tensors.md +++ b/docs/source/en/quantization/compressed_tensors.md @@ -65,11 +65,11 @@ print(f"{mem_params/2**30:.4f} GB") ## Model checkpoint -compressed-tensor models are defined through its configuration entry. The following example is taken from the [nm-testing/Meta-Llama-3.1-8B-Instruct-FP8-hf](https://huggingface.co/nm-testing/Meta-Llama-3.1-8B-Instruct-FP8-hf/blob/main/config.json) `config.json` file. +Compressed-tensor models are defined through its configuration entry. The following example is taken from the [nm-testing/Meta-Llama-3.1-8B-Instruct-FP8-hf](https://huggingface.co/nm-testing/Meta-Llama-3.1-8B-Instruct-FP8-hf/blob/main/config.json) `config.json` file. There are a lot of entries to allow for flexible expression both during and after compression, but the entries for loading and inference can be simplified to focus on just a few key entries. -```yaml +```json "quantization_config": { "config_groups": { "group_0": { @@ -97,31 +97,31 @@ The config file specifies the quantization of a config group (`group_0`), which For a more detailed look at the model weights, use the [safetensors viewer](https://huggingface.co/nm-testing/Meta-Llama-3.1-8B-Instruct-FP8-hf?show_file_info=model.safetensors.index.json) on the model card to see the quantized weights, input scale, and weight scale for all [nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) modules. -| Tensors | Shape | Precision | +| Tensors | Shape | Precision | | ------- | ----- | --------- | -model.layers.0.input_layernorm.weight | [4 096] | BF16 -model.layers.0.mlp.down_proj.input_scale | [1] | BF16 -model.layers.0.mlp.down_proj.weight | [4 096, 14 336] | F8_E4M3 -model.layers.0.mlp.down_proj.weight_scale | [1] | BF16 -model.layers.0.mlp.gate_proj.input_scale | [1] | BF16 -model.layers.0.mlp.gate_proj.weight | [14 336, 4 096] | F8_E4M3 -model.layers.0.mlp.gate_proj.weight_scale | [1] | BF16 -model.layers.0.mlp.up_proj.input_scale| [1] |BF16 -model.layers.0.mlp.up_proj.weight | [14 336, 4 096] | F8_E4M3 -model.layers.0.mlp.up_proj.weight_scale | [1] | BF16 -model.layers.0.post_attention_layernorm.weight | [4 096] |BF16 -model.layers.0.self_attn.k_proj.input_scale | [1] | BF16 -model.layers.0.self_attn.k_proj.weight | [1 024, 4 096]| F8_E4M3 -model.layers.0.self_attn.k_proj.weight_scale |[1] | BF16 -model.layers.0.self_attn.o_proj.input_scale | [1] | BF16 -model.layers.0.self_attn.o_proj.weight | [4 096, 4 096] | F8_E4M3 -model.layers.0.self_attn.o_proj.weight_scale | [1] | BF16 -model.layers.0.self_attn.q_proj.input_scale | [1] | BF16 -model.layers.0.self_attn.q_proj.weight | [4 096, 4 096] | F8_E4M3 -model.layers.0.self_attn.q_proj.weight_scale | [1] | BF16 -model.layers.0.self_attn.v_proj.input_scale | [1] | BF16 -model.layers.0.self_attn.v_proj.weight | [1 024, 4 096] | F8_E4M3 -model.layers.0.self_attn.v_proj.weight_scale | [1] | BF16 +|model.layers.0.input_layernorm.weight | [4 096] | BF16| +|model.layers.0.mlp.down_proj.input_scale | [1] | BF16| +|model.layers.0.mlp.down_proj.weight | [4 096, 14 336] | F8_E4M3| +|model.layers.0.mlp.down_proj.weight_scale | [1] | BF16| +|model.layers.0.mlp.gate_proj.input_scale | [1] | BF16| +|model.layers.0.mlp.gate_proj.weight | [14 336, 4 096] | F8_E4M3| +|model.layers.0.mlp.gate_proj.weight_scale | [1] | BF16| +|model.layers.0.mlp.up_proj.input_scale| [1] |BF16| +|model.layers.0.mlp.up_proj.weight | [14 336, 4 096] | F8_E4M3| +|model.layers.0.mlp.up_proj.weight_scale | [1] | BF16| +|model.layers.0.post_attention_layernorm.weight | [4 096] |BF16| +|model.layers.0.self_attn.k_proj.input_scale | [1] | BF16| +|model.layers.0.self_attn.k_proj.weight | [1 024, 4 096]| F8_E4M3| +|model.layers.0.self_attn.k_proj.weight_scale |[1] | BF16| +|model.layers.0.self_attn.o_proj.input_scale | [1] | BF16| +|model.layers.0.self_attn.o_proj.weight | [4 096, 4 096] | F8_E4M3| +|model.layers.0.self_attn.o_proj.weight_scale | [1] | BF16| +|model.layers.0.self_attn.q_proj.input_scale | [1] | BF16| +|model.layers.0.self_attn.q_proj.weight | [4 096, 4 096] | F8_E4M3| +|model.layers.0.self_attn.q_proj.weight_scale | [1] | BF16| +|model.layers.0.self_attn.v_proj.input_scale | [1] | BF16| +|model.layers.0.self_attn.v_proj.weight | [1 024, 4 096] | F8_E4M3| +|model.layers.0.self_attn.v_proj.weight_scale | [1] | BF16| When loading a compressed-tensors model with the [`~quantizers.HFQuantizer`] integration, all the [nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) modules specified in the quantization config are replaced by [CompressedLinear](https://github.com/neuralmagic/compressed-tensors/blob/975cb223b19fcac2b98a4271d17668462d4d6e1d/src/compressed_tensors/linear/compressed_linear.py#L30) modules that manage the compressed weights and forward pass for inference. The `lm_head` module is still kept as an unquantized nn.Linear module. diff --git a/docs/source/en/quantization/concept_guide.md b/docs/source/en/quantization/concept_guide.md index ff300b9d48a5..df3a2bdc6f2a 100644 --- a/docs/source/en/quantization/concept_guide.md +++ b/docs/source/en/quantization/concept_guide.md @@ -18,12 +18,11 @@ rendered properly in your Markdown viewer. Quantization reduces the memory footprint and computational cost of large machine learning models like those found in the Transformers library. It achieves this by representing the model's weights and or activations with lower-precision data types (like 8-bit integers or int8) instead of the standard 32-bit floating-point (float32). - Reducing a model's precision offers several significant benefits: -- Smaller model size: Lower-precision data types require less storage space. An int8 model, for example, is roughly 4 times smaller than its float32 counterpart. -- Faster inference: Operations on lower-precision data types, especially integers, can be significantly faster on compatible hardware (CPUs and GPUs often have specialized instructions for int8 operations). This leads to lower latency. -- Reduced energy consumption: Faster computations and smaller memory transfers often translate to lower power usage. +- Smaller model size: Lower-precision data types require less storage space. An int8 model, for example, is roughly 4 times smaller than its float32 counterpart. +- Faster inference: Operations on lower-precision data types, especially integers, can be significantly faster on compatible hardware (CPUs and GPUs often have specialized instructions for int8 operations). This leads to lower latency. +- Reduced energy consumption: Faster computations and smaller memory transfers often translate to lower power usage. The primary trade-off in quantization is *efficiency* vs. *accuracy*. Reducing precision saves resources but inevitably introduces small errors (quantization noise). The goal is to minimize this error using appropriate schemes (affine/symmetric), granularity (per-tensor/channel), and techniques (PTQ/QAT) so that the model's performance on its target task degrades as little as possible. @@ -46,8 +45,7 @@ The most common method is *affine quantization*. For a given float32 tensor (lik There are two main ways to perform this mapping, *symmetric* and *asymmetric*. The choice between symmetric and asymmetric quantization determines how the float32 range is mapped to the int8 range. - Symmetric: This method assumes the original float32 range is symmetric around zero ( \\([ -a, a ]\\) ). This range is mapped symmetrically to the int8 range, for example, \\([-127, 127]\\). A key characteristic is that the float32 value \\(0.0\\) maps directly to the int8 value \\(0\\). This only requires one parameter, the **scale ( \\(S\\) )**, to define the mapping. It can simplify computations, but it might be less accurate if the original data distribution isn't naturally centered around zero. -- Asymmetric (Affine): This method does not assume the data is centered around zero. It maps the exact range \\([val_{min}, val_{max}]\\) from float32 to the full int8 range, like \\([-128, 127]\\). This requires two parameters, a **scale ( \\(S\\) )** and a **zero-point ( \\(Z\\) )**. - +- Asymmetric (Affine): This method does not assume the data is centered around zero. It maps the exact range \\([val_{min}, val_{max}]\\) from float32 to the full int8 range, like \\([-128, 127]\\). This requires two parameters, a **scale ( \\(S\\) )** and a **zero-point ( \\(Z\\) )**. scale ( \\(S\\) ): A positive float32 number representing the ratio between the float32 and the int8 range. @@ -134,8 +132,7 @@ There are two main types of quantization techniques. ## Quantization in Transformers -Transformers integrates several quantization backends such as bitsandbytes, torchao, compressed-tensors, and more (refer to the quantization [overview](./overview) for more backends). - +Transformers integrates several quantization backends such as bitsandbytes, torchao, compressed-tensors, and more (refer to the quantization [overview](./overview) for more backends). All backends are unified under the [`HfQuantizer`] API and associated [`QuantizationConfig`] classes. You can integrate your own custom quantization backends by implementing a custom [`HfQuantizer`] and [`QuantizationConfig`], as shown in the [Contribution](./contribute) guide. @@ -165,7 +162,6 @@ model = AutoModelForCausalLM.from_pretrained( ) ``` - ## Resources To explore quantization and related performance optimization concepts more deeply, check out the following resources. @@ -175,4 +171,4 @@ To explore quantization and related performance optimization concepts more deepl - [Introduction to Quantization cooked in 🤗 with 💗🧑‍🍳](https://huggingface.co/blog/merve/quantization) - [EfficientML.ai Lecture 5 - Quantization Part I](https://www.youtube.com/watch?v=RP23-dRVDWM) - [Making Deep Learning Go Brrrr From First Principles](https://horace.io/brrr_intro.html) -- [Accelerating Generative AI with PyTorch Part 2: LLM Optimizations](https://pytorch.org/blog/accelerating-generative-ai-2/) \ No newline at end of file +- [Accelerating Generative AI with PyTorch Part 2: LLM Optimizations](https://pytorch.org/blog/accelerating-generative-ai-2/) diff --git a/docs/source/en/quantization/finegrained_fp8.md b/docs/source/en/quantization/finegrained_fp8.md index bbf273d8d933..1afd1505029b 100644 --- a/docs/source/en/quantization/finegrained_fp8.md +++ b/docs/source/en/quantization/finegrained_fp8.md @@ -59,4 +59,4 @@ Use [`~PreTrainedModel.save_pretrained`] to save the quantized model and reload quant_path = "/path/to/save/quantized/model" model.save_pretrained(quant_path) model = AutoModelForCausalLM.from_pretrained(quant_path, device_map="auto") -``` \ No newline at end of file +``` diff --git a/docs/source/en/quantization/fp_quant.md b/docs/source/en/quantization/fp_quant.md index 7c12fb870531..4888795a6d77 100644 --- a/docs/source/en/quantization/fp_quant.md +++ b/docs/source/en/quantization/fp_quant.md @@ -18,7 +18,9 @@ rendered properly in your Markdown viewer. [FP-Quant](https://github.com/IST-DASLab/FP-Quant) is a family of quantization algorithms tailored for the Blackwell generation of Nvidia GPUs. The goal is to allow for efficient post-training quantization (PTQ) and quantization-aware training (QAT) of LLMs in the [MXFP4 and NVFP4 data-types](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf). -Currently, only PTQ with MXFP4 is supported. Models can either be quantized on the fly with `quantization_config=FPQuantConfig()`: +This integration accompanies the pre-print of the [**Bridging the Gap Between Promise and Performance for Microscaling FP4 Quantization**](https://arxiv.org/abs/2509.23202) pre-print. + +Currently, only QAT is only supported with `pseudoquantization=True`. Models can either be quantized on the fly with `quantization_config=FPQuantConfig()`: ```python from transformers import AutoModelForCausalLM, AutoTokenizer, FPQuantConfig @@ -34,6 +36,8 @@ model = AutoModelForCausalLM.from_pretrained( or pre-processed with GPTQ for better quality (see [FP Format Quantization Harness](https://github.com/IST-DASLab/FP-Quant)). +You can choose between MXFP4 and NVFP4 with `FPQuantConfig(forward_dtype="mxfp4")`. NVFP4 provides better quality but uses a little more memory. + A **Blackwell-generation GPU is required** to run the kernels. Runtime support for FP-Quant is implemented through the [QuTLASS](https://github.com/IST-DASLab/qutlass) library and a lightweight PyTorch interface lib [`fp_quant`](https://github.com/IST-DASLab/FP-Quant/tree/master/inference_lib). We recommend installing the former **from source** and the latter with `pip install fp_quant`. Users **without a Blackwell-generation GPU** , can use the method with `quantization_config=FPQuantConfig(pseudoquant=True)` without having to install [QuTLASS](https://github.com/IST-DASLab/qutlass). This would provide no speedups but would fully emulate the effect of quantization. diff --git a/docs/source/en/quantization/mxfp4.md b/docs/source/en/quantization/mxfp4.md index a2b9f7634c8d..dd313c5555ed 100644 --- a/docs/source/en/quantization/mxfp4.md +++ b/docs/source/en/quantization/mxfp4.md @@ -16,7 +16,7 @@ rendered properly in your Markdown viewer. # MXFP4 -Note: MXFP4 quantisation currently only works for OpenAI GPT-OSS 120b and 20b. +Note: MXFP4 quantisation currently only works for OpenAI GPT-OSS 120b and 20b. MXFP4 is a 4-bit floating point format that dramatically reduces the memory requirements of large models. Large models (GPT-OSS-120B) can fit on a single 80GB GPU and smaller models (GPT-OSS-20B) only require 16GB of memory. It uses blockwise scaling to preserve it's range and accuracy, which typically becomes degraded at lower precisions. @@ -25,7 +25,6 @@ To use MXPF4, make sure your hardware meets the following requirements. - Install Accelerate, kernels, and Triton ≥ 3.4. Only manually install Triton ≥ 3.4 if you're using PyTorch 2.7 because it is already supported in PyTorch 2.8. - NVIDIA GPU Compute Capability ≥ 7.5 which includes Tesla GPUs and newer. Use [get_device_capability](https://docs.pytorch.org/docs/stable/generated/torch.cuda.get_device_capability.html) to check Compute Capability. - ```python from torch import cuda cuda.get_device_capability() @@ -54,7 +53,6 @@ print(cfg.quantization_config) # } ``` - ## MXFP4 kernels Transformers automatically pulls the MXFP4-aware Triton kernels from the community repository when you load a model that needs them. The kernels are stored in your local cache and used during the forward pass. @@ -67,7 +65,6 @@ You can use [hf cache scan](https://huggingface.co/docs/huggingface_hub/en/guide hf cache scan ``` - ```shell REPO ID REPO TYPE SIZE ON DISK -------------------------------- --------- ------------ diff --git a/docs/source/en/quantization/overview.md b/docs/source/en/quantization/overview.md index ceab195b2b59..0a8dee1e33ae 100644 --- a/docs/source/en/quantization/overview.md +++ b/docs/source/en/quantization/overview.md @@ -27,14 +27,14 @@ Use the Space below to help you pick a quantization method depending on your har | [AQLM](./aqlm) | 🔴 | 🟢 | 🟢 | 🔴 | 🔴 | 🟢 | 🟢 | 1/2 | 🟢 | 🟢 | 🟢 | https://github.com/Vahe1994/AQLM | | [AutoRound](./auto_round) | 🔴 | 🟢 | 🟢 | 🔴 | 🔴 | 🟢 | 🔴 | 2/3/4/8 | 🔴 | 🟢 | 🟢 | https://github.com/intel/auto-round | | [AWQ](./awq) | 🔴 | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | ? | 4 | 🟢 | 🟢 | 🟢 | https://github.com/casper-hansen/AutoAWQ | -| [bitsandbytes](./bitsandbytes) | 🟢 | 🟡 | 🟢 | 🟡 | 🔴 | 🟡 | 🟢 | 4/8 | 🟢 | 🟢 | 🟢 | https://github.com/bitsandbytes-foundation/bitsandbytes | +| [bitsandbytes](./bitsandbytes) | 🟢 | 🟢 | 🟢 | 🟡 | 🟡 | 🟢 | 🟢 | 4/8 | 🟢 | 🟢 | 🟢 | https://github.com/bitsandbytes-foundation/bitsandbytes | | [compressed-tensors](./compressed_tensors) | 🔴 | 🟢 | 🟢 | 🟢 | 🔴 | 🔴 | 🔴 | 1/8 | 🟢 | 🟢 | 🟢 | https://github.com/neuralmagic/compressed-tensors | | [EETQ](./eetq) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | ? | 8 | 🟢 | 🟢 | 🟢 | https://github.com/NetEase-FuXi/EETQ | | [FP-Quant](./fp_quant) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | 🟢 | 4 | 🔴 | 🟢 | 🟢 | https://github.com/IST-DASLab/FP-Quant | | [GGUF / GGML (llama.cpp)](../gguf) | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | 🟢 | 🔴 | 1/8 | 🔴 | [See Notes](../gguf) | [See Notes](../gguf) | https://github.com/ggerganov/llama.cpp | | [GPTQModel](./gptq) | 🔴 | 🟢 | 🟢 | 🟢 | 🟢 | 🟢 | 🔴 | 2/3/4/8 | 🟢 | 🟢 | 🟢 | https://github.com/ModelCloud/GPTQModel | | [AutoGPTQ](./gptq) | 🔴 | 🔴 | 🟢 | 🟢 | 🔴 | 🔴 | 🔴 | 2/3/4/8 | 🟢 | 🟢 | 🟢 | https://github.com/AutoGPTQ/AutoGPTQ | -| [HIGGS](./higgs) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | 🟢 | 2/4 | 🔴 | 🟢 | 🟢 | https://github.com/HanGuo97/flute | +| [HIGGS](./higgs) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | 🟢 | 2/4 | 🔴 | 🟢 | 🟢 | https://github.com/HanGuo97/flute | | [HQQ](./hqq) | 🟢 | 🟢 | 🟢 | 🔴 | 🔴 | 🟢 | 🟢 | 1/8 | 🟢 | 🔴 | 🟢 | https://github.com/mobiusml/hqq/ | | [optimum-quanto](./quanto) | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | 🟢 | 🟢 | 2/4/8 | 🔴 | 🔴 | 🟢 | https://github.com/huggingface/optimum-quanto | | [FBGEMM_FP8](./fbgemm_fp8) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | 🔴 | 8 | 🔴 | 🟢 | 🟢 | https://github.com/pytorch/FBGEMM | @@ -53,7 +53,7 @@ If you are new to quantization, we recommend checking out these beginner-friendl ## User-Friendly Quantization Tools -If you are looking for a user-friendly quantization experience, you can use the following community spaces and notebooks: +If you are looking for a user-friendly quantization experience, you can use the following community spaces and notebooks: * [Bitsandbytes Space](https://huggingface.co/spaces/bnb-community/bnb-my-repo) * [GGUF Space](https://huggingface.co/spaces/ggml-org/gguf-my-repo) diff --git a/docs/source/en/quantization/quanto.md b/docs/source/en/quantization/quanto.md index b3cf58b5b6ad..f58f93025f45 100644 --- a/docs/source/en/quantization/quanto.md +++ b/docs/source/en/quantization/quanto.md @@ -66,4 +66,4 @@ model = torch.compile(model) Read the [Quanto: a PyTorch quantization backend for Optimum](https://huggingface.co/blog/quanto-introduction) blog post to learn more about the library design and benchmarks. -For more hands-on examples, take a look at the Quanto [notebook](https://colab.research.google.com/drive/16CXfVmtdQvciSh9BopZUDYcmXCDpvgrT?usp=sharing). \ No newline at end of file +For more hands-on examples, take a look at the Quanto [notebook](https://colab.research.google.com/drive/16CXfVmtdQvciSh9BopZUDYcmXCDpvgrT?usp=sharing). diff --git a/docs/source/en/quantization/selecting.md b/docs/source/en/quantization/selecting.md index 7653e946dd80..e2c7bdf27076 100644 --- a/docs/source/en/quantization/selecting.md +++ b/docs/source/en/quantization/selecting.md @@ -26,7 +26,7 @@ Consider the quantization methods below for inference. | quantization method | use case | |---|---| -| bitsandbytes | ease of use and QLoRA fine-tuning on NVIDIA GPUs | +| bitsandbytes | ease of use and QLoRA fine-tuning on NVIDIA and Intel GPUs | | compressed-tensors | loading specific quantized formats (FP8, Sparse) | | GPTQModel or AWQ | good 4-bit accuracy with upfront calibration | | HQQ | fast on the fly quantization without calibration | @@ -112,22 +112,22 @@ Consider the quantization method below during fine-tuning to save memory. ### bitsandbytes[[training]] -* **Description:** The standard method for QLoRA fine-tuning via PEFT. -* **Pros:** Enables fine-tuning large models on consumer GPUs; widely supported and documented for PEFT. -* **Cons:** Primarily for NVIDIA GPUs. +* **Description:** The standard method for QLoRA fine-tuning via PEFT. +* **Pros:** Enables fine-tuning large models on consumer GPUs; widely supported and documented for PEFT. +* **Cons:** Primarily for NVIDIA GPUs. Other methods offer PEFT compatibility, though bitsandbytes is the most established and straightforward path for QLoRA. -See the [bitsandbytes documentation](./bitsandbytes#qlora) and [PEFT Docs](https://huggingface.co/docs/peft/developer_guides/quantization#aqlm-quantization) for more details. +See the [bitsandbytes documentation](./bitsandbytes#qlora) and [PEFT Docs](https://huggingface.co/docs/peft/developer_guides/quantization#aqlm-quantization) for more details. ## Research Methods like [AQLM](./aqlm), [SpQR](./spqr), [VPTQ](./vptq), [HIGGS](./higgs), etc., push the boundaries of compression (< 2-bit) or explore novel techniques. -* Consider these if: - * You need extreme compression (sub-4-bit). - * You are conducting research or require state-of-the-art results from their respective papers. - * You have significant compute resources available for potentially complex quantization procedures. +* Consider these if: + * You need extreme compression (sub-4-bit). + * You are conducting research or require state-of-the-art results from their respective papers. + * You have significant compute resources available for potentially complex quantization procedures. We recommend consulting each methods documentation and associated papers carefully before choosing one for use in production. ## Benchmark Comparison @@ -154,4 +154,4 @@ The key takeaways are: | **Sub-4-bit** (VPTQ, AQLM, 2-bit GPTQ) | Extreme (>4x) | Noticeable drop, especially at 2-bit | Quantization times can be very long (AQLM, VPTQ). Performance varies. | > [!TIP] -> Always benchmark the performance (accuracy and speed) of the quantized model on your specific task and hardware to ensure it meets your requirements. Refer to the individual documentation pages linked above for detailed usage instructions. \ No newline at end of file +> Always benchmark the performance (accuracy and speed) of the quantized model on your specific task and hardware to ensure it meets your requirements. Refer to the individual documentation pages linked above for detailed usage instructions. diff --git a/docs/source/en/quantization/torchao.md b/docs/source/en/quantization/torchao.md index 6427866d0229..8778f9f3e5ea 100644 --- a/docs/source/en/quantization/torchao.md +++ b/docs/source/en/quantization/torchao.md @@ -30,7 +30,6 @@ See the table below for additional torchao features. > [!TIP] > Refer to the torchao [README.md](https://github.com/pytorch/ao#torchao-pytorch-architecture-optimization) for more details about the library. - torchao supports the [quantization techniques](https://github.com/pytorch/ao/blob/main/torchao/quantization/README.md) below. - A16W8 Float8 Dynamic Quantization @@ -43,7 +42,6 @@ torchao supports the [quantization techniques](https://github.com/pytorch/ao/blo torchao also supports module level configuration by specifying a dictionary from fully qualified name of module and its corresponding quantization config. This allows skip quantizing certain layers and using different quantization config for different modules. - Check the table below to see if your hardware is compatible. | Component | Compatibility | @@ -52,8 +50,6 @@ Check the table below to see if your hardware is compatible. | XPU Versions | ✅ pytorch2.8 | | CPU | ✅ change `device_map="cpu"` (see examples below) | - - Install torchao from PyPi or the PyTorch index with the following commands. @@ -64,13 +60,15 @@ Install torchao from PyPi or the PyTorch index with the following commands. # Stable release from Pypi which will default to CUDA 12.6 pip install --upgrade torchao transformers ``` + Stable Release from the PyTorch index - + ```bash pip install torchao --index-url https://download.pytorch.org/whl/cu126 # options are cpu/cu118/cu126/cu128 ``` + @@ -118,6 +116,7 @@ input_ids = tokenizer(input_text, return_tensors="pt").to(model.device) output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static") print(tokenizer.decode(output[0], skip_special_tokens=True)) ``` + @@ -146,6 +145,7 @@ input_ids = tokenizer(input_text, return_tensors="pt").to(model.device) output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static") print(tokenizer.decode(output[0], skip_special_tokens=True)) ``` + @@ -177,13 +177,14 @@ input_ids = tokenizer(input_text, return_tensors="pt").to(model.device) output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static") print(tokenizer.decode(output[0], skip_special_tokens=True)) ``` + ### A100 GPU - + ```py import torch from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer @@ -210,6 +211,7 @@ input_ids = tokenizer(input_text, return_tensors="pt").to(model.device) output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static") print(tokenizer.decode(output[0], skip_special_tokens=True)) ``` + @@ -245,6 +247,7 @@ input_ids = tokenizer(input_text, return_tensors="pt").to(model.device) output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static") print(tokenizer.decode(output[0], skip_special_tokens=True)) ``` + @@ -276,13 +279,14 @@ input_ids = tokenizer(input_text, return_tensors="pt").to(model.device) output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static") print(tokenizer.decode(output[0], skip_special_tokens=True)) ``` + ### Intel XPU - + ```py import torch from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer @@ -309,6 +313,7 @@ input_ids = tokenizer(input_text, return_tensors="pt").to(model.device) output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static") print(tokenizer.decode(output[0], skip_special_tokens=True)) ``` + @@ -340,14 +345,14 @@ input_ids = tokenizer(input_text, return_tensors="pt").to(model.device) output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static") print(tokenizer.decode(output[0], skip_special_tokens=True)) ``` + - ### CPU - + ```py import torch from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer @@ -373,6 +378,7 @@ input_ids = tokenizer(input_text, return_tensors="pt") output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static") print(tokenizer.decode(output[0], skip_special_tokens=True)) ``` + @@ -404,12 +410,14 @@ input_ids = tokenizer(input_text, return_tensors="pt") output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static") print(tokenizer.decode(output[0], skip_special_tokens=True)) ``` + ### Per Module Quantization #### 1. Skip quantization for certain layers With `ModuleFqnToConfig` we can specify a default configuration for all layers while skipping quantization for certain layers. + ```py import torch from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig @@ -438,6 +446,7 @@ print(output_text) ``` #### 2. Quantizing different layers with different quantization configs + ```py import torch from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig @@ -485,7 +494,6 @@ Note: autoquant is for GPU only right now. Create a [`TorchAoConfig`] and set to `"autoquant"`. Set the `cache_implementation` to `"static"` to automatically [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) the forward method. Finally, call `finalize_autoquant` on the quantized model to finalize the quantization and log the input shapes. - ```py import torch from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer @@ -509,7 +517,6 @@ quantized_model.finalize_autoquant() print(tokenizer.decode(output[0], skip_special_tokens=True)) ``` - ## Serialization torchao implements [torch.Tensor subclasses](https://pytorch.org/docs/stable/notes/extending.html#subclassing-torch-tensor) for maximum flexibility in supporting new quantized torch.Tensor formats. [Safetensors](https://huggingface.co/docs/safetensors/en/index) serialization and deserialization does not work with torchao. @@ -518,15 +525,16 @@ To avoid arbitrary user code execution, torchao sets `weights_only=True` in [tor - + ```py # don't serialize model with Safetensors output_dir = "llama3-8b-int4wo-128" quantized_model.save_pretrained("llama3-8b-int4wo-128", safe_serialization=False) ``` + - + ```py # don't serialize model with Safetensors USER_ID = "your_huggingface_user_id" @@ -534,13 +542,14 @@ REPO_ID = "llama3-8b-int4wo-128" quantized_model.push_to_hub(f"{USER_ID}/llama3-8b-int4wo-128", safe_serialization=False) tokenizer.push_to_hub(f"{USER_ID}/llama3-8b-int4wo-128") ``` + - ## Loading quantized models Loading a quantized model depends on the quantization scheme. For quantization schemes, like int8 and float8, you can quantize the model on any device and also load it on any device. The example below demonstrates quantizing a model on the CPU and then loading it on CUDA or XPU. + ```py import torch from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer @@ -574,6 +583,7 @@ output = reloaded_model.generate(**input_ids, max_new_tokens=10) print(tokenizer.decode(output[0], skip_special_tokens=True)) ``` + For int4, the model can only be loaded on the same device it was quantized on because the layout is specific to the device. The example below demonstrates quantizing and loading a model on the CPU. ```py @@ -641,8 +651,6 @@ print(tokenizer.decode(output[0], skip_special_tokens=True)) > > All configuration objects accept parameters for customization (e.g., `group_size`, `scheme`, `layout`). - - ## Resources For a better sense of expected performance, view the [benchmarks](https://github.com/pytorch/ao/tree/main/torchao/quantization#benchmarks) for various models with CUDA and XPU backends. You can also run the code below to benchmark a model yourself. diff --git a/docs/source/en/run_scripts.md b/docs/source/en/run_scripts.md index c3a4787575c0..594eb84b02a1 100644 --- a/docs/source/en/run_scripts.md +++ b/docs/source/en/run_scripts.md @@ -52,6 +52,7 @@ Start with a smaller dataset by including the `max_train_samples`, `max_eval_sam > [!WARNING] > Not all example scripts support the `max_predict_samples` parameter. Run the command below to check whether a script supports it or not. +> > ```bash > examples/pytorch/summarization/run_summarization.py -h > ``` @@ -104,7 +105,7 @@ torchrun \ ... ``` -PyTorch supports TPUs, hardware designed to accelerate performance, through the [PyTorch/XLA](https://github.com/pytorch/xla/blob/master/README.md) package. Launch the `xla_spawn.py` script and use `num _cores` to set the number of TPU cores to train with. +PyTorch supports TPUs, hardware designed to accelerate performance, through the [PyTorch/XLA](https://github.com/pytorch/xla/blob/master/README.md) package. Launch the `xla_spawn.py` script and use `num_cores` to set the number of TPU cores to train with. ```bash python xla_spawn.py --num_cores 8 pytorch/summarization/run_summarization.py \ diff --git a/docs/source/en/serialization.md b/docs/source/en/serialization.md index 831f163bed18..1fefe08d5ca9 100644 --- a/docs/source/en/serialization.md +++ b/docs/source/en/serialization.md @@ -38,6 +38,7 @@ pip install optimum[exporters] > [!TIP] > Refer to the [Export a model to ONNX with optimum.exporters.onnx](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli) guide for all available arguments or with the command below. +> > ```bash > optimum-cli export onnx --help > ``` @@ -50,7 +51,7 @@ optimum-cli export onnx --model distilbert/distilbert-base-uncased-distilled-squ You should see logs indicating the progress and showing where the resulting `model.onnx` is saved. -```bash +```text Validating ONNX model distilbert_base_uncased_squad_onnx/model.onnx... -[✓] ONNX model output names match reference model (start_logits, end_logits) - Validating ONNX Model output "start_logits": diff --git a/docs/source/en/serving.md b/docs/source/en/serving.md index f421a284950a..4287c5d2d5ec 100644 --- a/docs/source/en/serving.md +++ b/docs/source/en/serving.md @@ -16,7 +16,7 @@ rendered properly in your Markdown viewer. # Serving -Transformer models can be efficiently deployed using libraries such as vLLM, Text Generation Inference (TGI), and others. These libraries are designed for production-grade user-facing services, and can scale to multiple servers and millions of concurrent users. Refer to [Transformers as Backend for Inference Servers](./transformers_as_backends) for usage examples. +Transformer models can be efficiently deployed using libraries such as vLLM, Text Generation Inference (TGI), and others. These libraries are designed for production-grade user-facing services, and can scale to multiple servers and millions of concurrent users. Refer to [Transformers as Backend for Inference Servers](./transformers_as_backend) for usage examples. > [!TIP] > Responses API is now supported as an experimental API! Read more about it [here](#responses-api). @@ -24,19 +24,20 @@ Transformer models can be efficiently deployed using libraries such as vLLM, Tex You can also serve transformer models with the `transformers serve` CLI. With Continuous Batching, `serve` now delivers solid throughput and latency well suited for evaluation, experimentation, and moderate-load local or self-hosted deployments. While vLLM, SGLang, or other inference engines remain our recommendations for large-scale production, `serve` avoids the extra runtime and operational overhead, and is on track to gain more production-oriented features. In this document, we dive into the different supported endpoints and modalities; we also cover the setup of several user interfaces that can be used on top of `transformers serve` in the following guides: -- [Jan (text and MCP user interface)](./jan.md) -- [Cursor (IDE)](./cursor.md) -- [Open WebUI (text, image, speech user interface)](./open_webui.md) -- [Tiny-Agents (text and MCP CLI tool)](./tiny_agents.md) +- [Jan (text and MCP user interface)](./jan) +- [Cursor (IDE)](./cursor) +- [Open WebUI (text, image, speech user interface)](./open_webui) +- [Tiny-Agents (text and MCP CLI tool)](./tiny_agents) ## Serve CLI > [!WARNING] > This section is experimental and subject to change in future versions -You can serve models of diverse modalities supported by `transformers` with the `transformers serve` CLI. It spawns a local server that offers compatibility with the OpenAI SDK, which is the _de facto_ standard for LLM conversations and other related tasks. This way, you can use the server from many third party applications, or test it using the `transformers chat` CLI ([docs](conversations.md#chat-cli)). +You can serve models of diverse modalities supported by `transformers` with the `transformers serve` CLI. It spawns a local server that offers compatibility with the OpenAI SDK, which is the _de facto_ standard for LLM conversations and other related tasks. This way, you can use the server from many third party applications, or test it using the `transformers chat` CLI ([docs](conversations#chat-cli)). The server supports the following REST APIs: + - `/v1/chat/completions` - `/v1/responses` - `/v1/audio/transcriptions` @@ -356,7 +357,6 @@ ResponseCompletedEvent(response=Response(id='resp_req_0', created_at=1754060400. - ## MCP integration The `transformers serve` server is also an MCP client, so it can interact with MCP tools in agentic use cases. This, of course, requires the use of an LLM that is designed to use tools. @@ -382,7 +382,6 @@ transformers serve \ --attn_implementation sdpa_paged ``` - ### Performance tips - Use an efficient attention backend when available: @@ -401,5 +400,3 @@ transformers serve \ - `--load_in_4bit`/`--load_in_8bit` can reduce memory footprint for LoRA setups - `--force-model ` avoids per-request model hints and helps produce stable, repeatable runs - - diff --git a/docs/source/en/tasks/audio_classification.md b/docs/source/en/tasks/audio_classification.md index 973f95e1e955..844b5caec052 100644 --- a/docs/source/en/tasks/audio_classification.md +++ b/docs/source/en/tasks/audio_classification.md @@ -212,7 +212,6 @@ At this point, only three steps remain: 2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function. 3. Call [`~Trainer.train`] to fine-tune your model. - ```py >>> training_args = TrainingArguments( ... output_dir="my_awesome_mind_model", diff --git a/docs/source/en/tasks/document_question_answering.md b/docs/source/en/tasks/document_question_answering.md index d83e025c4090..2c729f76adcb 100644 --- a/docs/source/en/tasks/document_question_answering.md +++ b/docs/source/en/tasks/document_question_answering.md @@ -104,6 +104,7 @@ yourself with the features. ``` Here's what the individual fields represent: + * `id`: the example's id * `image`: a PIL.Image.Image object containing the document image * `query`: the question string - natural language asked question, in several languages @@ -257,6 +258,7 @@ Once examples are encoded, however, they will look like this: ``` We'll need to find the position of the answer in the encoded input. + * `token_type_ids` tells us which tokens are part of the question, and which ones are part of the document's words. * `tokenizer.cls_token_id` will help find the special token at the beginning of the input. * `word_ids` will help match the answer found in the original `words` to the same answer in the full encoded input and determine @@ -365,6 +367,7 @@ of the Hugging Face course for inspiration. Congratulations! You've successfully navigated the toughest part of this guide and now you are ready to train your own model. Training involves the following steps: + * Load the model with [`AutoModelForDocumentQuestionAnswering`] using the same checkpoint as in the preprocessing. * Define your training hyperparameters in [`TrainingArguments`]. * Define a function to batch examples together, here the [`DefaultDataCollator`] will do just fine @@ -439,6 +442,7 @@ Now that you have finetuned a LayoutLMv2 model, and uploaded it to the 🤗 Hub, way to try out your finetuned model for inference is to use it in a [`Pipeline`]. Let's take an example: + ```py >>> example = dataset["test"][2] >>> question = example["query"]["en"] @@ -464,6 +468,7 @@ document question answering with your model, and pass the image + question combi ``` You can also manually replicate the results of the pipeline if you'd like: + 1. Take an image and a question, prepare them for the model using the processor from your model. 2. Forward the result or preprocessing through the model. 3. The model returns `start_logits` and `end_logits`, which indicate which token is at the start of the answer and diff --git a/docs/source/en/tasks/idefics.md b/docs/source/en/tasks/idefics.md index 3f8915f3cc99..b03c7bccd9c2 100644 --- a/docs/source/en/tasks/idefics.md +++ b/docs/source/en/tasks/idefics.md @@ -18,26 +18,27 @@ rendered properly in your Markdown viewer. [[open-in-colab]] -While individual tasks can be tackled by fine-tuning specialized models, an alternative approach -that has recently emerged and gained popularity is to use large models for a diverse set of tasks without fine-tuning. -For instance, large language models can handle such NLP tasks as summarization, translation, classification, and more. -This approach is no longer limited to a single modality, such as text, and in this guide, we will illustrate how you can -solve image-text tasks with a large multimodal model called IDEFICS. - -[IDEFICS](../model_doc/idefics) is an open-access vision and language model based on [Flamingo](https://huggingface.co/papers/2204.14198), -a state-of-the-art visual language model initially developed by DeepMind. The model accepts arbitrary sequences of image -and text inputs and generates coherent text as output. It can answer questions about images, describe visual content, -create stories grounded in multiple images, and so on. IDEFICS comes in two variants - [80 billion parameters](https://huggingface.co/HuggingFaceM4/idefics-80b) -and [9 billion parameters](https://huggingface.co/HuggingFaceM4/idefics-9b), both of which are available on the 🤗 Hub. For each variant, you can also find fine-tuned instructed +While individual tasks can be tackled by fine-tuning specialized models, an alternative approach +that has recently emerged and gained popularity is to use large models for a diverse set of tasks without fine-tuning. +For instance, large language models can handle such NLP tasks as summarization, translation, classification, and more. +This approach is no longer limited to a single modality, such as text, and in this guide, we will illustrate how you can +solve image-text tasks with a large multimodal model called IDEFICS. + +[IDEFICS](../model_doc/idefics) is an open-access vision and language model based on [Flamingo](https://huggingface.co/papers/2204.14198), +a state-of-the-art visual language model initially developed by DeepMind. The model accepts arbitrary sequences of image +and text inputs and generates coherent text as output. It can answer questions about images, describe visual content, +create stories grounded in multiple images, and so on. IDEFICS comes in two variants - [80 billion parameters](https://huggingface.co/HuggingFaceM4/idefics-80b) +and [9 billion parameters](https://huggingface.co/HuggingFaceM4/idefics-9b), both of which are available on the 🤗 Hub. For each variant, you can also find fine-tuned instructed versions of the model adapted for conversational use cases. -This model is exceptionally versatile and can be used for a wide range of image and multimodal tasks. However, -being a large model means it requires significant computational resources and infrastructure. It is up to you to decide whether -this approach suits your use case better than fine-tuning specialized models for each individual task. +This model is exceptionally versatile and can be used for a wide range of image and multimodal tasks. However, +being a large model means it requires significant computational resources and infrastructure. It is up to you to decide whether +this approach suits your use case better than fine-tuning specialized models for each individual task. + +In this guide, you'll learn how to: -In this guide, you'll learn how to: - [Load IDEFICS](#loading-the-model) and [load the quantized version of the model](#quantized-model) -- Use IDEFICS for: +- Use IDEFICS for: - [Image captioning](#image-captioning) - [Prompted image captioning](#prompted-image-captioning) - [Few-shot prompting](#few-shot-prompting) @@ -47,7 +48,7 @@ In this guide, you'll learn how to: - [Run inference in batch mode](#running-inference-in-batch-mode) - [Run IDEFICS instruct for conversational use](#idefics-instruct-for-conversational-use) -Before you begin, make sure you have all the necessary libraries installed. +Before you begin, make sure you have all the necessary libraries installed. ```bash pip install -q bitsandbytes sentencepiece accelerate transformers @@ -59,14 +60,14 @@ To run the following examples with a non-quantized version of the model checkpoi ## Loading the model -Let's start by loading the model's 9 billion parameters checkpoint: +Let's start by loading the model's 9 billion parameters checkpoint: ```py >>> checkpoint = "HuggingFaceM4/idefics-9b" ``` -Just like for other Transformers models, you need to load a processor and the model itself from the checkpoint. -The IDEFICS processor wraps a [`LlamaTokenizer`] and IDEFICS image processor into a single processor to take care of +Just like for other Transformers models, you need to load a processor and the model itself from the checkpoint. +The IDEFICS processor wraps a [`LlamaTokenizer`] and IDEFICS image processor into a single processor to take care of preparing text and image inputs for the model. ```py @@ -79,13 +80,13 @@ preparing text and image inputs for the model. >>> model = IdeficsForVisionText2Text.from_pretrained(checkpoint, dtype=torch.bfloat16, device_map="auto") ``` -Setting `device_map` to `"auto"` will automatically determine how to load and store the model weights in the most optimized +Setting `device_map` to `"auto"` will automatically determine how to load and store the model weights in the most optimized manner given existing devices. ### Quantized model -If high-memory device availability is an issue, you can load the quantized version of the model. To load the model and the -processor in 4bit precision, pass a `BitsAndBytesConfig` to the `from_pretrained` method and the model will be compressed +If high-memory device availability is an issue, you can load the quantized version of the model. To load the model and the +processor in 4bit precision, pass a `BitsAndBytesConfig` to the `from_pretrained` method and the model will be compressed on the fly while loading. ```py @@ -109,8 +110,8 @@ on the fly while loading. Now that you have the model loaded in one of the suggested ways, let's move on to exploring tasks that you can use IDEFICS for. ## Image captioning -Image captioning is the task of predicting a caption for a given image. A common application is to aid visually impaired -people navigate through different situations, for instance, explore image content online. +Image captioning is the task of predicting a caption for a given image. A common application is to aid visually impaired +people navigate through different situations, for instance, explore image content online. To illustrate the task, get an image to be captioned, e.g.: @@ -118,10 +119,10 @@ To illustrate the task, get an image to be captioned, e.g.: Image of a puppy in a flower bed
-Photo by [Hendo Wang](https://unsplash.com/@hendoo). +Photo by [Hendo Wang](https://unsplash.com/@hendoo). -IDEFICS accepts text and image prompts. However, to caption an image, you do not have to provide a text prompt to the -model, only the preprocessed input image. Without a text prompt, the model will start generating text from the +IDEFICS accepts text and image prompts. However, to caption an image, you do not have to provide a text prompt to the +model, only the preprocessed input image. Without a text prompt, the model will start generating text from the BOS (beginning-of-sequence) token thus creating a caption. As image input to the model, you can use either an image object (`PIL.Image`) or a url from which the image can be retrieved. @@ -142,15 +143,15 @@ A puppy in a flower bed -It is a good idea to include the `bad_words_ids` in the call to `generate` to avoid errors arising when increasing -the `max_new_tokens`: the model will want to generate a new `` or `` token when there +It is a good idea to include the `bad_words_ids` in the call to `generate` to avoid errors arising when increasing +the `max_new_tokens`: the model will want to generate a new `` or `` token when there is no image being generated by the model. You can set it on-the-fly as in this guide, or store in the `GenerationConfig` as described in the [Text generation strategies](../generation_strategies) guide. ## Prompted image captioning -You can extend image captioning by providing a text prompt, which the model will continue given the image. Let's take +You can extend image captioning by providing a text prompt, which the model will continue given the image. Let's take another image to illustrate:
@@ -158,7 +159,7 @@ another image to illustrate:
Photo by [Denys Nevozhai](https://unsplash.com/@dnevozhai). - + Textual and image prompts can be passed to the model's processor as a single list to create appropriate inputs. ```py @@ -178,12 +179,12 @@ This is an image of the Eiffel Tower in Paris, France. ## Few-shot prompting -While IDEFICS demonstrates great zero-shot results, your task may require a certain format of the caption, or come with +While IDEFICS demonstrates great zero-shot results, your task may require a certain format of the caption, or come with other restrictions or requirements that increase task's complexity. Few-shot prompting can be used to enable in-context learning. -By providing examples in the prompt, you can steer the model to generate results that mimic the format of given examples. +By providing examples in the prompt, you can steer the model to generate results that mimic the format of given examples. -Let's use the previous image of the Eiffel Tower as an example for the model and build a prompt that demonstrates to the model -that in addition to learning what the object in an image is, we would also like to get some interesting information about it. +Let's use the previous image of the Eiffel Tower as an example for the model and build a prompt that demonstrates to the model +that in addition to learning what the object in an image is, we would also like to get some interesting information about it. Then, let's see, if we can get the same response format for an image of the Statue of Liberty:
@@ -213,24 +214,24 @@ User: Describe this image. Assistant: An image of the Statue of Liberty. Fun fact: the Statue of Liberty is 151 feet tall. ``` -Notice that just from a single example (i.e., 1-shot) the model has learned how to perform the task. For more complex tasks, +Notice that just from a single example (i.e., 1-shot) the model has learned how to perform the task. For more complex tasks, feel free to experiment with a larger number of examples (e.g., 3-shot, 5-shot, etc.). ## Visual question answering -Visual Question Answering (VQA) is the task of answering open-ended questions based on an image. Similar to image -captioning it can be used in accessibility applications, but also in education (reasoning about visual materials), customer +Visual Question Answering (VQA) is the task of answering open-ended questions based on an image. Similar to image +captioning it can be used in accessibility applications, but also in education (reasoning about visual materials), customer service (questions about products based on images), and image retrieval. -Let's get a new image for this task: +Let's get a new image for this task:
Image of a couple having a picnic
-Photo by [Jarritos Mexican Soda](https://unsplash.com/@jarritos). +Photo by [Jarritos Mexican Soda](https://unsplash.com/@jarritos). -You can steer the model from image captioning to visual question answering by prompting it with appropriate instructions: +You can steer the model from image captioning to visual question answering by prompting it with appropriate instructions: ```py >>> prompt = [ @@ -251,11 +252,11 @@ Instruction: Provide an answer to the question. Use the image to answer. ## Image classification -IDEFICS is capable of classifying images into different categories without being explicitly trained on data containing -labeled examples from those specific categories. Given a list of categories and using its image and text understanding -capabilities, the model can infer which category the image likely belongs to. +IDEFICS is capable of classifying images into different categories without being explicitly trained on data containing +labeled examples from those specific categories. Given a list of categories and using its image and text understanding +capabilities, the model can infer which category the image likely belongs to. -Say, we have this image of a vegetable stand: +Say, we have this image of a vegetable stand:
Image of a vegetable stand @@ -286,10 +287,10 @@ In the example above we instruct the model to classify the image into a single c ## Image-guided text generation -For more creative applications, you can use image-guided text generation to generate text based on an image. This can be -useful to create descriptions of products, ads, descriptions of a scene, etc. +For more creative applications, you can use image-guided text generation to generate text based on an image. This can be +useful to create descriptions of products, ads, descriptions of a scene, etc. -Let's prompt IDEFICS to write a story based on a simple image of a red door: +Let's prompt IDEFICS to write a story based on a simple image of a red door:
Image of a red door with a pumpkin on the steps @@ -333,14 +334,14 @@ Looks like IDEFICS noticed the pumpkin on the doorstep and went with a spooky Ha -For longer outputs like this, you will greatly benefit from tweaking the text generation strategy. This can help -you significantly improve the quality of the generated output. Check out [Text generation strategies](../generation_strategies) -to learn more. +For longer outputs like this, you will greatly benefit from tweaking the text generation strategy. This can help +you significantly improve the quality of the generated output. Check out [Text generation strategies](../generation_strategies) +to learn more. ## Running inference in batch mode -All of the earlier sections illustrated IDEFICS for a single example. In a very similar fashion, you can run inference +All of the earlier sections illustrated IDEFICS for a single example. In a very similar fashion, you can run inference for a batch of examples by passing a list of prompts: ```py @@ -375,13 +376,13 @@ This is an image of a vegetable stand. ## IDEFICS instruct for conversational use -For conversational use cases, you can find fine-tuned instructed versions of the model on the 🤗 Hub: +For conversational use cases, you can find fine-tuned instructed versions of the model on the 🤗 Hub: `HuggingFaceM4/idefics-80b-instruct` and `HuggingFaceM4/idefics-9b-instruct`. -These checkpoints are the result of fine-tuning the respective base models on a mixture of supervised and instruction +These checkpoints are the result of fine-tuning the respective base models on a mixture of supervised and instruction fine-tuning datasets, which boosts the downstream performance while making the models more usable in conversational settings. -The use and prompting for the conversational use is very similar to using the base models: +The use and prompting for the conversational use is very similar to using the base models: ```py >>> import torch diff --git a/docs/source/en/tasks/image_captioning.md b/docs/source/en/tasks/image_captioning.md index f9716f29a204..4b4b3ba5fa36 100644 --- a/docs/source/en/tasks/image_captioning.md +++ b/docs/source/en/tasks/image_captioning.md @@ -14,7 +14,6 @@ rendered properly in your Markdown viewer. --> - # Image captioning [[open-in-colab]] @@ -26,7 +25,7 @@ helps to improve content accessibility for people by describing images to them. This guide will show you how to: * Fine-tune an image captioning model. -* Use the fine-tuned model for inference. +* Use the fine-tuned model for inference. Before you begin, make sure you have all the necessary libraries installed: @@ -37,7 +36,6 @@ pip install jiwer -q We encourage you to log in to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to log in: - ```python from huggingface_hub import notebook_login @@ -47,8 +45,7 @@ notebook_login() ## Load the Pokémon BLIP captions dataset Use the 🤗 Dataset library to load a dataset that consists of {image-caption} pairs. To create your own image captioning dataset -in PyTorch, you can follow [this notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/GIT/Fine_tune_GIT_on_an_image_captioning_dataset.ipynb). - +in PyTorch, you can follow [this notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/GIT/Fine_tune_GIT_on_an_image_captioning_dataset.ipynb). ```python from datasets import load_dataset @@ -56,6 +53,7 @@ from datasets import load_dataset ds = load_dataset("lambdalabs/pokemon-blip-captions") ds ``` + ```bash DatasetDict({ train: Dataset({ @@ -69,12 +67,11 @@ The dataset has two features, `image` and `text`. -Many image captioning datasets contain multiple captions per image. In those cases, a common strategy is to randomly sample a caption amongst the available ones during training. +Many image captioning datasets contain multiple captions per image. In those cases, a common strategy is to randomly sample a caption amongst the available ones during training. -Split the dataset’s train split into a train and test set with the [`~datasets.Dataset.train_test_split`] method: - +Split the dataset's train split into a train and test set with the [`~datasets.Dataset.train_test_split`] method: ```python ds = ds["train"].train_test_split(test_size=0.1) @@ -82,8 +79,7 @@ train_ds = ds["train"] test_ds = ds["test"] ``` -Let's visualize a couple of samples from the training set. - +Let's visualize a couple of samples from the training set. ```python from textwrap import wrap @@ -106,7 +102,7 @@ sample_images_to_visualize = [np.array(train_ds[i]["image"]) for i in range(5)] sample_captions = [train_ds[i]["text"] for i in range(5)] plot_images(sample_images_to_visualize, sample_captions) ``` - +
Sample training images
@@ -115,7 +111,7 @@ plot_images(sample_images_to_visualize, sample_captions) Since the dataset has two modalities (image and text), the pre-processing pipeline will preprocess images and the captions. -To do so, load the processor class associated with the model you are about to fine-tune. +To do so, load the processor class associated with the model you are about to fine-tune. ```python from transformers import AutoProcessor @@ -124,7 +120,7 @@ checkpoint = "microsoft/git-base" processor = AutoProcessor.from_pretrained(checkpoint) ``` -The processor will internally pre-process the image (which includes resizing, and pixel scaling) and tokenize the caption. +The processor will internally pre-process the image (which includes resizing, and pixel scaling) and tokenize the caption. ```python def transforms(example_batch): @@ -139,13 +135,12 @@ train_ds.set_transform(transforms) test_ds.set_transform(transforms) ``` -With the dataset ready, you can now set up the model for fine-tuning. +With the dataset ready, you can now set up the model for fine-tuning. ## Load a base model Load the ["microsoft/git-base"](https://huggingface.co/microsoft/git-base) into a [`AutoModelForCausalLM`](https://huggingface.co/docs/transformers/model_doc/auto#transformers.AutoModelForCausalLM) object. - ```python from transformers import AutoModelForCausalLM @@ -154,10 +149,9 @@ model = AutoModelForCausalLM.from_pretrained(checkpoint) ## Evaluate -Image captioning models are typically evaluated with the [Rouge Score](https://huggingface.co/spaces/evaluate-metric/rouge) or [Word Error Rate](https://huggingface.co/spaces/evaluate-metric/wer). For this guide, you will use the Word Error Rate (WER). - -We use the 🤗 Evaluate library to do so. For potential limitations and other gotchas of the WER, refer to [this guide](https://huggingface.co/spaces/evaluate-metric/wer). +Image captioning models are typically evaluated with the [Rouge Score](https://huggingface.co/spaces/evaluate-metric/rouge) or [Word Error Rate](https://huggingface.co/spaces/evaluate-metric/wer). For this guide, you will use the Word Error Rate (WER). +We use the 🤗 Evaluate library to do so. For potential limitations and other gotchas of the WER, refer to [this guide](https://huggingface.co/spaces/evaluate-metric/wer). ```python from evaluate import load @@ -177,11 +171,10 @@ def compute_metrics(eval_pred): ## Train! -Now, you are ready to start fine-tuning the model. You will use the 🤗 [`Trainer`] for this. +Now, you are ready to start fine-tuning the model. You will use the 🤗 [`Trainer`] for this. First, define the training arguments using [`TrainingArguments`]. - ```python from transformers import TrainingArguments, Trainer @@ -208,7 +201,7 @@ training_args = TrainingArguments( ) ``` -Then pass them along with the datasets and the model to 🤗 Trainer. +Then pass them along with the datasets and the model to 🤗 Trainer. ```python trainer = Trainer( @@ -222,7 +215,7 @@ trainer = Trainer( To start training, simply call [`~Trainer.train`] on the [`Trainer`] object. -```python +```python trainer.train() ``` @@ -230,7 +223,6 @@ You should see the training loss drop smoothly as training progresses. Once training is completed, share your model to the Hub with the [`~Trainer.push_to_hub`] method so everyone can use your model: - ```python trainer.push_to_hub() ``` @@ -239,7 +231,6 @@ trainer.push_to_hub() Take a sample image from `test_ds` to test the model. - ```python from PIL import Image import requests @@ -252,7 +243,7 @@ image
Test image
- + Prepare image for the model. ```python @@ -263,13 +254,14 @@ inputs = processor(images=image, return_tensors="pt").to(device) pixel_values = inputs.pixel_values ``` -Call [`generate`] and decode the predictions. +Call [`generate`] and decode the predictions. ```python generated_ids = model.generate(pixel_values=pixel_values, max_length=50) generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] print(generated_caption) ``` + ```bash a drawing of a pink and blue pokemon ``` diff --git a/docs/source/en/tasks/image_classification.md b/docs/source/en/tasks/image_classification.md index 39b013f129cc..4754a91bd482 100644 --- a/docs/source/en/tasks/image_classification.md +++ b/docs/source/en/tasks/image_classification.md @@ -175,7 +175,6 @@ Your `compute_metrics` function is ready to go now, and you'll return to it when ## Train - If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)! @@ -238,7 +237,6 @@ Once training is completed, share your model to the Hub with the [`~transformers >>> trainer.push_to_hub() ``` - For a more in-depth example of how to finetune a model for image classification, take a look at the corresponding [PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb). diff --git a/docs/source/en/tasks/image_feature_extraction.md b/docs/source/en/tasks/image_feature_extraction.md index 455a2b425d41..e08ba89e4dd8 100644 --- a/docs/source/en/tasks/image_feature_extraction.md +++ b/docs/source/en/tasks/image_feature_extraction.md @@ -27,7 +27,7 @@ In this guide, you will: ## Image Similarity using `image-feature-extraction` Pipeline -We have two images of cats sitting on top of fish nets, one of them is generated. +We have two images of cats sitting on top of fish nets, one of them is generated. ```python from PIL import Image @@ -66,7 +66,7 @@ print(outputs) # [[[-0.03909236937761307, 0.43381670117378235, -0.06913255900144577, ``` -To get the similarity score, we need to pass them to a similarity function. +To get the similarity score, we need to pass them to a similarity function. ```python from torch.nn.functional import cosine_similarity @@ -131,4 +131,3 @@ print(similarity_score) # tensor([0.6061], device='cuda:0', grad_fn=) ``` - diff --git a/docs/source/en/tasks/image_text_to_text.md b/docs/source/en/tasks/image_text_to_text.md index b34f4edf90f6..8820a534030c 100644 --- a/docs/source/en/tasks/image_text_to_text.md +++ b/docs/source/en/tasks/image_text_to_text.md @@ -23,6 +23,7 @@ Image-text-to-text models, also known as vision language models (VLMs), are lang In this guide, we provide a brief overview of VLMs and show how to use them with Transformers for inference. To begin with, there are multiple types of VLMs: + - base models used for fine-tuning - chat fine-tuned models for conversation - instruction fine-tuned models @@ -63,7 +64,6 @@ The image inputs look like the following. A bee on a pink flower
- ```python from PIL import Image import requests @@ -76,7 +76,6 @@ images = [Image.open(requests.get(img_urls[0], stream=True).raw), Below is an example of the chat template. We can feed conversation turns and the last message as an input by appending it at the end of the template. - ```python messages = [ { @@ -207,7 +206,6 @@ We can use [text streaming](./generation_strategies#streaming) for a better gene Assume we have an application that keeps chat history and takes in the new user input. We will preprocess the inputs as usual and initialize [`TextIteratorStreamer`] to handle the generation in a separate thread. This allows you to stream the generated text tokens in real-time. Any generation arguments can be passed to [`TextIteratorStreamer`]. - ```python import time from transformers import TextIteratorStreamer diff --git a/docs/source/en/tasks/image_to_image.md b/docs/source/en/tasks/image_to_image.md index da6a57ac9aa9..55380e9b0d1e 100644 --- a/docs/source/en/tasks/image_to_image.md +++ b/docs/source/en/tasks/image_to_image.md @@ -18,9 +18,10 @@ rendered properly in your Markdown viewer. [[open-in-colab]] -Image-to-Image task is the task where an application receives an image and outputs another image. This has various subtasks, including image enhancement (super resolution, low light enhancement, deraining and so on), image inpainting, and more. +Image-to-Image task is the task where an application receives an image and outputs another image. This has various subtasks, including image enhancement (super resolution, low light enhancement, deraining and so on), image inpainting, and more. This guide will show you how to: + - Use an image-to-image pipeline for super resolution task, - Run image-to-image models for same task without a pipeline. @@ -32,7 +33,7 @@ Let's begin by installing the necessary libraries. pip install transformers ``` -We can now initialize the pipeline with a [Swin2SR model](https://huggingface.co/caidas/swin2SR-lightweight-x2-64). We can then infer with the pipeline by calling it with an image. As of now, only [Swin2SR models](https://huggingface.co/models?sort=trending&search=swin2sr) are supported in this pipeline. +We can now initialize the pipeline with a [Swin2SR model](https://huggingface.co/caidas/swin2SR-lightweight-x2-64). We can then infer with the pipeline by calling it with an image. As of now, only [Swin2SR models](https://huggingface.co/models?sort=trending&search=swin2sr) are supported in this pipeline. ```python from transformers import pipeline, infer_device @@ -53,19 +54,22 @@ image = Image.open(requests.get(url, stream=True).raw) print(image.size) ``` + ```bash # (532, 432) ``` +
Photo of a cat
-We can now do inference with the pipeline. We will get an upscaled version of the cat image. +We can now do inference with the pipeline. We will get an upscaled version of the cat image. ```python upscaled = pipe(image) print(upscaled.size) ``` + ```bash # (1072, 880) ``` @@ -79,7 +83,7 @@ model = Swin2SRForImageSuperResolution.from_pretrained("caidas/swin2SR-lightweig processor = Swin2SRImageProcessor("caidas/swin2SR-lightweight-x2-64") ``` -`pipeline` abstracts away the preprocessing and postprocessing steps that we have to do ourselves, so let's preprocess the image. We will pass the image to the processor and then move the pixel values to GPU. +`pipeline` abstracts away the preprocessing and postprocessing steps that we have to do ourselves, so let's preprocess the image. We will pass the image to the processor and then move the pixel values to GPU. ```python pixel_values = processor(image, return_tensors="pt").pixel_values @@ -96,9 +100,10 @@ import torch with torch.no_grad(): outputs = model(pixel_values) ``` -Output is an object of type `ImageSuperResolutionOutput` that looks like below 👇 -``` +Output is an object of type `ImageSuperResolutionOutput` that looks like below 👇 + +```text (loss=None, reconstruction=tensor([[[[0.8270, 0.8269, 0.8275, ..., 0.7463, 0.7446, 0.7453], [0.8287, 0.8278, 0.8283, ..., 0.7451, 0.7448, 0.7457], [0.8280, 0.8273, 0.8269, ..., 0.7447, 0.7446, 0.7452], @@ -108,6 +113,7 @@ Output is an object of type `ImageSuperResolutionOutput` that looks like below [0.5927, 0.5914, 0.5922, ..., 0.0664, 0.0694, 0.0718]]]], device='cuda:0'), hidden_states=None, attentions=None) ``` + We need to get the `reconstruction` and post-process it for visualization. Let's see how it looks like. ```python @@ -128,6 +134,7 @@ output = np.moveaxis(output, source=0, destination=-1) output = (output * 255.0).round().astype(np.uint8) Image.fromarray(output) ``` +
Upscaled photo of a cat
diff --git a/docs/source/en/tasks/keypoint_detection.md b/docs/source/en/tasks/keypoint_detection.md index 3a5871d01a2b..c850c67ae153 100644 --- a/docs/source/en/tasks/keypoint_detection.md +++ b/docs/source/en/tasks/keypoint_detection.md @@ -18,7 +18,7 @@ rendered properly in your Markdown viewer. [[open-in-colab]] -Keypoint detection identifies and locates specific points of interest within an image. These keypoints, also known as landmarks, represent meaningful features of objects, such as facial features or object parts. These models take an image input and return the following outputs: +Keypoint detection identifies and locates specific points of interest within an image. These keypoints, also known as landmarks, represent meaningful features of objects, such as facial features or object parts. These models take an image input and return the following outputs: - **Keypoints and Scores**: Points of interest and their confidence scores. - **Descriptors**: A representation of the image region surrounding each keypoint, capturing its texture, gradient, orientation and other properties. @@ -36,15 +36,14 @@ model = SuperPointForKeypointDetection.from_pretrained("magic-leap-community/sup Let's test the model on the images below.
- Bee - Cats
- ```python import torch from PIL import Image @@ -93,7 +92,7 @@ image_sizes = [(image.size[1], image.size[0]) for image in images] outputs = processor.post_process_keypoint_detection(outputs, image_sizes) ``` -The outputs are now a list of dictionaries where each dictionary is a processed output of keypoints, scores and descriptors. +The outputs are now a list of dictionaries where each dictionary is a processed output of keypoints, scores and descriptors. ```python [{'keypoints': tensor([[ 226, 57], @@ -144,11 +143,10 @@ for i in range(len(images)): Below you can see the outputs.
- Bee - Cats
- diff --git a/docs/source/en/tasks/keypoint_matching.md b/docs/source/en/tasks/keypoint_matching.md index f7065f315211..7183c308c27a 100644 --- a/docs/source/en/tasks/keypoint_matching.md +++ b/docs/source/en/tasks/keypoint_matching.md @@ -34,15 +34,15 @@ model = AutoModelForKeypointMatching.from_pretrained("zju-community/matchanythin Load two images that have the same object of interest. The second photo is taken a second apart, it's colors are edited, and it is further cropped and rotated.
- Bee - Bee edited
-```python +```python from transformers.image_utils import load_image image1 = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg") image2 = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee_edited.jpg") @@ -69,7 +69,7 @@ print(outputs) Here's the outputs. -``` +```text [{'keypoints0': tensor([[4514, 550], [4813, 683], [1972, 1547], @@ -82,16 +82,16 @@ Here's the outputs. [1521, 2560]], dtype=torch.int32), 'matching_scores': tensor([0.2189, 0.2073, 0.2414, ... ])}] -``` +``` We have trimmed the output but there's 401 matches! ```python len(outputs[0]["keypoints0"]) # 401 -``` +``` -We can visualize them using the processor's [`~EfficientLoFTRImageProcessor.visualize_keypoint_matching`] method. +We can visualize them using the processor's [`~EfficientLoFTRImageProcessor.visualize_keypoint_matching`] method. ```python plot_images = processor.visualize_keypoint_matching(images, outputs) @@ -100,7 +100,7 @@ plot_images ![Matched Image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/matched_bees.png) -Optionally, you can use the [`Pipeline`] API and set the task to `keypoint-matching`. +Optionally, you can use the [`Pipeline`] API and set the task to `keypoint-matching`. ```python from transformers import pipeline diff --git a/docs/source/en/tasks/knowledge_distillation_for_image_classification.md b/docs/source/en/tasks/knowledge_distillation_for_image_classification.md index 7c4a684d3c05..d4b3dd8511df 100644 --- a/docs/source/en/tasks/knowledge_distillation_for_image_classification.md +++ b/docs/source/en/tasks/knowledge_distillation_for_image_classification.md @@ -52,7 +52,6 @@ processed_datasets = dataset.map(process, batched=True) Essentially, we want the student model (a randomly initialized MobileNet) to mimic the teacher model (fine-tuned vision transformer). To achieve this, we first get the logits output from the teacher and the student. Then, we divide each of them by the parameter `temperature` which controls the importance of each soft target. A parameter called `lambda` weighs the importance of the distillation loss. In this example, we will use `temperature=5` and `lambda=0.5`. We will use the Kullback-Leibler Divergence loss to compute the divergence between the student and teacher. Given two data P and Q, KL Divergence explains how much extra information we need to represent P using Q. If two are identical, their KL divergence is zero, as there's no other information needed to explain P from Q. Thus, in the context of knowledge distillation, KL divergence is useful. - ```python from transformers import TrainingArguments, Trainer, infer_device import torch diff --git a/docs/source/en/tasks/mask_generation.md b/docs/source/en/tasks/mask_generation.md index 5f66e68c2452..817cb9819e7d 100644 --- a/docs/source/en/tasks/mask_generation.md +++ b/docs/source/en/tasks/mask_generation.md @@ -16,24 +16,26 @@ rendered properly in your Markdown viewer. # Mask Generation -Mask generation is the task of generating semantically meaningful masks for an image. -This task is very similar to [image segmentation](semantic_segmentation), but many differences exist. Image segmentation models are trained on labeled datasets and are limited to the classes they have seen during training; they return a set of masks and corresponding classes, given an image. +Mask generation is the task of generating semantically meaningful masks for an image. +This task is very similar to [image segmentation](semantic_segmentation), but many differences exist. Image segmentation models are trained on labeled datasets and are limited to the classes they have seen during training; they return a set of masks and corresponding classes, given an image. -Mask generation models are trained on large amounts of data and operate in two modes. -- Prompting mode: In this mode, the model takes in an image and a prompt, where a prompt can be a 2D point location (XY coordinates) in the image within an object or a bounding box surrounding an object. In prompting mode, the model only returns the mask over the object -that the prompt is pointing out. -- Segment Everything mode: In segment everything, given an image, the model generates every mask in the image. To do so, a grid of points is generated and overlaid on the image for inference. +Mask generation models are trained on large amounts of data and operate in two modes. -Mask generation task is supported by [Segment Anything Model (SAM)](model_doc/sam). It's a powerful model that consists of a Vision Transformer-based image encoder, a prompt encoder, and a two-way transformer mask decoder. Images and prompts are encoded, and the decoder takes these embeddings and generates valid masks. +- Prompting mode: In this mode, the model takes in an image and a prompt, where a prompt can be a 2D point location (XY coordinates) in the image within an object or a bounding box surrounding an object. In prompting mode, the model only returns the mask over the object +that the prompt is pointing out. +- Segment Everything mode: In segment everything, given an image, the model generates every mask in the image. To do so, a grid of points is generated and overlaid on the image for inference. + +Mask generation task is supported by [Segment Anything Model (SAM)](model_doc/sam). It's a powerful model that consists of a Vision Transformer-based image encoder, a prompt encoder, and a two-way transformer mask decoder. Images and prompts are encoded, and the decoder takes these embeddings and generates valid masks.
SAM Architecture
-SAM serves as a powerful foundation model for segmentation as it has large data coverage. It is trained on -[SA-1B](https://ai.meta.com/datasets/segment-anything/), a dataset with 1 million images and 1.1 billion masks. +SAM serves as a powerful foundation model for segmentation as it has large data coverage. It is trained on +[SA-1B](https://ai.meta.com/datasets/segment-anything/), a dataset with 1 million images and 1.1 billion masks. In this guide, you will learn how to: + - Infer in segment everything mode with batching, - Infer in point prompting mode, - Infer in box prompting mode. @@ -114,7 +116,6 @@ Below is the original image in grayscale with colorful maps overlaid. Very impre Visualized
- ## Model Inference ### Point Prompting @@ -132,7 +133,7 @@ processor = SamProcessor.from_pretrained("facebook/sam-vit-base") To do point prompting, pass the input point to the processor, then take the processor output and pass it to the model for inference. To post-process the model output, pass the outputs and -`original_sizes` and `reshaped_input_sizes` we take from the processor's initial output. We need to pass these +`original_sizes` and `reshaped_input_sizes` we take from the processor's initial output. We need to pass these since the processor resizes the image, and the output needs to be extrapolated. ```python @@ -143,6 +144,7 @@ with torch.no_grad(): outputs = model(**inputs) masks = processor.image_processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu()) ``` + We can visualize the three masks in the `masks` output. ```python @@ -177,10 +179,9 @@ plt.show() ### Box Prompting You can also do box prompting in a similar fashion to point prompting. You can simply pass the input box in the format of a list -`[x_min, y_min, x_max, y_max]` format along with the image to the `processor`. Take the processor output and directly pass it +`[x_min, y_min, x_max, y_max]` format along with the image to the `processor`. Take the processor output and directly pass it to the model, then post-process the output again. - ```python # bounding box around the bee box = [2350, 1600, 2850, 2100] @@ -219,7 +220,7 @@ plt.show() Visualized Bbox
-You can see the inference output below. +You can see the inference output below. ```python fig, ax = plt.subplots() @@ -233,4 +234,3 @@ plt.show()
Visualized Inference
- diff --git a/docs/source/en/tasks/masked_language_modeling.md b/docs/source/en/tasks/masked_language_modeling.md index 3c024739d738..619374f91dae 100644 --- a/docs/source/en/tasks/masked_language_modeling.md +++ b/docs/source/en/tasks/masked_language_modeling.md @@ -150,6 +150,7 @@ To apply this preprocessing function over the entire dataset, use the 🤗 Datas This dataset contains the token sequences, but some of these are longer than the maximum input length for the model. You can now use a second preprocessing function to + - concatenate all the sequences - split the concatenated sequences into shorter chunks defined by `block_size`, which should be both shorter than the maximum input length and short enough for your GPU RAM. diff --git a/docs/source/en/tasks/monocular_depth_estimation.md b/docs/source/en/tasks/monocular_depth_estimation.md index c90abce1cd57..aef9bd22c4d3 100644 --- a/docs/source/en/tasks/monocular_depth_estimation.md +++ b/docs/source/en/tasks/monocular_depth_estimation.md @@ -23,7 +23,7 @@ a single camera viewpoint. Monocular depth estimation has various applications, including 3D reconstruction, augmented reality, autonomous driving, and robotics. It is a challenging task as it requires the model to understand the complex relationships between objects in the scene and the corresponding depth information, which can be affected by factors such as lighting conditions, -occlusion, and texture. +occlusion, and texture. There are two main depth estimation categories: @@ -143,7 +143,7 @@ Let's post-process the results to remove any padding and resize the depth map to

In the original implementation ZoeDepth model performs inference on both the original and flipped images and averages out the results. The post_process_depth_estimation function can handle this for us by passing the flipped outputs to the optional outputs_flipped argument:

-
>>> with torch.no_grad():   
+
>>> with torch.no_grad():
 ...     outputs = model(pixel_values)
 ...     outputs_flipped = model(pixel_values=torch.flip(inputs.pixel_values, dims=[3]))
 >>> post_processed_output = image_processor.post_process_depth_estimation(
diff --git a/docs/source/en/tasks/multiple_choice.md b/docs/source/en/tasks/multiple_choice.md
index 3f4c9d4637fb..d35f108ecce5 100644
--- a/docs/source/en/tasks/multiple_choice.md
+++ b/docs/source/en/tasks/multiple_choice.md
@@ -113,6 +113,7 @@ To apply the preprocessing function over the entire dataset, use 🤗 Datasets [
 ```
 
 To create a batch of examples, it's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length. [`DataCollatorForMultipleChoice`] flattens all the model inputs, applies padding, and then unflattens the results.
+
 ```py
 >>> from transformers import DataCollatorForMultipleChoice
 >>> collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
@@ -197,7 +198,6 @@ Once training is completed, share your model to the Hub with the [`~transformers
 >>> trainer.push_to_hub()
 ```
 
-
 
 
 For a more in-depth example of how to finetune a model for multiple choice, take a look at the corresponding
diff --git a/docs/source/en/tasks/object_detection.md b/docs/source/en/tasks/object_detection.md
index 394e77104b74..ef2a86190bbc 100644
--- a/docs/source/en/tasks/object_detection.md
+++ b/docs/source/en/tasks/object_detection.md
@@ -121,6 +121,7 @@ To get familiar with the data, explore what the examples look like.
 ```
 
 The examples in the dataset have the following fields:
+
 - `image_id`: the example image id
 - `image`: a `PIL.Image.Image` object containing the image
 - `width`: width of the image
@@ -171,11 +172,11 @@ To get an even better understanding of the data, visualize an example in the dat
 
 >>> image
 ```
+
 
CPPE-5 Image Example
- To visualize the bounding boxes with associated labels, you can get the labels from the dataset's metadata, specifically the `category` field. You'll also want to create dictionaries that map a label id to a label class (`id2label`) and the other way around (`label2id`). @@ -216,6 +217,7 @@ Instantiate the image processor from the same checkpoint as the model you want t ``` Before passing the images to the `image_processor`, apply two preprocessing transformations to the dataset: + - Augmenting images - Reformatting annotations to meet DETR expectations @@ -505,6 +507,7 @@ The images in this dataset are still quite large, even after resizing. This mean require at least one GPU. Training involves the following steps: + 1. Load the model with [`AutoModelForObjectDetection`] using the same checkpoint as in the preprocessing. 2. Define your training hyperparameters in [`TrainingArguments`]. 3. Pass the training arguments to [`Trainer`] along with the model, dataset, image processor, and data collator. @@ -527,9 +530,10 @@ and `id2label` maps that you created earlier from the dataset's metadata. Additi In the [`TrainingArguments`] use `output_dir` to specify where to save your model, then configure hyperparameters as you see fit. For `num_train_epochs=30` training will take about 35 minutes in Google Colab T4 GPU, increase the number of epoch to get better results. Important notes: - - Do not remove unused columns because this will drop the image column. Without the image column, you + +- Do not remove unused columns because this will drop the image column. Without the image column, you can't create `pixel_values`. For this reason, set `remove_unused_columns` to `False`. - - Set `eval_do_concat_batches=False` to get proper evaluation results. Images have different number of target boxes, if batches are concatenated we will not be able to determine which boxes belongs to particular image. +- Set `eval_do_concat_batches=False` to get proper evaluation results. Images have different number of target boxes, if batches are concatenated we will not be able to determine which boxes belongs to particular image. If you wish to share your model by pushing to the Hub, set `push_to_hub` to `True` (you must be signed in to Hugging Face to upload your model). @@ -576,6 +580,7 @@ Finally, bring everything together, and call [`~transformers.Trainer.train`]: >>> trainer.train() ``` +
@@ -1487,6 +1492,7 @@ Now that you have finetuned a model, evaluated it, and uploaded it to the Huggin ``` Load model and image processor from the Hugging Face Hub (skip to use already trained in this session): + ```py >>> from transformers import infer_device diff --git a/docs/source/en/tasks/prompting.md b/docs/source/en/tasks/prompting.md index eb8e61d67aaf..2678792c5f3d 100644 --- a/docs/source/en/tasks/prompting.md +++ b/docs/source/en/tasks/prompting.md @@ -80,7 +80,7 @@ This section covers a few prompting techniques. ### Few-shot prompting -Few-shot prompting improves accuracy and performance by including specific examples of what a model should generate given an input. The explicit examples give the model a better understanding of the task and the output format you’re looking for. Try experimenting with different numbers of examples (2, 4, 8, etc.) to see how it affects performance. The example below provides the model with 1 example (1-shot) of the output format (a date in MM/DD/YYYY format) it should return. +Few-shot prompting improves accuracy and performance by including specific examples of what a model should generate given an input. The explicit examples give the model a better understanding of the task and the output format you're looking for. Try experimenting with different numbers of examples (2, 4, 8, etc.) to see how it affects performance. The example below provides the model with 1 example (1-shot) of the output format (a date in MM/DD/YYYY format) it should return. ```python from transformers import pipeline @@ -127,7 +127,6 @@ for output in outputs: print(f"Result: {output['generated_text']}") ``` - While the basic few-shot prompting approach embedded examples within a single text string, the chat template format offers the following benefits. - The model may have a potentially improved understanding because it can better recognize the pattern and the expected roles of user input and assistant output. diff --git a/docs/source/en/tasks/semantic_segmentation.md b/docs/source/en/tasks/semantic_segmentation.md index 5d3c8e70aa1f..de88a0af6866 100644 --- a/docs/source/en/tasks/semantic_segmentation.md +++ b/docs/source/en/tasks/semantic_segmentation.md @@ -23,6 +23,7 @@ rendered properly in your Markdown viewer. Image segmentation models separate areas corresponding to different areas of interest in an image. These models work by assigning a label to each pixel. There are several types of segmentation: semantic segmentation, instance segmentation, and panoptic segmentation. In this guide, we will: + 1. [Take a look at different types of segmentation](#types-of-segmentation). 2. [Have an end-to-end fine-tuning example for semantic segmentation](#fine-tuning-a-model-for-segmentation). @@ -69,6 +70,7 @@ results ``` The segmentation pipeline output includes a mask for every predicted class. + ```bash [{'score': None, 'label': 'road', @@ -107,6 +109,7 @@ Taking a look at the mask for the car class, we can see every car is classified ```python results[-1]["mask"] ``` +
Semantic Segmentation Output
@@ -135,11 +138,13 @@ As you can see below, there are multiple cars classified, and there's no classif 'label': 'person', 'mask': }] ``` + Checking out one of the car masks below. ```python results[2]["mask"] ``` +
Semantic Segmentation Output
@@ -151,6 +156,7 @@ panoptic_segmentation = pipeline("image-segmentation", "facebook/mask2former-swi results = panoptic_segmentation(image) results ``` + As you can see below, we have more classes. We will later illustrate to see that every pixel is classified into one of the classes. ```bash @@ -206,7 +212,6 @@ To see all architectures and checkpoints compatible with this task, we recommend - ### Load SceneParse150 dataset Start by loading a smaller subset of the SceneParse150 dataset from the 🤗 Datasets library. This'll give you a chance to experiment and make sure everything works before spending more time training on the full dataset. @@ -473,7 +478,6 @@ Reload the dataset and load an image for inference. Image of bedroom
- We will now see how to infer without a pipeline. Process the image with an image processor and place the `pixel_values` on a GPU: ```py @@ -503,7 +507,6 @@ Next, rescale the logits to the original image size: >>> pred_seg = upsampled_logits.argmax(dim=1)[0] ``` - To visualize the results, load the [dataset color palette](https://github.com/tensorflow/models/blob/3f1ca33afe3c1631b733ea7e40c294273b9e406d/research/deeplab/utils/get_dataset_colormap.py#L51) as `ade_palette()` that maps each class to their RGB values. ```py diff --git a/docs/source/en/tasks/summarization.md b/docs/source/en/tasks/summarization.md index c57097421fbc..b2f2beebc806 100644 --- a/docs/source/en/tasks/summarization.md +++ b/docs/source/en/tasks/summarization.md @@ -213,7 +213,6 @@ Once training is completed, share your model to the Hub with the [`~transformers >>> trainer.push_to_hub() ``` - For a more in-depth example of how to finetune a model for summarization, take a look at the corresponding diff --git a/docs/source/en/tasks/token_classification.md b/docs/source/en/tasks/token_classification.md index 49b0fcf216b8..5096298affd1 100644 --- a/docs/source/en/tasks/token_classification.md +++ b/docs/source/en/tasks/token_classification.md @@ -242,7 +242,6 @@ Before you start training your model, create a map of the expected ids to their ... } ``` - If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)! @@ -298,7 +297,6 @@ Once training is completed, share your model to the Hub with the [`~transformers >>> trainer.push_to_hub() ``` - For a more in-depth example of how to finetune a model for token classification, take a look at the corresponding diff --git a/docs/source/en/tasks/video_classification.md b/docs/source/en/tasks/video_classification.md index b387a8320dfc..bae638bd84ed 100644 --- a/docs/source/en/tasks/video_classification.md +++ b/docs/source/en/tasks/video_classification.md @@ -363,7 +363,6 @@ Leverage [`Trainer`](https://huggingface.co/docs/transformers/main_classes/train Most of the training arguments are self-explanatory, but one that is quite important here is `remove_unused_columns=False`. This one will drop any features not used by the model's call function. By default it's `True` because usually it's ideal to drop unused feature columns, making it easier to unpack inputs into the model's call function. But, in this case, you need the unused features ('video' in particular) in order to create `pixel_values` (which is a mandatory key our model expects in its inputs). - ```py >>> from transformers import TrainingArguments, Trainer @@ -477,7 +476,6 @@ The simplest way to try out your fine-tuned model for inference is to use it in You can also manually replicate the results of the `pipeline` if you'd like. - ```py >>> def run_inference(model, video): ... # (num_frames, num_channels, height, width) diff --git a/docs/source/en/tasks/video_text_to_text.md b/docs/source/en/tasks/video_text_to_text.md index 0e0191af5884..58ca97e9a56c 100644 --- a/docs/source/en/tasks/video_text_to_text.md +++ b/docs/source/en/tasks/video_text_to_text.md @@ -18,13 +18,14 @@ rendered properly in your Markdown viewer. [[open-in-colab]] -Video-text-to-text models, also known as video language models or vision language models with video input, are language models that take a video input. These models can tackle various tasks, from video question answering to video captioning. +Video-text-to-text models, also known as video language models or vision language models with video input, are language models that take a video input. These models can tackle various tasks, from video question answering to video captioning. -These models have nearly the same architecture as [image-text-to-text](../image_text_to_text) models except for some changes to accept video data, since video data is essentially image frames with temporal dependencies. Some image-text-to-text models take in multiple images, but this alone is inadequate for a model to accept videos. Moreover, video-text-to-text models are often trained with all vision modalities. Each example might have videos, multiple videos, images and multiple images. Some of these models can also take interleaved inputs. For example, you can refer to a specific video inside a string of text by adding a video token in text like "What is happening in this video? `
Pass the image and the candidate object labels to look for to the pipeline. -Here we pass the image directly; other suitable options include a local path to an image or an image url. We also pass text descriptions for all items we want to query the image for. +Here we pass the image directly; other suitable options include a local path to an image or an image url. We also pass text descriptions for all items we want to query the image for. ```py >>> predictions = detector( diff --git a/docs/source/en/testing.md b/docs/source/en/testing.md index 497c6b019311..01658aa2beb7 100644 --- a/docs/source/en/testing.md +++ b/docs/source/en/testing.md @@ -16,7 +16,6 @@ rendered properly in your Markdown viewer. # Testing - Let's take a look at how 🤗 Transformers models are tested and how you can write new tests and improve the existing ones. There are 2 test suites in the repository: @@ -51,12 +50,8 @@ RUN_SLOW=1 pytest examples/ The results can be observed [here](https://github.com/huggingface/transformers/actions). - - ## Running tests - - ### Choosing which tests to run This document goes into many details of how tests can be run. If after reading everything, you need even more details @@ -89,8 +84,6 @@ which tells pytest to: - do not capture output - run in verbose mode - - ### Getting the list of all tests All tests of the test suite: @@ -187,7 +180,6 @@ Sometimes you need to run `accelerate` tests on your models. For that you can ju RUN_SLOW=1 pytest -m accelerate_tests tests/models/opt/test_modeling_opt.py ``` - ### Run documentation tests In order to test whether the documentation examples are correct, you should check that the `doctests` are passing. @@ -217,9 +209,11 @@ Example: ``` Just run the following line to automatically test every docstring example in the desired file: + ```bash pytest --doctest-modules ``` + If the file has a markdown extension, you should add the `--doctest-glob="*.md"` argument. ### Run only modified tests @@ -266,12 +260,10 @@ or `pytest.ini`/``tox.ini`` files: looponfailroots = transformers tests ``` -This would lead to only looking for file changes in the respective directories, specified relatively to the ini-file’s -directory. +This would lead to only looking for file changes in the respective directories, specified relatively to the ini-file's directory. [pytest-watch](https://github.com/joeyespo/pytest-watch) is an alternative implementation of this functionality. - ### Skip a test module If you want to run all test modules, except a few you can exclude them by giving an explicit list of tests to run. For @@ -307,7 +299,6 @@ It's good to repeat the tests several times, in sequence, randomly, or in sets, inter-dependency and state-related bugs (tear down). And the straightforward multiple repetition is just good to detect some problems that get uncovered by randomness of DL. - #### Repeat tests - [pytest-flakefinder](https://github.com/dropbox/pytest-flakefinder): @@ -403,8 +394,6 @@ pytest -p no:sugar or uninstall it. - - #### Report each sub-test name and its progress For a single or a group of tests via `pytest` (after `pip install pytest-pspec`): @@ -457,7 +446,6 @@ decorators are used to set the requirements of tests CPU/GPU/XPU/TPU-wise: Let's depict the GPU requirements in the following table: - | n gpus | decorator | |--------|--------------------------------| | `>= 0` | `@require_torch` | @@ -466,7 +454,6 @@ Let's depict the GPU requirements in the following table: | `< 2` | `@require_torch_non_multi_gpu` | | `< 3` | `@require_torch_up_to_2_gpus` | - For example, here is a test that must be run only when there are 2 or more GPUs available and pytorch is installed: ```python no-style @@ -520,6 +507,7 @@ Certain devices will require an additional import after importing `torch` for th ```bash TRANSFORMERS_TEST_BACKEND="torch_npu" pytest tests/utils/test_logging.py ``` + Alternative backends may also require the replacement of device-specific functions. For example `torch.cuda.manual_seed` may need to be replaced with a device-specific seed setter like `torch.npu.manual_seed` or `torch.xpu.manual_seed` to correctly set a random seed on the device. To specify a new backend with backend-specific device functions when running the test suite, create a Python device specification file `spec.py` in the format: ```python @@ -536,6 +524,7 @@ MANUAL_SEED_FN = torch.npu.manual_seed EMPTY_CACHE_FN = torch.npu.empty_cache DEVICE_COUNT_FN = torch.npu.device_count ``` + This format also allows for specification of any additional imports required. To use this file to replace equivalent methods in the test suite, set the environment variable `TRANSFORMERS_TEST_DEVICE_SPEC` to the path of the spec file, e.g. `TRANSFORMERS_TEST_DEVICE_SPEC=spec.py`. Currently, only `MANUAL_SEED_FN`, `EMPTY_CACHE_FN` and `DEVICE_COUNT_FN` are supported for device-specific dispatch. @@ -610,7 +599,6 @@ You can read [here](https://docs.pytest.org/en/stable/unittest.html) which featu thing to remember is that most `pytest` fixtures don't work. Neither parametrization, but we use the module `parameterized` that works in a similar way. - ### Parametrization Often, there is a need to run the same test multiple times, but with different arguments. It could be done from within @@ -719,8 +707,6 @@ pytest test_this2.py::test_floor[negative--1.5--2.0] test_this2.py::test_floor[i as in the previous example. - - ### Files and directories In tests often we need to know where things are relative to the current test file, and it's not trivial since the test @@ -843,7 +829,6 @@ otherwise. If you need to temporary override `sys.path` to import from another test for example, you can use the `ExtendSysPath` context manager. Example: - ```python import os from transformers.testing_utils import ExtendSysPath @@ -860,13 +845,13 @@ commit it to the main repository we need make sure it's skipped during `make tes Methods: -- A **skip** means that you expect your test to pass only if some conditions are met, otherwise pytest should skip +- A **skip** means that you expect your test to pass only if some conditions are met, otherwise pytest should skip running the test altogether. Common examples are skipping windows-only tests on non-windows platforms, or skipping tests that depend on an external resource which is not available at the moment (for example a database). -- A **xfail** means that you expect a test to fail for some reason. A common example is a test for a feature not yet +- A **xfail** means that you expect a test to fail for some reason. A common example is a test for a feature not yet implemented, or a bug not yet fixed. When a test passes despite being expected to fail (marked with - pytest.mark.xfail), it’s an xpass and will be reported in the test summary. + pytest.mark.xfail), it's an xpass and will be reported in the test summary. One of the important differences between the two is that `skip` doesn't run the test, and `xfail` does. So if the code that's buggy causes some bad state that will affect other tests, do not use `xfail`. @@ -893,7 +878,6 @@ or the `xfail` way: def test_feature_x(): ``` - Here's how to skip a test based on internal checks within the test: ```python @@ -924,7 +908,7 @@ def test_feature_x(): docutils = pytest.importorskip("docutils", minversion="0.3") ``` -- Skip a test based on a condition: +- Skip a test based on a condition: ```python no-style @pytest.mark.skipif(sys.version_info < (3,6), reason="requires python3.6 or higher") @@ -1018,7 +1002,6 @@ That report is also useful to find slow outliers that aren't marked as such, or If you notice that the test suite starts getting slow on CI, the top listing of this report will show the slowest tests. - ### Testing the stdout/stderr output In order to test functions that write to `stdout` and/or `stderr`, the test can access those streams using the @@ -1141,7 +1124,6 @@ print(cs.err, cs.out) Also, to aid debugging test issues, by default these context managers automatically replay the captured streams on exit from the context. - ### Capturing logger stream If you need to validate the output of a logger, you can use `CaptureLogger`: @@ -1193,7 +1175,6 @@ called if anything. This helper method creates a copy of the `os.environ` object, so the original remains intact. - ### Getting reproducible results In some situations you may want to remove randomness for your tests. To get identical reproducible results set, you @@ -1241,9 +1222,6 @@ To trigger a self-push workflow CI job, you must: 4. Then you can see the job appear [here](https://github.com/huggingface/transformers/actions/workflows/self-push.yml). It may not run right away if there is a backlog. - - - ## Testing Experimental CI Features Testing CI features can be potentially problematic as it can interfere with the normal CI functioning. Therefore if a @@ -1306,7 +1284,7 @@ You can vote for this feature and see where it is at these CI-specific threads: ## DeepSpeed integration -For a PR that involves the DeepSpeed integration, keep in mind our CircleCI PR CI setup doesn't have GPUs. Tests requiring GPUs are run on a different CI nightly. This means if you get a passing CI report in your PR, it doesn’t mean the DeepSpeed tests pass. +For a PR that involves the DeepSpeed integration, keep in mind our CircleCI PR CI setup doesn't have GPUs. Tests requiring GPUs are run on a different CI nightly. This means if you get a passing CI report in your PR, it doesn't mean the DeepSpeed tests pass. To run DeepSpeed tests: diff --git a/docs/source/en/tiny_agents.md b/docs/source/en/tiny_agents.md index dc53d05a4bff..7266f0236a63 100644 --- a/docs/source/en/tiny_agents.md +++ b/docs/source/en/tiny_agents.md @@ -42,4 +42,3 @@ Image URL: https://evalstate-flux1-schnell.hf.space/gradio_api/file=/tmp/gradio/ I have generated an image of a cat on the moon using the Flux 1 Schnell Image Generator. The image is 1024x1024 pixels and was created with 4 inference steps. Let me know if you would like to make any changes or need further assistance! ``` - diff --git a/docs/source/en/tokenizer_summary.md b/docs/source/en/tokenizer_summary.md index 801948f35d87..34bc16628cad 100644 --- a/docs/source/en/tokenizer_summary.md +++ b/docs/source/en/tokenizer_summary.md @@ -42,7 +42,7 @@ For instance, let's look at the sentence `"Don't you love 🤗 Transformers? We A simple way of tokenizing this text is to split it by spaces, which would give: -``` +```text ["Don't", "you", "love", "🤗", "Transformers?", "We", "sure", "do."] ``` @@ -52,7 +52,7 @@ punctuation into account so that a model does not have to learn a different repr punctuation symbol that could follow it, which would explode the number of representations the model has to learn. Taking punctuation into account, tokenizing our exemplary text would give: -``` +```text ["Don", "'", "t", "you", "love", "🤗", "Transformers", "?", "We", "sure", "do", "."] ``` @@ -65,7 +65,7 @@ input that was tokenized with the same rules that were used to tokenize its trai [spaCy](https://spacy.io/) and [Moses](http://www.statmt.org/moses/?n=Development.GetStarted) are two popular rule-based tokenizers. Applying them on our example, *spaCy* and *Moses* would output something like: -``` +```text ["Do", "n't", "you", "love", "🤗", "Transformers", "?", "We", "sure", "do", "."] ``` @@ -154,14 +154,14 @@ define before training the tokenizer. As an example, let's assume that after pre-tokenization, the following set of words including their frequency has been determined: -``` +```text ("hug", 10), ("pug", 5), ("pun", 12), ("bun", 4), ("hugs", 5) ``` Consequently, the base vocabulary is `["b", "g", "h", "n", "p", "s", "u"]`. Splitting all words into symbols of the base vocabulary, we obtain: -``` +```text ("h" "u" "g", 10), ("p" "u" "g", 5), ("p" "u" "n", 12), ("b" "u" "n", 4), ("h" "u" "g" "s", 5) ``` @@ -172,7 +172,7 @@ the example above `"h"` followed by `"u"` is present _10 + 5 = 15_ times (10 tim `"u"` symbols followed by a `"g"` symbol together. Next, `"ug"` is added to the vocabulary. The set of words then becomes -``` +```text ("h" "ug", 10), ("p" "ug", 5), ("p" "u" "n", 12), ("b" "u" "n", 4), ("h" "ug" "s", 5) ``` @@ -183,7 +183,7 @@ BPE then identifies the next most common symbol pair. It's `"u"` followed by `"n At this stage, the vocabulary is `["b", "g", "h", "n", "p", "s", "u", "ug", "un", "hug"]` and our set of unique words is represented as -``` +```text ("hug", 10), ("p" "ug", 5), ("p" "un", 12), ("b" "un", 4), ("hug" "s", 5) ``` @@ -246,7 +246,7 @@ reached the desired size. The Unigram algorithm always keeps the base characters Because Unigram is not based on merge rules (in contrast to BPE and WordPiece), the algorithm has several ways of tokenizing new text after training. As an example, if a trained Unigram tokenizer exhibits the vocabulary: -``` +```text ["b", "g", "h", "n", "p", "s", "u", "ug", "un", "hug"], ``` diff --git a/docs/source/en/trainer.md b/docs/source/en/trainer.md index 48325da6893c..32f14bc41da3 100644 --- a/docs/source/en/trainer.md +++ b/docs/source/en/trainer.md @@ -346,7 +346,6 @@ use_cpu: false - Run [accelerate_launch](https://hf.co/docs/accelerate/package_reference/cli#accelerate-launch) to start training with the configurations set in `config_file.yaml`. This file is saved to the Accelerate cache folder and automatically loaded when you run `accelerate_launch`. The example below launches the [run_glue.py](../../../examples/pytorch/text-classification/run_glue) script with the FSDP configuration shown earlier. Parameters from the `config_file.yaml` file can also be directly set in the command line. diff --git a/docs/source/en/training.md b/docs/source/en/training.md index ed992e8152d9..ccee25704fa3 100644 --- a/docs/source/en/training.md +++ b/docs/source/en/training.md @@ -52,6 +52,7 @@ dataset = dataset.map(tokenize, batched=True) > [!TIP] > Fine-tune on a smaller subset of the full dataset to reduce the time it takes. The results won't be as good compared to fine-tuning on the full dataset, but it is useful to make sure everything works as expected first before committing to training on the full dataset. +> > ```py > small_train = dataset["train"].shuffle(seed=42).select(range(1000)) > small_eval = dataset["test"].shuffle(seed=42).select(range(1000)) diff --git a/docs/source/en/transformers_as_backend.md b/docs/source/en/transformers_as_backend.md index 422cc4a121e9..ce5152c2a4a7 100644 --- a/docs/source/en/transformers_as_backend.md +++ b/docs/source/en/transformers_as_backend.md @@ -26,12 +26,13 @@ This guide shows how to use Transformers' models as a backend to some popular in [vLLM](https://github.com/vllm-project/vllm) is a high-performance inference engine optimized for serving LLMs at scale. It supports many Transformers' models, including all decoder-only LLMs and several vision-language models (VLMs). VLMs currently support image inputs only, with video support planned. -vLLM automatically selects the best backend, and if a model isn’t natively supported, it falls back to the Transformers model. To explicitly use a Transformers' model, set `model_impl="transformers"`. +vLLM automatically selects the best backend, and if a model isn't natively supported, it falls back to the Transformers model. To explicitly use a Transformers' model, set `model_impl="transformers"`. ```python from vllm import LLM llm = LLM(model="meta-llama/Llama-3.2-1B", model_impl="transformers") ``` + Add `--model-impl transformers` to `vllm serve` to launch a server with a Transformers' model. ```bash @@ -42,12 +43,11 @@ vllm serve meta-llama/Llama-3.2-1B \ Refer to the [vLLM docs](https://docs.vllm.ai/en/latest/models/supported_models.html#transformers) for more usage examples and tips on using a Transformers as the backend. - ## SGLang [SGLang](https://github.com/InternLM/sglang) is a high-performance, OpenAI-compatible server and runtime designed for chat-based LLMs. It offers fast inference, role-based conversation handling, and support for custom pipelines, making it great for building real-world LLM apps. -SGLang automatically falls back to the Transformers backend if a model isn’t natively supported. To explicitly use a Transformers' model, set `impl="transformers"`. +SGLang automatically falls back to the Transformers backend if a model isn't natively supported. To explicitly use a Transformers' model, set `impl="transformers"`. ```python import sglang as sgl @@ -57,12 +57,6 @@ print(llm.generate(["The capital of France is"], {"max_new_tokens": 20})[0]) ``` Add `impl transformers` to `sglang.launch_server` to launch a server with a Transformers' model. - - - - - - ```bash python3 -m sglang.launch_server \ @@ -133,7 +127,7 @@ class MyModel(PreTrainedModel): 3. This step is optional, but if you want to support tensor parallel and/or pipeline parallel features, add the following keys to the config. * `base_model_tp_plan` enables [tensor parallelism](./perf_infer_gpu_multi) by mapping fully qualified layer name patterns to tensor parallel styles. Only the `"colwise"` and `"rowwise"` partitioning strategies are currently supported. * `base_model_pp_plan` enables pipeline parallelism by mapping direct child layer names to tuples of lists of strings. The list in the first element of the tuple contains the names of the input arguments. The list in the last element of the tuple contains the names of the variables the layer outputs to in the modeling code. - + Expand the code below for an example.
@@ -158,6 +152,7 @@ class MyConfig(PretrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } ``` +
### Multimodal models @@ -200,8 +195,8 @@ class MyMultimodalModelForConditionalGeneration(MyMultimodalPreTrainedModel, Gen self.model = MyMultimodalModel(config) self.lm_head = nn.Linear(hidden_dim, vocab_size) ``` -
+ 2. A multimodal model config must be nested with the following fields. * text_config: decoder language model config @@ -210,7 +205,7 @@ class MyMultimodalModelForConditionalGeneration(MyMultimodalPreTrainedModel, Gen 3. A multimodal model's processing class must have the `self.image_token` and `self.image_token_ids` attributes. These are placeholder tokens used to indicate image positions in the input. The placeholder token is the same token used in the input prompt and to mask scatter image features. - The processing class also needs ` self._get_num_multimodal_tokens` method to compute the number of placeholder tokens needed for multimodal inputs with given sizes and to return a [`MultiModalData`] object. The placeholder for row and column tokens don't count as image placeholders. Only the tokens that are actually replaced by image features are computed. + The processing class also needs `self._get_num_multimodal_tokens` method to compute the number of placeholder tokens needed for multimodal inputs with given sizes and to return a [`MultiModalData`] object. The placeholder for row and column tokens don't count as image placeholders. Only the tokens that are actually replaced by image features are computed. Finally, when `return_mm_token_type_ids=True`, the class has to return `mm_token_type_ids` to indicate whether each position is a text token (`0`) or image placeholder token (`1`). Each image's token type IDs must be contiguous with no breaks between consecutive ones. @@ -246,6 +241,7 @@ class MyMultimodalProcessor(ProcessorMixin): vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches}) return MultiModalData(**vision_data) ``` + ## Resources diff --git a/docs/source/en/troubleshooting.md b/docs/source/en/troubleshooting.md index 7998881d3648..0cc5829d2e8d 100644 --- a/docs/source/en/troubleshooting.md +++ b/docs/source/en/troubleshooting.md @@ -34,12 +34,11 @@ Sometimes errors occur, but we are here to help! This guide covers some of the m For more details about troubleshooting and getting help, take a look at [Chapter 8](https://huggingface.co/course/chapter8/1?fw=pt) of the Hugging Face course. - ## Firewalled environments Some GPU instances on cloud and intranet setups are firewalled to external connections, resulting in a connection error. When your script attempts to download model weights or datasets, the download will hang and then timeout with the following message: -``` +```text ValueError: Connection error, and we cannot find the requested files in the cached path. Please try again or make sure your Internet connection is on. ``` @@ -50,7 +49,7 @@ In this case, you should try to run 🤗 Transformers on [offline mode](installa Training large models with millions of parameters can be challenging without the appropriate hardware. A common error you may encounter when the GPU runs out of memory is: -``` +```text CUDA out of memory. Tried to allocate 256.00 MiB (GPU 0; 11.17 GiB total capacity; 9.70 GiB already allocated; 179.81 MiB free; 9.85 GiB reserved in total by PyTorch) ``` @@ -69,7 +68,7 @@ Refer to the Performance [guide](performance) for more details about memory-savi Another common error you may encounter, especially if it is a newly released model, is `ImportError`: -``` +```text ImportError: cannot import name 'ImageGPTImageProcessor' from 'transformers' (unknown location) ``` @@ -83,7 +82,7 @@ pip install transformers --upgrade Sometimes you may run into a generic CUDA error about an error in the device code. -``` +```text RuntimeError: CUDA error: device-side assert triggered ``` diff --git a/docs/source/en/video_processors.md b/docs/source/en/video_processors.md index 4f44914c8cfc..2b26d9f9fc7f 100644 --- a/docs/source/en/video_processors.md +++ b/docs/source/en/video_processors.md @@ -14,17 +14,16 @@ rendered properly in your Markdown viewer. --> - # Video Processor -A **Video Processor** is a utility responsible for preparing input features for video models, as well as handling the post-processing of their outputs. It provides transformations such as resizing, normalization, and conversion into PyTorch. +A **Video Processor** is a utility responsible for preparing input features for video models, as well as handling the post-processing of their outputs. It provides transformations such as resizing, normalization, and conversion into PyTorch. The video processor extends the functionality of image processors by allowing the models to handle videos with a distinct set of arguments compared to images. It serves as the bridge between raw video data and the model, ensuring that input features are optimized for the VLM. Use [`~BaseVideoProcessor.from_pretrained`] to load a video processors configuration (image size, whether to normalize and rescale, etc.) from a video model on the Hugging Face [Hub](https://hf.co) or local directory. The configuration for each pretrained model should be saved in a [video_preprocessor_config.json] file but older models might have the config saved in [preprocessor_config.json](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf/blob/main/preprocessor_config.json) file. Note that the latter is less preferred and will be removed in the future. +## Usage Example -### Usage Example Here's an example of how to load a video processor with [`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf) model: ```python diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml index df2d53c49a96..2412e497556f 100644 --- a/docs/source/ko/_toctree.yml +++ b/docs/source/ko/_toctree.yml @@ -607,6 +607,8 @@ title: LED - local: in_translation title: LFM2 + - local: in_translation + title: LFM2-VL - local: model_doc/llama title: LLaMA - local: model_doc/llama2 diff --git a/docs/source/zh/main_classes/deepspeed.md b/docs/source/zh/main_classes/deepspeed.md index 7cdf3b62e427..a8863896235f 100644 --- a/docs/source/zh/main_classes/deepspeed.md +++ b/docs/source/zh/main_classes/deepspeed.md @@ -236,7 +236,7 @@ deepspeed --num_gpus=1 examples/pytorch/translation/run_translation.py \ } ``` -这会启用`optimizer offload `和一些其他重要功能。您可以尝试不同的buffer大小,有关详细信息,请参见下面的讨论。 +这会启用`optimizer offload`和一些其他重要功能。您可以尝试不同的buffer大小,有关详细信息,请参见下面的讨论。 关于这种启用类型的实际使用示例,请参阅 [此帖](https://github.com/huggingface/transformers/issues/8771#issuecomment-759176685)。 diff --git a/docs/source/zh/pipeline_tutorial.md b/docs/source/zh/pipeline_tutorial.md index 92fbcbba31e4..7c497c6f1c65 100644 --- a/docs/source/zh/pipeline_tutorial.md +++ b/docs/source/zh/pipeline_tutorial.md @@ -306,5 +306,5 @@ pipe = pipeline(model="facebook/opt-1.3b", device_map="auto", model_kwargs={"loa output = pipe("This is a cool example!", do_sample=True, top_p=0.95) ``` -请注意,您可以将`checkpoint `替换为任何支持大模型加载的Hugging Face模型,比如BLOOM! +请注意,您可以将`checkpoint`替换为任何支持大模型加载的Hugging Face模型,比如BLOOM! diff --git a/docs/source/zh/tasks/asr.md b/docs/source/zh/tasks/asr.md index 3b66888bc107..228ba55c0d0e 100644 --- a/docs/source/zh/tasks/asr.md +++ b/docs/source/zh/tasks/asr.md @@ -83,7 +83,7 @@ DatasetDict({ }) ``` -虽然数据集包含 `lang_id `和 `english_transcription` 等许多有用的信息,但在本指南中, +虽然数据集包含 `lang_id` 和 `english_transcription` 等许多有用的信息,但在本指南中, 您将专注于 `audio` 和 `transcription`。使用 [`~datasets.Dataset.remove_columns`] 方法删除其他列: ```py @@ -167,7 +167,7 @@ Wav2Vec2 分词器仅训练了大写字符,因此您需要确保文本与分 它还会动态地将您的文本和标签填充到其批次中最长元素的长度(而不是整个数据集),以使它们具有统一的长度。 虽然可以通过在 `tokenizer` 函数中设置 `padding=True` 来填充文本,但动态填充更有效。 -与其他数据整理器不同,这个特定的数据整理器需要对 `input_values` 和 `labels `应用不同的填充方法: +与其他数据整理器不同,这个特定的数据整理器需要对 `input_values` 和 `labels` 应用不同的填充方法: ```py >>> import torch diff --git a/examples/legacy/pytorch-lightning/run_ner.py b/examples/legacy/pytorch-lightning/run_ner.py index 144759d36aac..6cbb138f023f 100644 --- a/examples/legacy/pytorch-lightning/run_ner.py +++ b/examples/legacy/pytorch-lightning/run_ner.py @@ -72,12 +72,12 @@ def prepare_data(self): self.labels, args.max_seq_length, self.tokenizer, - cls_token_at_end=bool(self.config.model_type in ["xlnet"]), + cls_token_at_end=bool(self.config.model_type == "xlnet"), cls_token=self.tokenizer.cls_token, - cls_token_segment_id=2 if self.config.model_type in ["xlnet"] else 0, + cls_token_segment_id=2 if self.config.model_type == "xlnet" else 0, sep_token=self.tokenizer.sep_token, sep_token_extra=False, - pad_on_left=bool(self.config.model_type in ["xlnet"]), + pad_on_left=bool(self.config.model_type == "xlnet"), pad_token=self.tokenizer.pad_token_id, pad_token_segment_id=self.tokenizer.pad_token_type_id, pad_token_label_id=self.pad_token_label_id, diff --git a/examples/legacy/run_chinese_ref.py b/examples/legacy/run_chinese_ref.py index e63096d05244..7cb6caccefe1 100755 --- a/examples/legacy/run_chinese_ref.py +++ b/examples/legacy/run_chinese_ref.py @@ -55,7 +55,7 @@ def get_chinese_word(tokens: list[str]): def add_sub_symbol(bert_tokens: list[str], chinese_word_set: set()): if not chinese_word_set: return bert_tokens - max_word_len = max([len(w) for w in chinese_word_set]) + max_word_len = max(len(w) for w in chinese_word_set) bert_word = bert_tokens start, end = 0, len(bert_word) diff --git a/examples/legacy/token-classification/utils_ner.py b/examples/legacy/token-classification/utils_ner.py index 0c1725b59b4e..833984bc0ec3 100644 --- a/examples/legacy/token-classification/utils_ner.py +++ b/examples/legacy/token-classification/utils_ner.py @@ -251,10 +251,10 @@ def __init__( labels, max_seq_length, tokenizer, - cls_token_at_end=bool(model_type in ["xlnet"]), + cls_token_at_end=bool(model_type == "xlnet"), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, - cls_token_segment_id=2 if model_type in ["xlnet"] else 0, + cls_token_segment_id=2 if model_type == "xlnet" else 0, sep_token=tokenizer.sep_token, sep_token_extra=False, # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 diff --git a/examples/pytorch/audio-classification/run_audio_classification.py b/examples/pytorch/audio-classification/run_audio_classification.py index ad82f4c401e8..dc76764cdd5b 100644 --- a/examples/pytorch/audio-classification/run_audio_classification.py +++ b/examples/pytorch/audio-classification/run_audio_classification.py @@ -15,7 +15,7 @@ # /// script # dependencies = [ -# "transformers @ git+https://github.com/huggingface/transformers.git", +# "transformers==4.57.1", # "datasets[audio]>=1.14.0", # "evaluate", # "librosa", @@ -48,14 +48,14 @@ set_seed, ) from transformers.trainer_utils import get_last_checkpoint -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version from transformers.utils.versions import require_version logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt") @@ -218,10 +218,6 @@ def main(): else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_audio_classification", model_args, data_args) - # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", diff --git a/examples/pytorch/continuous_batching.py b/examples/pytorch/continuous_batching.py index 2b0d506eb895..cf5379fc619c 100644 --- a/examples/pytorch/continuous_batching.py +++ b/examples/pytorch/continuous_batching.py @@ -40,7 +40,8 @@ def generate_simple( attn_impl = { "sdpa_paged": "sdpa", "eager_paged": "eager", - "flash_paged": "flash_attention_2", + "paged_attention": "eager", # TODO: this does not work on AMD docker + "flash_paged": "flash_attention_2", # TODO: this does not work on AMD docker }[attn_impl] model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype=torch.bfloat16, attn_implementation=attn_impl) diff --git a/examples/pytorch/contrastive-image-text/run_clip.py b/examples/pytorch/contrastive-image-text/run_clip.py index 8b0b42252a2e..e754e2f8a87e 100644 --- a/examples/pytorch/contrastive-image-text/run_clip.py +++ b/examples/pytorch/contrastive-image-text/run_clip.py @@ -15,7 +15,7 @@ # /// script # dependencies = [ -# "transformers @ git+https://github.com/huggingface/transformers.git", +# "transformers==4.57.1", # "torch>=1.5.0", # "torchvision>=0.6.0", # "datasets>=1.8.0", @@ -56,14 +56,14 @@ set_seed, ) from transformers.trainer_utils import get_last_checkpoint -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version from transformers.utils.versions import require_version logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt") @@ -247,10 +247,6 @@ def main(): else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_clip", model_args, data_args) - # 2. Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py index 9693d4b1c84a..748e07dabe90 100755 --- a/examples/pytorch/image-classification/run_image_classification.py +++ b/examples/pytorch/image-classification/run_image_classification.py @@ -14,7 +14,7 @@ # /// script # dependencies = [ -# "transformers @ git+https://github.com/huggingface/transformers.git", +# "transformers==4.57.1", # "accelerate>=0.12.0", # "torch>=1.5.0", # "torchvision>=0.6.0", @@ -59,7 +59,7 @@ set_seed, ) from transformers.trainer_utils import get_last_checkpoint -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version from transformers.utils.versions import require_version @@ -68,7 +68,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt") @@ -201,10 +201,6 @@ def main(): else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_image_classification", model_args, data_args) - # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py index 656310424c17..39cdc137fa95 100644 --- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py +++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py @@ -14,7 +14,7 @@ # /// script # dependencies = [ -# "transformers @ git+https://github.com/huggingface/transformers.git", +# "transformers==4.57.1", # "accelerate>=0.12.0", # "torch>=1.5.0", # "torchvision>=0.6.0", @@ -56,12 +56,12 @@ import transformers from transformers import AutoConfig, AutoImageProcessor, AutoModelForImageClassification, SchedulerType, get_scheduler -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") logger = get_logger(__name__) @@ -234,10 +234,6 @@ def parse_args(): def main(): args = parse_args() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_image_classification_no_trainer", args) - # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment diff --git a/examples/pytorch/image-pretraining/run_mae.py b/examples/pytorch/image-pretraining/run_mae.py index d0ea39e780b5..2cc5af1d062e 100644 --- a/examples/pytorch/image-pretraining/run_mae.py +++ b/examples/pytorch/image-pretraining/run_mae.py @@ -14,7 +14,7 @@ # /// script # dependencies = [ -# "transformers @ git+https://github.com/huggingface/transformers.git", +# "transformers==4.57.1", # "torch>=1.5.0", # "torchvision>=0.6.0", # "datasets>=1.8.0", @@ -42,7 +42,7 @@ ViTMAEForPreTraining, ) from transformers.trainer_utils import get_last_checkpoint -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version from transformers.utils.versions import require_version @@ -51,7 +51,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") @@ -193,10 +193,6 @@ def main(): else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_mae", model_args, data_args) - # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", diff --git a/examples/pytorch/image-pretraining/run_mim.py b/examples/pytorch/image-pretraining/run_mim.py index 746126596fbe..7f4b2d0a142c 100644 --- a/examples/pytorch/image-pretraining/run_mim.py +++ b/examples/pytorch/image-pretraining/run_mim.py @@ -14,7 +14,7 @@ # /// script # dependencies = [ -# "transformers @ git+https://github.com/huggingface/transformers.git", +# "transformers==4.57.1", # "torch>=1.5.0", # "torchvision>=0.6.0", # "datasets>=1.8.0", @@ -45,7 +45,7 @@ TrainingArguments, ) from transformers.trainer_utils import get_last_checkpoint -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version from transformers.utils.versions import require_version @@ -56,7 +56,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") @@ -257,10 +257,6 @@ def main(): else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_mim", model_args, data_args) - # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", diff --git a/examples/pytorch/image-pretraining/run_mim_no_trainer.py b/examples/pytorch/image-pretraining/run_mim_no_trainer.py index 92c4d2242f76..3cbcf3d9d22a 100644 --- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py +++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py @@ -14,7 +14,7 @@ # /// script # dependencies = [ -# "transformers @ git+https://github.com/huggingface/transformers.git", +# "transformers==4.57.1", # "torch>=1.5.0", # "torchvision>=0.6.0", # "datasets>=1.8.0", @@ -49,7 +49,7 @@ SchedulerType, get_scheduler, ) -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version from transformers.utils.versions import require_version @@ -61,7 +61,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") @@ -384,10 +384,6 @@ def collate_fn(examples): def main(): args = parse_args() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_mim_no_trainer", args) - # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment diff --git a/examples/pytorch/instance-segmentation/run_instance_segmentation.py b/examples/pytorch/instance-segmentation/run_instance_segmentation.py index 992d9854d078..cc5e88d9e2dc 100644 --- a/examples/pytorch/instance-segmentation/run_instance_segmentation.py +++ b/examples/pytorch/instance-segmentation/run_instance_segmentation.py @@ -14,7 +14,7 @@ # /// script # dependencies = [ -# "transformers @ git+https://github.com/huggingface/transformers.git", +# "transformers==4.57.1", # "albumentations >= 1.4.16", # "timm", # "datasets", @@ -50,14 +50,14 @@ from transformers.image_processing_utils import BatchFeature from transformers.trainer import EvalPrediction from transformers.trainer_utils import get_last_checkpoint -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version from transformers.utils.versions import require_version logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt") @@ -367,10 +367,6 @@ def main(): training_args.batch_eval_metrics = True training_args.remove_unused_columns = False - # # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_instance_segmentation", args) - # Setup logging and log on each process the small summary: setup_logging(training_args) logger.warning( diff --git a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py index c538508b7b74..48190a8d4950 100644 --- a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py +++ b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py @@ -14,7 +14,7 @@ # /// script # dependencies = [ -# "transformers @ git+https://github.com/huggingface/transformers.git", +# "transformers==4.57.1", # "albumentations >= 1.4.16", # "timm", # "datasets", @@ -56,14 +56,14 @@ get_scheduler, ) from transformers.image_processing_utils import BatchFeature -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version from transformers.utils.versions import require_version logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt") @@ -413,10 +413,6 @@ def handle_repository_creation(accelerator: Accelerator, args: argparse.Namespac def main(): args = parse_args() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_instance_segmentation_no_trainer", args) - # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py index 69099bb79306..8d28a134427d 100755 --- a/examples/pytorch/language-modeling/run_clm.py +++ b/examples/pytorch/language-modeling/run_clm.py @@ -15,7 +15,7 @@ # /// script # dependencies = [ -# "transformers @ git+https://github.com/huggingface/transformers.git", +# "transformers==4.57.1", # "albumentations >= 1.4.16", # "accelerate >= 0.12.0", # "torch >= 1.3", @@ -64,12 +64,12 @@ ) from transformers.testing_utils import CaptureLogger from transformers.trainer_utils import get_last_checkpoint -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") @@ -292,10 +292,6 @@ def main(): else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_clm", model_args, data_args) - # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index 874d95393f70..4da87f1d0250 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -15,7 +15,7 @@ # /// script # dependencies = [ -# "transformers @ git+https://github.com/huggingface/transformers.git", +# "transformers==4.57.1", # "albumentations >= 1.4.16", # "accelerate >= 0.12.0", # "torch >= 1.3", @@ -66,12 +66,12 @@ default_data_collator, get_scheduler, ) -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") logger = get_logger(__name__) @@ -268,10 +268,6 @@ def parse_args(): def main(): args = parse_args() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_clm_no_trainer", args) - # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment diff --git a/examples/pytorch/language-modeling/run_fim.py b/examples/pytorch/language-modeling/run_fim.py index 46b759e03002..ca491864f665 100644 --- a/examples/pytorch/language-modeling/run_fim.py +++ b/examples/pytorch/language-modeling/run_fim.py @@ -15,7 +15,7 @@ # /// script # dependencies = [ -# "transformers @ git+https://github.com/huggingface/transformers.git", +# "transformers==4.57.1", # "albumentations >= 1.4.16", # "accelerate >= 0.12.0", # "torch >= 1.3", @@ -67,12 +67,12 @@ from transformers.integrations import is_deepspeed_zero3_enabled from transformers.testing_utils import CaptureLogger from transformers.trainer_utils import get_last_checkpoint -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") @@ -319,10 +319,6 @@ def main(): else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_fim", model_args, data_args) - # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", diff --git a/examples/pytorch/language-modeling/run_fim_no_trainer.py b/examples/pytorch/language-modeling/run_fim_no_trainer.py index 67a94f1fae30..a83a5887e264 100644 --- a/examples/pytorch/language-modeling/run_fim_no_trainer.py +++ b/examples/pytorch/language-modeling/run_fim_no_trainer.py @@ -15,7 +15,7 @@ # /// script # dependencies = [ -# "transformers @ git+https://github.com/huggingface/transformers.git", +# "transformers==4.57.1", # "albumentations >= 1.4.16", # "accelerate >= 0.12.0", # "torch >= 1.3", @@ -69,12 +69,12 @@ is_torch_xla_available, ) from transformers.integrations import is_deepspeed_zero3_enabled -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") logger = get_logger(__name__) @@ -328,10 +328,6 @@ def parse_args(): def main(): args = parse_args() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_fim_no_trainer", args) - # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py index 5ba9262f451b..0773dda736bb 100755 --- a/examples/pytorch/language-modeling/run_mlm.py +++ b/examples/pytorch/language-modeling/run_mlm.py @@ -15,7 +15,7 @@ # /// script # dependencies = [ -# "transformers @ git+https://github.com/huggingface/transformers.git", +# "transformers==4.57.1", # "albumentations >= 1.4.16", # "accelerate >= 0.12.0", # "torch >= 1.3", @@ -63,12 +63,12 @@ set_seed, ) from transformers.trainer_utils import get_last_checkpoint -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") @@ -264,10 +264,6 @@ def main(): else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_mlm", model_args, data_args) - # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py index 501da0cff932..f553bc05b7d8 100755 --- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py @@ -15,7 +15,7 @@ # /// script # dependencies = [ -# "transformers @ git+https://github.com/huggingface/transformers.git", +# "transformers==4.57.1", # "albumentations >= 1.4.16", # "accelerate >= 0.12.0", # "torch >= 1.3", @@ -66,12 +66,12 @@ SchedulerType, get_scheduler, ) -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") logger = get_logger(__name__) require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") @@ -275,10 +275,6 @@ def parse_args(): def main(): args = parse_args() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_mlm_no_trainer", args) - # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py index fd29c6a630d7..acef677ae6f7 100755 --- a/examples/pytorch/language-modeling/run_plm.py +++ b/examples/pytorch/language-modeling/run_plm.py @@ -15,7 +15,7 @@ # /// script # dependencies = [ -# "transformers @ git+https://github.com/huggingface/transformers.git", +# "transformers==4.57.1", # "albumentations >= 1.4.16", # "accelerate >= 0.12.0", # "torch >= 1.3", @@ -56,12 +56,12 @@ set_seed, ) from transformers.trainer_utils import get_last_checkpoint -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") @@ -244,10 +244,6 @@ def main(): else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_plm", model_args, data_args) - # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py index 585ac54febb2..b0582d967a9b 100755 --- a/examples/pytorch/multiple-choice/run_swag.py +++ b/examples/pytorch/multiple-choice/run_swag.py @@ -15,7 +15,7 @@ # /// script # dependencies = [ -# "transformers @ git+https://github.com/huggingface/transformers.git", +# "transformers==4.57.1", # "accelerate >= 0.12.0", # "sentencepiece != 0.1.92", # "protobuf", @@ -53,11 +53,11 @@ set_seed, ) from transformers.trainer_utils import get_last_checkpoint -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") logger = logging.getLogger(__name__) @@ -188,10 +188,6 @@ def main(): else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_swag", model_args, data_args) - # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py index 5d19486da0e1..6c8176f0c98d 100755 --- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py +++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py @@ -15,7 +15,7 @@ # /// script # dependencies = [ -# "transformers @ git+https://github.com/huggingface/transformers.git", +# "transformers==4.57.1", # "accelerate >= 0.12.0", # "sentencepiece != 0.1.92", # "protobuf", @@ -61,11 +61,11 @@ default_data_collator, get_scheduler, ) -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") logger = get_logger(__name__) # You should update this to your particular problem to have better documentation of `model_type` @@ -238,10 +238,6 @@ def parse_args(): def main(): args = parse_args() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_swag_no_trainer", args) - # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment diff --git a/examples/pytorch/object-detection/run_object_detection.py b/examples/pytorch/object-detection/run_object_detection.py index f615488c7099..c7b6af1f3c08 100644 --- a/examples/pytorch/object-detection/run_object_detection.py +++ b/examples/pytorch/object-detection/run_object_detection.py @@ -14,7 +14,7 @@ # /// script # dependencies = [ -# "transformers @ git+https://github.com/huggingface/transformers.git", +# "transformers==4.57.1", # "albumentations >= 1.4.16", # "timm", # "datasets>=4.0", @@ -52,14 +52,14 @@ from transformers.image_transforms import center_to_corners_format from transformers.trainer import EvalPrediction from transformers.trainer_utils import get_last_checkpoint -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version from transformers.utils.versions import require_version logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/object-detection/requirements.txt") @@ -349,10 +349,6 @@ def main(): else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() - # # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_object_detection", model_args, data_args) - # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", diff --git a/examples/pytorch/object-detection/run_object_detection_no_trainer.py b/examples/pytorch/object-detection/run_object_detection_no_trainer.py index f90bf1bbd3c0..9c64bf5d732a 100644 --- a/examples/pytorch/object-detection/run_object_detection_no_trainer.py +++ b/examples/pytorch/object-detection/run_object_detection_no_trainer.py @@ -14,7 +14,7 @@ # /// script # dependencies = [ -# "transformers @ git+https://github.com/huggingface/transformers.git", +# "transformers==4.57.1", # "albumentations >= 1.4.16", # "timm", # "datasets>=4.0", @@ -58,12 +58,12 @@ ) from transformers.image_processing_utils import BatchFeature from transformers.image_transforms import center_to_corners_format -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") logging.basicConfig(level=logging.INFO) logger = get_logger(__name__) @@ -411,10 +411,6 @@ def parse_args(): def main(): args = parse_args() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_object_detection_no_trainer", args) - # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py index 5a639696f6cd..4f3b38409a52 100755 --- a/examples/pytorch/question-answering/run_qa.py +++ b/examples/pytorch/question-answering/run_qa.py @@ -44,12 +44,12 @@ set_seed, ) from transformers.trainer_utils import get_last_checkpoint -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") @@ -237,10 +237,6 @@ def main(): else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_qa", model_args, data_args) - # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py index b778d9fc67ee..14bc2448d18e 100755 --- a/examples/pytorch/question-answering/run_qa_beam_search.py +++ b/examples/pytorch/question-answering/run_qa_beam_search.py @@ -42,12 +42,12 @@ set_seed, ) from transformers.trainer_utils import get_last_checkpoint -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") @@ -235,10 +235,6 @@ def main(): else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_qa_beam_search", model_args, data_args) - # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py index 9fd3ce223220..6c93819ee947 100644 --- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py @@ -49,12 +49,12 @@ default_data_collator, get_scheduler, ) -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") @@ -299,10 +299,6 @@ def parse_args(): def main(): args = parse_args() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_qa_beam_search_no_trainer", args) - # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers # in the environment diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py index dc1b9743e634..ddd5d574f3a1 100755 --- a/examples/pytorch/question-answering/run_qa_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_no_trainer.py @@ -51,12 +51,12 @@ default_data_collator, get_scheduler, ) -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") @@ -338,10 +338,6 @@ def parse_args(): def main(): args = parse_args() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_qa_no_trainer", args) - # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment @@ -954,7 +950,7 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len): all_start_logits.append(accelerator.gather_for_metrics(start_logits).cpu().numpy()) all_end_logits.append(accelerator.gather_for_metrics(end_logits).cpu().numpy()) - max_len = max([x.shape[1] for x in all_start_logits]) # Get the max_length of the tensor + max_len = max(x.shape[1] for x in all_start_logits) # Get the max_length of the tensor # concatenate the numpy array start_logits_concat = create_and_fill_np_array(all_start_logits, eval_dataset, max_len) @@ -993,7 +989,7 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len): all_start_logits.append(accelerator.gather_for_metrics(start_logits).cpu().numpy()) all_end_logits.append(accelerator.gather_for_metrics(end_logits).cpu().numpy()) - max_len = max([x.shape[1] for x in all_start_logits]) # Get the max_length of the tensor + max_len = max(x.shape[1] for x in all_start_logits) # Get the max_length of the tensor # concatenate the numpy array start_logits_concat = create_and_fill_np_array(all_start_logits, predict_dataset, max_len) end_logits_concat = create_and_fill_np_array(all_end_logits, predict_dataset, max_len) diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py index 408d4d23f59c..83a1614fcfbc 100644 --- a/examples/pytorch/question-answering/run_seq2seq_qa.py +++ b/examples/pytorch/question-answering/run_seq2seq_qa.py @@ -40,12 +40,12 @@ set_seed, ) from transformers.trainer_utils import EvalLoopOutput, EvalPrediction, get_last_checkpoint -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") @@ -282,10 +282,6 @@ def main(): else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_seq2seq_qa", model_args, data_args) - # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py index ea678c094aef..21752fae045a 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py @@ -14,7 +14,7 @@ # /// script # dependencies = [ -# "transformers @ git+https://github.com/huggingface/transformers.git", +# "transformers==4.57.1", # "datasets >= 2.0.0", # "torch >= 1.3", # "accelerate", @@ -53,7 +53,7 @@ default_data_collator, ) from transformers.trainer_utils import get_last_checkpoint -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version from transformers.utils.versions import require_version @@ -62,7 +62,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/semantic-segmentation/requirements.txt") @@ -197,10 +197,6 @@ def main(): else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_semantic_segmentation", model_args, data_args) - # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py index 97a3a249d484..f36f5a366b63 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py @@ -14,7 +14,7 @@ # /// script # dependencies = [ -# "transformers @ git+https://github.com/huggingface/transformers.git", +# "transformers==4.57.1", # "datasets >= 2.0.0", # "torch >= 1.3", # "accelerate", @@ -57,12 +57,12 @@ default_data_collator, get_scheduler, ) -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") logger = get_logger(__name__) @@ -253,10 +253,6 @@ def parse_args(): def main(): args = parse_args() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_semantic_segmentation_no_trainer", args) - # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment diff --git a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py index f30fd1676a3a..0ec5f038244c 100755 --- a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py +++ b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py @@ -14,7 +14,7 @@ # /// script # dependencies = [ -# "transformers @ git+https://github.com/huggingface/transformers.git", +# "transformers==4.57.1", # "datasets[audio] >= 1.12.0", # "torch >= 1.5", # "torchaudio", @@ -53,7 +53,6 @@ set_seed, ) from transformers.models.wav2vec2.modeling_wav2vec2 import _compute_mask_indices, _sample_negative_indices -from transformers.utils import send_example_telemetry logger = get_logger(__name__) @@ -410,10 +409,6 @@ def main(): # We now keep distinct sets of args, for a cleaner separation of concerns. args = parse_args() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_wav2vec2_pretraining_no_trainer", args) - # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. accelerator = Accelerator() logger.info(accelerator.state, main_process_only=False) diff --git a/examples/pytorch/speech-recognition/README.md b/examples/pytorch/speech-recognition/README.md index 2889919655f4..41df41880b5a 100644 --- a/examples/pytorch/speech-recognition/README.md +++ b/examples/pytorch/speech-recognition/README.md @@ -66,7 +66,7 @@ The following command shows how to fine-tune [XLSR-Wav2Vec2](https://huggingface ```bash python run_speech_recognition_ctc.py \ - --dataset_name="common_voice" \ + --dataset_name="mozilla-foundation/common_voice_17_0" \ --model_name_or_path="facebook/wav2vec2-large-xlsr-53" \ --dataset_config_name="tr" \ --output_dir="./wav2vec2-common_voice-tr-demo" \ @@ -102,7 +102,7 @@ The following command shows how to fine-tune [XLSR-Wav2Vec2](https://huggingface ```bash torchrun \ --nproc_per_node 8 run_speech_recognition_ctc.py \ - --dataset_name="common_voice" \ + --dataset_name="mozilla-foundation/common_voice_17_0" \ --model_name_or_path="facebook/wav2vec2-large-xlsr-53" \ --dataset_config_name="tr" \ --output_dir="./wav2vec2-common_voice-tr-demo-dist" \ @@ -149,7 +149,7 @@ However, the `--shuffle_buffer_size` argument controls how many examples we can ```bash **torchrun \ --nproc_per_node 4 run_speech_recognition_ctc_streaming.py \ - --dataset_name="common_voice" \ + --dataset_name="mozilla-foundation/common_voice_17_0" \ --model_name_or_path="facebook/wav2vec2-xls-r-300m" \ --tokenizer_name_or_path="anton-l/wav2vec2-tokenizer-turkish" \ --dataset_config_name="tr" \ @@ -314,7 +314,7 @@ below 27%. For an example run, you can have a look at [`patrickvonplaten/wav2vec2-common_voice-tr-mms-demo`](https://huggingface.co/patrickvonplaten/wav2vec2-common_voice-tr-mms-demo). -If you'd like to train another adapter model with the same base model, you can simply re-use the same `--output_dir`, +If you'd like to train another adapter model with the same base model, you can simply reuse the same `--output_dir`, but make sure to pass the `--output_dir` folder also to `--tokenizer_name_or_path` so that the vocabulary is not overwritten but **extended**. Assuming you would like to train adapter weights on Swedish in addition to Turkish and save the adapter weights in the same model repo, you can run: diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py index 4532bc511e9f..2fbbc9e52a73 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py @@ -15,7 +15,7 @@ # /// script # dependencies = [ -# "transformers @ git+https://github.com/huggingface/transformers.git", +# "transformers==4.57.1", # "datasets[audio] >= 1.18.0", # "torch >= 1.5", # "torchaudio", @@ -56,14 +56,17 @@ set_seed, ) from transformers.trainer_utils import get_last_checkpoint, is_main_process -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") -require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") +require_version( + "datasets>=1.18.0", + "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt", +) logger = logging.getLogger(__name__) @@ -91,13 +94,16 @@ class ModelArguments: metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, ) freeze_feature_encoder: bool = field( - default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."} + default=True, + metadata={"help": "Whether to freeze the feature encoder layers of the model."}, ) attention_dropout: float = field( - default=0.0, metadata={"help": "The dropout ratio for the attention probabilities."} + default=0.0, + metadata={"help": "The dropout ratio for the attention probabilities."}, ) activation_dropout: float = field( - default=0.0, metadata={"help": "The dropout ratio for activations inside the fully connected layer."} + default=0.0, + metadata={"help": "The dropout ratio for activations inside the fully connected layer."}, ) feat_proj_dropout: float = field(default=0.0, metadata={"help": "The dropout ratio for the projected features."}) hidden_dropout: float = field( @@ -140,7 +146,8 @@ class ModelArguments: ) layerdrop: float = field(default=0.0, metadata={"help": "The LayerDrop probability."}) ctc_loss_reduction: Optional[str] = field( - default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."} + default="mean", + metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."}, ) ctc_zero_infinity: Optional[bool] = field( default=False, @@ -169,10 +176,13 @@ class DataTrainingArguments: """ dataset_name: str = field( - metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + metadata={"help": "Path or name of the dataset (cf `load_dataset` method of the Datasets library)."} ) dataset_config_name: str = field( - default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + default=None, + metadata={ + "help": "The configuration name of the dataset to use (cf `load_dataset` method of the Datasets library)." + }, ) train_split_name: str = field( default="train+validation", @@ -198,7 +208,8 @@ class DataTrainingArguments: metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"}, ) overwrite_cache: bool = field( - default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."} + default=False, + metadata={"help": "Overwrite the cached preprocessed datasets or not."}, ) preprocessing_num_workers: Optional[int] = field( default=None, @@ -240,7 +251,8 @@ class DataTrainingArguments: }, ) min_duration_in_seconds: float = field( - default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"} + default=0.0, + metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}, ) preprocessing_only: bool = field( default=False, @@ -383,7 +395,8 @@ def extract_all_chars(batch): # take union of all unique characters in each dataset vocab_set = functools.reduce( - lambda vocab_1, vocab_2: set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]), vocabs.values() + lambda vocab_1, vocab_2: set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]), + vocabs.values(), ) vocab_dict = {v: k for k, v in enumerate(sorted(vocab_set))} @@ -416,10 +429,6 @@ def main(): else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_speech_recognition_ctc", model_args, data_args) - # Detecting last checkpoint. last_checkpoint = None if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: @@ -575,7 +584,7 @@ def remove_special_characters(batch): # it is defined by `tokenizer_class` if present in config else by `model_type` tokenizer_kwargs = { "config": config if config.tokenizer_class is not None else None, - "tokenizer_type": config.model_type if config.tokenizer_class is None else None, + "tokenizer_type": (config.model_type if config.tokenizer_class is None else None), "unk_token": unk_token, "pad_token": pad_token, "word_delimiter_token": word_delimiter_token, @@ -643,7 +652,8 @@ def remove_special_characters(batch): dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate if dataset_sampling_rate != feature_extractor.sampling_rate: raw_datasets = raw_datasets.cast_column( - data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate) + data_args.audio_column_name, + datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate), ) # derive max & min input length for sample rate & max duration diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py index 884201d9d993..0d6d2918228e 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py @@ -15,7 +15,7 @@ # /// script # dependencies = [ -# "transformers @ git+https://github.com/huggingface/transformers.git", +# "transformers==4.57.1", # "datasets[audio] >= 1.18.0", # "torch >= 1.5", # "torchaudio", @@ -59,14 +59,17 @@ ) from transformers.models.wav2vec2.modeling_wav2vec2 import WAV2VEC2_ADAPTER_SAFE_FILE from transformers.trainer_utils import get_last_checkpoint, is_main_process -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") -require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") +require_version( + "datasets>=1.18.0", + "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt", +) logger = logging.getLogger(__name__) @@ -127,7 +130,8 @@ class ModelArguments: ) layerdrop: float = field(default=0.0, metadata={"help": "The LayerDrop probability."}) ctc_loss_reduction: Optional[str] = field( - default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."} + default="mean", + metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."}, ) adapter_attn_dim: int = field( default=16, @@ -148,9 +152,9 @@ class DataTrainingArguments: """ dataset_name: str = field( - metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + metadata={"help": "Path or name of the dataset (cf `load_dataset` method of the Datasets library)."} ) - target_language: Optional[str] = field( + target_language: str = field( metadata={ "help": ( "The target language on which the adapter attention layers" @@ -162,7 +166,10 @@ class DataTrainingArguments: }, ) dataset_config_name: str = field( - default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + default=None, + metadata={ + "help": "The configuration name of the dataset to use (cf `load_dataset` method of the Datasets library)." + }, ) train_split_name: str = field( default="train+validation", @@ -188,7 +195,8 @@ class DataTrainingArguments: metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"}, ) overwrite_cache: bool = field( - default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."} + default=False, + metadata={"help": "Overwrite the cached preprocessed datasets or not."}, ) preprocessing_num_workers: Optional[int] = field( default=None, @@ -230,7 +238,8 @@ class DataTrainingArguments: }, ) min_duration_in_seconds: float = field( - default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"} + default=0.0, + metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}, ) preprocessing_only: bool = field( default=False, @@ -363,7 +372,8 @@ def extract_all_chars(batch): # take union of all unique characters in each dataset vocab_set = functools.reduce( - lambda vocab_1, vocab_2: set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]), vocabs.values() + lambda vocab_1, vocab_2: set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]), + vocabs.values(), ) vocab_dict = {v: k for k, v in enumerate(sorted(vocab_set))} @@ -396,10 +406,6 @@ def main(): else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_speech_recognition_ctc_adapter", model_args, data_args) - # Detecting last checkpoint. last_checkpoint = None if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: @@ -582,7 +588,7 @@ def remove_special_characters(batch): # it is defined by `tokenizer_class` if present in config else by `model_type` tokenizer_kwargs = { "config": config if config.tokenizer_class is not None else None, - "tokenizer_type": config.model_type if config.tokenizer_class is None else None, + "tokenizer_type": (config.model_type if config.tokenizer_class is None else None), "unk_token": unk_token, "pad_token": pad_token, "word_delimiter_token": word_delimiter_token, @@ -654,7 +660,8 @@ def remove_special_characters(batch): dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate if dataset_sampling_rate != feature_extractor.sampling_rate: raw_datasets = raw_datasets.cast_column( - data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate) + data_args.audio_column_name, + datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate), ) # derive max & min input length for sample rate & max duration diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py index aee6ae3b8bae..f6744e0ed52a 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py @@ -15,7 +15,7 @@ # /// script # dependencies = [ -# "transformers @ git+https://github.com/huggingface/transformers.git", +# "transformers==4.57.1", # "datasets[audio] >= 1.18.0", # "torch >= 1.5", # "torchaudio", @@ -55,14 +55,17 @@ set_seed, ) from transformers.trainer_utils import get_last_checkpoint, is_main_process -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") -require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") +require_version( + "datasets>=1.18.0", + "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt", +) logger = logging.getLogger(__name__) @@ -77,13 +80,16 @@ class ModelArguments: metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} ) config_name: Optional[str] = field( - default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + default=None, + metadata={"help": "Pretrained config name or path if not the same as model_name"}, ) tokenizer_name: Optional[str] = field( - default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + default=None, + metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}, ) feature_extractor_name: Optional[str] = field( - default=None, metadata={"help": "feature extractor name or path if not the same as model_name"} + default=None, + metadata={"help": "feature extractor name or path if not the same as model_name"}, ) cache_dir: Optional[str] = field( default=None, @@ -117,10 +123,12 @@ class ModelArguments: }, ) freeze_feature_encoder: bool = field( - default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."} + default=True, + metadata={"help": "Whether to freeze the feature encoder layers of the model."}, ) freeze_encoder: bool = field( - default=False, metadata={"help": "Whether to freeze the entire encoder of the seq2seq model."} + default=False, + metadata={"help": "Whether to freeze the entire encoder of the seq2seq model."}, ) forced_decoder_ids: list[list[int]] = field( default=None, @@ -150,13 +158,17 @@ class DataTrainingArguments: """ dataset_name: str = field( - default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} + metadata={"help": "Path or name of the dataset (cf `load_dataset` method of the Datasets library)."} ) - dataset_config_name: Optional[str] = field( - default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + dataset_config_name: str = field( + default=None, + metadata={ + "help": "The configuration name of the dataset to use (cf `load_dataset` method of the Datasets library)." + }, ) overwrite_cache: bool = field( - default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} + default=False, + metadata={"help": "Overwrite the cached training and evaluation sets"}, ) preprocessing_num_workers: Optional[int] = field( default=None, @@ -198,7 +210,8 @@ class DataTrainingArguments: }, ) min_duration_in_seconds: float = field( - default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"} + default=0.0, + metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}, ) preprocessing_only: bool = field( default=False, @@ -300,10 +313,6 @@ def main(): else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_speech_recognition_seq2seq", model_args, data_args) - # 2. Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", @@ -391,7 +400,7 @@ def main(): # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently config = AutoConfig.from_pretrained( - model_args.config_name if model_args.config_name else model_args.model_name_or_path, + (model_args.config_name if model_args.config_name else model_args.model_name_or_path), cache_dir=model_args.cache_dir, revision=model_args.model_revision, token=model_args.token, @@ -403,14 +412,14 @@ def main(): config.update({"apply_spec_augment": model_args.apply_spec_augment}) feature_extractor = AutoFeatureExtractor.from_pretrained( - model_args.feature_extractor_name if model_args.feature_extractor_name else model_args.model_name_or_path, + (model_args.feature_extractor_name if model_args.feature_extractor_name else model_args.model_name_or_path), cache_dir=model_args.cache_dir, revision=model_args.model_revision, token=model_args.token, trust_remote_code=model_args.trust_remote_code, ) tokenizer = AutoTokenizer.from_pretrained( - model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + (model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path), cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, @@ -469,7 +478,8 @@ def main(): dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate if dataset_sampling_rate != feature_extractor.sampling_rate: raw_datasets = raw_datasets.cast_column( - data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate) + data_args.audio_column_name, + datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate), ) # 7. Preprocessing the datasets. @@ -498,7 +508,9 @@ def prepare_dataset(batch): # process audio sample = batch[audio_column_name] inputs = feature_extractor( - sample["array"], sampling_rate=sample["sampling_rate"], return_attention_mask=forward_attention_mask + sample["array"], + sampling_rate=sample["sampling_rate"], + return_attention_mask=forward_attention_mask, ) # process audio length batch[model_input_name] = inputs.get(model_input_name)[0] @@ -583,7 +595,7 @@ def compute_metrics(pred): eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None, processing_class=feature_extractor, data_collator=data_collator, - compute_metrics=compute_metrics if training_args.predict_with_generate else None, + compute_metrics=(compute_metrics if training_args.predict_with_generate else None), ) # 12. Training @@ -625,7 +637,10 @@ def compute_metrics(pred): trainer.save_metrics("eval", metrics) # 14. Write Training Stats - kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "automatic-speech-recognition"} + kwargs = { + "finetuned_from": model_args.model_name_or_path, + "tasks": "automatic-speech-recognition", + } if data_args.dataset_name is not None: kwargs["dataset_tags"] = data_args.dataset_name if data_args.dataset_config_name is not None: diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py index e3554ec85829..641d6d5bcfad 100755 --- a/examples/pytorch/summarization/run_summarization.py +++ b/examples/pytorch/summarization/run_summarization.py @@ -15,7 +15,7 @@ # /// script # dependencies = [ -# "transformers @ git+https://github.com/huggingface/transformers.git", +# "transformers==4.57.1", # "accelerate >= 0.12.0", # "datasets >= 1.8.0", # "sentencepiece != 0.1.92", @@ -62,12 +62,12 @@ set_seed, ) from transformers.trainer_utils import get_last_checkpoint -from transformers.utils import check_min_version, is_offline_mode, send_example_telemetry +from transformers.utils import check_min_version, is_offline_mode from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") @@ -337,10 +337,6 @@ def main(): else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_summarization", model_args, data_args) - # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py index 19366f7b7248..ad86e0c54c8d 100644 --- a/examples/pytorch/summarization/run_summarization_no_trainer.py +++ b/examples/pytorch/summarization/run_summarization_no_trainer.py @@ -15,7 +15,7 @@ # /// script # dependencies = [ -# "transformers @ git+https://github.com/huggingface/transformers.git", +# "transformers==4.57.1", # "accelerate >= 0.12.0", # "datasets >= 1.8.0", # "sentencepiece != 0.1.92", @@ -66,12 +66,12 @@ SchedulerType, get_scheduler, ) -from transformers.utils import check_min_version, is_offline_mode, send_example_telemetry +from transformers.utils import check_min_version, is_offline_mode from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") logger = get_logger(__name__) require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") @@ -338,9 +338,6 @@ def parse_args(): def main(): args = parse_args() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_summarization_no_trainer", args) # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers diff --git a/examples/pytorch/text-classification/run_classification.py b/examples/pytorch/text-classification/run_classification.py index 17eaccd96baf..e6093501a353 100755 --- a/examples/pytorch/text-classification/run_classification.py +++ b/examples/pytorch/text-classification/run_classification.py @@ -15,7 +15,7 @@ # /// script # dependencies = [ -# "transformers @ git+https://github.com/huggingface/transformers.git", +# "transformers==4.57.1", # "accelerate >= 0.12.0", # "datasets >= 1.8.0", # "sentencepiece != 0.1.92", @@ -56,12 +56,12 @@ set_seed, ) from transformers.trainer_utils import get_last_checkpoint -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") @@ -296,10 +296,6 @@ def main(): else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_classification", model_args, data_args) - # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py index 1c8df2d54daf..1df1a7fa65a5 100755 --- a/examples/pytorch/text-classification/run_glue.py +++ b/examples/pytorch/text-classification/run_glue.py @@ -15,7 +15,7 @@ # /// script # dependencies = [ -# "transformers @ git+https://github.com/huggingface/transformers.git", +# "transformers==4.57.1", # "accelerate >= 0.12.0", # "datasets >= 1.8.0", # "sentencepiece != 0.1.92", @@ -58,12 +58,12 @@ set_seed, ) from transformers.trainer_utils import get_last_checkpoint -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") @@ -241,10 +241,6 @@ def main(): else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_glue", model_args, data_args) - # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py index a706e003f69e..2d47a6fb02e4 100644 --- a/examples/pytorch/text-classification/run_glue_no_trainer.py +++ b/examples/pytorch/text-classification/run_glue_no_trainer.py @@ -14,7 +14,7 @@ # /// script # dependencies = [ -# "transformers @ git+https://github.com/huggingface/transformers.git", +# "transformers==4.57.1", # "accelerate >= 0.12.0", # "datasets >= 1.8.0", # "sentencepiece != 0.1.92", @@ -58,12 +58,12 @@ default_data_collator, get_scheduler, ) -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") logger = get_logger(__name__) @@ -234,9 +234,6 @@ def parse_args(): def main(): args = parse_args() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_glue_no_trainer", args) # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers diff --git a/examples/pytorch/text-classification/run_xnli.py b/examples/pytorch/text-classification/run_xnli.py index beb7bb778b1d..513d26e64ce3 100755 --- a/examples/pytorch/text-classification/run_xnli.py +++ b/examples/pytorch/text-classification/run_xnli.py @@ -16,7 +16,7 @@ # /// script # dependencies = [ -# "transformers @ git+https://github.com/huggingface/transformers.git", +# "transformers==4.57.1", # "accelerate >= 0.12.0", # "datasets >= 1.8.0", # "sentencepiece != 0.1.92", @@ -57,12 +57,12 @@ set_seed, ) from transformers.trainer_utils import get_last_checkpoint -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") @@ -199,10 +199,6 @@ def main(): parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_xnli", model_args) - # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", diff --git a/examples/pytorch/text-generation/run_generation.py b/examples/pytorch/text-generation/run_generation.py index 7784580e033c..ccbf4182f3e7 100755 --- a/examples/pytorch/text-generation/run_generation.py +++ b/examples/pytorch/text-generation/run_generation.py @@ -16,7 +16,7 @@ # /// script # dependencies = [ -# "transformers @ git+https://github.com/huggingface/transformers.git", +# "transformers==4.57.1", # "accelerate >= 0.21.0", # "sentencepiece != 0.1.92", # "protobuf", diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py index d5bdb9ee3662..32e2c21a39df 100755 --- a/examples/pytorch/token-classification/run_ner.py +++ b/examples/pytorch/token-classification/run_ner.py @@ -15,7 +15,7 @@ # /// script # dependencies = [ -# "transformers @ git+https://github.com/huggingface/transformers.git", +# "transformers==4.57.1", # "accelerate >= 0.12.0", # "seqeval", # "datasets >= 1.8.0", @@ -55,12 +55,12 @@ set_seed, ) from transformers.trainer_utils import get_last_checkpoint -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") @@ -238,10 +238,6 @@ def main(): else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_ner", model_args, data_args) - # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py index 7d5256f48e81..02ec8469dbba 100755 --- a/examples/pytorch/token-classification/run_ner_no_trainer.py +++ b/examples/pytorch/token-classification/run_ner_no_trainer.py @@ -15,7 +15,7 @@ # /// script # dependencies = [ -# "transformers @ git+https://github.com/huggingface/transformers.git", +# "transformers==4.57.1", # "accelerate >= 0.12.0", # "seqeval", # "datasets >= 1.8.0", @@ -62,12 +62,12 @@ default_data_collator, get_scheduler, ) -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") logger = get_logger(__name__) require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") @@ -284,10 +284,6 @@ def parse_args(): def main(): args = parse_args() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_ner_no_trainer", args) - # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py index dcfe9a6ffe94..e951332c418c 100755 --- a/examples/pytorch/translation/run_translation.py +++ b/examples/pytorch/translation/run_translation.py @@ -15,7 +15,7 @@ # /// script # dependencies = [ -# "transformers @ git+https://github.com/huggingface/transformers.git", +# "transformers==4.57.1", # "accelerate >= 0.12.0", # "datasets >= 1.8.0", # "sentencepiece != 0.1.92", @@ -61,12 +61,12 @@ set_seed, ) from transformers.trainer_utils import get_last_checkpoint -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt") @@ -285,10 +285,6 @@ def main(): else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_translation", model_args, data_args) - # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py index 871504bb9877..979c103001af 100644 --- a/examples/pytorch/translation/run_translation_no_trainer.py +++ b/examples/pytorch/translation/run_translation_no_trainer.py @@ -15,7 +15,7 @@ # /// script # dependencies = [ -# "transformers @ git+https://github.com/huggingface/transformers.git", +# "transformers==4.57.1", # "accelerate >= 0.12.0", # "datasets >= 1.8.0", # "sentencepiece != 0.1.92", @@ -66,12 +66,12 @@ default_data_collator, get_scheduler, ) -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.57.0.dev0") +check_min_version("4.57.0") logger = get_logger(__name__) require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt") @@ -330,10 +330,6 @@ def main(): # Parse the arguments args = parse_args() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_translation_no_trainer", args) - # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment diff --git a/examples/quantization/custom_quantization_int8_example.py b/examples/quantization/custom_quantization_int8_example.py index 4bf907b77fe5..884b943f696b 100644 --- a/examples/quantization/custom_quantization_int8_example.py +++ b/examples/quantization/custom_quantization_int8_example.py @@ -159,24 +159,13 @@ def _process_model_before_weight_loading(self, model, **kwargs): pre_quantized=self.pre_quantized, ) - def check_quantized_param( - self, - model, - param_value: "torch.Tensor", - param_name: str, - state_dict: dict[str, Any], - **kwargs, - ): + def param_needs_quantization(self, model, param_name: str, **kwargs) -> bool: module, tensor_name = get_module_from_name(model, param_name) if isinstance(module, Int8SymmetricLinear): if self.pre_quantized or tensor_name == "bias": - if tensor_name == "weight" and param_value.dtype != torch.int8: - raise ValueError("Expect quantized weights but got an unquantized weight") return False else: - if tensor_name == "weight_scale": - raise ValueError("Expect unquantized weights but got a quantized weight_scale") return True return False @@ -186,12 +175,18 @@ def create_quantized_param( param_value: "torch.Tensor", param_name: str, target_device: "torch.device", - state_dict: dict[str, Any], - unexpected_keys: Optional[list[str]] = None, + **kwargs, ): - """ - Quantizes weights to INT8 symmetric format. - """ + # Sanity check + module, tensor_name = get_module_from_name(model, param_name) + if isinstance(module, Int8SymmetricLinear): + if self.pre_quantized or tensor_name == "bias": + if tensor_name == "weight" and param_value.dtype != torch.int8: + raise ValueError("Expect quantized weights but got an unquantized weight") + else: + if tensor_name == "weight_scale": + raise ValueError("Expect unquantized weights but got a quantized weight_scale") + abs_max_per_row = torch.max(torch.abs(param_value), dim=1, keepdim=True)[0].clamp(min=1e-5) weight_scale = abs_max_per_row / 127.0 diff --git a/i18n/README_ar.md b/i18n/README_ar.md index cdf813445d6f..a0c86c770600 100644 --- a/i18n/README_ar.md +++ b/i18n/README_ar.md @@ -47,9 +47,11 @@ limitations under the License. తెలుగు | Français | Deutsch | + Italiano | Tiếng Việt | العربية | اردو | + বাংলা |

diff --git a/i18n/README_bn.md b/i18n/README_bn.md new file mode 100644 index 000000000000..354521ee7ba3 --- /dev/null +++ b/i18n/README_bn.md @@ -0,0 +1,335 @@ + + +

+ + + + Hugging Face Transformers Library + +
+
+

+ +

+ Checkpoints on Hub + Build + GitHub + Documentation + GitHub release + Contributor Covenant + DOI +

+ +

+

+ English | + 简体中文 | + 繁體中文 | + 한국어 | + Español | + 日本語 | + हिन्दी | + Русский | + Português | + తెలుగు | + Français | + Deutsch | + Italiano | + Tiếng Việt | + العربية | + اردو | + বাংলা | +

+

+ +

+

ইনফারেন্স ও ট্রেনিংয়ের জন্য আধুনিকতম (State-of-the-art) প্রি-ট্রেইন্ড মডেলসমূহ

+

+ +

+ +

+ + +**Transformers** হলো একটা ফ্রেমওয়ার্ক যেটা দিয়ে টেক্সট, কম্পিউটার ভিশন, অডিও, ভিডিও আর মাল্টিমোডাল—সব ধরনের মডেল তৈরি আর চালানো যায়। এটা ট্রেইনিং আর ইনফারেন্স – দুই কাজেই ব্যবহার করা হয়। + +Transformers মডেলের ডেফিনিশন এক জায়গায় রাখে। এর মানে হলো, একবার কোনো মডেল `transformers`-এ সাপোর্ট পেলেই সেটা সহজে বিভিন্ন ট্রেইনিং ফ্রেমওয়ার্ক (Axolotl, Unsloth, DeepSpeed, FSDP, PyTorch-Lightning ইত্যাদি), ইনফারেন্স ইঞ্জিন (vLLM, SGLang, TGI ইত্যাদি) আর অন্যান্য লাইব্রেরি (llama.cpp, mlx ইত্যাদি)-তে ব্যবহার করা যায়। + +আমরা চাই নতুন আর আধুনিক মডেলগুলো সবাই ব্যবহার করতে পারে। তাই মডেলের ডেফিনিশন রাখা হয়েছে সহজ, কাস্টমাইজযোগ্য আর পারফরম্যান্স-ফ্রেন্ডলি। + +এখন পর্যন্ত [Hugging Face Hub](https://huggingface.com/models)-এ ১০ লাখেরও বেশি Transformers [মডেল চেকপয়েন্ট](https://huggingface.co/models?library=transformers&sort=trending) আছে, যেগুলো যেকোনো সময় ব্যবহার করা যায়। + +আজই [Hub](https://huggingface.com/) থেকে একটা মডেল বেছে নিন আর Transformers দিয়ে শুরু করুন। + + +## ইনস্টলেশন + +Transformers Python 3.9+ সহ কাজ করে, এবং সমর্থিত ফ্রেমওয়ার্কগুলো হলো [PyTorch](https://pytorch.org/get-started/locally/) 2.1+, [TensorFlow](https://www.tensorflow.org/install/pip) 2.6+, এবং [Flax](https://flax.readthedocs.io/en/latest/) 0.4.1+। + +[venv](https://docs.python.org/3/library/venv.html) বা [uv](https://docs.astral.sh/uv/) ব্যবহার করে একটি ভার্চুয়াল এনভায়রনমেন্ট তৈরি এবং সক্রিয় করুন। + +```py +# venv +python -m venv .my-env +source .my-env/bin/activate +# uv +uv venv .my-env +source .my-env/bin/activate +``` +আপনার ভার্চুয়াল পরিবেশে Transformers ইনস্টল করুন। + +```py +# pip +pip install "transformers[torch]" + +# uv +uv pip install "transformers[torch]" +``` +যদি আপনি লাইব্রেরির সর্বশেষ পরিবর্তনগুলি চান বা অবদান রাখতে আগ্রহী হন তবে উৎস থেকে Transformers ইনস্টল করুন। তবে, সর্বশেষ সংস্করণটি স্থিতিশীল নাও হতে পারে। যদি আপনি কোনো ত্রুটির সম্মুখীন হন তবে নির্দ্বিধায় একটি [issue](https://github.com/huggingface/transformers/issues) খুলুন। + +```Shell +git clone [https://github.com/huggingface/transformers.git](https://github.com/huggingface/transformers.git) +cd transformers + +# pip +pip install .[torch] + +# uv +uv pip install .[torch] +``` + +## কুইকস্টার্ট + +Transformers ব্যবহার শুরু করুন এখনই [Pipeline](https://huggingface.co/docs/transformers/pipeline_tutorial) API দিয়ে। `Pipeline` হলো একটি হাই-লেভেল ইনফারেন্স ক্লাস, যা টেক্সট, অডিও, ভিশন এবং মাল্টিমোডাল টাস্ক সাপোর্ট করে। এটি ইনপুট প্রিপ্রসেসিং করে এবং সঠিক আউটপুট রিটার্ন করে। + +একটি পাইপলাইন তৈরি করুন এবং টেক্সট জেনারেশনের জন্য কোন মডেল ব্যবহার করবেন তা নির্দিষ্ট করুন। মডেলটি ডাউনলোড হয়ে ক্যাশে রাখা হবে, ফলে পরে সহজেই আবার ব্যবহার করতে পারবেন। সবশেষে, মডেলকে প্রম্পট করার জন্য কিছু টেক্সট দিন। + + +```py +from transformers import pipeline + +pipeline = pipeline(task="text-generation", model="Qwen/Qwen2.5-1.5B") +pipeline("the secret to baking a really good cake is ") +[{'generated_text': 'the secret to baking a really good cake is 1) to use the right ingredients and 2) to follow the recipe exactly. the recipe for the cake is as follows: 1 cup of sugar, 1 cup of flour, 1 cup of milk, 1 cup of butter, 1 cup of eggs, 1 cup of chocolate chips. if you want to make 2 cakes, how much sugar do you need? To make 2 cakes, you will need 2 cups of sugar.'}] +``` + +মডেলের সাথে চ্যাট করতে হলেও ব্যবহার প্যাটার্ন একই। শুধু পার্থক্য হলো, আপনাকে একটি চ্যাট হিস্ট্রি তৈরি করতে হবে (যা `Pipeline`-এ ইনপুট হিসেবে যাবে) আপনার আর সিস্টেমের মধ্যে। + +> [!TIP] +> আপনি সরাসরি কমান্ড লাইন থেকেও একটি মডেলের সাথে চ্যাট করতে পারেন। +> ```Shell +> transformers chat Qwen/Qwen2.5-0.5B-Instruct +> ``` + +```Python +import torch +from transformers import pipeline + +chat = [ + {"role": "system", "content": "You are a sassy, wise-cracking robot as imagined by Hollywood circa 1986."}, + {"role": "user", "content": "Hey, can you tell me any fun things to do in New York?"} +] + +pipeline = pipeline(task="text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct", dtype=torch.bfloat16, device_map="auto") +response = pipeline(chat, max_new_tokens=512) +print(response[0]["generated_text"][-1]["content"]) + +বিভিন্ন মোডালিটি এবং কাজের জন্য Pipeline কিভাবে কাজ করে তা দেখতে নিচের উদাহরণগুলো সম্প্রসারণ করুন। +``` + +
+অটোমেটিক স্পিচ রিকগনিশন (ASR) + +```Python +from transformers import pipeline + +pipeline = pipeline(task="automatic-speech-recognition", model="openai/whisper-large-v3") +pipeline("[https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac](https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac)") +{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'} +``` + +
+ +
+ইমেজ ক্লাসিফিকেশন + +

+ +

+ +```py +from transformers import pipeline + +pipeline = pipeline(task="image-classification", model="facebook/dinov2-small-imagenet1k-1-layer") +pipeline("[https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png](https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png)") +[{'label': 'macaw', 'score': 0.997848391532898}, + {'label': 'sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita', + 'score': 0.0016551691805943847}, + {'label': 'lorikeet', 'score': 0.00018523589824326336}, + {'label': 'African grey, African gray, Psittacus erithacus', + 'score': 7.85409429227002e-05}, + {'label': 'quail', 'score': 5.502637941390276e-05}] + ``` +
+ +
+ভিজুয়াল কোয়েশ্চন আনসারিং + +

+ +

+ +```py +from transformers import pipeline + +pipeline = pipeline(task="visual-question-answering", model="Salesforce/blip-vqa-base") +pipeline( + image="[https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-few-shot.jpg](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-few-shot.jpg)", + question="What is in the image?", +) +[{'answer': 'statue of liberty'}] +``` +
+ +## কেন Transformers ব্যবহার করবেন? + +1. সহজে ব্যবহারযোগ্য সর্বাধুনিক মডেল: + + * ন্যাচারাল ল্যাঙ্গুয়েজ আন্ডারস্ট্যান্ডিং ও জেনারেশন, কম্পিউটার ভিশন, অডিও, ভিডিও এবং মাল্টিমোডাল টাস্কে উচ্চ পারফরম্যান্স। + * গবেষক, ইঞ্জিনিয়ার এবং ডেভেলপারদের জন্য সহজে শুরু করার সুযোগ। + * মাত্র তিনটি ক্লাস শিখলেই ব্যবহার করা যায়। + * সব প্রি-ট্রেইন্ড মডেলের জন্য একটি একীভূত API। + +2. কম কম্পিউট খরচ, ছোট কার্বন ফুটপ্রিন্ট: + + * শূন্য থেকে ট্রেইন না করে ট্রেইন্ড মডেল শেয়ার করুন। + * কম্পিউট টাইম ও প্রোডাকশন খরচ কমান। + * সব ধরনের মোডালিটির জন্য ১০ লক্ষ+ প্রি-ট্রেইন্ড চেকপয়েন্টসহ ডজনখানেক মডেল আর্কিটেকচার। + +3. মডেলের লাইফসাইকেলের প্রতিটি ধাপে সঠিক ফ্রেমওয়ার্ক বেছে নিন: + + * মাত্র ৩ লাইনের কোডে সর্বাধুনিক মডেল ট্রেইন করুন। + * সহজে PyTorch / JAX / TF2.0 এর মধ্যে মডেল স্থানান্তর করুন। + * ট্রেইনিং, ইভ্যালুয়েশন ও প্রোডাকশনের জন্য আলাদা ফ্রেমওয়ার্ক ব্যবহার করুন। + +4. সহজেই মডেল বা উদাহরণ কাস্টমাইজ করুন: + + * প্রতিটি আর্কিটেকচারের জন্য এমন উদাহরণ দেওয়া আছে যা মূল লেখকদের প্রকাশিত ফলাফল পুনরুত্পাদন করতে সক্ষম। + * মডেলের অভ্যন্তরীণ অংশগুলো যতটা সম্ভব একভাবে এক্সপোজ করা হয়েছে। + * দ্রুত এক্সপেরিমেন্টের জন্য লাইব্রেরি ছাড়াও মডেল ফাইল ব্যবহার করা যায়। + + + +Hugging Face Enterprise Hub +
+ +## কেন Transformers ব্যবহার করবেন না? + +* এই লাইব্রেরি নিউরাল নেটওয়ার্কের জন্য ব্লক-মডিউল টুলবক্স নয়। মডেল ফাইলের কোডে অতিরিক্ত অ্যাবস্ট্র্যাকশন intentionally করা হয়নি, যাতে গবেষকরা দ্রুত প্রতিটি মডেলের উপর কাজ করতে পারে কোনো অতিরিক্ত ফাইল বা স্তরে না গিয়ে। +* ট্রেইনিং API মূলত Transformers-এর PyTorch মডেলের সাথে কাজ করার জন্য অপটিমাইজ করা হয়েছে। সাধারণ মেশিন লার্নিং লুপের জন্য, [Accelerate](https://huggingface.co/docs/accelerate) এর মতো অন্য লাইব্রেরি ব্যবহার করা উচিত। +* [উদাহরণ স্ক্রিপ্টগুলো](https://github.com/huggingface/transformers/tree/main/examples) শুধু *উদাহরণ*। এগুলো সরাসরি আপনার ব্যবহারের ক্ষেত্রে কাজ নাও করতে পারে, তাই কোড সামঞ্জস্য করতে হতে পারে। + +## Transformers দিয়ে ১০০টি প্রজেক্ট + +Transformers শুধু প্রি-ট্রেইন্ড মডেল ব্যবহার করার টুলকিট নয়, এটি একটি কমিউনিটি, যা Hugging Face Hub-এর চারপাশে তৈরি। আমরা চাই যে ডেভেলপার, গবেষক, শিক্ষার্থী, অধ্যাপক, ইঞ্জিনিয়ার বা যে কেউ তাদের স্বপ্নের প্রজেক্ট তৈরি করতে পারে। + +Transformers 100,000 স্টার উদযাপন করতে আমরা কমিউনিটিকে তুলে ধরতে [awesome-transformers](./awesome-transformers.md) পেজ তৈরি করেছি, যেখানে Transformers দিয়ে তৈরি ১০০টি অসাধারণ প্রজেক্ট তালিকাভুক্ত আছে। + +আপনার কোনো প্রজেক্ট আছে যা তালিকায় থাকা উচিত মনে করেন? তাহলে PR খুলে যুক্ত করুন। + +## উদাহরণ মডেল + +আপনি আমাদের অধিকাংশ মডেল সরাসরি তাদের [Hub মডেল পেজ](https://huggingface.co/models) থেকে পরীক্ষা করতে পারেন। + +নিচের প্রতিটি মোডালিটি এক্সপ্যান্ড করে বিভিন্ন ব্যবহার কেসের জন্য কয়েকটি উদাহরণ মডেল দেখুন। + + +
+অডিও + +* [Whisper](https://huggingface.co/openai/whisper-large-v3-turbo) দিয়ে অডিও ক্লাসিফিকেশন +* [Moonshine](https://huggingface.co/UsefulSensors/moonshine) দিয়ে অটোমেটিক স্পিচ রিকগনিশন +* [Wav2Vec2](https://huggingface.co/superb/wav2vec2-base-superb-ks) দিয়ে কীওয়ার্ড স্পটিং +* [Moshi](https://huggingface.co/kyutai/moshiko-pytorch-bf16) দিয়ে স্পিচ-টু-স্পিচ জেনারেশন +* [MusicGen](https://huggingface.co/facebook/musicgen-large) দিয়ে টেক্সট-টু-অডিও +* [Bark](https://huggingface.co/suno/bark) দিয়ে টেক্সট-টু-স্পিচ + + +
+ +
+কম্পিউটার ভিশন + +* [SAM](https://huggingface.co/facebook/sam-vit-base) দিয়ে স্বয়ংক্রিয় মাস্ক জেনারেশন +* [DepthPro](https://huggingface.co/apple/DepthPro-hf) দিয়ে গভীরতা অনুমান +* [DINO v2](https://huggingface.co/facebook/dinov2-base) দিয়ে চিত্র শ্রেণীকরণ +* [SuperPoint](https://huggingface.co/magic-leap-community/superpoint) দিয়ে কীপয়েন্ট সনাক্তকরণ +* [SuperGlue](https://huggingface.co/magic-leap-community/superglue_outdoor) দিয়ে কীপয়েন্ট ম্যাচিং +* [RT-DETRv2](https://huggingface.co/PekingU/rtdetr_v2_r50vd) দিয়ে অবজেক্ট সনাক্তকরণ +* [VitPose](https://huggingface.co/usyd-community/vitpose-base-simple) দিয়ে পোস অনুমান +* [OneFormer](https://huggingface.co/shi-labs/oneformer_ade20k_swin_large) দিয়ে ইউনিভার্সাল সেগমেন্টেশন +* [VideoMAE](https://huggingface.co/MCG-NJU/videomae-large) দিয়ে ভিডিও শ্রেণীকরণ + + +
+ +
+মাল্টিমোডাল + +* [Qwen2-Audio](https://huggingface.co/Qwen/Qwen2-Audio-7B) দিয়ে অডিও বা টেক্সট থেকে টেক্সট জেনারেশন +* [LayoutLMv3](https://huggingface.co/microsoft/layoutlmv3-base) দিয়ে ডকুমেন্ট প্রশ্নোত্তর +* [Qwen-VL](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct) দিয়ে ইমেজ বা টেক্সট থেকে টেক্সট জেনারেশন +* [BLIP-2](https://huggingface.co/Salesforce/blip2-opt-2.7b) দিয়ে ইমেজ ক্যাপশনিং +* [GOT-OCR2](https://huggingface.co/stepfun-ai/GOT-OCR-2.0-hf) দিয়ে OCR-ভিত্তিক ডকুমেন্ট আন্ডারস্ট্যান্ডিং +* [TAPAS](https://huggingface.co/google/tapas-base) দিয়ে টেবিল প্রশ্নোত্তর +* [Emu3](https://huggingface.co/BAAI/Emu3-Gen) দিয়ে ইউনিফাইড মাল্টিমোডাল আন্ডারস্ট্যান্ডিং এবং জেনারেশন +* [Llava-OneVision](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf) দিয়ে ভিশন থেকে টেক্সট +* [Llava](https://huggingface.co/llava-hf/llava-1.5-7b-hf) দিয়ে ভিজুয়াল কোয়েশ্চন আনসারিং +* [Kosmos-2](https://huggingface.co/microsoft/kosmos-2-patch14-224) দিয়ে ভিজুয়াল রেফারিং এক্সপ্রেশন সেগমেন্টেশন + + +
+ +
+NLP + +* [ModernBERT](https://huggingface.co/answerdotai/ModernBERT-base) দিয়ে মাস্কড ওয়ার্ড কমপ্লিশন +* [Gemma](https://huggingface.co/google/gemma-2-2b) দিয়ে নাম্বড এন্টিটি রিকগনিশন +* [Mixtral](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1) দিয়ে প্রশ্নোত্তর +* [BART](https://huggingface.co/facebook/bart-large-cnn) দিয়ে সারসংক্ষেপ (Summarization) +* [T5](https://huggingface.co/google-t5/t5-base) দিয়ে অনুবাদ +* [Llama](https://huggingface.co/meta-llama/Llama-3.2-1B) দিয়ে টেক্সট জেনারেশন +* [Qwen](https://huggingface.co/Qwen/Qwen2.5-0.5B) দিয়ে টেক্সট ক্লাসিফিকেশন + +
+ +## সাইটেশন +আমাদের [একটি পেপার](https://www.aclweb.org/anthology/2020.emnlp-demos.6/) আছে যা আপনি 🤗 Transformers লাইব্রেরির জন্য রেফারেন্স হিসেবে ব্যবহার করতে পারেন। + +```bibtex +@inproceedings{wolf-etal-2020-transformers, + title = "Transformers: State-of-the-Art Natural Language Processing", + author = "Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and Rémi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush", + booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations", + month = oct, + year = "2020", + address = "Online", + publisher = "Association for Computational Linguistics", + url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6", + pages = "38--45" +} +``` \ No newline at end of file diff --git a/i18n/README_de.md b/i18n/README_de.md index b913df894dc1..2c54965371c1 100644 --- a/i18n/README_de.md +++ b/i18n/README_de.md @@ -47,9 +47,11 @@ limitations under the License. తెలుగు | Français | Deutsch | + Italiano | Tiếng Việt | العربية | اردو | + বাংলা |

diff --git a/i18n/README_es.md b/i18n/README_es.md index d31b7f5f76c3..1a7a2256424a 100644 --- a/i18n/README_es.md +++ b/i18n/README_es.md @@ -47,9 +47,11 @@ limitations under the License. తెలుగు | Français | Deutsch | + Italiano | Tiếng Việt | العربية | اردو | + বাংলা |

diff --git a/i18n/README_fr.md b/i18n/README_fr.md index 6512b4af0700..17e6c0424269 100644 --- a/i18n/README_fr.md +++ b/i18n/README_fr.md @@ -47,9 +47,11 @@ limitations under the License. తెలుగు | Français | Deutsch | + Italiano | Tiếng Việt | العربية | اردو | + বাংলা |

diff --git a/i18n/README_hd.md b/i18n/README_hd.md index 1eb220efadc0..6c441088834c 100644 --- a/i18n/README_hd.md +++ b/i18n/README_hd.md @@ -72,9 +72,11 @@ checkpoint: जाँच बिंदु తెలుగు | Français | Deutsch | + Italiano | Tiếng Việt | العربية | اردو | + বাংলা |

diff --git a/i18n/README_it.md b/i18n/README_it.md new file mode 100644 index 000000000000..3b8d71bdb721 --- /dev/null +++ b/i18n/README_it.md @@ -0,0 +1,337 @@ + + +

+ + + + Hugging Face Transformers Library + +
+
+

+ +

+ Checkpoints on Hub + Build + GitHub + Documentation + GitHub release + Contributor Covenant + DOI +

+ +

+

+ English | + 简体中文 | + 繁體中文 | + 한국어 | + Español | + 日本語 | + हिन्दी | + Русский | + Português | + తెలుగు | + Français | + Deutsch | + Italiano | + Tiếng Việt | + العربية | + اردو | + বাংলা | +

+

+ +

+

Modelli preaddestrati all'avanguardia per l'inferenza e l'addestramento

+

+ +

+ +

+ +Transformers funge da framework di definizione dei modelli per modelli di machine learning all'avanguardia nei +modelli di testo, visione artificiale, audio, video e multimodali, sia per l'inferenza che per l'addestramento. + +Centralizza la definizione del modello in modo che tale definizione sia concordata all'interno dell'ecosistema. +`transformers` è il perno tra i framework: se una definizione di modello è supportata, sarà compatibile con la +maggior parte dei framework di addestramento (Axolotl, Unsloth, DeepSpeed, FSDP, PyTorch-Lightning, ...), motori +di inferenza (vLLM, SGLang, TGI, ...) e librerie di modellazione adiacenti (llama.cpp, mlx, ...) che sfruttano +la definizione del modello da `transformers`. + +Ci impegniamo a sostenere nuovi modelli all'avanguardia e a democratizzarne l'utilizzo rendendo la loro definizione +semplice, personalizzabile ed efficiente. + +Ci sono oltre 1 milione di Transformers [model checkpoint](https://huggingface.co/models?library=transformers&sort=trending) su [Hugging Face Hub](https://huggingface.com/models) che puoi utilizzare. + +Esplora oggi stesso l'[Hub](https://huggingface.com/) per trovare un modello e utilizzare Transformers per aiutarti a iniziare subito. + +## Installazione + +Transformers funziona con Python 3.9+ e [PyTorch](https://pytorch.org/get-started/locally/) 2.1+. + +Crea e attiva un ambiente virtuale con [venv](https://docs.python.org/3/library/venv.html) o [uv](https://docs.astral.sh/uv/), un pacchetto Python veloce basato su Rust e un gestore di progetti. + +```py +# venv +python -m venv .my-env +source .my-env/bin/activate +# uv +uv venv .my-env +source .my-env/bin/activate +``` + +Installa Transformers nel tuo ambiente virtuale. + +```py +# pip +pip install "transformers[torch]" + +# uv +uv pip install "transformers[torch]" +``` + +Installa Transformers dal sorgente se desideri le ultime modifiche nella libreria o sei interessato a contribuire. Tuttavia, la versione *più recente* potrebbe non essere stabile. Non esitare ad aprire una [issue](https://github.com/huggingface/transformers/issues) se riscontri un errore. + +```shell +git clone https://github.com/huggingface/transformers.git +cd transformers + +# pip +pip install .[torch] + +# uv +uv pip install .[torch] +``` + +## Quickstart + +Inizia subito a utilizzare Transformers con l'API [Pipeline](https://huggingface.co/docs/transformers/pipeline_tutorial). Pipeline è una classe di inferenza di alto livello che supporta attività di testo, audio, visione e multimodali. Gestisce la pre-elaborazione dell'input e restituisce l'output appropriato. + +Istanziare una pipeline e specificare il modello da utilizzare per la generazione di testo. Il modello viene scaricato e memorizzato nella cache in modo da poterlo riutilizzare facilmente. Infine, passare del testo per attivare il modello. + +```py +from transformers import pipeline + +pipeline = pipeline(task="text-generation", model="Qwen/Qwen2.5-1.5B") +pipeline("il segreto per preparare una torta davvero buona è ") +[{'generated_text': 'il segreto per preparare una torta davvero buona è 1) usare gli ingredienti giusti e 2) seguire alla lettera la ricetta. la ricetta della torta è la seguente: 1 tazza di zucchero, 1 tazza di farina, 1 tazza di latte, 1 tazza di burro, 1 tazza di uova, 1 tazza di gocce di cioccolato. se vuoi preparare 2 torte, quanto zucchero ti serve? Per preparare 2 torte, avrete bisogno di 2 tazze di zucchero.'}] +``` + +Per chattare con un modello, lo schema di utilizzo è lo stesso. L'unica differenza è che è necessario creare una cronologia delle chat (l'input per `Pipeline`) tra l'utente e il sistema. + +> [!TIP] +> È anche possibile chattare con un modello direttamente dalla riga di comando. +> ```shell +> transformers chat Qwen/Qwen2.5-0.5B-Instruct +> ``` + +```py +import torch +from transformers import pipeline + +chat = [ + {"role": "system", "content": "Sei un robot sfacciato e spiritoso, proprio come lo immaginava Hollywood nel 1986."}, + {"role": "user", "content": "Ehi, mi puoi suggerire qualcosa di divertente da fare a New York?"} +] + +pipeline = pipeline(task="text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct", dtype=torch.bfloat16, device_map="auto") +response = pipeline(chat, max_new_tokens=512) +print(response[0]["generated_text"][-1]["content"]) +``` + +Espandi gli esempi riportati di seguito per vedere come funziona `Pipeline` per diverse modalità e attività. + +
+Riconoscimento vocale automatico + +```py +from transformers import pipeline + +pipeline = pipeline(task="automatic-speech-recognition", model="openai/whisper-large-v3") +pipeline("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac") +{'text': ' Ho un sogno: che un giorno questa nazione si solleverà e vivrà il vero significato del suo credo.'} +``` + +
+ +
+Classificazione delle immagini + +

+ +

+ +```py +from transformers import pipeline + +pipeline = pipeline(task="image-classification", model="facebook/dinov2-small-imagenet1k-1-layer") +pipeline("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png") +[{'label': 'macaw', 'score': 0.997848391532898}, + {'label': 'cacatua dal ciuffo giallo, Kakatoe galerita, Cacatua galerita', + 'score': 0.0016551691805943847}, + {'label': 'lorichetto', 'score': 0.00018523589824326336}, + {'label': 'Pappagallo grigio africano, Psittacus erithacus', + 'score': 7.85409429227002e-05}, + {'label': 'quaglia', 'score': 5.502637941390276e-05}] +``` + +
+ +
+Risposta a domande visive + +

+ +

+ +```py +from transformers import pipeline + +pipeline = pipeline(task="visual-question-answering", model="Salesforce/blip-vqa-base") +pipeline( + image="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-few-shot.jpg", + question="Cosa c'è nell'immagine?", +) +[{'answer': 'statua della libertà'}] +``` + +
+ +## Perché dovrei usare Transformers? + +1. Modelli all'avanguardia facili da usare: + - Prestazioni elevate nella comprensione e generazione del linguaggio naturale, nella visione artificiale, nell'audio, nel video e nelle attività multimodali. + - Bassa barriera di ingresso per ricercatori, ingegneri e sviluppatori. + - Poche astrazioni rivolte all'utente con solo tre classi da imparare. + - Un'API unificata per l'utilizzo di tutti i nostri modelli preaddestrati. + +1. Riduzione dei costi di calcolo e dell'impronta di carbonio: + - Condivisione dei modelli addestrati invece di addestrarli da zero. + - Riduzione dei tempi di calcolo e dei costi di produzione. + - Decine di architetture di modelli con oltre 1 milione di checkpoint preaddestrati in tutte le modalità. + +1. Scegli il framework giusto per ogni fase del ciclo di vita di un modello: + - Addestra modelli all'avanguardia con sole 3 righe di codice. + - Sposta un singolo modello tra i framework PyTorch/JAX/TF2.0 a tuo piacimento. + - Scegli il framework giusto per l'addestramento, la valutazione e la produzione. + +1. Personalizza facilmente un modello o un esempio in base alle tue esigenze: + - Forniamo esempi per ogni architettura per riprodurre i risultati pubblicati dagli autori originali. + - Gli interni del modello sono esposti nel modo più coerente possibile. + - I file del modello possono essere utilizzati indipendentemente dalla libreria per esperimenti rapidi. + + + Hugging Face Enterprise Hub +
+ +## Perché non dovrei usare Transformers? + +- Questa libreria non è un toolbox modulare di blocchi costitutivi per reti neurali. Il codice nei file dei modelli non è stato rifattorizzato con ulteriori astrazioni di proposito, in modo che i ricercatori possano iterare rapidamente su ciascuno dei modelli senza dover approfondire ulteriori astrazioni/file. +- L'API di addestramento è ottimizzata per funzionare con i modelli PyTorch forniti da Transformers. Per i loop generici di machine learning, è necessario utilizzare un'altra libreria come [Accelerate](https://huggingface.co/docs/accelerate). +- Gli [script di esempio](https://github.com/huggingface/transformers/tree/main/examples) sono solo *esempi*. Potrebbero non funzionare immediatamente nel vostro caso specifico e potrebbe essere necessario adattare il codice affinché funzioni. + +## 100 progetti che usano Transformers + +Transformers è più di un semplice toolkit per l'utilizzo di modelli preaddestrati, è una comunità di progetti costruita attorno ad esso e all' +Hugging Face Hub. Vogliamo che Transformers consenta a sviluppatori, ricercatori, studenti, professori, ingegneri e chiunque altro +di realizzare i propri progetti dei sogni. + +Per celebrare le 100.000 stelle di Transformers, abbiamo voluto puntare i riflettori sulla +comunità con la pagina [awesome-transformers](./awesome-transformers.md), che elenca 100 +incredibili progetti realizzati con Transformers. + +Se possiedi o utilizzi un progetto che ritieni debba essere inserito nell'elenco, apri una PR per aggiungerlo! + +## Modelli di esempio + +È possibile testare la maggior parte dei nostri modelli direttamente sulle loro [pagine dei modelli Hub](https://huggingface.co/models). + +Espandi ciascuna modalità qui sotto per vedere alcuni modelli di esempio per vari casi d'uso. + +
+Audio + +- Classificazione audio con [Whisper](https://huggingface.co/openai/whisper-large-v3-turbo) +- Riconoscimento vocale automatico con [Moonshine](https://huggingface.co/UsefulSensors/moonshine) +- Individuazione delle keyword con [Wav2Vec2](https://huggingface.co/superb/wav2vec2-base-superb-ks) +- Generazione da discorso a discorso con [Moshi](https://huggingface.co/kyutai/moshiko-pytorch-bf16) +- Testo in audio con [MusicGen](https://huggingface.co/facebook/musicgen-large) +- Sintesi vocale con [Bark](https://huggingface.co/suno/bark) + +
+ +
+Visione artificiale + +- Generazione automatica di maschere con [SAM](https://huggingface.co/facebook/sam-vit-base) +- Stima della profondità con [DepthPro](https://huggingface.co/apple/DepthPro-hf) +- Classificazione delle immagini con [DINO v2](https://huggingface.co/facebook/dinov2-base) +- Rilevamento dei punti chiave con [SuperPoint](https://huggingface.co/magic-leap-community/superpoint) +- Corrispondenza dei punti chiave con [SuperGlue](https://huggingface.co/magic-leap-community/superglue_outdoor) +- Rilevamento degli oggetti con [RT-DETRv2](https://huggingface.co/PekingU/rtdetr_v2_r50vd) +- Stima della posa con [VitPose](https://huggingface.co/usyd-community/vitpose-base-simple) +- Segmentazione universale con [OneFormer](https://huggingface.co/shi-labs/oneformer_ade20k_swin_large) +- Classificazione dei video con [VideoMAE](https://huggingface.co/MCG-NJU/videomae-large) + +
+ +
+Multimodale + +- Audio or text to text with [Qwen2-Audio](https://huggingface.co/Qwen/Qwen2-Audio-7B) +- Document question answering with [LayoutLMv3](https://huggingface.co/microsoft/layoutlmv3-base) +- Image or text to text with [Qwen-VL](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct) +- Image captioning [BLIP-2](https://huggingface.co/Salesforce/blip2-opt-2.7b) +- OCR-based document understanding with [GOT-OCR2](https://huggingface.co/stepfun-ai/GOT-OCR-2.0-hf) +- Table question answering with [TAPAS](https://huggingface.co/google/tapas-base) +- Unified multimodal understanding and generation with [Emu3](https://huggingface.co/BAAI/Emu3-Gen) +- Vision to text with [Llava-OneVision](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf) +- Visual question answering with [Llava](https://huggingface.co/llava-hf/llava-1.5-7b-hf) +- Visual referring expression segmentation with [Kosmos-2](https://huggingface.co/microsoft/kosmos-2-patch14-224) + +
+ +
+NLP + +- Completamento parole mascherate con [ModernBERT](https://huggingface.co/answerdotai/ModernBERT-base) +- Riconoscimento delle entità denominate con [Gemma](https://huggingface.co/google/gemma-2-2b) +- Risposte alle domande con [Mixtral](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1) +- Sintesi con [BART](https://huggingface.co/facebook/bart-large-cnn) +- Traduzione con [T5](https://huggingface.co/google-t5/t5-base) +- Generazione di testo con [Llama](https://huggingface.co/meta-llama/Llama-3.2-1B) +- Classificazione del testo con [Qwen](https://huggingface.co/Qwen/Qwen2.5-0.5B) + +
+ +## Citazione + +Ora abbiamo un [paper](https://www.aclweb.org/anthology/2020.emnlp-demos.6/) che puoi citare per la libreria 🤗 Transformers: +```bibtex +@inproceedings{wolf-etal-2020-transformers, + title = "Transformers: State-of-the-Art Natural Language Processing", + author = "Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and Rémi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush", + booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations", + month = oct, + year = "2020", + address = "Online", + publisher = "Association for Computational Linguistics", + url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6", + pages = "38--45" +} +``` diff --git a/i18n/README_ja.md b/i18n/README_ja.md index 5d5db4993239..98ad2643d23c 100644 --- a/i18n/README_ja.md +++ b/i18n/README_ja.md @@ -82,9 +82,11 @@ user: ユーザ తెలుగు | Français | Deutsch | + Italiano | Tiếng Việt | العربية | اردو | + বাংলা |

diff --git a/i18n/README_ko.md b/i18n/README_ko.md index fded56a37c9b..a3e6b95cecb5 100644 --- a/i18n/README_ko.md +++ b/i18n/README_ko.md @@ -47,9 +47,11 @@ limitations under the License. తెలుగు | Français | Deutsch | + Italiano | Tiếng Việt | العربية | اردو | + বাংলা |

diff --git a/i18n/README_pt-br.md b/i18n/README_pt-br.md index e3c71c6a3f35..bdd464ad0664 100644 --- a/i18n/README_pt-br.md +++ b/i18n/README_pt-br.md @@ -47,9 +47,11 @@ limitations under the License. తెలుగు | Français | Deutsch | + Italiano | Tiếng Việt | العربية | اردو | + বাংলা |

diff --git a/i18n/README_ru.md b/i18n/README_ru.md index c30237fef885..3bcaab10f20b 100644 --- a/i18n/README_ru.md +++ b/i18n/README_ru.md @@ -47,9 +47,11 @@ limitations under the License. తెలుగు | Français | Deutsch | + Italiano | Tiếng Việt | العربية | اردو | + বাংলা |

diff --git a/i18n/README_te.md b/i18n/README_te.md index aee579b52abd..225bd74bb025 100644 --- a/i18n/README_te.md +++ b/i18n/README_te.md @@ -49,9 +49,11 @@ limitations under the License. తెలుగు | Français | Deutsch | + Italiano | Tiếng Việt | العربية | اردو | + বাংলা |

diff --git a/i18n/README_ur.md b/i18n/README_ur.md index bba5988e7717..215191e4cbb2 100644 --- a/i18n/README_ur.md +++ b/i18n/README_ur.md @@ -47,8 +47,10 @@ limitations under the License. తెలుగు | Français | Deutsch | + Italiano | Tiếng Việt | العربية | + বাংলা | اردو |

diff --git a/i18n/README_vi.md b/i18n/README_vi.md index f78e3b6d4e9b..3e0146c1ddb0 100644 --- a/i18n/README_vi.md +++ b/i18n/README_vi.md @@ -47,9 +47,11 @@ limitations under the License. తెలుగు | Français | Deutsch | + Italiano | Tiếng việt | العربية | اردو | + বাংলা |

diff --git a/i18n/README_zh-hans.md b/i18n/README_zh-hans.md index 8220e403b8b2..4c5859592c89 100644 --- a/i18n/README_zh-hans.md +++ b/i18n/README_zh-hans.md @@ -72,9 +72,11 @@ checkpoint: 检查点 తెలుగు | Français | Deutsch | + Italiano | Tiếng Việt | العربية | اردو | + বাংলা |

diff --git a/i18n/README_zh-hant.md b/i18n/README_zh-hant.md index da6ed40910ea..5842e57255c3 100644 --- a/i18n/README_zh-hant.md +++ b/i18n/README_zh-hant.md @@ -84,9 +84,11 @@ user: 使用者 తెలుగు | Français | Deutsch | + Italiano | Tiếng Việt | العربية | اردو | + বাংলা |

diff --git a/notebooks/README.md b/notebooks/README.md index 4d31797104f8..aed435878804 100644 --- a/notebooks/README.md +++ b/notebooks/README.md @@ -22,7 +22,6 @@ Also, we would like to list here interesting content created by the community. If you wrote some notebook(s) leveraging 🤗 Transformers and would like to be listed here, please open a Pull Request so it can be included under the Community notebooks. - ## Hugging Face's notebooks 🤗 ### Documentation notebooks @@ -38,7 +37,6 @@ You can open any page of the documentation as a notebook in Colab (there is a bu | [Summary of the tokenizers](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb) | The differences between the tokenizers algorithm |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb)| | [Multilingual models](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/multilingual.ipynb) | How to use the multilingual models of the library |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/multilingual.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/multilingual.ipynb)| - ### PyTorch Examples #### Natural Language Processing[[pytorch-nlp]] @@ -88,7 +86,6 @@ You can open any page of the documentation as a notebook in Colab (there is a bu | [How to fine-tune a Nucleotide Transformer model](https://github.com/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling.ipynb) | See how to tokenize DNA and fine-tune a large pre-trained DNA "language" model | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling.ipynb) | | [Fine-tune a Nucleotide Transformer model with LoRA](https://github.com/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling_with_peft.ipynb) | Train even larger DNA models in a memory-efficient way | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling_with_peft.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling_with_peft.ipynb) | - #### Other modalities[[pytorch-other]] | Notebook | Description | | | @@ -101,7 +98,6 @@ You can open any page of the documentation as a notebook in Colab (there is a bu |:----------|:-------------|:-------------|------:| | [How to export model to ONNX](https://github.com/huggingface/notebooks/blob/main/examples/onnx-export.ipynb)| Highlight how to export and run inference workloads through ONNX | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/onnx-export.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/onnx-export.ipynb)| - ### Optimum notebooks 🤗 [Optimum](https://github.com/huggingface/optimum) is an extension of 🤗 Transformers, providing a set of performance optimization tools enabling maximum efficiency to train and run models on targeted hardwares. diff --git a/pyproject.toml b/pyproject.toml index 5d3a9436eb3f..80983fd49703 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ exclude_lines = [ ] [tool.ruff] -target-version = "py39" +target-version = "py310" line-length = 119 [tool.ruff.lint] @@ -27,7 +27,10 @@ line-length = 119 # UP031: Use format specifiers instead of percent format # UP004: Class `XXX` inherits from `object` # UP028: Checks for for loops that can be replaced with yield from expressions -ignore = ["C901", "E501", "E741", "F402", "F823", "SIM1", "SIM300", "SIM212", "SIM905", "UP009", "UP015", "UP031", "UP028", "UP004"] +# UP045: Use `X | None` for type annotations +# UP007: Use `X | Y` for type annotations +# UP035: temporarily disabled to minimize upgrade changes +ignore = ["C901", "E501", "E741", "F402", "F823", "SIM1", "SIM300", "SIM212", "SIM905", "UP009", "UP015", "UP031", "UP028", "UP004", "UP045", "UP007", "UP035"] # RUF013: Checks for the use of implicit Optional # in type annotations when the default parameter value is None. select = ["C", "E", "F", "I", "W", "RUF013", "PERF102", "PLC1802", "PLC0208", "SIM", "UP"] diff --git a/setup.py b/setup.py index 9f3bb1750597..86891a483ffb 100644 --- a/setup.py +++ b/setup.py @@ -160,7 +160,7 @@ "rhoknp>=1.1.0,<1.3.1", "rjieba", "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1", - "ruff==0.11.2", + "ruff==0.13.1", # `sacrebleu` not used in `transformers`. However, it is needed in several tests, when a test calls # `evaluate.load("sacrebleu")`. This metric is used in the examples that we use to test the `Trainer` with, in the # `Trainer` tests (see references to `run_translation.py`). @@ -461,7 +461,7 @@ def run(self): setup( name="transformers", - version="4.57.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) + version="4.57.1", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)", author_email="transformers@huggingface.co", description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow", diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 2cf1d5970b54..9bc547ddcd38 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -18,7 +18,7 @@ # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names # in the namespace without actually importing anything (and especially none of the backends). -__version__ = "4.57.0.dev0" +__version__ = "4.57.1" from pathlib import Path from typing import TYPE_CHECKING @@ -928,7 +928,6 @@ from .utils import is_torch_npu_available as is_torch_npu_available from .utils import is_torch_xla_available as is_torch_xla_available from .utils import is_torch_xpu_available as is_torch_xpu_available - from .utils import logging as logging # bitsandbytes config from .utils.quantization_config import AqlmConfig as AqlmConfig diff --git a/src/transformers/activations.py b/src/transformers/activations.py index 8bfd517add9f..7642e8aa238a 100644 --- a/src/transformers/activations.py +++ b/src/transformers/activations.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import functools import math from collections import OrderedDict @@ -26,7 +27,8 @@ logger = logging.get_logger(__name__) -class PytorchGELUTanh(nn.Module): +@use_kernel_forward_from_hub("GeluTanh") +class GELUTanh(nn.Module): """ A fast C implementation of the tanh approximation of the GeLU activation function. See https://huggingface.co/papers/1606.08415. @@ -35,8 +37,18 @@ class PytorchGELUTanh(nn.Module): match due to rounding errors. """ + def __init__(self, use_gelu_tanh_python: bool = False): + super().__init__() + if use_gelu_tanh_python: + self.act = self._gelu_tanh_python + else: + self.act = functools.partial(nn.functional.gelu, approximate="tanh") + + def _gelu_tanh_python(self, input: Tensor) -> Tensor: + return input * 0.5 * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0)))) + def forward(self, input: Tensor) -> Tensor: - return nn.functional.gelu(input, approximate="tanh") + return self.act(input) @use_kernel_forward_from_hub("NewGELU") @@ -50,6 +62,7 @@ def forward(self, input: Tensor) -> Tensor: return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0)))) +@use_kernel_forward_from_hub("GeLU") class GELUActivation(nn.Module): """ Original Implementation of the GELU activation function in Google BERT repo when initially created. For @@ -72,6 +85,20 @@ def forward(self, input: Tensor) -> Tensor: return self.act(input) +@use_kernel_forward_from_hub("SiLU") +class SiLUActivation(nn.Module): + """ + See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear + Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function + Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated + Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with + later. + """ + + def forward(self, input: Tensor) -> Tensor: + return nn.functional.silu(input) + + @use_kernel_forward_from_hub("FastGELU") class FastGELUActivation(nn.Module): """ @@ -290,7 +317,8 @@ def forward(self, input: Tensor) -> Tensor: "gelu_fast": FastGELUActivation, "gelu_new": NewGELUActivation, "gelu_python": (GELUActivation, {"use_gelu_python": True}), - "gelu_pytorch_tanh": PytorchGELUTanh, + "gelu_pytorch_tanh": GELUTanh, + "gelu_python_tanh": (GELUTanh, {"use_gelu_tanh_python": True}), "gelu_accurate": AccurateGELUActivation, "laplace": LaplaceActivation, "leaky_relu": nn.LeakyReLU, @@ -301,7 +329,7 @@ def forward(self, input: Tensor) -> Tensor: "relu2": ReLUSquaredActivation, "relu6": nn.ReLU6, "sigmoid": nn.Sigmoid, - "silu": nn.SiLU, + "silu": SiLUActivation, "swish": nn.SiLU, "tanh": nn.Tanh, "prelu": nn.PReLU, diff --git a/src/transformers/audio_utils.py b/src/transformers/audio_utils.py index e848f558738c..5de56618014e 100644 --- a/src/transformers/audio_utils.py +++ b/src/transformers/audio_utils.py @@ -23,8 +23,11 @@ import warnings from collections.abc import Sequence from io import BytesIO -from typing import Any, Optional, Union +from typing import TYPE_CHECKING, Any, Optional, Union + +if TYPE_CHECKING: + import torch import numpy as np import requests from packaging import version @@ -51,7 +54,7 @@ if is_torchcodec_available(): TORCHCODEC_VERSION = version.parse(importlib.metadata.version("torchcodec")) -AudioInput = Union[np.ndarray, "torch.Tensor", Sequence[np.ndarray], Sequence["torch.Tensor"]] # noqa: F821 +AudioInput = Union[np.ndarray, "torch.Tensor", Sequence[np.ndarray], Sequence["torch.Tensor"]] def load_audio(audio: Union[str, np.ndarray], sampling_rate=16000, timeout=None) -> np.ndarray: @@ -78,9 +81,7 @@ def load_audio(audio: Union[str, np.ndarray], sampling_rate=16000, timeout=None) audio = load_audio_torchcodec(audio, sampling_rate=sampling_rate) else: audio = load_audio_librosa(audio, sampling_rate=sampling_rate, timeout=timeout) - elif isinstance(audio, np.ndarray): - audio = audio - else: + elif not isinstance(audio, np.ndarray): raise TypeError( "Incorrect format used for `audio`. Should be an url linking to an audio, a local path, or numpy array." ) @@ -318,9 +319,7 @@ def mel_to_hertz(mels: Union[float, np.ndarray], mel_scale: str = "htk") -> Unio return freq -def hertz_to_octave( - freq: Union[float, np.ndarray], tuning: Optional[float] = 0.0, bins_per_octave: Optional[int] = 12 -): +def hertz_to_octave(freq: Union[float, np.ndarray], tuning: float = 0.0, bins_per_octave: int = 12): """ Convert frequency from hertz to fractional octave numbers. Adapted from *librosa*. @@ -370,7 +369,7 @@ def chroma_filter_bank( tuning: float = 0.0, power: Optional[float] = 2.0, weighting_parameters: Optional[tuple[float, float]] = (5.0, 2.0), - start_at_c_chroma: Optional[bool] = True, + start_at_c_chroma: bool = True, ): """ Creates a chroma filter bank, i.e a linear transformation to project spectrogram bins onto chroma bins. @@ -391,7 +390,7 @@ def chroma_filter_bank( weighting_parameters (`tuple[float, float]`, *optional*, defaults to `(5., 2.)`): If specified, apply a Gaussian weighting parameterized by the first element of the tuple being the center and the second element being the Gaussian half-width. - start_at_c_chroma (`float`, *optional*, defaults to `True`): + start_at_c_chroma (`bool`, *optional*, defaults to `True`): If True, the filter bank will start at the 'C' pitch class. Otherwise, it will start at 'A'. Returns: `np.ndarray` of shape `(num_frequency_bins, num_chroma)` @@ -586,7 +585,7 @@ def window_function( window = np.hamming(length) elif name in ["hann", "hann_window"]: window = np.hanning(length) - elif name in ["povey"]: + elif name == "povey": window = np.power(np.hanning(length), 0.85) else: raise ValueError(f"Unknown window function '{name}'") @@ -627,7 +626,7 @@ def spectrogram( reference: float = 1.0, min_value: float = 1e-10, db_range: Optional[float] = None, - remove_dc_offset: Optional[bool] = None, + remove_dc_offset: bool = False, dtype: np.dtype = np.float32, ) -> np.ndarray: """ @@ -838,7 +837,7 @@ def spectrogram_batch( reference: float = 1.0, min_value: float = 1e-10, db_range: Optional[float] = None, - remove_dc_offset: Optional[bool] = None, + remove_dc_offset: bool = False, dtype: np.dtype = np.float32, ) -> list[np.ndarray]: """ diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py index e6f2645a766e..99beb0b610a1 100644 --- a/src/transformers/cache_utils.py +++ b/src/transformers/cache_utils.py @@ -395,7 +395,12 @@ def update( if not self.is_initialized: self.lazy_initialization(key_states) - cache_position = cache_kwargs.get("cache_position") + # Some old models give None for `cache_position` or even omit passing `cache_kwargs` when used as cross-attention, + # in which case we should copy the whole Layer (key_states.shape[-2] == self.max_cache_len) + cache_position = cache_kwargs.get("cache_position") if cache_kwargs is not None else None + cache_position = ( + cache_position if cache_position is not None else torch.arange(key_states.shape[-2], device=self.device) + ) cumulative_length = self.cumulative_length is_full = cumulative_length >= self.max_cache_len @@ -790,7 +795,7 @@ def early_initialization( for layer in self.layers: layer.lazy_initialization(fake_keys_tensor) - def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: + def get_seq_length(self, layer_idx: int = 0) -> int: """Returns the sequence length of the cache for the given layer.""" if layer_idx >= len(self.layers): return 0 @@ -955,17 +960,19 @@ def __init__( layers = [] # If a config is passed, use it to infer the layer types and initialize accordingly if config is not None: - config = config.get_text_config(decoder=True) - sliding_window = getattr(config, "sliding_window", None) or getattr(config, "attention_chunk_size", None) - layer_types = getattr(config, "layer_types", None) + decoder_config = config.get_text_config(decoder=True) + sliding_window = getattr(decoder_config, "sliding_window", None) or getattr( + decoder_config, "attention_chunk_size", None + ) + layer_types = getattr(decoder_config, "layer_types", None) if layer_types is None: layer_types = [ "sliding_attention" if sliding_window is not None else "full_attention" - for _ in range(config.num_hidden_layers) + for _ in range(decoder_config.num_hidden_layers) ] # Some models have shared layers thus no cache is needed for them (e.g. Gemma3n) - if hasattr(config, "num_kv_shared_layers"): - layer_types = layer_types[: -config.num_kv_shared_layers] + if hasattr(decoder_config, "num_kv_shared_layers"): + layer_types = layer_types[: -decoder_config.num_kv_shared_layers] for layer_type in layer_types: # From a cache point of view, both sliding and chunked are the same in how they should behave and how many @@ -1286,7 +1293,7 @@ def from_legacy_cache( cache.is_updated[layer_idx] = True return cache - def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: + def get_seq_length(self, layer_idx: int = 0) -> int: """Returns the sequence length of the cached states. A layer index can be optionally passed.""" return self.self_attention_cache.get_seq_length(layer_idx) diff --git a/src/transformers/commands/add_new_model_like.py b/src/transformers/commands/add_new_model_like.py index ffff54df93ba..fce524d4a6c0 100644 --- a/src/transformers/commands/add_new_model_like.py +++ b/src/transformers/commands/add_new_model_like.py @@ -755,7 +755,7 @@ def register_subcommand(parser: ArgumentParser): ) add_new_model_like_parser.set_defaults(func=add_new_model_like_command_factory) - def __init__(self, path_to_repo=None, *args): + def __init__(self, path_to_repo=None, **kwargs): ( self.old_model_infos, self.new_lowercase_name, diff --git a/src/transformers/commands/chat.py b/src/transformers/commands/chat.py index 70ee41c0c514..6ddf90164ba7 100644 --- a/src/transformers/commands/chat.py +++ b/src/transformers/commands/chat.py @@ -40,6 +40,12 @@ from transformers.utils import is_rich_available, is_torch_available +try: + import readline # noqa importing this enables GNU readline capabilities +except ImportError: + # some platforms may not support readline: https://docs.python.org/3/library/readline.html + pass + if platform.system() != "Windows": import pwd @@ -53,9 +59,7 @@ from transformers import ( AutoModelForCausalLM, - AutoTokenizer, BitsAndBytesConfig, - GenerationConfig, ) ALLOWED_KEY_CHARS = set(string.ascii_letters + string.whitespace) @@ -437,8 +441,7 @@ def parse_generate_flags(self, generate_flags: list[str]) -> dict: # 2. b. strings should be quoted def is_number(s: str) -> bool: # handle negative numbers - if s.startswith("-"): - s = s[1:] + s = s.removeprefix("-") return s.replace(".", "", 1).isdigit() generate_flags_as_dict = {k: f'"{v}"' if not is_number(v) else v for k, v in generate_flags_as_dict.items()} @@ -528,7 +531,7 @@ def parse_eos_tokens( # ----------------------------------------------------------------------------------------------------------------- # Model loading and performance automation methods @staticmethod - def get_quantization_config(model_args: ChatArguments) -> Optional["BitsAndBytesConfig"]: + def get_quantization_config(model_args: ChatArguments) -> Optional[BitsAndBytesConfig]: if model_args.load_in_4bit: quantization_config = BitsAndBytesConfig( load_in_4bit=True, @@ -684,7 +687,6 @@ async def _inner_run(self): model = self.args.model_name_or_path + "@" + self.args.model_revision host = "http://localhost" if self.args.host == "localhost" else self.args.host - client = AsyncInferenceClient(f"{host}:{self.args.port}") args = self.args if args.examples_path is None: @@ -707,48 +709,47 @@ async def _inner_run(self): # Starts the session with a minimal help message at the top, so that a user doesn't get stuck interface.print_help(minimal=True) - while True: - try: - user_input = interface.input() - - # User commands - if user_input.startswith("!"): - # `!exit` is special, it breaks the loop - if user_input == "!exit": - break - else: - chat, valid_command, generation_config, model_kwargs = self.handle_non_exit_user_commands( - user_input=user_input, - args=args, - interface=interface, - examples=examples, - generation_config=generation_config, - model_kwargs=model_kwargs, - chat=chat, - ) - # `!example` sends a user message to the model - if not valid_command or not user_input.startswith("!example"): - continue - else: - chat.append({"role": "user", "content": user_input}) - - stream = client.chat_completion( - chat, - stream=True, - extra_body={ - "generation_config": generation_config.to_json_string(), - "model": model, - }, - ) - model_output = await interface.stream_output(stream) + async with AsyncInferenceClient(f"{host}:{self.args.port}") as client: + while True: + try: + user_input = interface.input() + + # User commands + if user_input.startswith("!"): + # `!exit` is special, it breaks the loop + if user_input == "!exit": + break + else: + chat, valid_command, generation_config, model_kwargs = self.handle_non_exit_user_commands( + user_input=user_input, + args=args, + interface=interface, + examples=examples, + generation_config=generation_config, + model_kwargs=model_kwargs, + chat=chat, + ) + # `!example` sends a user message to the model + if not valid_command or not user_input.startswith("!example"): + continue + else: + chat.append({"role": "user", "content": user_input}) + + stream = client.chat_completion( + chat, + stream=True, + extra_body={ + "generation_config": generation_config.to_json_string(), + "model": model, + }, + ) - chat.append({"role": "assistant", "content": model_output}) + model_output = await interface.stream_output(stream) - except KeyboardInterrupt: - break - finally: - await client.close() + chat.append({"role": "assistant", "content": model_output}) + except KeyboardInterrupt: + break if __name__ == "__main__": diff --git a/src/transformers/commands/env.py b/src/transformers/commands/env.py index 983a858cd952..e15a699e80f6 100644 --- a/src/transformers/commands/env.py +++ b/src/transformers/commands/env.py @@ -14,7 +14,6 @@ import contextlib -import importlib.util import io import os import platform @@ -27,7 +26,6 @@ from ..utils import ( is_accelerate_available, is_flax_available, - is_safetensors_available, is_tf_available, is_torch_available, is_torch_hpu_available, @@ -61,18 +59,13 @@ def __init__(self, accelerate_config_file, *args) -> None: self._accelerate_config_file = accelerate_config_file def run(self): - safetensors_version = "not installed" - if is_safetensors_available(): - import safetensors + import safetensors - safetensors_version = safetensors.__version__ - elif importlib.util.find_spec("safetensors") is not None: - import safetensors - - safetensors_version = f"{safetensors.__version__} but is ignored because of PyTorch version too old." + safetensors_version = safetensors.__version__ accelerate_version = "not installed" accelerate_config = accelerate_config_str = "not found" + if is_accelerate_available(): import accelerate from accelerate.commands.config import default_config_file, load_config_from_file diff --git a/src/transformers/commands/serving.py b/src/transformers/commands/serving.py index 33a48aed7e64..970d59c96e74 100644 --- a/src/transformers/commands/serving.py +++ b/src/transformers/commands/serving.py @@ -31,7 +31,7 @@ from dataclasses import dataclass, field from io import BytesIO from threading import Thread -from typing import Optional, Union +from typing import Optional, TypedDict, Union from huggingface_hub import model_info from huggingface_hub.constants import HF_HUB_OFFLINE @@ -141,7 +141,7 @@ class TransformersTranscriptionCreateParams(TranscriptionCreateParamsBase, total file: bytes # Overwritten -- pydantic isn't happy with `typing.IO[bytes]`, present in the original type generation_config: str - stream: Optional[bool] = False + stream: bool = False # Contrarily to OpenAI's output types, input types are `TypedDict`, which don't have built-in validation. response_validator = TypeAdapter(TransformersResponseCreateParamsStreaming) @@ -528,7 +528,7 @@ def __init__(self, args: ServeArguments): def _validate_request( self, request: dict, - schema: "_TypedDictMeta", # noqa: F821 + schema: TypedDict, validator: "TypeAdapter", unused_fields: set, ): @@ -538,7 +538,7 @@ def _validate_request( Args: request (`dict`): The request to validate. - schema (`_TypedDictMeta`): + schema (`TypedDict`): The schema of the request to validate. It is a `TypedDict` definition. validator (`TypeAdapter`): The validator to use to validate the request. Built from `schema`. @@ -600,7 +600,7 @@ def validate_transcription_request(self, request: dict): def build_chat_completion_chunk( self, - request_id: Optional[str] = "", + request_id: str = "", content: Optional[int] = None, model: Optional[str] = None, role: Optional[str] = None, @@ -1026,7 +1026,9 @@ def generate_chat_completion(self, req: dict) -> Generator[str, None, None]: last_kv_cache = None if self.is_continuation(req) and not must_discard_cache: - last_kv_cache = self.last_kv_cache + seq_len = self.last_kv_cache.get_seq_length() + if inputs["input_ids"].shape[-1] > seq_len: + last_kv_cache = self.last_kv_cache generation_kwargs = { **inputs, @@ -1064,8 +1066,7 @@ def generate_with_cache(**kwargs): for result in streamer: # Temporary hack for GPTOS 3: don't emit the final "<|return|>" if "gptoss" in model.config.architectures[0].lower(): - if result.endswith("<|return|>"): - result = result[: -len("<|return|>")] + result = result.removesuffix("<|return|>") results += result # (related to temporary hack 2) @@ -1213,7 +1214,9 @@ def generate_response(self, req: dict) -> Generator[str, None, None]: last_kv_cache = None if self.is_continuation(req) and not must_discard_cache: - last_kv_cache = self.last_kv_cache + seq_len = self.last_kv_cache.get_seq_length() + if inputs["input_ids"].shape[-1] > seq_len: + last_kv_cache = self.last_kv_cache generation_kwargs = { "inputs": inputs, @@ -1321,8 +1324,7 @@ def generate_with_cache(**kwargs): for result in streamer: # Temporary hack for GPTOS 3: don't emit the final "<|return|>" if "gptoss" in model.config.architectures[0].lower(): - if result.endswith("<|return|>"): - result = result[: -len("<|return|>")] + result = result.removesuffix("<|return|>") results += result # (related to temporary hack 2) diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py index a9e7c9bff5bc..aa32734ffb38 100644 --- a/src/transformers/convert_slow_tokenizer.py +++ b/src/transformers/convert_slow_tokenizer.py @@ -1454,7 +1454,7 @@ def pre_tokenizer(self, replacement, add_prefix_space): class HeliumConverter(SpmConverter): handle_byte_fallback = True - def __init__(self, vocab_file=None, *args): + def __init__(self, vocab_file=None, **kwargs): requires_backends(self, "protobuf") Converter.__init__(self, vocab_file) @@ -1540,6 +1540,54 @@ def post_processor(self): ) +class ParakeetConverter(SpmConverter): + handle_byte_fallback = True + + def __init__(self, vocab_file=None, *args): + self.vocab_file = vocab_file + + requires_backends(self, "protobuf") + + Converter.__init__(self, vocab_file) + + model_pb2 = import_protobuf() + m = model_pb2.ModelProto() + with open(vocab_file, "rb") as f: + m.ParseFromString(f.read()) + self.proto = m + + def tokenizer(self, proto): + vocab_scores = self.vocab(proto) + + _, merges = self.SpmExtractor(self.vocab_file).extract(vocab_scores) + bpe_vocab = {word: i for i, (word, score) in enumerate(vocab_scores)} + tokenizer = Tokenizer( + BPE( + bpe_vocab, + merges, + unk_token=proto.trainer_spec.unk_piece, + fuse_unk=True, + byte_fallback=self.handle_byte_fallback, + dropout=None, + ) + ) + + # Add user defined symbols and control tokens from sentencepiece model + spm_added_tokens = [ + (id, p.piece, p.type == 3 or p.piece in self.special_tokens) + for id, p in enumerate(proto.pieces) + if p.type in [3, 4] + ] + tokenizer.add_tokens( + [ + AddedToken(token, normalized=False, special=special) + for id, token, special in sorted(spm_added_tokens, key=lambda x: x[0]) + ] + ) + + return tokenizer + + # Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode def bytes_to_unicode(): """ @@ -1576,10 +1624,8 @@ def __init__( pattern=r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""", add_prefix_space=False, additional_special_tokens=None, - *args, **kwargs, ): - super().__init__(*args) self.vocab_file = vocab_file self.pattern = pattern self.add_prefix_space = add_prefix_space diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py index 10ee10e01950..3fa9cb72de9f 100644 --- a/src/transformers/data/data_collator.py +++ b/src/transformers/data/data_collator.py @@ -18,26 +18,25 @@ from collections.abc import Mapping from dataclasses import dataclass from random import randint -from typing import Any, Callable, NewType, Optional, Union +from typing import Any, Callable, Optional, Union import numpy as np -from ..models.bert import BertTokenizer, BertTokenizerFast from ..tokenization_utils_base import PreTrainedTokenizerBase from ..utils import PaddingStrategy -InputDataClass = NewType("InputDataClass", Any) +InputDataClass = Any """ A DataCollator is a function that takes a list of samples from a Dataset and collate them into a batch, as a dictionary of PyTorch/TensorFlow tensors or NumPy arrays. """ -DataCollator = NewType("DataCollator", Callable[[list[InputDataClass]], dict[str, Any]]) +DataCollator = Callable[[list[InputDataClass]], dict[str, Any]] class DataCollatorMixin: - def __call__(self, features, return_tensors=None): + def __call__(self, features, return_tensors: Optional[str] = None): if return_tensors is None: return_tensors = self.return_tensors if return_tensors == "tf": @@ -773,6 +772,8 @@ class DataCollatorForLanguageModeling(DataCollatorMixin): Whether or not to use masked language modeling. If set to `False`, the labels are the same as the inputs with the padding tokens ignored (by setting them to -100). Otherwise, the labels are -100 for non-masked tokens and the value to predict for the masked token. + whole_word_mask (`bool`, *optional*, defaults to `False`): + Whether or not to mask whole words instead of individual tokens. mlm_probability (`float`, *optional*, defaults to 0.15): The probability with which to (randomly) mask tokens in the input, when `mlm` is set to `True`. mask_replace_prob (`float`, *optional*, defaults to 0.8): @@ -824,6 +825,7 @@ class DataCollatorForLanguageModeling(DataCollatorMixin): tokenizer: PreTrainedTokenizerBase mlm: bool = True + whole_word_mask: bool = False mlm_probability: Optional[float] = 0.15 mask_replace_prob: float = 0.8 random_replace_prob: float = 0.1 @@ -842,6 +844,11 @@ def __post_init__(self): if self.mlm_probability is None or self.mlm_probability < 0 or self.mlm_probability > 1: raise ValueError("mlm_probability should be between 0 and 1.") self.mlm_probability = float(self.mlm_probability) + elif self.whole_word_mask: + raise ValueError( + "Whole word masking can only be used with mlm=True." + "If you want to use whole word masking, please set mlm=True." + ) if self.mask_replace_prob + self.random_replace_prob > 1: raise ValueError("The sum of mask_replace_prob and random_replace_prob should not exceed 1") if self.mask_replace_prob < 0 or self.mask_replace_prob > 1: @@ -856,6 +863,20 @@ def __post_init__(self): import tensorflow as tf self.tf_mask_tokens = tf.function(self.tf_mask_tokens, jit_compile=True) + if self.whole_word_mask: + if not self.tokenizer.is_fast: + warnings.warn( + "Whole word masking depends on offset mapping which is only natively available with fast tokenizers.", + UserWarning, + ) + + if self.mask_replace_prob < 1: + warnings.warn( + "Random token replacement is not supported with whole word masking.", + "Setting mask_replace_prob to 1.", + ) + self.mask_replace_prob = 1 + self.random_replace_prob = 0 self.generator = None @@ -869,8 +890,6 @@ def get_generator(self, seed): return tf.random.Generator.from_seed(seed) else: - import numpy as np - return np.random.default_rng(seed) def create_rng(self): @@ -1021,9 +1040,10 @@ def torch_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> d # If special token mask has been preprocessed, pop it from the dict. special_tokens_mask = batch.pop("special_tokens_mask", None) + offset_mapping = batch.pop("offset_mapping", None) if self.mlm: batch["input_ids"], batch["labels"] = self.torch_mask_tokens( - batch["input_ids"], special_tokens_mask=special_tokens_mask + batch["input_ids"], special_tokens_mask=special_tokens_mask, offset_mapping=offset_mapping ) else: labels = batch["input_ids"].clone() @@ -1032,9 +1052,11 @@ def torch_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> d batch["labels"] = labels return batch - def torch_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = None) -> tuple[Any, Any]: + def torch_mask_tokens( + self, inputs: Any, special_tokens_mask: Optional[Any] = None, offset_mapping: Optional[Any] = None + ) -> tuple[Any, Any]: """ - Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. + Prepare masked tokens inputs/labels for masked language modeling. """ import torch @@ -1045,12 +1067,24 @@ def torch_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = No special_tokens_mask = [ self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] - special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool) + + if self.whole_word_mask: + word_ids, no_mask_mask = self._calc_word_ids_and_prob_mask( + to_numpy(offset_mapping), to_numpy(special_tokens_mask) + ) + no_mask_mask = torch.tensor(no_mask_mask, dtype=torch.bool) else: - special_tokens_mask = special_tokens_mask.bool() + no_mask_mask = ( + special_tokens_mask.bool() + if isinstance(special_tokens_mask, torch.Tensor) + else torch.tensor(special_tokens_mask, dtype=torch.bool) + ) - probability_matrix.masked_fill_(special_tokens_mask, value=0.0) + probability_matrix.masked_fill_(no_mask_mask, value=0.0) masked_indices = torch.bernoulli(probability_matrix, generator=self.generator).bool() + if self.whole_word_mask: + masked_indices = torch.BoolTensor(self._whole_word_mask(word_ids, masked_indices)) + labels[~masked_indices] = -100 # We only compute loss on masked tokens # mask_replace_prob% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) @@ -1100,9 +1134,10 @@ def numpy_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> d # If special token mask has been preprocessed, pop it from the dict. special_tokens_mask = batch.pop("special_tokens_mask", None) + offset_mapping = batch.pop("offset_mapping", None) if self.mlm: batch["input_ids"], batch["labels"] = self.numpy_mask_tokens( - batch["input_ids"], special_tokens_mask=special_tokens_mask + batch["input_ids"], special_tokens_mask=special_tokens_mask, offset_mapping=offset_mapping ) else: labels = np.copy(batch["input_ids"]) @@ -1111,9 +1146,14 @@ def numpy_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> d batch["labels"] = labels return batch - def numpy_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = None) -> tuple[Any, Any]: + def numpy_mask_tokens( + self, + inputs: Any, + special_tokens_mask: Optional[Any] = None, + offset_mapping: Optional[Any] = None, + ) -> tuple[Any, Any]: """ - Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. + Prepare masked tokens inputs/labels for masked language modeling. """ labels = np.copy(inputs) # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`) @@ -1122,16 +1162,28 @@ def numpy_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = No special_tokens_mask = [ self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] - special_tokens_mask = np.array(special_tokens_mask, dtype=bool) + + if self.whole_word_mask: + word_ids, no_mask_mask = self._calc_word_ids_and_prob_mask( + to_numpy(offset_mapping), to_numpy(special_tokens_mask) + ) else: - special_tokens_mask = special_tokens_mask.astype(bool) + no_mask_mask = ( + special_tokens_mask.astype(bool) + if isinstance(special_tokens_mask, np.ndarray) + else np.array(special_tokens_mask, dtype=bool) + ) - probability_matrix[special_tokens_mask] = 0 + probability_matrix[no_mask_mask] = 0 # Numpy doesn't have bernoulli, so we use a binomial with 1 trial if self.generator: masked_indices = self.generator.binomial(1, probability_matrix, size=probability_matrix.shape).astype(bool) else: masked_indices = np.random.binomial(1, probability_matrix, size=probability_matrix.shape).astype(bool) + + if self.whole_word_mask: + masked_indices = self._whole_word_mask(word_ids, masked_indices) + labels[~masked_indices] = -100 # We only compute loss on masked tokens # mask_replace_prob% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) @@ -1176,6 +1228,51 @@ def numpy_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = No # The rest of the time (10% of the time) we keep the masked input tokens unchanged return inputs, labels + @staticmethod + def _calc_word_ids_and_prob_mask( + offsets: np.ndarray[np.ndarray[tuple[int, int]]], special_tokens_mask: np.ndarray[np.ndarray[int]] + ) -> tuple[np.ndarray[np.ndarray[int]], np.ndarray[np.ndarray[int]]]: + """ + Map tokens to word ids and create mask of tokens to not mask. + Tokens that are part of the same word will have the same word id and we will only + set a mask probability for the first token of each word. + """ + + token_starts = offsets[:, :, 0] + token_ends = offsets[:, :, 1] + + prev_token_ends = np.roll(token_ends, 1, axis=1) + prev_token_ends[:, 0] = -1 # First token has no previous token + + prev_token_special = np.roll(special_tokens_mask, 1, axis=1) + prev_token_special[:, 0] = 0 + + # Not special token AND (gap from previous or previous token was special) + special_tokens_mask = special_tokens_mask.astype(bool) + is_new_word = (~special_tokens_mask) & ((token_starts != prev_token_ends) | (prev_token_special == 1)) + + word_ids = np.cumsum(is_new_word, axis=1) + word_ids[special_tokens_mask] = -1 + + prob_mask = ~is_new_word + + return word_ids, prob_mask + + @staticmethod + def _whole_word_mask(word_ids: np.ndarray[np.ndarray[int]], mask: Any) -> Any: + """ + Mask whole words based on word ids and mask. + """ + mask = to_numpy(mask) + + valid_ids = word_ids != -1 + + # Create 3D mask where [batch, token_i, token_j] is True if token_i and token_j are the same word + same_word = (word_ids[:, :, None] == word_ids[:, None, :]) & valid_ids[:, :, None] & valid_ids[:, None, :] + + # For each token, set True if any token in the same word is masked + return np.any(same_word & mask[:, None, :], axis=2) + @dataclass class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling): @@ -1322,6 +1419,8 @@ def _whole_word_mask(self, input_tokens: list[str], max_predictions=512): """ Get 0/1 labels for masked tokens with whole word mask proxy """ + from transformers import BertTokenizer, BertTokenizerFast + if not isinstance(self.tokenizer, (BertTokenizer, BertTokenizerFast)): warnings.warn( "DataCollatorForWholeWordMask is only suitable for BertTokenizer-like tokenizers. " @@ -1539,8 +1638,18 @@ def numpy_mask_tokens(self, inputs: Any, mask_labels: Any) -> tuple[Any, Any]: # The rest of the time ((1-mask_replace_prob-random_replace_prob)% of the time) we keep the masked input tokens unchanged return inputs, labels + def __init__(self, *args, **kwargs): + warnings.warn( + "DataCollatorForWholeWordMask is deprecated and will be removed in a future version, you can now use " + "DataCollatorForLanguageModeling with whole_word_mask=True instead.", + FutureWarning, + ) + super().__init__(*args, **kwargs) + self.mlm = True # Force masked language modeling + self.whole_word_mask = True # Force whole word masking + -def tolist(x): +def tolist(x) -> list[Any]: if isinstance(x, list): return x elif hasattr(x, "numpy"): # Checks for TF tensors without needing the import @@ -1548,6 +1657,15 @@ def tolist(x): return x.tolist() +def to_numpy(x) -> np.ndarray[Any]: + if isinstance(x, np.ndarray): + return x + elif hasattr(x, "detach"): + return x.detach().cpu().numpy() + else: + return np.array(x) + + @dataclass class DataCollatorForSOP(DataCollatorForLanguageModeling): """ diff --git a/src/transformers/data/datasets/squad.py b/src/transformers/data/datasets/squad.py index fdee571e249b..d4f76a51f422 100644 --- a/src/transformers/data/datasets/squad.py +++ b/src/transformers/data/datasets/squad.py @@ -122,9 +122,9 @@ def __init__( tokenizer: PreTrainedTokenizer, limit_length: Optional[int] = None, mode: Union[str, Split] = Split.train, - is_language_sensitive: Optional[bool] = False, + is_language_sensitive: bool = False, cache_dir: Optional[str] = None, - dataset_format: Optional[str] = "pt", + dataset_format: str = "pt", ): self.args = args self.is_language_sensitive = is_language_sensitive diff --git a/src/transformers/data/metrics/squad_metrics.py b/src/transformers/data/metrics/squad_metrics.py index f83c23bdeecf..0ffc025b65a0 100644 --- a/src/transformers/data/metrics/squad_metrics.py +++ b/src/transformers/data/metrics/squad_metrics.py @@ -148,7 +148,7 @@ def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans): best_score = cur_score best_thresh = 0.0 qid_list = sorted(na_probs, key=lambda k: na_probs[k]) - for i, qid in enumerate(qid_list): + for qid in qid_list: if qid not in scores: continue if qid_to_has_ans[qid]: diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index ab6e747d14db..42bbcbaabfad 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -68,7 +68,7 @@ "rhoknp": "rhoknp>=1.1.0,<1.3.1", "rjieba": "rjieba", "rouge-score": "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1", - "ruff": "ruff==0.11.2", + "ruff": "ruff==0.13.1", "sacrebleu": "sacrebleu>=1.4.12,<2.0.0", "sacremoses": "sacremoses", "safetensors": "safetensors>=0.4.3", diff --git a/src/transformers/dynamic_module_utils.py b/src/transformers/dynamic_module_utils.py index 5b541c076f63..6d4e2bf48921 100644 --- a/src/transformers/dynamic_module_utils.py +++ b/src/transformers/dynamic_module_utils.py @@ -285,8 +285,7 @@ def get_class_in_module( `typing.Type`: The class looked for. """ name = os.path.normpath(module_path) - if name.endswith(".py"): - name = name[:-3] + name = name.removesuffix(".py") name = name.replace(os.path.sep, ".") module_file: Path = Path(HF_MODULES_CACHE) / module_path with _HF_REMOTE_CODE_LOCK: @@ -396,7 +395,7 @@ def get_cached_module_file( if is_local: submodule = _sanitize_module_name(os.path.basename(pretrained_model_name_or_path)) else: - submodule = _sanitize_module_name(pretrained_model_name_or_path.replace("/", os.path.sep)) + submodule = os.path.sep.join(map(_sanitize_module_name, pretrained_model_name_or_path.split("/"))) cached_module = try_to_load_from_cache( pretrained_model_name_or_path, module_file, cache_dir=cache_dir, revision=_commit_hash, repo_type=repo_type ) diff --git a/src/transformers/feature_extraction_utils.py b/src/transformers/feature_extraction_utils.py index a9ff39b0cc19..e007e72d4761 100644 --- a/src/transformers/feature_extraction_utils.py +++ b/src/transformers/feature_extraction_utils.py @@ -48,13 +48,12 @@ if TYPE_CHECKING: - if is_torch_available(): - import torch # noqa + from .feature_extraction_sequence_utils import SequenceFeatureExtractor logger = logging.get_logger(__name__) -PreTrainedFeatureExtractor = Union["SequenceFeatureExtractor"] # noqa: F821 +PreTrainedFeatureExtractor = Union["SequenceFeatureExtractor"] # type hinting: specifying the type of feature extractor class that inherits from FeatureExtractionMixin SpecificFeatureExtractorType = TypeVar("SpecificFeatureExtractorType", bound="FeatureExtractionMixin") @@ -127,7 +126,7 @@ def _get_is_as_tensor_fns(self, tensor_type: Optional[Union[str, TensorType]] = elif tensor_type == TensorType.PYTORCH: if not is_torch_available(): raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.") - import torch # noqa + import torch def as_tensor(value): if isinstance(value, (list, tuple)) and len(value) > 0: @@ -216,7 +215,7 @@ def to(self, *args, **kwargs) -> "BatchFeature": [`BatchFeature`]: The same instance after modification. """ requires_backends(self, ["torch"]) - import torch # noqa + import torch device = kwargs.get("device") non_blocking = kwargs.get("non_blocking", False) @@ -563,7 +562,9 @@ def get_feature_extractor_dict( return feature_extractor_dict, kwargs @classmethod - def from_dict(cls, feature_extractor_dict: dict[str, Any], **kwargs) -> PreTrainedFeatureExtractor: + def from_dict( + cls, feature_extractor_dict: dict[str, Any], **kwargs + ) -> Union["FeatureExtractionMixin", tuple["FeatureExtractionMixin", dict[str, Any]]]: """ Instantiates a type of [`~feature_extraction_utils.FeatureExtractionMixin`] from a Python dictionary of parameters. @@ -613,7 +614,7 @@ def to_dict(self) -> dict[str, Any]: return output @classmethod - def from_json_file(cls, json_file: Union[str, os.PathLike]) -> PreTrainedFeatureExtractor: + def from_json_file(cls, json_file: Union[str, os.PathLike]) -> "FeatureExtractionMixin": """ Instantiates a feature extractor of type [`~feature_extraction_utils.FeatureExtractionMixin`] from the path to a JSON file of parameters. diff --git a/src/transformers/generation/beam_search.py b/src/transformers/generation/beam_search.py index ba2820cb437a..8510a02c803a 100644 --- a/src/transformers/generation/beam_search.py +++ b/src/transformers/generation/beam_search.py @@ -165,10 +165,10 @@ def __init__( batch_size: int, num_beams: int, device: torch.device, - length_penalty: Optional[float] = 1.0, - do_early_stopping: Optional[Union[bool, str]] = False, - num_beam_hyps_to_keep: Optional[int] = 1, - num_beam_groups: Optional[int] = 1, + length_penalty: float = 1.0, + do_early_stopping: Union[bool, str] = False, + num_beam_hyps_to_keep: int = 1, + num_beam_groups: int = 1, max_length: Optional[int] = None, ): logger.warning_once( @@ -214,7 +214,7 @@ def __init__( @property def is_done(self) -> bool: - return self._done.all() + return self._done.all().item() def process( self, @@ -225,8 +225,8 @@ def process( pad_token_id: Optional[Union[int, torch.Tensor]] = None, eos_token_id: Optional[Union[int, list[int], torch.Tensor]] = None, beam_indices: Optional[torch.LongTensor] = None, - group_index: Optional[int] = 0, - decoder_prompt_len: Optional[int] = 0, + group_index: int = 0, + decoder_prompt_len: int = 0, ) -> dict[str, torch.Tensor]: # add up to the length which the next_scores is calculated on (including decoder prompt) cur_len = input_ids.shape[-1] + 1 @@ -331,7 +331,7 @@ def finalize( pad_token_id: Optional[Union[int, torch.Tensor]] = None, eos_token_id: Optional[Union[int, list[int], torch.Tensor]] = None, beam_indices: Optional[torch.LongTensor] = None, - decoder_prompt_len: Optional[int] = 0, + decoder_prompt_len: int = 0, ) -> tuple[torch.LongTensor]: batch_size = len(self._beam_hyps) // self.num_beam_groups @@ -460,9 +460,9 @@ def __init__( num_beams: int, constraints: list[Constraint], device: torch.device, - length_penalty: Optional[float] = 1.0, - do_early_stopping: Optional[Union[bool, str]] = False, - num_beam_hyps_to_keep: Optional[int] = 1, + length_penalty: float = 1.0, + do_early_stopping: Union[bool, str] = False, + num_beam_hyps_to_keep: int = 1, max_length: Optional[int] = None, ): logger.warning_once( @@ -495,7 +495,7 @@ def __init__( @property def is_done(self) -> bool: - return self._done.all() + return self._done.all().item() def make_constraint_states(self, n): return [ConstraintListState([constraint.copy() for constraint in self.constraints]) for _ in range(n)] @@ -515,7 +515,7 @@ def process( pad_token_id: Optional[Union[int, torch.Tensor]] = None, eos_token_id: Optional[Union[int, list[int], torch.Tensor]] = None, beam_indices: Optional[torch.LongTensor] = None, - decoder_prompt_len: Optional[int] = 0, + decoder_prompt_len: int = 0, ) -> tuple[torch.Tensor]: r""" Args: @@ -804,7 +804,7 @@ def finalize( pad_token_id: Optional[Union[int, torch.Tensor]] = None, eos_token_id: Optional[Union[int, list[int], torch.Tensor]] = None, beam_indices: Optional[torch.LongTensor] = None, - decoder_prompt_len: Optional[int] = 0, + decoder_prompt_len: int = 0, ) -> tuple[torch.LongTensor]: batch_size = len(self._beam_hyps) @@ -912,7 +912,9 @@ def finalize( class BeamHypotheses: - def __init__(self, num_beams: int, length_penalty: float, early_stopping: bool, max_length: Optional[int] = None): + def __init__( + self, num_beams: int, length_penalty: float, early_stopping: Union[bool, str], max_length: Optional[int] = None + ): """ Initialize n-best list of hypotheses. """ @@ -963,7 +965,7 @@ def add( else: self.worst_score = min(score, self.worst_score) - def is_done(self, best_sum_logprobs: float, cur_len: int, decoder_prompt_len: Optional[int] = 0) -> bool: + def is_done(self, best_sum_logprobs: float, cur_len: int, decoder_prompt_len: int = 0) -> bool: """ If there are enough hypotheses and that none of the hypotheses being generated can become better than the worst one in the heap, then we are done with this sentence. diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index a455e69d03ff..cd42288aebfa 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -524,7 +524,7 @@ def get_candidates(self, input_ids: torch.LongTensor) -> tuple[torch.LongTensor, self.assistant_kwargs.pop("attention_mask", None) assistant_output = self.assistant_model.generate(**generation_args, **self.assistant_kwargs) - new_target_ids = self._process_assistant_outputs(input_ids, assistant_output.sequences, assistant_input_ids) + new_target_ids = self._process_assistant_outputs(input_ids, assistant_output.sequences) # Update state self.prev_target_ids_len = input_ids.shape[1] @@ -583,7 +583,7 @@ def _prepare_assistant_input_ids(self, input_ids: torch.LongTensor) -> tuple[tor return assistant_input_ids, remove_from_pkv def _process_assistant_outputs( - self, input_ids: torch.LongTensor, assistant_sequences: torch.LongTensor, assistant_input_ids: torch.LongTensor + self, input_ids: torch.LongTensor, assistant_sequences: torch.LongTensor ) -> torch.LongTensor: """Processes assistant outputs to obtain target input IDs.""" num_prev_assistant = self.prev_assistant_ids.shape[1] diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py index 05caed152c6e..98a0d14ade1a 100644 --- a/src/transformers/generation/configuration_utils.py +++ b/src/transformers/generation/configuration_utils.py @@ -1282,11 +1282,11 @@ class WatermarkingConfig(BaseWatermarkingConfig): def __init__( self, - greenlist_ratio: Optional[float] = 0.25, - bias: Optional[float] = 2.0, - hashing_key: Optional[int] = 15485863, - seeding_scheme: Optional[str] = "lefthash", - context_width: Optional[int] = 1, + greenlist_ratio: float = 0.25, + bias: float = 2.0, + hashing_key: int = 15485863, + seeding_scheme: str = "lefthash", + context_width: int = 1, ): self.greenlist_ratio = greenlist_ratio self.bias = bias diff --git a/src/transformers/generation/continuous_batching/cache.py b/src/transformers/generation/continuous_batching/cache.py index 05de093f661f..8d6e057be84a 100644 --- a/src/transformers/generation/continuous_batching/cache.py +++ b/src/transformers/generation/continuous_batching/cache.py @@ -79,7 +79,7 @@ class PagedAttentionCache: layer group, and the shape of the cache tensor is `[num_blocks * block_size, num_heads, head_size]`. Grouping layers into groups is useful because when we allocate one block to a group N, the block allocated is the - same for all layers in group N, equivalently it is allocated accross all cache tensors. This allows us to + same for all layers in group N, equivalently it is allocated across all cache tensors. This allows us to efficiently allocate and free blocks, and to efficiently read and write key and value states. For instance, imagine we have 8 blocks of cache and a model with two layer groups: a full-attention group with 3 @@ -349,7 +349,7 @@ class PagedAttentionMemoryHandler: The memory footprint consists of three main components: - Cache memory: the space needed to store the cache tensors: 2 * layer_group_size * [num_pages, page_size] * cache_dtype - - Activation memory: the space temporarly taken by the largest activation during the model forward pass: + - Activation memory: the space temporarily taken by the largest activation during the model forward pass: peak_activation_per_token * max_tokens_per_batch * activation_dtype_size - Static tensors: the space taken by the input/output buffers and metadata tensors for batch processing, sum of: - inputs_ids + outputs_ids + position_ids + logits_indices: 4 * max_tokens_per_batch * int32_size diff --git a/src/transformers/generation/continuous_batching/continuous_api.py b/src/transformers/generation/continuous_batching/continuous_api.py index b00c0a4825c3..0d1801fa163e 100644 --- a/src/transformers/generation/continuous_batching/continuous_api.py +++ b/src/transformers/generation/continuous_batching/continuous_api.py @@ -42,7 +42,56 @@ def build_attention_mask( ) -> None: """Builds an attention mask inplace using the cumulative seqlens of the query and key. If given a sliding window, it will also apply a sliding window mask on top. The attention mask is not boolean, it uses zeroes and -inf (or its - equivalent) so it's more of an attention score bias tensor.""" + equivalent) so it's more of an attention score bias tensor. + The attention mask is a block-diagonal matrix, with each block an attention mask for a single query-key pair. + Each of those block is built from a causal mask and, if there is a sliding window, a sliding window mask. + + An example is represented below, with seqlen_k = 8, seqlen_q = 4 and sliding_window = 6: + + CAUSAL MASK: + + █ █ █ █ █ ░ ░ ░ + █ █ █ █ █ █ ░ ░ + █ █ █ █ █ █ █ ░ + █ █ █ █ █ █ █ █ + + SLIDING WINDOW MASK: + ┌──────────────────────── seqlen_k - seqlen_q - sliding_window = 8 - 4 - 6 = -2 offset to the right + <─┴─> + ░ █ | █ █ █ █ █ █ █ █ + ░ ░ | █ █ █ █ █ █ █ █ + ░ ░ | ░ █ █ █ █ █ █ █ + ░ ░ | ░ ░ █ █ █ █ █ █ + + ATTENTION MASK (sum of causal and sliding window masks): + + █ █ █ █ █ ░ ░ ░ + █ █ █ █ █ █ ░ ░ + ░ █ █ █ █ █ █ ░ + ░ ░ █ █ █ █ █ █ + + Another example with seqlen_k = 5, seqlen_q = 3 and sliding_window = 2: + + CAUSAL MASK: + + █ █ █ ░ ░ + █ █ █ █ ░ + █ █ █ █ █ + + SLIDING WINDOW MASK: + ┌──────────────────────── seqlen_k - seqlen_q - sliding_window = 5 - 3 - 2 = 0 offset to the right + <┴> + | ░ █ █ █ █ + | ░ ░ █ █ █ + | ░ ░ ░ █ █ + + ATTENTION MASK (sum of causal and sliding window masks): + + ░ █ █ ░ ░ + ░ ░ █ █ ░ + ░ ░ ░ █ █ + + """ min_value = torch.finfo(attention_mask.dtype).min for i in range(len(cumulative_seqlens_q) - 1): seqlen_q = cumulative_seqlens_q[i + 1] - cumulative_seqlens_q[i] @@ -63,8 +112,8 @@ def build_attention_mask( masked = torch.triu(minus_inf, diagonal=causal_diagonal) # Apply sliding window mask if needed if sliding_window > 1: - sliding_diagonal = seqlen_k - seqlen_q + sliding_window - masked = torch.tril(masked, diagonal=sliding_diagonal) + sliding_diagonal = seqlen_k - seqlen_q - sliding_window + masked += torch.tril(minus_inf, diagonal=sliding_diagonal) # Replace in attention mask attention_mask[..., query_range, key_range] = masked diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py index f63d2246c6a9..7d81501a783d 100644 --- a/src/transformers/generation/logits_process.py +++ b/src/transformers/generation/logits_process.py @@ -369,7 +369,6 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to if scores.dim() == 3: if self.logits_indices is not None and self.cu_seq_lens_q is not None: - batch_size, seq_len, vocab_size = scores.shape last_positions = self.logits_indices last_scores = scores[0, last_positions, :] @@ -2289,7 +2288,7 @@ def __init__( model, unconditional_ids: Optional[torch.LongTensor] = None, unconditional_attention_mask: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = True, + use_cache: bool = True, ): self.guidance_scale = guidance_scale self.model = model diff --git a/src/transformers/generation/stopping_criteria.py b/src/transformers/generation/stopping_criteria.py index 2b9e57aacd8d..5a013a49723d 100644 --- a/src/transformers/generation/stopping_criteria.py +++ b/src/transformers/generation/stopping_criteria.py @@ -76,9 +76,9 @@ def __init__(self, max_length: int, max_position_embeddings: Optional[int] = Non def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> torch.BoolTensor: cur_len = input_ids.shape[1] is_done = cur_len >= self.max_length - if self.max_position_embeddings is not None and not is_done and cur_len >= self.max_position_embeddings: + if self.max_position_embeddings is not None and not is_done and cur_len > self.max_position_embeddings: logger.warning_once( - "This is a friendly reminder - the current text generation call will exceed the model's predefined " + "This is a friendly reminder - the current text generation call has exceeded the model's predefined " f"maximum length ({self.max_position_embeddings}). Depending on the model, you may observe " "exceptions, performance degradation, or nothing at all." ) @@ -249,7 +249,7 @@ def __init__(self, tokenizer: PreTrainedTokenizerBase, stop_strings: Union[str, token_list, token_indices, tokenizer ) - self.maximum_token_len = max([len(stop_string) for stop_string in self.stop_strings]) + self.maximum_token_len = max(len(stop_string) for stop_string in self.stop_strings) self.num_stop_strings = len(self.stop_strings) self.target_lens = torch.tensor([len(stop_string) for stop_string in stop_strings], dtype=torch.int32) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 2e312bcb3c79..f9d58dfdf4f6 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -22,7 +22,6 @@ import torch import torch.distributed as dist -from huggingface_hub import file_exists from packaging import version from torch import nn @@ -414,23 +413,20 @@ def load_custom_generate( Returns: A callable that can be used to generate text. """ - # Does `pretrained_model_name_or_path` have a `custom_generate` subdirectory? If not -> OSError - is_local_code = os.path.exists(pretrained_model_name_or_path) - has_custom_generate_folder = True - if is_local_code: - if not os.path.exists(os.path.join(pretrained_model_name_or_path, "custom_generate/generate.py")): - has_custom_generate_folder = False - else: - if not file_exists(pretrained_model_name_or_path, "custom_generate/generate.py"): - has_custom_generate_folder = False - - if not has_custom_generate_folder: + # Fetches the generate.py file from the model repo. If it doesn't exist, a file in `.no_exist` cache directory + # is created (preventing future hub requests), and an OSError is raised. + try: + module = get_cached_module_file( + pretrained_model_name_or_path, module_file="custom_generate/generate.py", **kwargs + ) + except OSError: raise OSError( f"`{pretrained_model_name_or_path}` does not contain a `custom_generate` subdirectory with a " "`generate.py` file, can't load the custom generate function." ) # Handle opt-in `trust_remote_code` and related exceptions + is_local_code = os.path.exists(pretrained_model_name_or_path) error_message = ( f"The repository `{pretrained_model_name_or_path}` contains custom generation code that will override " "the default `generate` method." @@ -447,9 +443,6 @@ def load_custom_generate( check_python_requirements( pretrained_model_name_or_path, requirements_file="custom_generate/requirements.txt", **kwargs ) - module = get_cached_module_file( - pretrained_model_name_or_path, module_file="custom_generate/generate.py", **kwargs - ) custom_generate_function = get_class_in_module("generate", module) return custom_generate_function @@ -912,7 +905,7 @@ def _prepare_decoder_input_ids_for_generation( self.config.model_type == "vision-encoder-decoder" and "donut" in self.config.encoder.model_type.lower() ): pass - elif self.config.model_type in ["whisper"]: + elif self.config.model_type == "whisper": pass # user input but doesn't start with decoder_start_token_id -> prepend decoder_start_token_id (and adjust # decoder_attention_mask if provided) @@ -1018,7 +1011,7 @@ def _get_candidate_generator( input_ids: torch.LongTensor, inputs_tensor: torch.Tensor, logits_processor: LogitsProcessorList, - model_kwargs: dict, + model_kwargs: dict[str, Any], assistant_model: Optional["PreTrainedModel"] = None, target_tokenizer: Optional["PreTrainedTokenizerBase"] = None, assistant_tokenizer: Optional["PreTrainedTokenizerBase"] = None, @@ -1709,7 +1702,10 @@ def _prepare_generated_length( return generation_config def _prepare_generation_config( - self, generation_config: Optional[GenerationConfig], use_model_defaults: Optional[bool] = None, **kwargs: dict + self, + generation_config: Optional[GenerationConfig], + use_model_defaults: Optional[bool] = None, + **kwargs: Any, ) -> tuple[GenerationConfig, dict]: """ Prepares the base generation config, then applies any generation configuration options from kwargs. This @@ -1903,6 +1899,7 @@ def _supports_default_dynamic_cache(cls) -> bool: "minimax", "xlnet", "lfm2", + "lfm2-vl", ] ) @@ -2136,7 +2133,7 @@ def _tensor_or_none(token, device=None): generation_config._pad_token_tensor = pad_token_tensor generation_config._decoder_start_token_tensor = decoder_start_token_tensor - def _valid_auto_compile_criteria(self, model_kwargs: dict, generation_config: GenerationConfig) -> bool: + def _valid_auto_compile_criteria(self, model_kwargs: dict[str, Any], generation_config: GenerationConfig) -> bool: """ Determines whether to trigger auto-compilation of the model's forward pass at generation time. """ @@ -3453,7 +3450,7 @@ def _assisted_decoding( generation_config: GenerationConfig, synced_gpus: bool = False, streamer: Optional["BaseStreamer"] = None, - inputs_tensor: torch.FloatTensor = None, + inputs_tensor: Optional[torch.FloatTensor] = None, assistant_model: Optional["PreTrainedModel"] = None, assistant_tokenizer: Optional["PreTrainedTokenizerBase"] = None, tokenizer: Optional["PreTrainedTokenizerBase"] = None, diff --git a/src/transformers/generation/watermarking.py b/src/transformers/generation/watermarking.py index e62742ef7514..df8a6ef7d483 100644 --- a/src/transformers/generation/watermarking.py +++ b/src/transformers/generation/watermarking.py @@ -24,14 +24,9 @@ from torch.nn import BCELoss from ..modeling_utils import PreTrainedModel -from ..utils import ModelOutput, is_torch_available, logging +from ..utils import ModelOutput, logging from .configuration_utils import PretrainedConfig, WatermarkingConfig - - -if is_torch_available(): - import torch - - from .logits_process import SynthIDTextWatermarkLogitsProcessor, WatermarkLogitsProcessor +from .logits_process import SynthIDTextWatermarkLogitsProcessor, WatermarkLogitsProcessor logger = logging.get_logger(__name__) @@ -43,31 +38,31 @@ class WatermarkDetectorOutput: Outputs of a watermark detector. Args: - num_tokens_scored (np.array of shape (batch_size)): + num_tokens_scored (np.ndarray of shape (batch_size)): Array containing the number of tokens scored for each element in the batch. - num_green_tokens (np.array of shape (batch_size)): + num_green_tokens (np.ndarray of shape (batch_size)): Array containing the number of green tokens for each element in the batch. - green_fraction (np.array of shape (batch_size)): + green_fraction (np.ndarray of shape (batch_size)): Array containing the fraction of green tokens for each element in the batch. - z_score (np.array of shape (batch_size)): + z_score (np.ndarray of shape (batch_size)): Array containing the z-score for each element in the batch. Z-score here shows how many standard deviations away is the green token count in the input text from the expected green token count for machine-generated text. - p_value (np.array of shape (batch_size)): + p_value (np.ndarray of shape (batch_size)): Array containing the p-value for each batch obtained from z-scores. - prediction (np.array of shape (batch_size)), *optional*: + prediction (np.ndarray of shape (batch_size)), *optional*: Array containing boolean predictions whether a text is machine-generated for each element in the batch. - confidence (np.array of shape (batch_size)), *optional*: + confidence (np.ndarray of shape (batch_size)), *optional*: Array containing confidence scores of a text being machine-generated for each element in the batch. """ - num_tokens_scored: Optional[np.array] = None - num_green_tokens: Optional[np.array] = None - green_fraction: Optional[np.array] = None - z_score: Optional[np.array] = None - p_value: Optional[np.array] = None - prediction: Optional[np.array] = None - confidence: Optional[np.array] = None + num_tokens_scored: Optional[np.ndarray] = None + num_green_tokens: Optional[np.ndarray] = None + green_fraction: Optional[np.ndarray] = None + z_score: Optional[np.ndarray] = None + p_value: Optional[np.ndarray] = None + prediction: Optional[np.ndarray] = None + confidence: Optional[np.ndarray] = None class WatermarkDetector: @@ -179,7 +174,7 @@ def _score_ngrams_in_passage(self, input_ids: torch.LongTensor): ) return num_tokens_scored_batch, green_token_count_batch - def _compute_z_score(self, green_token_count: np.ndarray, total_num_tokens: np.ndarray) -> np.array: + def _compute_z_score(self, green_token_count: np.ndarray, total_num_tokens: np.ndarray) -> np.ndarray: expected_count = self.greenlist_ratio numer = green_token_count - expected_count * total_num_tokens denom = np.sqrt(total_num_tokens * expected_count * (1 - expected_count)) @@ -195,7 +190,7 @@ def __call__( input_ids: torch.LongTensor, z_threshold: float = 3.0, return_dict: bool = False, - ) -> Union[WatermarkDetectorOutput, np.array]: + ) -> Union[WatermarkDetectorOutput, np.ndarray]: """ Args: input_ids (`torch.LongTensor`): @@ -207,8 +202,8 @@ def __call__( Whether to return `~generation.WatermarkDetectorOutput` or not. If not it will return boolean predictions, ma Return: - [`~generation.WatermarkDetectorOutput`] or `np.array`: A [`~generation.WatermarkDetectorOutput`] - if `return_dict=True` otherwise a `np.array`. + [`~generation.WatermarkDetectorOutput`] or `np.ndarray`: A [`~generation.WatermarkDetectorOutput`] + if `return_dict=True` otherwise a `np.ndarray`. """ diff --git a/src/transformers/hf_argparser.py b/src/transformers/hf_argparser.py index be7f05344faf..503130ea651a 100644 --- a/src/transformers/hf_argparser.py +++ b/src/transformers/hf_argparser.py @@ -262,19 +262,6 @@ def _add_dataclass_arguments(self, dtype: DataClassType): "removing line of `from __future__ import annotations` which opts in Postponed " "Evaluation of Annotations (PEP 563)" ) - except TypeError as ex: - # Remove this block when we drop Python 3.9 support - if sys.version_info[:2] < (3, 10) and "unsupported operand type(s) for |" in str(ex): - python_version = ".".join(map(str, sys.version_info[:3])) - raise RuntimeError( - f"Type resolution failed for {dtype} on Python {python_version}. Try removing " - "line of `from __future__ import annotations` which opts in union types as " - "`X | Y` (PEP 604) via Postponed Evaluation of Annotations (PEP 563). To " - "support Python versions that lower than 3.10, you need to use " - "`typing.Union[X, Y]` instead of `X | Y` and `typing.Optional[X]` instead of " - "`X | None`." - ) from ex - raise for field in dataclasses.fields(dtype): if not field.init: diff --git a/src/transformers/image_processing_utils_fast.py b/src/transformers/image_processing_utils_fast.py index 983fd4e16953..4dfa7f08b0db 100644 --- a/src/transformers/image_processing_utils_fast.py +++ b/src/transformers/image_processing_utils_fast.py @@ -46,7 +46,6 @@ auto_docstring, is_torch_available, is_torchvision_available, - is_torchvision_v2_available, is_vision_available, logging, ) @@ -60,14 +59,13 @@ import torch if is_torchvision_available(): + from torchvision.transforms.v2 import functional as F + from .image_utils import pil_torch_interpolation_mapping + else: pil_torch_interpolation_mapping = None -if is_torchvision_v2_available(): - from torchvision.transforms.v2 import functional as F -elif is_torchvision_available(): - from torchvision.transforms import functional as F logger = logging.get_logger(__name__) @@ -85,7 +83,7 @@ def validate_fast_preprocess_arguments( size: Optional[SizeDict] = None, interpolation: Optional["F.InterpolationMode"] = None, return_tensors: Optional[Union[str, TensorType]] = None, - data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + data_format: ChannelDimension = ChannelDimension.FIRST, ): """ Checks validity of typically used arguments in an `ImageProcessorFast` `preprocess` method. @@ -131,7 +129,7 @@ def max_across_indices(values: Iterable[Any]) -> list[Any]: return [max(values_i) for values_i in zip(*values)] -def get_max_height_width(images: list["torch.Tensor"]) -> tuple[int]: +def get_max_height_width(images: list["torch.Tensor"]) -> tuple[int, ...]: """ Get the maximum height and width across all images in a batch. """ @@ -142,8 +140,8 @@ def get_max_height_width(images: list["torch.Tensor"]) -> tuple[int]: def divide_to_patches( - image: Union[np.array, "torch.Tensor"], patch_size: int -) -> list[Union[np.array, "torch.Tensor"]]: + image: Union[np.ndarray, "torch.Tensor"], patch_size: int +) -> list[Union[np.ndarray, "torch.Tensor"]]: """ Divides an image into patches of a specified size. @@ -248,7 +246,7 @@ def pad( pad_size: SizeDict = None, fill_value: Optional[int] = 0, padding_mode: Optional[str] = "constant", - return_mask: Optional[bool] = False, + return_mask: bool = False, disable_grouping: Optional[bool] = False, **kwargs, ) -> "torch.Tensor": @@ -375,9 +373,13 @@ def compile_friendly_resize( A wrapper around `F.resize` so that it is compatible with torch.compile when the image is a uint8 tensor. """ if image.dtype == torch.uint8: - image = image.float() / 255 + # 256 is used on purpose instead of 255 to avoid numerical differences + # see https://github.com/huggingface/transformers/pull/38540#discussion_r2127165652 + image = image.float() / 256 image = F.resize(image, new_size, interpolation=interpolation, antialias=antialias) - image = image * 255 + image = image * 256 + # torch.where is used on purpose instead of torch.clamp to avoid bug in torch.compile + # see https://github.com/huggingface/transformers/pull/38540#discussion_r2126888471 image = torch.where(image > 255, 255, image) image = torch.where(image < 0, 0, image) image = image.round().to(torch.uint8) diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py index f0aeae8985b7..c0158b7111b7 100644 --- a/src/transformers/image_transforms.py +++ b/src/transformers/image_transforms.py @@ -255,7 +255,7 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple[int, in # Logic adapted from torchvision resizing logic: https://github.com/pytorch/vision/blob/511924c1ced4ce0461197e5caa64ce5b9e558aab/torchvision/transforms/functional.py#L366 def get_resize_output_image_size( input_image: np.ndarray, - size: Union[int, tuple[int, int], list[int], tuple[int]], + size: Union[int, tuple[int, int], list[int], tuple[int, ...]], default_to_square: bool = True, max_size: Optional[int] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, @@ -323,7 +323,7 @@ def get_resize_output_image_size( def resize( image: np.ndarray, size: tuple[int, int], - resample: "PILImageResampling" = None, + resample: Optional["PILImageResampling"] = None, reducing_gap: Optional[int] = None, data_format: Optional[ChannelDimension] = None, return_numpy: bool = True, diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py index 2079c21f3b0c..c5f4d4a3fa4c 100644 --- a/src/transformers/image_utils.py +++ b/src/transformers/image_utils.py @@ -30,7 +30,6 @@ is_torch_available, is_torch_tensor, is_torchvision_available, - is_torchvision_v2_available, is_vision_available, logging, requires_backends, @@ -56,9 +55,7 @@ from torchvision.transforms import InterpolationMode pil_torch_interpolation_mapping = { - PILImageResampling.NEAREST: InterpolationMode.NEAREST_EXACT - if is_torchvision_v2_available() - else InterpolationMode.NEAREST, + PILImageResampling.NEAREST: InterpolationMode.NEAREST_EXACT, PILImageResampling.BOX: InterpolationMode.BOX, PILImageResampling.BILINEAR: InterpolationMode.BILINEAR, PILImageResampling.HAMMING: InterpolationMode.HAMMING, @@ -78,7 +75,7 @@ ImageInput = Union[ "PIL.Image.Image", np.ndarray, "torch.Tensor", list["PIL.Image.Image"], list[np.ndarray], list["torch.Tensor"] -] # noqa +] class ChannelDimension(ExplicitEnum): @@ -486,9 +483,7 @@ def load_image(image: Union[str, "PIL.Image.Image"], timeout: Optional[float] = raise ValueError( f"Incorrect image source. Must be a valid URL starting with `http://` or `https://`, a valid path to an image file, or a base64 encoded string. Got {image}. Failed with {e}" ) - elif isinstance(image, PIL.Image.Image): - image = image - else: + elif not isinstance(image, PIL.Image.Image): raise TypeError( "Incorrect format used for image. Should be an url linking to an image, a base64 string, a local path, or a PIL image." ) @@ -579,7 +574,7 @@ class ImageFeatureExtractionMixin: def _ensure_format_supported(self, image): if not isinstance(image, (PIL.Image.Image, np.ndarray)) and not is_torch_tensor(image): raise ValueError( - f"Got type {type(image)} which is not supported, only `PIL.Image.Image`, `np.array` and " + f"Got type {type(image)} which is not supported, only `PIL.Image.Image`, `np.ndarray` and " "`torch.Tensor` are." ) diff --git a/src/transformers/integrations/deepspeed.py b/src/transformers/integrations/deepspeed.py index 47d7a7ffcb5f..c5f9ecc03b53 100644 --- a/src/transformers/integrations/deepspeed.py +++ b/src/transformers/integrations/deepspeed.py @@ -130,58 +130,11 @@ def fill_match(self, ds_key_long, hf_val, hf_key=None, must_match=True): fill_only = partialmethod(fill_match, must_match=False) - def override_training_args_from_deepspeed(self, args): - """ - Override TrainingArguments based on DeepSpeed config values to ensure compatibility. - - This method ensures that the DeepSpeed config takes precedence over TrainingArguments - defaults when there are conflicts, particularly for mixed precision settings. - - Args: - args: TrainingArguments object to potentially modify - """ - # Check precision settings in DeepSpeed config and override TrainingArguments accordingly - # Only override defaults, not explicit user settings - - # Check if user explicitly set precision options (we assume defaults are False) - user_set_fp16 = args.fp16 is True - user_set_bf16 = args.bf16 is True - - if self.is_true("fp16.enabled"): - # DeepSpeed config explicitly enables fp16 - if not user_set_fp16 and not user_set_bf16: - # User didn't explicitly set either, so apply DeepSpeed config - args.fp16 = True - args.bf16 = False - elif user_set_bf16 and not user_set_fp16: - # User explicitly chose bf16, but DeepSpeed config wants fp16 - # This is a potential conflict - let user choice win but log a warning - pass # Keep user's bf16=True, fp16=False - elif self.is_true("bf16.enabled"): - # DeepSpeed config explicitly enables bf16 - if not user_set_fp16 and not user_set_bf16: - # User didn't explicitly set either, so apply DeepSpeed config - args.bf16 = True - args.fp16 = False - elif user_set_fp16 and not user_set_bf16: - # User explicitly chose fp16, but DeepSpeed config wants bf16 - # This is a potential conflict - let user choice win but log a warning - pass # Keep user's fp16=True, bf16=False - elif self.is_false("fp16.enabled") and self.is_false("bf16.enabled"): - # Both are explicitly disabled in DeepSpeed config - if not user_set_fp16 and not user_set_bf16: - # User didn't explicitly set either, so apply DeepSpeed config (fp32) - args.fp16 = False - args.bf16 = False - def trainer_config_process(self, args, auto_find_batch_size=False): """ Adjust the config with `TrainingArguments` values. This stage is run during `TrainingArguments` object creation. """ - # First, override TrainingArguments based on DeepSpeed config to ensure compatibility - self.override_training_args_from_deepspeed(args) - # DeepSpeed does: # train_batch_size = world_size * train_micro_batch_size_per_gpu * gradient_accumulation_steps train_batch_size = args.world_size * args.per_device_train_batch_size * args.gradient_accumulation_steps @@ -268,17 +221,20 @@ def trainer_config_finalize(self, args, model, num_training_steps): hidden_size_auto_keys = [x for x in hidden_size_based_keys if self.is_auto(x)] if len(hidden_size_auto_keys) > 0: - if hasattr(model.config, "hidden_size"): - hidden_size = model.config.hidden_size - elif hasattr(model.config, "hidden_sizes"): - # if there are many hidden sizes pick the largest one - hidden_size = max(model.config.hidden_sizes) - elif hasattr(model.config, "text_config") and hasattr(model.config.text_config, "hidden_size"): - hidden_size = model.config.text_config.hidden_size - elif hasattr(model.config, "text_config") and hasattr(model.config.text_config, "hidden_sizes"): - # if there are many hidden sizes pick the largest one - hidden_size = max(model.config.text_config.hidden_sizes) - else: + hidden_size = None + if hasattr(model, "config"): + if hasattr(model.config, "hidden_size"): + hidden_size = model.config.hidden_size + elif hasattr(model.config, "hidden_sizes"): + # if there are many hidden sizes pick the largest one + hidden_size = max(model.config.hidden_sizes) + elif hasattr(model.config, "text_config") and hasattr(model.config.text_config, "hidden_size"): + hidden_size = model.config.text_config.hidden_size + elif hasattr(model.config, "text_config") and hasattr(model.config.text_config, "hidden_sizes"): + # if there are many hidden sizes pick the largest one + hidden_size = max(model.config.text_config.hidden_sizes) + + if hidden_size is None: raise ValueError( "The model's config file has neither `hidden_size` nor `hidden_sizes` entry, " "therefore it's not possible to automatically fill out the following `auto` entries " @@ -416,7 +372,7 @@ def deepspeed_optim_sched(trainer, hf_deepspeed_config, args, num_training_steps optimizer = None if "optimizer" in config: - if args.adafactor: + if args.optim == "adafactor": raise ValueError( "--adafactor was passed, but also found `optimizer` configured in the DeepSpeed config. " "Only one optimizer can be configured." diff --git a/src/transformers/integrations/flash_paged.py b/src/transformers/integrations/flash_paged.py index 329fab4c9323..1d1db72a7605 100644 --- a/src/transformers/integrations/flash_paged.py +++ b/src/transformers/integrations/flash_paged.py @@ -6,11 +6,21 @@ from ..utils import is_flash_attn_2_available +# For some reason, if we dont assign the function to a variable here, it will be garbage collected try: if is_flash_attn_2_available(): from flash_attn import flash_attn_varlen_func # noqa: F401 -except Exception: - pass + + FLASH_ATTN_VARLEN_FUNC = flash_attn_varlen_func + else: + raise RuntimeError( + "Flash Attention 2 is not installed. Please refer to https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install it" + ) +except Exception as e: + msg = repr(e) + + def FLASH_ATTN_VARLEN_FUNC(*args, **kwargs): + raise Exception(f"flash_attn_varlen_func is not available: {msg}") def paged_attention_forward( @@ -58,14 +68,13 @@ def paged_attention_forward( # Retrieve the cumulative sequence lengths for the current layer if isinstance(cu_seq_lens_k, dict): - cu_seq_lens_k = cu_seq_lens_k[layer_type].clone() + cu_seq_lens_k = cu_seq_lens_k[layer_type] max_seqlen_k = max_seqlen_k[layer_type] - else: - cu_seq_lens_k = cu_seq_lens_k.clone() - max_seqlen_k = max_seqlen_k if implementation is not None and hasattr(implementation, "flash_attn_varlen_func"): flash_attn_varlen_func = implementation.flash_attn_varlen_func + else: + flash_attn_varlen_func = FLASH_ATTN_VARLEN_FUNC custom_kwargs = {"s_aux": kwargs.get("s_aux")} if "s_aux" in kwargs else {} diff --git a/src/transformers/integrations/flex_attention.py b/src/transformers/integrations/flex_attention.py index 85ddc433e67a..2701936685dd 100644 --- a/src/transformers/integrations/flex_attention.py +++ b/src/transformers/integrations/flex_attention.py @@ -36,7 +36,7 @@ if is_torch_flex_attn_available(): - from torch.nn.attention.flex_attention import _DEFAULT_SPARSE_BLOCK_SIZE as flex_default_block_size # noqa: N811 + from torch.nn.attention.flex_attention import _DEFAULT_SPARSE_BLOCK_SIZE as flex_default_block_size from torch.nn.attention.flex_attention import BlockMask, create_block_mask, flex_attention @@ -272,12 +272,9 @@ def score_mod(score, batch_idx, head_idx, q_idx, kv_idx): score = score + score_mask[batch_idx][0][q_idx][kv_idx] if head_mask is not None: score = score + head_mask[batch_idx][head_idx][0][0] - if s_aux is not None: - logits_max = torch.max(score, dim=-1, keepdim=True).values - sinks = torch.exp(s_aux - logits_max) - unnormalized_scores = torch.exp(score - logits_max) - normalizer = unnormalized_scores.sum(dim=-1, keepdim=True) + sinks - score = unnormalized_scores / normalizer + # Note: attention sinks cannot be correctly implemented in score_mod + # because it requires operating on the full attention matrix before softmax. + # ==> this is done after flex attention return score enable_gqa = True @@ -293,6 +290,11 @@ def score_mod(score, batch_idx, head_idx, q_idx, kv_idx): # On CPU we must skip returning LSE due to a runtime issue; elsewhere, follow PyTorch API and return it return_lse = query.device.type != "cpu" + if not return_lse and s_aux is not None: + raise ValueError( + "Attention sinks cannot be run on CPU with flex attention. Please switch to a different device, e.g. CUDA" + ) + flex_attention_output = compile_friendly_flex_attention( query, key, @@ -311,6 +313,21 @@ def score_mod(score, batch_idx, head_idx, q_idx, kv_idx): if return_lse: attention_output, lse = flex_attention_output # type: ignore[misc] lse = lse.to(value.dtype) + + if s_aux is not None: + # Apply attention sinks by renormalizing using LSE + batch_size, num_heads, seq_len_q, _ = attention_output.shape # batch, num_heads, seq_len, head_dim + sinks = s_aux.view(1, -1, 1, 1).expand(batch_size, num_heads, seq_len_q, 1) + + # We need to compute the normalization that includes the sinks + # since log(sum(exp(scores))) = lse, exp(log(sum(exp(scores)))) = exp(lse) + # NB: log(sum(exp(scores)) + exp(sink)) = log(exp(lse) + exp(sink)) + lse_expanded = lse.unsqueeze(-1) # [batch, num_heads, seq_len, 1] + combined_lse = torch.logsumexp(torch.cat([lse_expanded, sinks], dim=-1), dim=-1, keepdim=True) + + # Use new_norm / old_norm = exp(combined_lse - lse) to compute renorm and apply + renorm_factor = torch.exp(lse_expanded - combined_lse) + attention_output = attention_output * renorm_factor else: attention_output = flex_attention_output # type: ignore[assignment] lse = None diff --git a/src/transformers/integrations/fp_quant.py b/src/transformers/integrations/fp_quant.py index 89ebac7004ee..0ac441e36f93 100644 --- a/src/transformers/integrations/fp_quant.py +++ b/src/transformers/integrations/fp_quant.py @@ -28,6 +28,8 @@ def adapt_fp_quant_config(config: FPQuantConfig): if config.forward_dtype == "mxfp4": forward_dtype = FPQuantDtype.MXFP4 + elif config.forward_dtype == "nvfp4": + forward_dtype = FPQuantDtype.NVFP4 else: raise ValueError(f"Unsupported forward dtype: {config.forward_dtype}") @@ -43,5 +45,6 @@ def adapt_fp_quant_config(config: FPQuantConfig): store_master_weights=config.store_master_weights, hadamard_group_size=config.hadamard_group_size, pseudoquantization=config.pseudoquantization, + transform_init=config.transform_init, modules_to_not_convert=config.modules_to_not_convert, ) diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py index 703fd0156365..d5600050188f 100644 --- a/src/transformers/integrations/ggml.py +++ b/src/transformers/integrations/ggml.py @@ -90,6 +90,19 @@ "expert_count": "num_experts", "expert_used_count": "num_experts_per_tok", }, + "lfm2": { + "context_length": "max_position_embeddings", + "block_count": "num_hidden_layers", + "feed_forward_length": "intermediate_size", + "embedding_length": "hidden_size", + "rope.dimension_count": None, + "rope.freq_base": "rope_theta", + "attention.head_count": "num_attention_heads", + "attention.head_count_kv": "num_key_value_heads", + "attention.layer_norm_rms_epsilon": "rms_norm_eps", + "vocab_size": "vocab_size", + "shortconv.l_cache": "conv_L_cache", + }, "qwen3": { "context_length": "max_position_embeddings", "block_count": "num_hidden_layers", @@ -316,11 +329,11 @@ def _gguf_parse_value(_value, data_type): _value = int(_value[0]) elif data_type in [6, 12]: _value = float(_value[0]) - elif data_type in [7]: + elif data_type == 7: _value = bool(_value[0]) - elif data_type in [8]: + elif data_type == 8: _value = array("B", list(_value)).tobytes().decode() - elif data_type in [9]: + elif data_type == 9: _value = _gguf_parse_value(_value, array_data_type) return _value diff --git a/src/transformers/integrations/hub_kernels.py b/src/transformers/integrations/hub_kernels.py index 5be21e2f9a51..6bf8dbcc0219 100644 --- a/src/transformers/integrations/hub_kernels.py +++ b/src/transformers/integrations/hub_kernels.py @@ -111,6 +111,27 @@ ) } }, + "SiLU": { + "cuda": { + Mode.INFERENCE | Mode.TORCH_COMPILE: LayerRepository( + repo_id="kernels-community/activation", layer_name="Silu", version=">=0.1.0" + ) + } + }, + "GeLU": { + "cuda": { + Mode.INFERENCE | Mode.TORCH_COMPILE: LayerRepository( + repo_id="kernels-community/activation", layer_name="Gelu", version=">=0.1.0" + ) + } + }, + "GeluTanh": { + "cuda": { + Mode.INFERENCE | Mode.TORCH_COMPILE: LayerRepository( + repo_id="kernels-community/activation", layer_name="GeluTanh", version=">=0.1.0" + ) + } + }, } register_kernel_mapping(_KERNEL_MAPPING) @@ -152,7 +173,10 @@ def load_and_register_kernel(attn_implementation: str) -> None: if not is_kernel(attn_implementation): return if not _kernels_available: - raise ImportError("`kernels` is not installed. Please install it with `pip install kernels`.") + raise ImportError( + "`kernels` is either not installed or uses an incompatible version. " + "Please install the latest version with `pip install -U kernels`." + ) # Need to be imported here as otherwise we have a circular import in `modeling_utils` from ..masking_utils import ALL_MASK_ATTENTION_FUNCTIONS @@ -188,7 +212,7 @@ def load_and_register_kernel(attn_implementation: str) -> None: if attention_wrapper is None: attention_wrapper = flash_attention_forward kernel_function = partial(attention_wrapper, implementation=kernel) - lazy_import_flash_attention(kernel) + lazy_import_flash_attention(kernel, force_import=True) elif kernel_name is not None: kernel_function = getattr(kernel, kernel_name) # Register the kernel as a valid attention diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py index 5ef1123b8fce..b81d47831b6b 100755 --- a/src/transformers/integrations/integration_utils.py +++ b/src/transformers/integrations/integration_utils.py @@ -547,8 +547,6 @@ def run_hp_search_sigopt(trainer, n_trials: int, direction: str, **kwargs) -> Be def run_hp_search_wandb(trainer, n_trials: int, direction: str, **kwargs) -> BestRun: - from ..integrations import is_wandb_available - if not is_wandb_available(): raise ImportError("This function needs wandb installed: `pip install wandb`") import wandb @@ -686,7 +684,7 @@ def __init__(self, tb_writer=None): ) if has_tensorboard: try: - from torch.utils.tensorboard import SummaryWriter # noqa: F401 + from torch.utils.tensorboard import SummaryWriter self._SummaryWriter = SummaryWriter except ImportError: @@ -1092,19 +1090,28 @@ def setup(self, args, state, model, **kwargs): """ Setup the optional Trackio integration. - To customize the setup you can also override the following environment variables: - - Environment: - - **TRACKIO_PROJECT** (`str`, *optional*, defaults to `"huggingface"`): - The name of the project (can be an existing project to continue tracking or a new project to start tracking - from scratch). - - **TRACKIO_SPACE_ID** (`str`, *optional*, defaults to `None`): - If set, the project will be logged to a Hugging Face Space instead of a local directory. Should be a - complete Space name like `"username/reponame"` or `"orgname/reponame"`, or just `"reponame" in which case - the Space will be created in the currently-logged-in Hugging Face user's namespace. If the Space does not - exist, it will be created. If the Space already exists, the project will be logged to it. + To customize the setup you can also set the arguments `project`, `trackio_space_id` and `hub_private_repo` in + [`TrainingArguments`]. Please refer to the docstring of for more details. """ if state.is_world_process_zero: + if os.getenv("TRACKIO_PROJECT"): + logger.warning( + "The `TRACKIO_PROJECT` environment variable is deprecated and will be removed in a future " + "version. Use TrainingArguments.project instead." + ) + project = os.getenv("TRACKIO_PROJECT") + else: + project = args.project + + if os.getenv("TRACKIO_SPACE_ID"): + logger.warning( + "The `TRACKIO_SPACE_ID` environment variable is deprecated and will be removed in a future " + "version. Use TrainingArguments.trackio_space_id instead." + ) + space_id = os.getenv("TRACKIO_SPACE_ID") + else: + space_id = args.trackio_space_id + combined_dict = {**args.to_dict()} if hasattr(model, "config") and model.config is not None: @@ -1115,10 +1122,11 @@ def setup(self, args, state, model, **kwargs): combined_dict = {**{"peft_config": peft_config}, **combined_dict} self._trackio.init( - project=os.getenv("TRACKIO_PROJECT", "huggingface"), + project=project, name=args.run_name, - space_id=os.getenv("TRACKIO_SPACE_ID", None), + space_id=space_id, resume="allow", + private=args.hub_private_repo, ) # Add config parameters (run may have been created manually) diff --git a/src/transformers/integrations/mistral.py b/src/transformers/integrations/mistral.py index 78172329277e..cdf237645fc1 100644 --- a/src/transformers/integrations/mistral.py +++ b/src/transformers/integrations/mistral.py @@ -16,10 +16,8 @@ def __init__( pattern=r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""", add_prefix_space=False, additional_special_tokens=None, - *args, **kwargs, ): - super().__init__(*args) self.vocab = vocab self.pattern = pattern self.add_prefix_space = add_prefix_space diff --git a/src/transformers/integrations/mxfp4.py b/src/transformers/integrations/mxfp4.py index c40b202c54e8..6a6ce1db17e7 100644 --- a/src/transformers/integrations/mxfp4.py +++ b/src/transformers/integrations/mxfp4.py @@ -23,6 +23,7 @@ from accelerate import init_empty_weights import re +from contextlib import contextmanager logger = logging.get_logger(__name__) @@ -47,6 +48,28 @@ ] +@contextmanager +def on_device(dev): + if is_torch_available(): + import torch + + if isinstance(dev, torch.Tensor): + dev = dev.device + elif isinstance(dev, str): + dev = torch.device(dev) + dev_type = getattr(dev, "type", None) + if dev_type == "cuda": + with torch.cuda.device(dev): + yield + return + if dev_type == "xpu" and hasattr(torch, "xpu"): + with torch.xpu.device(dev): + yield + return + # other: CPU + yield + + # Copied from GPT_OSS repo and vllm def quantize_to_mxfp4(w, triton_kernels_hub): downcast_to_mxfp_torch = triton_kernels_hub.numerics_details.mxfp.downcast_to_mxfp_torch @@ -173,7 +196,7 @@ def forward(self, hidden_states: torch.Tensor, routing_data, gather_idx, scatter ) swiglu_fn = triton_kernels_hub.swiglu.swiglu_fn - with torch.cuda.device(hidden_states.device): + with on_device(hidden_states.device): act = FusedActivation(FnSpecs("swiglu", swiglu_fn, ("alpha", "limit")), (self.alpha, self.limit), 2) intermediate_cache1 = matmul_ogs( @@ -214,7 +237,7 @@ def routing_torch_dist( triton_kernels_hub.routing.compute_expt_data_torch, ) - with torch.cuda.device(logits.device): + with on_device(logits.device): world_size = torch.distributed.get_world_size() rank = int(os.environ.get("LOCAL_RANK", "0")) replace_value = -1 @@ -281,7 +304,7 @@ def mlp_forward(self, hidden_states): hidden_states = hidden_states.reshape(-1, self.router.hidden_dim) router_logits = nn.functional.linear(hidden_states, self.router.weight, self.router.bias) - with torch.cuda.device(router_logits.device): + with on_device(router_logits.device): routing_data, gather_idx, scatter_idx = routing(router_logits, self.router.top_k) routed_out = self.experts(hidden_states, routing_data, gather_idx, scatter_idx) @@ -320,7 +343,6 @@ def dequantize(module, param_name, param_value, target_device, dq_param_name, ** to_contiguous, rank, device_mesh, - set_param=False, ) blocks_attr = f"{proj}_blocks" scales_attr = f"{proj}_scales" @@ -376,7 +398,7 @@ def load_and_swizzle_mxfp4(module, param_name, param_value, target_device, trito target_device = "cuda" blocks = blocks.to(target_device).contiguous() scales = scales.to(target_device).contiguous() - with torch.cuda.device(target_device): + with on_device(target_device): triton_weight_tensor, weight_scale = swizzle_mxfp4( blocks.transpose(-2, -1), scales.transpose(-2, -1), triton_kernels_hub ) diff --git a/src/transformers/integrations/peft.py b/src/transformers/integrations/peft.py index 87dd6cffc2fa..22261eecad0b 100644 --- a/src/transformers/integrations/peft.py +++ b/src/transformers/integrations/peft.py @@ -15,7 +15,6 @@ import importlib import inspect import re -import warnings from typing import Any, Optional, Union from packaging import version @@ -70,14 +69,9 @@ class PeftAdapterMixin: more details about adapters and injecting them on a transformer-based model, check out the documentation of PEFT library: https://huggingface.co/docs/peft/index - Currently supported PEFT methods are all non-prefix tuning methods. Below is the list of supported PEFT methods - that anyone can load, train and run with this mixin class: - - Low Rank Adapters (LoRA): https://huggingface.co/docs/peft/conceptual_guides/lora - - IA3: https://huggingface.co/docs/peft/conceptual_guides/ia3 - - AdaLora: https://huggingface.co/papers/2303.10512 - - Other PEFT models such as prompt tuning, prompt learning are out of scope as these adapters are not "injectable" - into a torch module. For using these methods, please refer to the usage guide of PEFT library. + Currently supported PEFT methods are all non-prompt learning methods (LoRA, IA³, etc.). Other PEFT models such as + prompt tuning, prompt learning are out of scope as these adapters are not "injectable" into a torch module. For + using these methods, please refer to the usage guide of PEFT library. With this mixin, if the correct PEFT version is installed, it is possible to: @@ -96,7 +90,7 @@ def load_adapter( adapter_name: Optional[str] = None, revision: Optional[str] = None, token: Optional[str] = None, - device_map: Optional[str] = "auto", + device_map: str = "auto", max_memory: Optional[str] = None, offload_folder: Optional[str] = None, offload_index: Optional[int] = None, @@ -110,24 +104,21 @@ def load_adapter( Load adapter weights from file or remote Hub folder. If you are not familiar with adapters and PEFT methods, we invite you to read more about them on PEFT official documentation: https://huggingface.co/docs/peft - Requires peft as a backend to load the adapter weights. + Requires PEFT to be installed as a backend to load the adapter weights. Args: peft_model_id (`str`, *optional*): The identifier of the model to look for on the Hub, or a local path to the saved adapter config file and adapter weights. adapter_name (`str`, *optional*): - The adapter name to use. If not set, will use the default adapter. + The adapter name to use. If not set, will use the name "default". revision (`str`, *optional*, defaults to `"main"`): The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any identifier allowed by git. - - - To test a pull request you made on the Hub, you can pass `revision="refs/pr/"`. - - + > [!TIP] + > To test a pull request you made on the Hub, you can pass `revision="refs/pr/"`. token (`str`, `optional`): Whether to use authentication token to load the remote folder. Useful to load private repositories @@ -151,11 +142,11 @@ def load_adapter( offload_index (`int`, `optional`): `offload_index` argument to be passed to `accelerate.dispatch_model` method. peft_config (`dict[str, Any]`, *optional*): - The configuration of the adapter to add, supported adapters are non-prefix tuning and adaption prompts - methods. This argument is used in case users directly pass PEFT state dicts + The configuration of the adapter to add, supported adapters are all non-prompt learning configs (LoRA, + IA³, etc). This argument is used in case users directly pass PEFT state dicts. adapter_state_dict (`dict[str, torch.Tensor]`, *optional*): The state dict of the adapter to load. This argument is used in case users directly pass PEFT state - dicts + dicts. low_cpu_mem_usage (`bool`, *optional*, defaults to `False`): Reduce memory usage while loading the PEFT adapter. This should also speed up the loading process. Requires PEFT version 0.13.0 or higher. @@ -320,10 +311,12 @@ def add_adapter(self, adapter_config, adapter_name: Optional[str] = None) -> Non name is assigned to the adapter to follow the convention of PEFT library (in PEFT we use "default" as the default adapter name). + Note that the newly added adapter is not automatically activated. To activate it, use `model.set_adapter`. + Args: adapter_config (`~peft.PeftConfig`): - The configuration of the adapter to add, supported adapters are non-prefix tuning and adaption prompts - methods + The configuration of the adapter to add, supported adapters are non-prompt learning methods (LoRA, + IA³, etc.). adapter_name (`str`, *optional*, defaults to `"default"`): The name of the adapter to add. If no name is passed, a default name is assigned to the adapter. """ @@ -470,13 +463,6 @@ def active_adapters(self) -> list[str]: return active_adapters - def active_adapter(self) -> str: - warnings.warn( - "The `active_adapter` method is deprecated and will be removed in a future version.", FutureWarning - ) - - return self.active_adapters()[0] - def get_adapter_state_dict(self, adapter_name: Optional[str] = None, state_dict: Optional[dict] = None) -> dict: """ If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT @@ -564,34 +550,47 @@ def _dispatch_accelerate_model( def delete_adapter(self, adapter_names: Union[list[str], str]) -> None: """ - Delete an adapter's LoRA layers from the underlying model. + Delete a PEFT adapter from the underlying model. Args: adapter_names (`Union[list[str], str]`): The name(s) of the adapter(s) to delete. - - Example: - - ```py - from diffusers import AutoPipelineForText2Image - import torch - - pipeline = AutoPipelineForText2Image.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", dtype=torch.float16 - ).to("cuda") - pipeline.load_lora_weights( - "jbilcke-hf/sdxl-cinematic-1", weight_name="pytorch_lora_weights.safetensors", adapter_names="cinematic" - ) - pipeline.delete_adapters("cinematic") - ``` """ check_peft_version(min_version=MIN_PEFT_VERSION) + min_version_delete_adapter = "0.18.0" if not self._hf_peft_config_loaded: raise ValueError("No adapter loaded. Please load an adapter first.") - from peft.tuners.tuners_utils import BaseTunerLayer + # TODO: delete old version once support for PEFT < 0.18.0 is dropped + def old_delete_adapter(model, adapter_name, prefix=None): + from peft.tuners.tuners_utils import BaseTunerLayer + from peft.utils import ModulesToSaveWrapper + + has_modules_to_save = False + for module in model.modules(): + if isinstance(module, ModulesToSaveWrapper): + has_modules_to_save |= True + continue + if isinstance(module, BaseTunerLayer): + if hasattr(module, "delete_adapter"): + module.delete_adapter(adapter_name) + else: + raise ValueError( + "The version of PEFT you are using is not compatible, please use a version that is greater than 0.6.1" + ) + + if has_modules_to_save: + logger.warning( + "The deleted adapter contains modules_to_save, which could not be deleted. For this to work, PEFT version " + f">= {min_version_delete_adapter} is required." + ) + + if version.parse(importlib.metadata.version("peft")) >= version.parse(min_version_delete_adapter): + from peft.functional import delete_adapter + else: + delete_adapter = old_delete_adapter if isinstance(adapter_names, str): adapter_names = [adapter_names] @@ -603,16 +602,9 @@ def delete_adapter(self, adapter_names: Union[list[str], str]) -> None: f"The following adapter(s) are not present and cannot be deleted: {', '.join(missing_adapters)}" ) - for adapter_name in adapter_names: - for module in self.modules(): - if isinstance(module, BaseTunerLayer): - if hasattr(module, "delete_adapter"): - module.delete_adapter(adapter_name) - else: - raise ValueError( - "The version of PEFT you are using is not compatible, please use a version that is greater than 0.6.1" - ) - + prefixes = [f"{self.peft_config[adapter_name].peft_type.value.lower()}_" for adapter_name in adapter_names] + for adapter_name, prefix in zip(adapter_names, prefixes): + delete_adapter(self, adapter_name=adapter_name, prefix=prefix) # For transformers integration - we need to pop the adapter from the config if getattr(self, "_hf_peft_config_loaded", False) and hasattr(self, "peft_config"): self.peft_config.pop(adapter_name, None) diff --git a/src/transformers/integrations/sdpa_attention.py b/src/transformers/integrations/sdpa_attention.py index f6c6f2785c3f..e2eb69b2db8f 100644 --- a/src/transformers/integrations/sdpa_attention.py +++ b/src/transformers/integrations/sdpa_attention.py @@ -2,7 +2,7 @@ import torch -from ..utils import is_torch_xpu_available, logging +from ..utils import is_torch_npu_available, is_torch_xpu_available, logging from ..utils.import_utils import is_torch_greater_or_equal @@ -12,6 +12,7 @@ _is_torch_greater_or_equal_than_2_5 = is_torch_greater_or_equal("2.5", accept_dev=True) _is_torch_greater_or_equal_than_2_8 = is_torch_greater_or_equal("2.8", accept_dev=True) _is_torch_xpu_available = is_torch_xpu_available() +_is_torch_npu_available = is_torch_npu_available() def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: @@ -35,8 +36,12 @@ def use_gqa_in_sdpa(attention_mask: Optional[torch.Tensor], key: torch.Tensor) - # 2.xpu # - torch version >= 2.8 # - key is not a torch.fx.Proxy (otherwise it will fail with a tracing error) + # 3.npu + # - npu is not supported gqa currently if _is_torch_xpu_available: return _is_torch_greater_or_equal_than_2_8 and not isinstance(key, torch.fx.Proxy) + if _is_torch_npu_available: + return False return _is_torch_greater_or_equal_than_2_5 and attention_mask is None and not isinstance(key, torch.fx.Proxy) @@ -80,6 +85,14 @@ def sdpa_attention_forward( if torch.jit.is_tracing() and isinstance(is_causal, torch.Tensor): is_causal = is_causal.item() + # When `is_causal = False` and the `attention_mask` is not of boolean type, the Ascend NPU's SDPA interface cannot utilize the FlashAttentionScore operator, + # and falls back to small-operator concatenation. To invoke the FlashAttentionScore, the attention_mask must be converted to boolean type. + # This adaptation ensures the `attention_mask` meets the requirement for using FlashAttentionScore. + if _is_torch_npu_available: + if attention_mask is not None and attention_mask.dtype != torch.bool: + # Convert to boolean type, making sdpa to force call FlashAttentionScore to improve performance. + attention_mask = torch.logical_not(attention_mask.bool()).to(query.device) + attn_output = torch.nn.functional.scaled_dot_product_attention( query, key, diff --git a/src/transformers/integrations/tensor_parallel.py b/src/transformers/integrations/tensor_parallel.py index 3f9d40f13388..e746ed60a7e4 100644 --- a/src/transformers/integrations/tensor_parallel.py +++ b/src/transformers/integrations/tensor_parallel.py @@ -1009,7 +1009,7 @@ def add_tensor_parallel_hooks_to_module( def shard_and_distribute_module( - model, param, empty_param, parameter_name, param_casting_dtype, is_contiguous, rank, device_mesh, set_param=True + model, param, empty_param, parameter_name, param_casting_dtype, is_contiguous, rank, device_mesh ): # TODO: rename to shard_and_distribute_param r""" This function is called in `from_pretrained` when loading a model's checkpoints. @@ -1103,8 +1103,6 @@ def distribute_model(model, distributed_config, device_mesh, tp_size): raise ValueError(f"Unsupported tensor parallel style {v}. Supported styles are {ALL_PARALLEL_STYLES}") for name, module in model.named_modules(): if not getattr(module, "_is_hooked", False): - from transformers.integrations.tensor_parallel import add_tensor_parallel_hooks_to_module - plan = _get_parameter_tp_plan(parameter_name=name, tp_plan=model_plan, is_weight=False) add_tensor_parallel_hooks_to_module( model=model, diff --git a/src/transformers/masking_utils.py b/src/transformers/masking_utils.py index 1899a6de8af8..99306bd94c88 100644 --- a/src/transformers/masking_utils.py +++ b/src/transformers/masking_utils.py @@ -26,7 +26,7 @@ if is_torch_flex_attn_available(): - from torch.nn.attention.flex_attention import _DEFAULT_SPARSE_BLOCK_SIZE as flex_default_block_size # noqa: N811 + from torch.nn.attention.flex_attention import _DEFAULT_SPARSE_BLOCK_SIZE as flex_default_block_size from torch.nn.attention.flex_attention import BlockMask, create_block_mask else: # Register a fake type to avoid crashing for annotations and `isinstance` checks @@ -43,7 +43,7 @@ logger = logging.get_logger(__name__) -def and_masks(*mask_functions: list[Callable]) -> Callable: +def and_masks(*mask_functions: Callable) -> Callable: """Returns a mask function that is the intersection of provided mask functions""" if not all(callable(arg) for arg in mask_functions): raise RuntimeError(f"All inputs should be callable mask_functions: {mask_functions}") @@ -57,7 +57,7 @@ def and_mask(batch_idx, head_idx, q_idx, kv_idx): return and_mask -def or_masks(*mask_functions: list[Callable]) -> Callable: +def or_masks(*mask_functions: Callable) -> Callable: """Returns a mask function that is the union of provided mask functions""" if not all(callable(arg) for arg in mask_functions): raise RuntimeError(f"All inputs should be callable mask_functions: {mask_functions}") @@ -625,6 +625,7 @@ class AttentionMaskInterface(GeneralInterface): "sdpa": sdpa_mask, "eager": eager_mask, "flash_attention_2": flash_attention_mask, + "flash_attention_3": flash_attention_mask, "flex_attention": flex_attention_mask, } diff --git a/src/transformers/model_debugging_utils.py b/src/transformers/model_debugging_utils.py index 9f763c83c66d..2c7b47c04fd5 100644 --- a/src/transformers/model_debugging_utils.py +++ b/src/transformers/model_debugging_utils.py @@ -21,6 +21,7 @@ from io import StringIO from typing import Optional +from .utils import logging from .utils.import_utils import is_torch_available, requires @@ -28,6 +29,7 @@ import torch from safetensors.torch import save_file + _torch_distributed_available = False # Note to code inspectors: this toolbox is intended for people who add models to `transformers`. if torch.distributed.is_available(): import torch.distributed.tensor @@ -35,7 +37,6 @@ _torch_distributed_available = True else: _torch_distributed_available = False -from .utils import logging logger = logging.get_logger(__name__) @@ -224,7 +225,7 @@ def prune_intermediate_layers(node): prune_intermediate_layers(child) -def log_model_debug_trace(debug_path, model): +def log_model_debug_trace(debug_path: Optional[str], model): if debug_path: try: os.makedirs(debug_path, exist_ok=True) @@ -269,8 +270,8 @@ def clean(val): def _attach_debugger_logic( model, - debug_path: Optional[str] = ".", - do_prune_layers: Optional[bool] = True, + debug_path: str = ".", + do_prune_layers: bool = True, use_repr: bool = True, ): """ @@ -283,7 +284,7 @@ def _attach_debugger_logic( debug_path (`str`): Optional directory to dump debug JSON files. do_prune_layers (`bool`, *optional*, defaults to `True`): Whether to prune intermediate layers. use_repr (bool, *optional*, defaults to `True`): Whether to save a `repr()`-ized version of the tensors as the - `value` property in the asscoiated FULL_TENSORS.json file, or to store full tensors in separate SafeTensors + `value` property in the associated FULL_TENSORS.json file, or to store full tensors in separate SafeTensors files and store the relative path to that file in the `value` property. """ class_name = model.__class__.__name__ @@ -399,8 +400,8 @@ def top_wrapped_forward(*inps, **kws): def model_addition_debugger_context( model, debug_path: Optional[str] = None, - do_prune_layers: Optional[bool] = True, - use_repr: Optional[bool] = True, + do_prune_layers: bool = True, + use_repr: bool = True, ): """ # Model addition debugger - context manager for model adders diff --git a/src/transformers/modelcard.py b/src/transformers/modelcard.py index 8c68d8b8af10..dd3a0b401733 100644 --- a/src/transformers/modelcard.py +++ b/src/transformers/modelcard.py @@ -794,8 +794,7 @@ def parse_log_history(log_history): if idx > 0: eval_results = {} for key, value in log_history[idx].items(): - if key.startswith("eval_"): - key = key[5:] + key = key.removeprefix("eval_") if key not in ["runtime", "samples_per_second", "steps_per_second", "epoch", "step"]: camel_cased_key = " ".join([part.capitalize() for part in key.split("_")]) eval_results[camel_cased_key] = value diff --git a/src/transformers/modeling_flash_attention_utils.py b/src/transformers/modeling_flash_attention_utils.py index 37554773a85f..5312b0dd9cd0 100644 --- a/src/transformers/modeling_flash_attention_utils.py +++ b/src/transformers/modeling_flash_attention_utils.py @@ -124,7 +124,7 @@ def _lazy_define_process_function(flash_function): return partial(_process_flash_attention_kwargs, supports_mapping=supports_mapping) -def lazy_import_flash_attention(implementation: Optional[str]): +def lazy_import_flash_attention(implementation: Optional[str], force_import: Optional[bool] = False): """ Lazily import flash attention and return the respective functions + flags. @@ -132,11 +132,11 @@ def lazy_import_flash_attention(implementation: Optional[str]): work without preloading. See `load_and_register_kernel` in `integrations.hub_kernels`. """ global _flash_fn, _flash_varlen_fn, _pad_fn, _unpad_fn - if any(k is None for k in [_flash_fn, _flash_varlen_fn, _pad_fn, _unpad_fn]): + if force_import or any(k is None for k in [_flash_fn, _flash_varlen_fn, _pad_fn, _unpad_fn]): _flash_fn, _flash_varlen_fn, _pad_fn, _unpad_fn = _lazy_imports(implementation) global _process_flash_kwargs_fn - if _process_flash_kwargs_fn is None: + if force_import or _process_flash_kwargs_fn is None: _process_flash_kwargs_fn = _lazy_define_process_function(_flash_varlen_fn) return (_flash_fn, _flash_varlen_fn, _pad_fn, _unpad_fn), _process_flash_kwargs_fn diff --git a/src/transformers/modeling_gguf_pytorch_utils.py b/src/transformers/modeling_gguf_pytorch_utils.py index 9b90fb82afa2..08aaac3617ff 100644 --- a/src/transformers/modeling_gguf_pytorch_utils.py +++ b/src/transformers/modeling_gguf_pytorch_utils.py @@ -243,6 +243,17 @@ def process(self, weights, name, **kwargs): return GGUFTensor(weights, name, {}) +class Lfm2TensorProcessor(TensorProcessor): + def __init__(self, config=None): + super().__init__(config=config) + + def process(self, weights, name, **kwargs): + if "shortconv.conv.weight" in name: + ## GGUF shape is [hidden_dim, L_cache], HF expects [hidden_dim, 1, L_cache] + weights = np.expand_dims(weights, axis=1) ## equivalent to unsqueeze(1) + return GGUFTensor(weights, name, {}) + + TENSOR_PROCESSORS = { "llama": LlamaTensorProcessor, "qwen2moe": Qwen2MoeTensorProcessor, @@ -255,6 +266,7 @@ def process(self, weights, name, **kwargs): "nemotron": NemotronTensorProcessor, "gemma2": Gemma2TensorProcessor, "gemma3": Gemma2TensorProcessor, + "lfm2": Lfm2TensorProcessor, } @@ -459,6 +471,19 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False, model_to_lo if parsed_parameters["config"]["model_type"] == "gemma3": parsed_parameters["config"]["model_type"] = "gemma3_text" + if parsed_parameters["config"]["model_type"] == "lfm2": + gguf_num_key_value_heads = parsed_parameters["config"]["num_key_value_heads"] + # LFM2 GGUF checkpoint defines num_key_value_heads as a list of integers .e.g [0, 0, 8, 0, 0, 8, 0, 0, 8, 0, 8, 0, 8, 0, 8, 0] but we need to set it to the max value for HF + parsed_parameters["config"]["num_key_value_heads"] = max(gguf_num_key_value_heads) + ## we already read the correct intermediate_size from the GGUF checkpoint so we need to set block_auto_adjust_ff_dim to False + parsed_parameters["config"]["block_auto_adjust_ff_dim"] = False + + ## llama.cpp defines the layers that are full-attention by looking at num_key_value_heads + ## we need to set the full_attn_idxs to the layers that are full-attention + parsed_parameters["config"]["full_attn_idxs"] = [ + i for i, num_kv_heads in enumerate(gguf_num_key_value_heads) if num_kv_heads > 0 + ] + # retrieve config vocab_size from tokenizer # Please refer to https://github.com/huggingface/transformers/issues/32526 for more details if "vocab_size" not in parsed_parameters["config"]: diff --git a/src/transformers/modeling_outputs.py b/src/transformers/modeling_outputs.py index 597e20b28ca8..1747f6fa477b 100755 --- a/src/transformers/modeling_outputs.py +++ b/src/transformers/modeling_outputs.py @@ -1651,7 +1651,7 @@ class Seq2SeqTSPredictionOutput(ModelOutput): """ loss: Optional[torch.FloatTensor] = None - params: Optional[tuple[torch.FloatTensor]] = None + params: Optional[tuple[torch.FloatTensor, ...]] = None past_key_values: Optional[EncoderDecoderCache] = None decoder_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None decoder_attentions: Optional[tuple[torch.FloatTensor, ...]] = None diff --git a/src/transformers/modeling_rope_utils.py b/src/transformers/modeling_rope_utils.py index 34c136980234..c0070df6ee17 100644 --- a/src/transformers/modeling_rope_utils.py +++ b/src/transformers/modeling_rope_utils.py @@ -98,17 +98,30 @@ def _compute_default_rope_parameters( Computes the inverse frequencies according to the original RoPE implementation Args: config ([`~transformers.PretrainedConfig`]): - The model configuration. + The model configuration. This function assumes that the config will provide at least the following + properties: + + * rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived. + * hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly. + * num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly. + + Additionally, this function will make use of the following properties if they are found in the config: + + * head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be + derived as hidden_size // num_attention_heads. + * partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for + the first fraction of the head_dim. Defaults to 1.0. device (`torch.device`): The device to use for initialization of the inverse frequencies. seq_len (`int`, *optional*): The current sequence length. Unused for this type of RoPE. + Returns: Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_theta - partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) @@ -128,11 +141,24 @@ def _compute_linear_scaling_rope_parameters( Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev Args: config ([`~transformers.PretrainedConfig`]): - The model configuration. + The model configuration. This function assumes that the config will provide at least the following + properties: + + * rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived. + * hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly. + * num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly. + + Additionally, this function will make use of the following properties if they are found in the config: + + * head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be + derived as hidden_size // num_attention_heads. + * partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for + the first fraction of the head_dim. Defaults to 1.0. device (`torch.device`): The device to use for initialization of the inverse frequencies. seq_len (`int`, *optional*): The current sequence length. Unused for this type of RoPE. + Returns: Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). @@ -156,20 +182,43 @@ def _compute_dynamic_ntk_parameters( ) -> tuple["torch.Tensor", float]: """ Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla + Args: config ([`~transformers.PretrainedConfig`]): - The model configuration. + The model configuration. This function assumes that the config will provide at least the following + properties: + + * rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived. + * hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly. + * num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly. + * max_position_embeddings (`int`): The default sequence length used to update the dynamic RoPE at + inference time + * rope_scaling (`dict[str, float]`): The standard RoPE scaling parameters, from which `factor` + will be accessed. The value of `factor` is used to determine the new base frequency, along with the + current sequence length (seq_len), the maximum positional embeddings (max_position_embeddings), and the + computed dimensionality (dim) of the rotary embeddings. If seq_len <= max_position_embeddings, this + factor has no effect. If seq_len <= max_position_embeddings, this factor effectively stretches the + context window using an exponent derived from `dim`. + + Additionally, this function will make use of the following properties if they are found in the config: + + * head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be + derived as hidden_size // num_attention_heads. + * partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for + the first fraction of the head_dim. Defaults to 1.0. device (`torch.device`): The device to use for initialization of the inverse frequencies. seq_len (`int`, *optional*): - The current sequence length, used to update the dynamic RoPE at inference time. + The current sequence length, used to update the dynamic RoPE at inference time. If `None` or shorter than + max_position_embeddings, this value will be overridden by max_position_embeddings. + Returns: Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling base = config.rope_theta - partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) dim = int(head_dim * partial_rotary_factor) max_position_embeddings = config.max_position_embeddings @@ -200,20 +249,58 @@ def _compute_yarn_parameters( """ Computes the inverse frequencies with NTK scaling. Please refer to the [original paper](https://huggingface.co/papers/2309.00071) + Args: config ([`~transformers.PretrainedConfig`]): - The model configuration. + The model configuration. This function assumes that the config will provide at least the following + properties: + + * rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived. + * hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly. + * num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly. + * max_position_embeddings (`int`): The maximum length of the positional embeddings. + * rope_scaling (`dict[str, float | int]`): The standard RoPE scaling parameters, from which the following + keys will be accessed: + * `attention_factor` (`float`, *optional*): The scaling factor to be applied to the computed cos/sin. + If None, the value is inferred from `factor`, `mscale`, and `mscale_all_dim` as avaialble. + * `beta_fast` (`float`, *optional*, defaults to 32): Parameter to set the boundary for extrapolation + (only) in the linear ramp function. + * `beta_slow` (`float`, *optional*, defaults to 1): Parameter to set the boundary for interpolation + (only) in the linear ramp function. + * `factor` (`float`, *optional*): The scaling factor applied when interpolating the position IDs to + extend the possible context length. Additionally, if `attention_factor` is None, the log of this + value is used to compute a value for `attention_factor`, possibly in conjunciton with `mscale` and + `mscale_all_dim`, if provided. + * `mscale` (`float`, *optional*): If `attention_factor` is None and both `mscale` and + `mscale_all_dim` are provided, `mscale` acts scalar augmenting `log(factor)` when computing the + numerator for the inferred value of `attention_factor`. If not provided, `attention_factor` will be + calculated based on `factor` only. + * `mscale_all_dim` (`float`, *optional*): If `attention_factor` is None and both `mscale` and + `mscale_all_dim` are provided, `mscale_all_dim` acts scalar augmenting `log(factor)` when computing + the denominator for the inferred value of `attention_factor`. If not provided, `attention_factor` + will be calculated based on `factor` only. + * `original_max_position_embeddings` (`int`, *optional*): The original max position embeddings used + during pretraining. If not provided, the function falls back to `max_position_embeddings`. + * `truncate` (`bool`, *optional*): Whether to truncate the correction range. + + Additionally, this function will make use of the following properties if they are found in the config: + + * head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be + derived as hidden_size // num_attention_heads. + * partial_rotary_factor (`float`, *optional*, defaults to 1.0): If less than 1.0, inverse frequencies + will be returned for the first fraction of the head_dim. device (`torch.device`): The device to use for initialization of the inverse frequencies. seq_len (`int`, *optional*): The current sequence length. Unused for this type of RoPE. + Returns: Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the post-processing scaling factor applied to the computed cos/sin. """ base = config.rope_theta - partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) dim = int(head_dim * partial_rotary_factor) factor = config.rope_scaling["factor"] @@ -237,7 +324,7 @@ def get_mscale(scale, mscale=1): attention_factor = get_mscale(factor) # Optional config options - # beta_fast/beta_slow: as suggested in the paper, default to 32/1 (correspondingly) + # beta_fast/beta_slow: as suggested in the paper, default to 32 and 1 respectively beta_fast = config.rope_scaling.get("beta_fast") or 32 beta_slow = config.rope_scaling.get("beta_slow") or 1 @@ -287,20 +374,49 @@ def _compute_longrope_parameters( """ Computes the inverse frequencies with LongRoPE scaling. Please refer to the [original implementation](https://github.com/microsoft/LongRoPE) + Args: config ([`~transformers.PretrainedConfig`]): - The model configuration. + The model configuration. This function assumes that the config will provide at least the following + properties: + + * rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived. + * hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly. + * num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly. + * max_position_embeddings (`int`): The maximum length of the positional embeddings. + * original_max_position_embeddings (`int`, *optional*): The original max position embeddings used during + pretraining. If not provided, defaults to `max_position_embeddings`. + * rope_scaling (`dict[str, float]`): The standard RoPE scaling parameters, from which the following keys + will be accessed: + * `attention_factor` (`float`, *optional*): The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, inferred from + the value of `factor`. + * `factor` (`float`, *optional*): The scaling factor to apply to the RoPE embeddings. If both + `max_position_embeddings` and `original_max_position_embeddings` are provided, this value will be + overridden s the ratio between those values. + * `long_factor` (`float`, *optional*): The scale factor applied when computing the inverse + frequencies if `seq_len` is provided and greater than `original_max_position_embeddings`. + * `short_factor` (`float`, *optional*): The scale factor applied when computing the inverse + frequencies if `seq_len` is None or less-than-or-equal-to `original_max_position_embeddings`. + + Additionally, this function will make use of the following properties if they are found in the config: + + * head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be + derived as hidden_size // num_attention_heads. + * partial_rotary_factor (`float`, *optional*, defaults to 1.0): If less than 1.0, inverse frequencies + will be returned for the first fraction of the head_dim. device (`torch.device`): The device to use for initialization of the inverse frequencies. seq_len (`int`, *optional*): The current sequence length. + Returns: Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the post-processing scaling factor applied to the computed cos/sin. """ # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling base = config.rope_theta - partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) dim = int(head_dim * partial_rotary_factor) long_factor = config.rope_scaling["long_factor"] @@ -311,9 +427,8 @@ def _compute_longrope_parameters( # NOTE: Phi3 (and potentially other models) modify `max_position_embeddings` and have a # `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two # values to compute the default attention scaling factor, instead of using `factor`. - if hasattr(config, "original_max_position_embeddings"): - original_max_position_embeddings = config.original_max_position_embeddings - factor = config.max_position_embeddings / config.original_max_position_embeddings + if original_max_position_embeddings := getattr(config, "original_max_position_embeddings", None): + factor = config.max_position_embeddings / original_max_position_embeddings else: original_max_position_embeddings = config.max_position_embeddings @@ -343,7 +458,31 @@ def _compute_llama3_parameters( Args: config ([`~transformers.PretrainedConfig`]): - The model configuration. + The model configuration. This function assumes that the config will provide at least the following + properties: + + * rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived. + * hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly. + * num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly. + * rope_scaling (`dict[str, float | int]`): The standard RoPE scaling parameters, from which the following + keys will be accessed: + * `factor` (`float`, *optional*): The scaling factor applied to the inverse frequencies when 1) the + wavelength is greater than `low_freq_wavelen` prior to smoothing, and 2) to all inverse frequencies + during smoothing. + * `high_freq_factor` (`float`): The scale factor used to compute `high_freq_wavelen` and + the value for the denominator of the smoothing factor prior to the `low_freq_factor` shift. + * `low_freq_factor` (`float`): The scale factor used to compute `low_freq_wavelen` and + the shift applied to the numerator and denominator of the smoothing factor. + frequencies if `seq_len` is None or less-than-or-equal-to `original_max_position_embeddings`. + * `original_max_position_embeddings` (`int`): The original max position embeddings used + during pretraining. If not provided, the function falls back to `max_position_embeddings`. + + Additionally, this function will make use of the following properties if they are found in the config: + + * head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be + derived as hidden_size // num_attention_heads. + * partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for + the first fraction of the head_dim. Defaults to 1.0. device (`torch.device`): The device to use for initialization of the inverse frequencies. seq_len (`int`, *optional*): @@ -527,7 +666,7 @@ def _validate_longrope_parameters(config: PretrainedConfig, ignore_keys: Optiona received_keys = set(rope_scaling.keys()) _check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys) - partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 12c3e7cd99ef..a1cf858469a6 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -19,13 +19,10 @@ import gc import importlib.metadata import inspect -import itertools import json import os import re -import shutil import sys -import tempfile import warnings from abc import abstractmethod from collections import defaultdict @@ -40,6 +37,9 @@ import torch from huggingface_hub import split_torch_state_dict_into_shards from packaging import version +from safetensors import safe_open +from safetensors.torch import load_file as safe_load_file +from safetensors.torch import save_file as safe_save_file from torch import Tensor, nn from torch.distributions import constraints from torch.utils.checkpoint import checkpoint @@ -103,14 +103,12 @@ is_optimum_available, is_peft_available, is_remote_url, - is_safetensors_available, is_torch_flex_attn_available, is_torch_greater_or_equal, is_torch_mlu_available, is_torch_npu_available, is_torch_xla_available, is_torch_xpu_available, - is_torchao_available, logging, ) from .utils.generic import _CAN_RECORD_REGISTRY, GeneralInterface, OutputRecorder @@ -125,9 +123,6 @@ from .utils.quantization_config import BitsAndBytesConfig, QuantizationMethod -if is_torchao_available(): - from torchao.quantization import Int4WeightOnlyConfig - if is_accelerate_available(): from accelerate import dispatch_model, infer_auto_device_map from accelerate.hooks import add_hook_to_module @@ -136,7 +131,6 @@ extract_model_from_parallel, get_balanced_memory, get_max_memory, - load_offloaded_weights, offload_weight, save_offload_index, ) @@ -145,11 +139,6 @@ if accelerate_version >= version.parse("0.31"): from accelerate.utils.modeling import get_state_dict_from_offload -if is_safetensors_available(): - from safetensors import safe_open - from safetensors.torch import load_file as safe_load_file - from safetensors.torch import save_file as safe_save_file - if is_peft_available(): from .utils import find_adapter_config_file @@ -414,24 +403,11 @@ def load_sharded_checkpoint(model, folder, strict=True, prefer_safe=True): index_present = os.path.isfile(index_file) safe_index_present = os.path.isfile(safe_index_file) - if not index_present and not (safe_index_present and is_safetensors_available()): - filenames = ( - (WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_INDEX_NAME) if is_safetensors_available() else (WEIGHTS_INDEX_NAME,) - ) + if not index_present and not safe_index_present: + filenames = (WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_INDEX_NAME) raise ValueError(f"Can't find a checkpoint index ({' or '.join(filenames)}) in {folder}.") - load_safe = False - if safe_index_present: - if prefer_safe: - if is_safetensors_available(): - load_safe = True # load safe due to preference - else: - logger.warning( - f"Cannot load sharded checkpoint at {folder} safely since safetensors is not installed!" - ) - elif not index_present: - load_safe = True # load safe since we have no other choice - + load_safe = safe_index_present and (prefer_safe or not index_present) load_index = safe_index_file if load_safe else index_file with open(load_index, "r", encoding="utf-8") as f: @@ -504,7 +480,7 @@ def load_state_dict( Reads a `safetensor` or a `.bin` checkpoint file. We load the checkpoint on "cpu" by default. """ # Use safetensors if possible - if checkpoint_file.endswith(".safetensors") and is_safetensors_available(): + if checkpoint_file.endswith(".safetensors"): with safe_open(checkpoint_file, framework="pt") as f: metadata = f.metadata() @@ -575,26 +551,6 @@ def load_state_dict( ) -def set_initialized_submodules(model, state_dict_keys): - """ - Sets the `_is_hf_initialized` flag in all submodules of a given model when all its weights are in the loaded state - dict. - """ - state_dict_keys = set(state_dict_keys) - not_initialized_submodules = {} - for module_name, module in model.named_modules(): - if module_name == "": - # When checking if the root module is loaded there's no need to prepend module_name. - module_keys = set(module.state_dict()) - else: - module_keys = {f"{module_name}.{k}" for k in module.state_dict()} - if module_keys.issubset(state_dict_keys): - module._is_hf_initialized = True - else: - not_initialized_submodules[module_name] = module - return not_initialized_submodules - - def _end_ptr(tensor: torch.Tensor) -> int: # extract the end of the pointer if the tensor is a slice of a bigger tensor if tensor.nelement(): @@ -682,6 +638,7 @@ def _infer_parameter_dtype( QuantizationMethod.HQQ, QuantizationMethod.QUARK, QuantizationMethod.MXFP4, + QuantizationMethod.BITS_AND_BYTES, }: return True, None else: @@ -715,17 +672,12 @@ def _load_state_dict_into_meta_model( model: "PreTrainedModel", state_dict: dict, shard_file: str, - expected_keys: list[str], reverse_renaming_mapping: dict[str, str], device_map: Optional[dict] = None, disk_offload_folder: Optional[str] = None, disk_offload_index: Optional[dict] = None, - cpu_offload_folder: Optional[str] = None, - cpu_offload_index: Optional[dict] = None, hf_quantizer: Optional[HfQuantizer] = None, - is_safetensors: bool = False, keep_in_fp32_regex: Optional[re.Pattern] = None, - unexpected_keys: Optional[list[str]] = None, # passing `unexpected` for cleanup from quantization items device_mesh: Optional["torch.distributed.device_mesh.DeviceMesh"] = None, ) -> tuple[Optional[dict], Optional[dict]]: """Load parameters from `meta_state_dict` into the model. The parameters of the `meta_state_dict` are on the meta @@ -741,18 +693,13 @@ def _load_state_dict_into_meta_model( device_map_regex = "|".join([re.escape(k) for k in sorted(device_map.keys(), reverse=True)]) is_quantized = hf_quantizer is not None - is_hqq_or_bnb = is_quantized and hf_quantizer.quantization_config.quant_method in { - QuantizationMethod.HQQ, - QuantizationMethod.BITS_AND_BYTES, - } - is_meta_state_dict = shard_file.endswith(".safetensors") and not is_hqq_or_bnb - file_pointer = None - if is_meta_state_dict: - file_pointer = safe_open(shard_file, framework="pt", device=tensor_device) + is_safetensors = shard_file.endswith(".safetensors") + is_meta_state_dict = is_safetensors + file_pointer = safe_open(shard_file, framework="pt", device=tensor_device) if is_meta_state_dict else None + params_to_load = list(state_dict.keys()) - for param_name, empty_param in state_dict.items(): - if param_name not in expected_keys: # when loading from ckpt, we skip param if doesnt exist in modeling - continue + for param_name in params_to_load: + empty_param = state_dict[param_name] # we need to use serialized_param_name as file pointer is untouched if is_meta_state_dict: # This is the name of the parameter as it appears on disk file @@ -769,19 +716,8 @@ def _load_state_dict_into_meta_model( ) if device_mesh is not None: - if ( - not is_quantized - or (not hf_quantizer.requires_parameters_quantization) - or ( - not hf_quantizer.check_quantized_param( - model, - param, - param_name, - state_dict, - device_map=device_map, - ) - ) - ): # In this case, the param is already on the correct device! + if not is_quantized or not hf_quantizer.param_needs_quantization(model, param_name): + # In this case, the param is already on the correct device! shard_and_distribute_module( model, param, @@ -792,7 +728,8 @@ def _load_state_dict_into_meta_model( device_mesh.get_local_rank(), device_mesh, ) - else: # we have a device mesh but the param needs to be quantized, so we shard inside create_quantized_param: + else: + # we have a device mesh but the param needs to be quantized, so we shard inside create_quantized_param sharding_kwargs = { "empty_param": empty_param, "casting_dtype": casting_dtype, @@ -805,8 +742,6 @@ def _load_state_dict_into_meta_model( param, param_name, device_mesh.get_local_rank(), - state_dict, - unexpected_keys, **sharding_kwargs, ) else: @@ -828,22 +763,7 @@ def _load_state_dict_into_meta_model( if param_device == "disk": if not is_safetensors: disk_offload_index = offload_weight(param, param_name, disk_offload_folder, disk_offload_index) - elif param_device == "cpu" and cpu_offload_index is not None: - cpu_offload_index = offload_weight(param, param_name, cpu_offload_folder, cpu_offload_index) - elif ( - not is_quantized - or (not hf_quantizer.requires_parameters_quantization) - or ( - not hf_quantizer.check_quantized_param( - model, - param, - param_name, - state_dict, - param_device=param_device, - device_map=device_map, - ) - ) - ): + elif not is_quantized or not hf_quantizer.param_needs_quantization(model, param_name): if is_fsdp_enabled(): param_device = "cpu" if is_local_dist_rank_0() else "meta" @@ -851,35 +771,33 @@ def _load_state_dict_into_meta_model( else: # TODO naming is stupid it loads it as well - hf_quantizer.create_quantized_param( - model, param, param_name, param_device, state_dict, unexpected_keys - ) + hf_quantizer.create_quantized_param(model, param, param_name, param_device) # For quantized modules with FSDP/DeepSpeed Stage 3, we need to quantize the parameter on the GPU # and then cast it to CPU to avoid excessive memory usage on each GPU # in comparison to the sharded model across GPUs. if is_fsdp_enabled() or is_deepspeed_zero3_enabled(): - param_name = hf_quantizer.update_param_name(param_name) + param_name = hf_quantizer.get_param_name(param_name) module, param_type = get_module_from_name(model, param_name) value = getattr(module, param_type) - # special case for gpt_oss model, we wait for the param to be leave the meta device before casting it to cpu - if model.config.model_type == "gpt_oss" and value.device.type == "meta": + # We need to wait until the quantized value is created + if value.device.type == "meta": continue - param_to = "cpu" - if is_fsdp_enabled() and not is_local_dist_rank_0(): - param_to = "meta" - val_kwargs = {} - if (hasattr(module, "weight") and module.weight.__class__.__name__ == "Int8Params") or ( - value.dtype == torch.uint8 or value.dtype == torch.int8 - ): + val_kwargs = value.__dict__ + if not value.is_floating_point(): val_kwargs["requires_grad"] = False - value = type(value)(value.data.to(param_to), **val_kwargs, **value.__dict__) + device = "meta" if is_fsdp_enabled() and not is_local_dist_rank_0() else "cpu" + value = type(value)(value.data.to(device), **val_kwargs) setattr(module, param_type, value) + # Remove the param from the state dict if it was not loaded on the fly to avoid wasting memory + if not is_meta_state_dict: + del state_dict[param_name] + if file_pointer is not None: file_pointer.__exit__(None, None, None) - return disk_offload_index, cpu_offload_index + return disk_offload_index def load_shard_file(args): @@ -887,46 +805,26 @@ def load_shard_file(args): shard_file, state_dict, disk_only_shard_files, - is_hqq_or_bnb, is_quantized, device_map, hf_quantizer, key_renaming_mapping, weights_only, - model_to_load, - expected_keys, + model, reverse_key_renaming_mapping, disk_offload_folder, disk_offload_index, - cpu_offload_folder, - cpu_offload_index, - is_offloaded_safetensors, keep_in_fp32_regex, - unexpected_keys, device_mesh, ) = args # Skip the load for shards that only contain disk-offloaded weights if shard_file in disk_only_shard_files: - return [], disk_offload_index, cpu_offload_index + return [], disk_offload_index map_location = "cpu" - if ( - shard_file.endswith(".safetensors") - and not is_hqq_or_bnb - and not (is_deepspeed_zero3_enabled() and not is_quantized) - ): + if shard_file.endswith(".safetensors") and not (is_deepspeed_zero3_enabled() and not is_quantized): map_location = "meta" - elif ( - device_map is not None - and hf_quantizer is not None - and hf_quantizer.quantization_config.quant_method == QuantizationMethod.TORCHAO - and ( - hf_quantizer.quantization_config.quant_type in ["int4_weight_only", "autoquant"] - or isinstance(hf_quantizer.quantization_config.quant_type, Int4WeightOnlyConfig) - ) - ): - map_location = torch.device([d for d in device_map.values() if d not in ["disk"]][0]) # If shard_file is "", we use the existing state_dict instead of loading it if shard_file != "": @@ -938,30 +836,24 @@ def load_shard_file(args): state_dict = {key_renaming_mapping[k]: v for k, v in state_dict.items() if k in key_renaming_mapping} error_msgs = [] - if is_deepspeed_zero3_enabled() and not is_quantized: - error_msgs += _load_state_dict_into_zero3_model(model_to_load, state_dict) + error_msgs += _load_state_dict_into_zero3_model(model, state_dict) # Skip it with fsdp on ranks other than 0 elif not (is_fsdp_enabled() and not is_local_dist_rank_0() and not is_quantized): - disk_offload_index, cpu_offload_index = _load_state_dict_into_meta_model( - model_to_load, + disk_offload_index = _load_state_dict_into_meta_model( + model, state_dict, shard_file, - expected_keys, reverse_key_renaming_mapping, device_map=device_map, disk_offload_folder=disk_offload_folder, disk_offload_index=disk_offload_index, - cpu_offload_folder=cpu_offload_folder, - cpu_offload_index=cpu_offload_index, hf_quantizer=hf_quantizer, - is_safetensors=is_offloaded_safetensors, keep_in_fp32_regex=keep_in_fp32_regex, - unexpected_keys=unexpected_keys, device_mesh=device_mesh, ) - return error_msgs, disk_offload_index, cpu_offload_index + return error_msgs, disk_offload_index def load_shard_files_with_threadpool(args_list): @@ -978,18 +870,13 @@ def load_shard_files_with_threadpool(args_list): with logging.tqdm(total=len(args_list), desc="Loading checkpoint shards") as pbar: futures = [executor.submit(load_shard_file, arg) for arg in args_list] for future in as_completed(futures): - result = future.result() - ( - _error_msgs, - disk_offload_index, - cpu_offload_index, - ) = result + _error_msgs, disk_offload_index = future.result() error_msgs += _error_msgs pbar.update(1) - return error_msgs, disk_offload_index, cpu_offload_index + return error_msgs, disk_offload_index def _add_variant(weights_name: str, variant: Optional[str] = None) -> str: @@ -1190,7 +1077,12 @@ def _get_resolved_checkpoint_files( is_sharded = True if not local_files_only and not is_offline_mode(): if resolved_archive_file is not None: - if filename in [WEIGHTS_NAME, WEIGHTS_INDEX_NAME]: + # In a CI environment (CircleCI / Github Actions workflow runs) or in a pytest run, + # we set `DISABLE_SAFETENSORS_CONVERSION=true` to prevent the conversion. + if ( + filename in [WEIGHTS_NAME, WEIGHTS_INDEX_NAME] + and os.getenv("DISABLE_SAFETENSORS_CONVERSION", None) != "true" + ): # If the PyTorch file was found, check if there is a safetensors file on the repository # If there is no safetensors file on the repositories, start an auto conversion safe_weights_name = SAFE_WEIGHTS_INDEX_NAME if is_sharded else SAFE_WEIGHTS_NAME @@ -1481,20 +1373,18 @@ def _get_device_map( def _find_missing_and_unexpected_keys( - cls, model: "PreTrainedModel", original_checkpoint_keys: list[str], checkpoint_keys: list[str], loading_base_model_from_task_state_dict: bool, hf_quantizer: Optional[HfQuantizer], - device_map: dict, ) -> tuple[list[str], list[str]]: """Find missing keys (keys that are part of the model parameters but were NOT found in the loaded state dict keys) and unexpected keys (keys found in the loaded state dict keys, but that are NOT part of the model parameters) """ prefix = model.base_model_prefix - # Compute expected keys, i.e. keys that the FULL model (not model_to_load) expects + # Compute expected keys, i.e. keys that the full model expects expected_keys = list(model.state_dict().keys()) if hf_quantizer is not None: expected_keys = hf_quantizer.update_expected_keys(model, expected_keys, checkpoint_keys) @@ -1512,12 +1402,6 @@ def _find_missing_and_unexpected_keys( model_buffers = {n for n, _ in model.named_buffers()} unexpected_keys = sorted(unexpected_keys - model_buffers) - # Old checkpoints may have keys for rotary_emb.inv_freq for each layer, however we moved this buffer to the main model - # (so the buffer name has changed). Remove them in such a case - has_inv_freq_buffers = any(buffer.endswith("rotary_emb.inv_freq") for buffer in model_buffers) - if has_inv_freq_buffers: - unexpected_keys = [k for k in unexpected_keys if "rotary_emb.inv_freq" not in k] - tied_params = find_tied_parameters(model) for group in tied_params: missing_in_group = [k for k in missing_keys if k in group] @@ -1526,16 +1410,7 @@ def _find_missing_and_unexpected_keys( if hf_quantizer is not None: missing_keys = hf_quantizer.update_missing_keys(model, missing_keys, prefix) - unexpected_keys = hf_quantizer.update_unexpected_keys(model, unexpected_keys, prefix) - - # Model-specific exceptions for missing and unexpected keys (e.g. if the modeling change over time, or any other reason...) - if cls._keys_to_ignore_on_load_missing is not None: - for pattern in cls._keys_to_ignore_on_load_missing: - missing_keys = [k for k in missing_keys if re.search(pattern, k) is None] - - if cls._keys_to_ignore_on_load_unexpected is not None: - for pattern in cls._keys_to_ignore_on_load_unexpected: - unexpected_keys = [k for k in unexpected_keys if re.search(pattern, k) is None] + unexpected_keys = hf_quantizer.update_unexpected_keys(model, unexpected_keys) return missing_keys, unexpected_keys @@ -1721,7 +1596,7 @@ def create_extended_attention_mask_for_decoder(input_shape, attention_mask, devi def get_extended_attention_mask( self, attention_mask: Tensor, - input_shape: tuple[int], + input_shape: tuple[int, ...], device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None, ) -> Tensor: @@ -1959,7 +1834,7 @@ def get_input_embeddings(self) -> nn.Module: ) def set_input_embeddings(self, value: nn.Module): - """Fallback setter that handles **~70 %** of models in the code‑base. + """Fallback setter that handles **~70%** of models in the code-base. Order of attempts: 1. `self.model.embed_tokens` @@ -2305,8 +2180,6 @@ def tp_plan(self, plan: dict[str, str]): if hasattr(self, "named_parameters"): model_param_names = [name for name, _ in self.named_parameters()] if model_param_names: # Only validate if model has parameters - import re - for layer_pattern in plan.keys(): # Convert pattern to regex (replace * with .*) regex_pattern = layer_pattern.replace("*", r"\d+") @@ -2332,8 +2205,6 @@ def tp_plan(self, plan: dict[str, str]): flexible_matched = True break if not flexible_matched: - import warnings - warnings.warn( f"Layer pattern '{layer_pattern}' does not match any parameters in the model. " f"This rule may not be applied during tensor parallelization." @@ -2778,42 +2649,46 @@ def _check_and_adjust_attn_implementation( None to sdpa (to potentially eager). """ applicable_attn_implementation = attn_implementation + # If FA not installed, do not fail but use kernels instead if ( - applicable_attn_implementation == "flash_attention_2" + attn_implementation is not None + and attn_implementation.startswith("flash_attention") and self._supports_flash_attn - and not is_flash_attn_2_available() + and not (is_flash_attn_2_available() or is_flash_attn_3_available()) and is_kernels_available() ): - applicable_attn_implementation = "kernels-community/flash-attn" + if attn_implementation.endswith("2"): + applicable_attn_implementation = "kernels-community/flash-attn" + else: + applicable_attn_implementation = "kernels-community/vllm-flash-attn3" + if is_kernel(applicable_attn_implementation): try: load_and_register_kernel(applicable_attn_implementation) # log that we used kernel fallback if successful - if attn_implementation == "flash_attention_2": + if attn_implementation.startswith("flash_attention"): logger.warning_once( - "You do not have `flash_attn` installed, using `kernels-community/flash-attn` from the `kernels` " - "library instead!" + f"You do not have `flash_attn` installed, using `{applicable_attn_implementation}` " + "from the `kernels` library instead!" ) except Exception as e: - if attn_implementation == "flash_attention_2": - self._flash_attn_2_can_dispatch() # will fail as fa2 is not available but raise the proper exception - logger.warning_once( - f"Could not find a kernel matching `{applicable_attn_implementation}` compatible with your device in the " - f"hub:\n{e}.\nUsing default attention implementation instead (sdpa if available, eager otherwise)." - ) - try: - self._sdpa_can_dispatch(is_init_check) - applicable_attn_implementation = "sdpa" - except (ValueError, ImportError) as e: - applicable_attn_implementation = "eager" + # raise the proper exception for requested flash attention + if attn_implementation.startswith("flash_attention"): + if attn_implementation.endswith("2"): + self._flash_attn_2_can_dispatch() + else: + self._flash_attn_3_can_dispatch() + + # error properly out if a kernel was specifically requested + raise e else: applicable_attn_implementation = self.get_correct_attn_implementation( applicable_attn_implementation, is_init_check ) # preload flash attention here to allow compile with fullgraph if applicable_attn_implementation.startswith("flash_attention"): - lazy_import_flash_attention(applicable_attn_implementation) + lazy_import_flash_attention(applicable_attn_implementation, force_import=True) return applicable_attn_implementation @@ -3558,7 +3433,7 @@ def _get_resized_lm_head( self, old_lm_head: nn.Linear, new_num_tokens: Optional[int] = None, - transposed: Optional[bool] = False, + transposed: bool = False, mean_resizing: bool = True, ) -> nn.Linear: """ @@ -3715,7 +3590,7 @@ def _init_added_lm_head_weights_with_mean( old_lm_head_dim, old_num_tokens, added_num_tokens, - transposed=False, + transposed: bool = False, ): if transposed: # Transpose to the desired shape for the function. @@ -3993,8 +3868,6 @@ def save_pretrained( "`save_config` is deprecated and will be removed in v5 of Transformers. Use `is_main_process` instead." ) is_main_process = kwargs.pop("save_config") - if safe_serialization and not is_safetensors_available(): - raise ImportError("`safe_serialization` requires the `safetensors library: `pip install safetensors`.") # we need to check against tp_size, not tp_plan, as tp_plan is substituted to the class one if self._tp_size is not None and not is_huggingface_hub_greater_or_equal("0.31.4"): @@ -4263,7 +4136,7 @@ def save_pretrained( if _is_dtensor_available and isinstance(state_dict[tensor], DTensor): full_tensor = state_dict[tensor].full_tensor() # to get the correctly ordered tensor we need to repack if packed - if _get_parameter_tp_plan(tensor, self._tp_plan) in ("local_packed_rowwise",): + if _get_parameter_tp_plan(tensor, self._tp_plan) == "local_packed_rowwise": full_tensor = repack_weights(full_tensor, -1, self._tp_size, 2) shard[tensor] = full_tensor.contiguous() # only do contiguous after it's permuted correctly else: @@ -4365,9 +4238,9 @@ def get_memory_footprint(self, return_buffers=True): are tensors that do not require gradients and not registered as parameters. E.g. mean and std in batch norm layers. Please see: https://discuss.pytorch.org/t/what-pytorch-means-by-buffers/120266/2 """ - mem = sum([param.nelement() * param.element_size() for param in self.parameters()]) + mem = sum(param.nelement() * param.element_size() for param in self.parameters()) if return_buffers: - mem_bufs = sum([buf.nelement() * buf.element_size() for buf in self.buffers()]) + mem_bufs = sum(buf.nelement() * buf.element_size() for buf in self.buffers()) mem = mem + mem_bufs return mem @@ -4591,9 +4464,6 @@ def from_pretrained( force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. - resume_download: - Deprecated and ignored. All downloads are now resumed by default when possible. - Will be removed in v5 of Transformers. proxies (`dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. @@ -4683,10 +4553,6 @@ def from_pretrained( If provided, it has to contain dimension named `"tp"` in case it's > 1 dimensional, this dimension will be used for tensor parallelism offload_folder (`str` or `os.PathLike`, *optional*): If the `device_map` contains any value `"disk"`, the folder where we will offload weights. - offload_state_dict (`bool`, *optional*): - If `True`, will temporarily offload the CPU state dict to the hard drive to avoid getting out of CPU - RAM if the weight of the CPU state dict + the biggest shard of the checkpoint does not fit. Defaults to - `True` when there is some disk offload. offload_buffers (`bool`, *optional*): Whether or not to offload the buffers with the model parameters. quantization_config (`Union[QuantizationConfigMixin,Dict]`, *optional*): @@ -4764,7 +4630,6 @@ def from_pretrained( device_map = kwargs.pop("device_map", None) max_memory = kwargs.pop("max_memory", None) offload_folder = kwargs.pop("offload_folder", None) - offload_state_dict = kwargs.pop("offload_state_dict", False) offload_buffers = kwargs.pop("offload_buffers", False) load_in_8bit = kwargs.pop("load_in_8bit", False) load_in_4bit = kwargs.pop("load_in_4bit", False) @@ -4798,6 +4663,7 @@ def from_pretrained( _ = kwargs.pop("mirror", None) _ = kwargs.pop("_fast_init", True) _ = kwargs.pop("low_cpu_mem_usage", None) + _ = kwargs.pop("offload_state_dict", None) # For BC on torch_dtype argument if torch_dtype is not None: @@ -4859,9 +4725,6 @@ def from_pretrained( if token is not None and adapter_kwargs is not None and "token" not in adapter_kwargs: adapter_kwargs["token"] = token - if use_safetensors is None and not is_safetensors_available(): - use_safetensors = False - if gguf_file is not None and not is_accelerate_available(): raise ValueError("accelerate is required when loading a GGUF file `pip install accelerate`.") @@ -5058,12 +4921,7 @@ def from_pretrained( is_quantized = hf_quantizer is not None is_from_file = pretrained_model_name_or_path is not None or gguf_file is not None - if ( - is_safetensors_available() - and is_from_file - and not is_sharded - and checkpoint_files[0].endswith(".safetensors") - ): + if is_from_file and not is_sharded and checkpoint_files[0].endswith(".safetensors"): with safe_open(checkpoint_files[0], framework="pt") as f: metadata = f.metadata() @@ -5159,6 +5017,10 @@ def _assign_original_dtype(module): config._pre_quantization_dtype = original_dtype _assign_original_dtype(model) + # Torchao needs access to all metadata later + if hf_quantizer.quantization_config.quant_method == QuantizationMethod.TORCHAO: + hf_quantizer.set_metadata(checkpoint_files) + if _torch_distributed_available and device_mesh is not None: model = distribute_model(model, distributed_config, device_mesh, tp_size) @@ -5192,7 +5054,6 @@ def _assign_original_dtype(module): sharded_metadata=sharded_metadata, device_map=device_map, disk_offload_folder=offload_folder, - offload_state_dict=offload_state_dict, dtype=dtype, hf_quantizer=hf_quantizer, keep_in_fp32_regex=keep_in_fp32_regex, @@ -5346,6 +5207,14 @@ def _get_key_renaming_mapping( prefix = self.base_model_prefix _prefix = f"{prefix}." + if loading_task_model_from_base_state_dict: + task_specific_expected_keys, base_model_keys = [], [] + for key in self.state_dict(): + if key.startswith(_prefix): + base_model_keys.append(key[len(_prefix) :]) + else: + task_specific_expected_keys.append(key) + renamed_keys = {} key_renaming_mapping = {} for key in checkpoint_keys: @@ -5363,6 +5232,13 @@ def _get_key_renaming_mapping( # In this case, we need to add the prefix to the keys, to match them to the expected keys if loading_task_model_from_base_state_dict: + # small sanity check: if we find a key that is only part of the task-specific keys, we raise + # (if it's also part of the base model, we do not raise and assume it comes from there) + if new_key in task_specific_expected_keys and new_key not in base_model_keys: + raise ValueError( + "The state dictionary of the model you are trying to load is corrupted. Are you sure it was " + "properly saved?" + ) new_key = ".".join([prefix, new_key]) # In this case we need to remove the prefix from the key to match them to the expected keys, and use # only the keys starting with the prefix @@ -5416,7 +5292,6 @@ def _load_pretrained_model( sharded_metadata: Optional[dict] = None, device_map: Optional[dict] = None, disk_offload_folder: Optional[str] = None, - offload_state_dict: Optional[bool] = None, dtype: Optional[torch.dtype] = None, hf_quantizer: Optional[HfQuantizer] = None, keep_in_fp32_regex: Optional[re.Pattern] = None, @@ -5430,10 +5305,6 @@ def _load_pretrained_model( QuantizationMethod.HQQ, QuantizationMethod.QUARK, } - is_hqq_or_bnb = is_quantized and hf_quantizer.quantization_config.quant_method in { - QuantizationMethod.HQQ, - QuantizationMethod.BITS_AND_BYTES, - } # Get all the keys of the state dicts that we have to initialize the model if sharded_metadata is not None: @@ -5447,7 +5318,6 @@ def _load_pretrained_model( # Check if we are in a special state, i.e. loading from a state dict coming from a different architecture prefix = model.base_model_prefix - _prefix = f"{prefix}." has_prefix_module = any(s.startswith(prefix) for s in original_checkpoint_keys) if len(prefix) > 0 else False expects_prefix_module = hasattr(model, prefix) if len(prefix) > 0 else False loading_task_model_from_base_state_dict = not has_prefix_module and expects_prefix_module @@ -5464,13 +5334,7 @@ def _load_pretrained_model( # Find missing and unexpected keys from the state dict missing_keys, unexpected_keys = _find_missing_and_unexpected_keys( - cls, - model, - original_checkpoint_keys, - checkpoint_keys, - loading_base_model_from_task_state_dict, - hf_quantizer, - device_map, + model, original_checkpoint_keys, checkpoint_keys, loading_base_model_from_task_state_dict, hf_quantizer ) # Find all the keys with shape mismatch (if we ignore the mismatch, the weights need to be newly initialized the # same way as missing keys) @@ -5484,16 +5348,18 @@ def _load_pretrained_model( weights_only, ) - # We need to update both the mapping and the list of checkpoint keys to remove the mismatched ones - key_renaming_mapping = {k: v for k, v in key_renaming_mapping.items() if v not in mismatched_keys} + # We need to update both the mapping and the list of checkpoint keys to remove the mismatched and unexpected ones + key_renaming_mapping = { + k: v for k, v in key_renaming_mapping.items() if v not in mismatched_keys and v not in unexpected_keys + } checkpoint_keys = list(key_renaming_mapping.values()) # Move missing (and potentially mismatched) keys back to cpu from meta device (because they won't be moved when # loading the weights as they are not in the loaded state dict) - model._move_missing_keys_from_meta_to_cpu(missing_keys + mismatched_keys, unexpected_keys, dtype, hf_quantizer) + model._move_missing_keys_from_meta_to_cpu(missing_keys + mismatched_keys, dtype, hf_quantizer) # correctly initialize the missing (and potentially mismatched) keys - model._initialize_missing_keys(checkpoint_keys, ignore_mismatched_sizes, is_quantized) + model._initialize_missing_keys(missing_keys + mismatched_keys, is_quantized) # Set some modules to fp32 if needed if keep_in_fp32_regex is not None: @@ -5502,29 +5368,6 @@ def _load_pretrained_model( # param = param.to(torch.float32) does not work here as only in the local scope. param.data = param.data.to(torch.float32) - # Make sure we are able to load base models as well as derived models (specific task models, with heads) - model_to_load = model - # In this case, we load a ForTaskModel with keys from a BaseModel -> only load keys to the BaseModel - if loading_task_model_from_base_state_dict: - model_to_load = getattr(model, prefix) - # Here we need to remove the prefix we added to correctly find missing/unexpected keys, as we will load - # in the submodule - key_renaming_mapping = {k: v[len(_prefix) :] for k, v in key_renaming_mapping.items()} - checkpoint_keys = list(key_renaming_mapping.values()) - # We need to update the device map as well - if device_map is not None: - device_map = {k[len(_prefix) :] if k.startswith(_prefix) else k: v for k, v in device_map.items()} - # small sanity check: the base model should not contain task-specific head keys - task_specific_expected_keys = [s for s in model.state_dict() if not s.startswith(_prefix)] - base_model_expected_keys = list(model_to_load.state_dict().keys()) - if any( - key in task_specific_expected_keys and key not in base_model_expected_keys for key in checkpoint_keys - ): - raise ValueError( - "The state dictionary of the model you are trying to load is corrupted. Are you sure it was " - "properly saved?" - ) - # Get reverse key mapping reverse_key_renaming_mapping = {v: k for k, v in key_renaming_mapping.items()} @@ -5534,8 +5377,6 @@ def _load_pretrained_model( disk_only_shard_files = [] # Prepare parameters offloading if needed if device_map is not None and "disk" in device_map.values(): - if offload_state_dict is None: - offload_state_dict = True if disk_offload_folder is not None: os.makedirs(disk_offload_folder, exist_ok=True) is_offloaded_safetensors = checkpoint_files is not None and checkpoint_files[0].endswith(".safetensors") @@ -5573,31 +5414,22 @@ def _load_pretrained_model( else: disk_offload_index = {} - # This offload index if for params that are supposed to be on the "cpu", either with or without a device_map - # It allows to load parameters one-by-one from the state dict, avoiding a memory peak of 2 x state_dict_size, - # i.e. 1x to load it, and 1x to copy it to model - cpu_offload_folder = None - cpu_offload_index = None - if offload_state_dict: - cpu_offload_folder = tempfile.mkdtemp() - cpu_offload_index = {} - # To be able to iterate, even if we don't use it if the state_dict is already provided elif state_dict is not None: checkpoint_files = [""] # Compute expected model keys - expected_keys = list(model_to_load.state_dict().keys()) + expected_keys = list(model.state_dict().keys()) if hf_quantizer is not None: - expected_keys = hf_quantizer.update_expected_keys(model_to_load, expected_keys, checkpoint_keys) + expected_keys = hf_quantizer.update_expected_keys(model, expected_keys, checkpoint_keys) if logger.level >= logging.WARNING: - verify_tp_plan(expected_keys, getattr(model_to_load, "_tp_plan", None)) + verify_tp_plan(expected_keys, getattr(model, "_tp_plan", None)) # Warmup cuda to load the weights much faster on devices if device_map is not None and not is_hqq_or_quark: expanded_device_map = expand_device_map(device_map, expected_keys) - caching_allocator_warmup(model_to_load, expanded_device_map, hf_quantizer) + caching_allocator_warmup(model, expanded_device_map, hf_quantizer) # Prepare and compatabilize arguments for serial and parallel shard loading args_list = [ @@ -5605,22 +5437,16 @@ def _load_pretrained_model( shard_file, state_dict, disk_only_shard_files, - is_hqq_or_bnb, is_quantized, device_map, hf_quantizer, key_renaming_mapping, weights_only, - model_to_load, - expected_keys, + model, reverse_key_renaming_mapping, disk_offload_folder, disk_offload_index, - cpu_offload_folder, - cpu_offload_index, - is_offloaded_safetensors, keep_in_fp32_regex, - unexpected_keys, device_mesh, ) for shard_file in checkpoint_files @@ -5632,40 +5458,20 @@ def _load_pretrained_model( os.environ.get("HF_ENABLE_PARALLEL_LOADING", "").upper() in ENV_VARS_TRUE_VALUES and not is_deepspeed_zero3_enabled() ): - _error_msgs, disk_offload_index, cpu_offload_index = load_shard_files_with_threadpool(args_list) + _error_msgs, disk_offload_index = load_shard_files_with_threadpool(args_list) error_msgs += _error_msgs else: if len(args_list) > 1: args_list = logging.tqdm(args_list, desc="Loading checkpoint shards") for args in args_list: - _error_msgs, disk_offload_index, cpu_offload_index = load_shard_file(args) + _error_msgs, disk_offload_index = load_shard_file(args) error_msgs += _error_msgs - # Adjust offloaded weights name and save if needed - if disk_offload_index is not None and len(disk_offload_index) > 0: - if loading_task_model_from_base_state_dict: - # We need to add the prefix of the base model - prefix = cls.base_model_prefix - if not is_offloaded_safetensors: - for weight_name in disk_offload_index: - shutil.move( - os.path.join(disk_offload_folder, f"{weight_name}.dat"), - os.path.join(disk_offload_folder, f"{prefix}.{weight_name}.dat"), - ) - disk_offload_index = {f"{prefix}.{key}": value for key, value in disk_offload_index.items()} - if not is_offloaded_safetensors: - save_offload_index(disk_offload_index, disk_offload_folder) - disk_offload_index = None - - # one-at-a-time param loading for the cpu offloaded params - if offload_state_dict: - # Load back temporarily offloaded state dict - load_offloaded_weights(model_to_load, cpu_offload_index, cpu_offload_folder) - shutil.rmtree(cpu_offload_folder) - - if hf_quantizer is not None: - missing_keys = hf_quantizer.update_missing_keys_after_loading(model_to_load, missing_keys, prefix) + # Save offloaded index if needed + if disk_offload_index is not None and len(disk_offload_index) > 0 and not is_offloaded_safetensors: + save_offload_index(disk_offload_index, disk_offload_folder) + disk_offload_index = None # Post-processing for tensor parallelism if device_mesh is not None: @@ -5700,6 +5506,11 @@ def _load_pretrained_model( device_mesh, ) + # Remove potential model-specific exceptions from the warnings + missing_keys, unexpected_keys = model._adjust_missing_and_unexpected_keys( + missing_keys, unexpected_keys, loading_task_model_from_base_state_dict + ) + # All potential warnings/infos if len(error_msgs) > 0: error_msg = "\n\t".join(error_msgs) @@ -5720,21 +5531,12 @@ def _load_pretrained_model( f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly identical" " (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)." ) - else: - logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n") if len(missing_keys) > 0: logger.warning( f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at" f" {pretrained_model_name_or_path} and are newly initialized: {missing_keys}\nYou should probably" " TRAIN this model on a down-stream task to be able to use it for predictions and inference." ) - elif len(mismatched_keys) == 0: - logger.info( - f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at" - f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the checkpoint" - f" was trained on, you can already use {model.__class__.__name__} for predictions without further" - " training." - ) if len(mismatched_keys) > 0: mismatched_warning = "\n".join( [ @@ -5803,7 +5605,7 @@ def retrieve_modules_from_names(self, names, add_prefix=False, remove_prefix=Fal for name, module in self.named_modules(): if remove_prefix: _prefix = f"{self.base_model_prefix}." - name = name[len(_prefix) :] if name.startswith(_prefix) else name + name = name.removeprefix(_prefix) elif add_prefix: name = ".".join([self.base_model_prefix, name]) if len(name) > 0 else self.base_model_prefix @@ -6022,12 +5824,8 @@ def is_backend_compatible(cls): return cls._supports_attention_backend def _move_missing_keys_from_meta_to_cpu( - self, - missing_keys: list[str], - unexpected_keys: list[str], - dtype: Optional[torch.dtype], - hf_quantizer: Optional[HfQuantizer], - ) -> "PreTrainedModel": + self, missing_keys: list[str], dtype: torch.dtype, hf_quantizer: Optional[HfQuantizer] + ) -> None: """Move the missing keys (keys that are part of the model parameters, but were NOT found in the loaded state dicts) back from meta device to cpu. """ @@ -6047,56 +5845,90 @@ def _move_missing_keys_from_meta_to_cpu( # Buffers are not initialized on the meta device, so we still need this check to avoid overwriting them if param.device == torch.device("meta"): value = torch.empty_like(param, dtype=dtype, device="cpu") - if ( - not is_quantized - or (getattr(hf_quantizer, "requires_parameters_quantization", False)) - or not hf_quantizer.check_quantized_param(self, param_value=value, param_name=key, state_dict={}) - ): + if not is_quantized or not hf_quantizer.param_needs_quantization(self, key): _load_parameter_into_model(self, key, value) else: - hf_quantizer.create_quantized_param(self, value, key, "cpu", model_state_dict, unexpected_keys) + hf_quantizer.create_quantized_param(self, value, key, "cpu") - def _initialize_missing_keys( - self, - loaded_keys: list[str], - ignore_mismatched_sizes: bool, - is_quantized: bool, - ) -> "PreTrainedModel": + def _initialize_missing_keys(self, missing_keys: list[str], is_quantized: bool) -> None: """Initialize the missing keys (keys that are part of the model parameters, but were NOT found in the loaded state dicts), according to `_initialize_weights`. Indeed, since the corresponding weights are missing from the state dict, they will not be replaced and need to be initialized correctly (i.e. weight initialization distribution). Also take care of setting the `_is_hf_initialized` flag for keys that are not missing. """ - if not ignore_mismatched_sizes: - not_initialized_submodules = set_initialized_submodules(self, loaded_keys) - # If we're about to tie the output embeds to the input embeds we don't need to init them + for key in self.state_dict(): + # If it's part of the keys that will be loaded, mark it as already initialized + if key not in missing_keys: + param_or_buffer = self.get_parameter_or_buffer(key) + param_or_buffer._is_hf_initialized = True + + def set_is_initialized_for_modules(module): + # A module is already initialized if and only if all its children are also already initialized, and all + # its immediate `nn.Parameter` and persistent buffers are also already initialized if ( - hasattr(self.config.get_text_config(decoder=True), "tie_word_embeddings") - and self.config.get_text_config(decoder=True).tie_word_embeddings + all(getattr(child, "_is_hf_initialized", False) for child in module.children()) + and all(getattr(param, "_is_hf_initialized", False) for param in module.parameters(recurse=False)) + and all( + getattr(buffer, "_is_hf_initialized", False) + for buffer in module.buffers(recurse=False) + if buffer not in module._non_persistent_buffers_set + ) ): - output_embeddings = self.get_output_embeddings() - if output_embeddings is not None: - # Still need to initialize if there is a bias term since biases are not tied. - if not hasattr(output_embeddings, "bias") or output_embeddings.bias is None: - output_embeddings._is_hf_initialized = True - else: - not_initialized_submodules = dict(self.named_modules()) + module._is_hf_initialized = True + + # Set the flag on the modules as well. We do it recursively (depth-first), as it's more efficient (we do not + # need to check the entire state dict of each module, only the immediate children, so we only iterate once over + # each param) + self.apply(set_is_initialized_for_modules) + # This will only initialize submodules that are not marked as initialized by the line above. if is_deepspeed_zero3_enabled() and not is_quantized: import deepspeed not_initialized_parameters = list( - set( - itertools.chain.from_iterable( - submodule.parameters(recurse=False) for submodule in not_initialized_submodules.values() - ) - ) + {v for v in self.state_dict().values() if not getattr(v, "_is_hf_initialized", False)} ) with deepspeed.zero.GatheredParameters(not_initialized_parameters, modifier_rank=0): self.initialize_weights() else: self.initialize_weights() + def _adjust_missing_and_unexpected_keys( + self, missing_keys: list[str], unexpected_keys: list[str], loading_task_model_from_base_state_dict: bool + ) -> tuple[list[str], list[str]]: + """Adjust the `missing_keys` and `unexpected_keys` based on current model's exception rules, to avoid + raising unneeded warnings/errors. + """ + # Old checkpoints may have keys for rotary_emb.inv_freq for each layer, however we moved this buffer to the main model + # (so the buffer name has changed). Remove them in such a case. This is another exception that was not added to + # `_keys_to_ignore_on_load_unexpected` as it touches many models -> we add it manually to the existing patterns + has_inv_freq_buffers = any(buffer.endswith("rotary_emb.inv_freq") for buffer, _ in self.named_buffers()) + additional_unexpected_patterns = [r"rotary_emb\.inv_freq"] if has_inv_freq_buffers else [] + + missing_patterns = self._keys_to_ignore_on_load_missing or [] + unexpected_patterns = (self._keys_to_ignore_on_load_unexpected or []) + additional_unexpected_patterns + ignore_missing_regex, ignore_unexpected_regex = None, None + if len(missing_patterns) > 0: + ignore_missing_regex = re.compile("|".join(rf"({pattern})" for pattern in missing_patterns)) + if len(unexpected_patterns) > 0: + ignore_unexpected_regex = re.compile("|".join(rf"({pattern})" for pattern in unexpected_patterns)) + + # Clean-up missing keys + if ignore_missing_regex is not None: + missing_keys = [key for key in missing_keys if ignore_missing_regex.search(key) is None] + + # Clean-up unexpected keys + if ignore_unexpected_regex is not None: + unexpected_keys = [key for key in unexpected_keys if ignore_unexpected_regex.search(key) is None] + + # Note: only the unexpected keys should remove the added prefix here, to correctly display the original name + # in the warnings. For missing keys, we should show the prefix in the warning as it's part of the final model + if loading_task_model_from_base_state_dict: + _prefix = f"{self.base_model_prefix}." + unexpected_keys = [k.removeprefix(_prefix) for k in unexpected_keys] + + return missing_keys, unexpected_keys + def get_parameter_or_buffer(self, target: str): """ Return the parameter or buffer given by `target` if it exists, otherwise throw an error. This combines @@ -6234,7 +6066,7 @@ def caching_allocator_warmup(model: PreTrainedModel, expanded_device_map: dict, # For example in the case of MXFP4 quantization, we need to update the param name to the original param name # because the checkpoint contains blocks, and scales, but since we are dequantizing, we need to use the original param name if hf_quantizer is not None: - param_name = hf_quantizer.update_param_name(param_name) + param_name = hf_quantizer.get_param_name(param_name) try: param = model.get_parameter_or_buffer(param_name) diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 5c391e7162f4..c721f24a506d 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -48,6 +48,7 @@ from .blip import * from .blip_2 import * from .bloom import * + from .blt import * from .bridgetower import * from .bros import * from .byt5 import * @@ -107,6 +108,8 @@ from .dots1 import * from .dpr import * from .dpt import * + from .edgetam import * + from .edgetam_video import * from .efficientloftr import * from .efficientnet import * from .electra import * @@ -183,6 +186,7 @@ from .led import * from .levit import * from .lfm2 import * + from .lfm2_vl import * from .lightglue import * from .lilt import * from .llama import * @@ -251,6 +255,7 @@ from .owlv2 import * from .owlvit import * from .paligemma import * + from .parakeet import * from .patchtsmixer import * from .patchtst import * from .pegasus import * @@ -281,6 +286,7 @@ from .qwen3 import * from .qwen3_moe import * from .qwen3_next import * + from .qwen3_omni_moe import * from .qwen3_vl import * from .qwen3_vl_moe import * from .rag import * diff --git a/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py b/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py deleted file mode 100644 index 824d6b5138f7..000000000000 --- a/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py +++ /dev/null @@ -1,269 +0,0 @@ -# coding=utf-8 -# Copyright 2025 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import gc -import os -import re -from typing import Optional - -import torch -from huggingface_hub import snapshot_download -from safetensors import safe_open - -from transformers import ( - Aimv2Config, - Aimv2Model, - Aimv2VisionConfig, - Aimv2VisionModel, - AutoImageProcessor, - AutoProcessor, -) - - -ORIGINAL_TO_CONVERTED_KEY_MAPPING_VISION_MODEL = { - # Embeddings - r"preprocessor.patchifier.proj": r"embeddings.patch_embed", - r"preprocessor.pos_embed": r"embeddings.position_embedding.weight", - r"preprocessor.patchifier.norm.weight": r"embeddings.rms_norm.weight", - # Encoder Layers - r"trunk.blocks.(\d+).attn.qkv": r"encoder.layers.\1.attention.qkv", - r"trunk.blocks.(\d+).attn.proj": r"encoder.layers.\1.attention.out_proj", - r"trunk.blocks.(\d+).mlp.fc1": r"encoder.layers.\1.ffn.gate_proj", - r"trunk.blocks.(\d+).mlp.fc2": r"encoder.layers.\1.ffn.down_proj", - r"trunk.blocks.(\d+).mlp.fc3": r"encoder.layers.\1.ffn.up_proj", - # Normalization Layers - r"trunk.blocks.(\d+).norm_1": r"encoder.layers.\1.rms_norm1", - r"trunk.blocks.(\d+).norm_2": r"encoder.layers.\1.rms_norm2", - # Final Norm - r"trunk.post_trunk_norm": r"rms_norm", -} - -ORIGINAL_TO_CONVERTED_KEY_MAPPING = { - # Vision Embeddings - r"image_encoder.preprocessor.patchifier.proj": r"vision_model.embeddings.patch_embed", - r"image_encoder.preprocessor.pos_embed": r"vision_model.embeddings.position_embedding.weight", - r"image_encoder.preprocessor.patchifier.norm.weight": r"vision_model.embeddings.rms_norm.weight", - # Vision Encoder Layers - r"image_encoder.trunk.blocks.(\d+).attn.qkv": r"vision_model.encoder.layers.\1.attention.qkv", - r"image_encoder.trunk.blocks.(\d+).attn.proj": r"vision_model.encoder.layers.\1.attention.out_proj", - r"image_encoder.trunk.blocks.(\d+).mlp.fc1": r"vision_model.encoder.layers.\1.ffn.gate_proj", - r"image_encoder.trunk.blocks.(\d+).mlp.fc2": r"vision_model.encoder.layers.\1.ffn.down_proj", - r"image_encoder.trunk.blocks.(\d+).mlp.fc3": r"vision_model.encoder.layers.\1.ffn.up_proj", - # Normalization Layers - r"image_encoder.trunk.blocks.(\d+).norm_1": r"vision_model.encoder.layers.\1.rms_norm1", - r"image_encoder.trunk.blocks.(\d+).norm_2": r"vision_model.encoder.layers.\1.rms_norm2", - r"image_encoder.trunk.post_trunk_norm": r"vision_model.rms_norm", - r"image_projector": r"visual_projection", - # Vision Head - r"image_encoder.head.cls_token": r"vision_model.head.cls_token", - r"image_encoder.head.k": r"vision_model.head.k_proj", - r"image_encoder.head.v": r"vision_model.head.v_proj", - r"image_encoder.head.linear": r"vision_model.head.output_proj", - # Text Embeddings - r"text_encoder.preprocessor.text_embedding.weight": r"text_model.embeddings.token_embedding.weight", - r"text_encoder.preprocessor.positional_embedding": r"text_model.embeddings.position_embedding.weight", - # Text Encoder Layers - r"text_encoder.trunk.blocks.(\d+).attn.qkv": r"text_model.encoder.layers.\1.attention.qkv", - r"text_encoder.trunk.blocks.(\d+).attn.proj": r"text_model.encoder.layers.\1.attention.out_proj", - r"text_encoder.trunk.blocks.(\d+).mlp.fc1": r"text_model.encoder.layers.\1.ffn.gate_proj", - r"text_encoder.trunk.blocks.(\d+).mlp.fc2": r"text_model.encoder.layers.\1.ffn.down_proj", - r"text_encoder.trunk.blocks.(\d+).mlp.fc3": r"text_model.encoder.layers.\1.ffn.up_proj", - # Text Normalization Layers - r"text_encoder.trunk.blocks.(\d+).norm_1": r"text_model.encoder.layers.\1.rms_norm1", - r"text_encoder.trunk.blocks.(\d+).norm_2": r"text_model.encoder.layers.\1.rms_norm2", - r"text_encoder.trunk.post_trunk_norm": r"text_model.rms_norm", - r"text_projector": r"text_projection", - r"log_logit_scale": r"logit_scale", -} - - -def load_original_state_dict(model_id: str, revision: Optional[str] = None) -> dict[str, torch.Tensor]: - # Download only the model.safetensors file - directory_path = snapshot_download( - repo_id=model_id, - revision=revision, - allow_patterns=["model.safetensors"], - ) - - original_state_dict = {} - safetensor_path = f"{directory_path}/model.safetensors" - - with safe_open(safetensor_path, framework="pt", device="cpu") as f: - for key in f.keys(): - original_state_dict[key] = f.get_tensor(key) - - return original_state_dict - - -def convert_old_keys_to_new_keys(state_dict_keys: dict, ORIGINAL_TO_CONVERTED_KEY_MAPPING: dict): - """Converts state dict keys from the old format to the new format.""" - - output_dict = {} - if state_dict_keys is not None: - old_text = "\n".join(state_dict_keys) - new_text = old_text - for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items(): - if replacement is None: - new_text = re.sub(pattern, "", new_text) # an empty line - continue - new_text = re.sub(pattern, replacement, new_text) - output_dict = dict(zip(old_text.split("\n"), new_text.split("\n"))) - return output_dict - - -def split_qkv_tensor(key, tensor): - """Splits a qkv tensor into separate q, k, v tensors and updates the key accordingly.""" - - new_keys = ["q_proj", "k_proj", "v_proj"] - split_size = tensor.shape[0] // 3 - split_tensors = torch.split(tensor, split_size, dim=0) - - return {key.replace("qkv", new_key): split_tensors[i] for i, new_key in enumerate(new_keys)} - - -def get_model_config_mapping(model_id: str): - """Determines the correct model, config, and key mappings based on the checkpoint name.""" - - if model_id == "apple/aimv2-large-patch14-224-lit": - return Aimv2Model, Aimv2Config, ORIGINAL_TO_CONVERTED_KEY_MAPPING - else: - return Aimv2VisionModel, Aimv2VisionConfig, ORIGINAL_TO_CONVERTED_KEY_MAPPING_VISION_MODEL - - -def write_model( - hf_repo_id: str, - output_dir: str, - safe_serialization: bool = True, -): - """ - Converts a model checkpoint to Hugging Face format and saves it. - - Args: - hf_repo_id (str): The Hugging Face repo ID to load from. - output_dir (str): The directory to save the converted model. - safe_serialization (bool): Whether to use safe serialization. - - Returns: - model: The reloaded Hugging Face model. - """ - os.makedirs(output_dir, exist_ok=True) - - # Get the appropriate model, config, and key mapping - model_class, config_class, key_mapping = get_model_config_mapping(hf_repo_id) - - # Load config and original state dict - config = config_class.from_pretrained(hf_repo_id) - - # Checkpoint `apple/aimv2-large-patch14-224-lit` uses AttentionPoolingHead hence set the required attr in config. - if hf_repo_id != "apple/aimv2-large-patch14-224-lit": - config.use_head = False - - if hf_repo_id == "apple/aimv2-large-patch14-native": - config.is_native = True - - original_state_dict = load_original_state_dict(hf_repo_id) - - print("Converting model...") - - state_dict = {} - result = convert_old_keys_to_new_keys(original_state_dict, key_mapping) - all_keys = list(original_state_dict.keys()) - - for key in all_keys: - value = original_state_dict[key] - new_key = result.pop(key) - - if "qkv" in new_key: - qkv_state_dict = split_qkv_tensor(new_key, value) - state_dict.update(qkv_state_dict) - else: - state_dict[new_key] = value - - # Check if position embeddings exist before squeezing - if new_key.endswith("position_embedding.weight"): - state_dict[new_key] = value.squeeze(0) - - print(f"Loading the checkpoint in a {model_class.__name__}.") - model = model_class(config) - model.load_state_dict(state_dict, strict=True, assign=True) - print("Checkpoint loaded successfully.") - - print("Saving the model.") - model.save_pretrained(output_dir, safe_serialization=safe_serialization) - del state_dict, model - gc.collect() - - print("Reloading the model to check if it's saved correctly.") - model = model_class.from_pretrained(output_dir, device_map="auto") - print("Model reloaded successfully.") - return model - - -def write_image_processor(hf_repo_id: str, output_dir: str): - if hf_repo_id == "apple/aimv2-large-patch14-224-lit": - image_processor = AutoProcessor.from_pretrained(hf_repo_id, use_fast=True) - else: - image_processor = AutoImageProcessor.from_pretrained(hf_repo_id, use_fast=True) - image_processor.save_pretrained(output_dir) - return image_processor - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--hf_repo_id", - default="apple/aimv2-large-patch14-224", - help="Location of official weights from apple on HF", - ) - parser.add_argument( - "--output_dir", - default="aimv2_model", - help="Location to write the converted model and processor", - ) - parser.add_argument( - "--safe_serialization", default=True, type=bool, help="Whether or not to save using `safetensors`." - ) - parser.add_argument( - "--push_to_hub", - action=argparse.BooleanOptionalAction, - help="Whether or not to push the converted model to the huggingface hub.", - ) - parser.add_argument( - "--hub_repo_id", - default=None, - help="Huggingface hub repo to write the converted model and processor", - ) - args = parser.parse_args() - - model = write_model( - hf_repo_id=args.hf_repo_id, - output_dir=args.output_dir, - safe_serialization=args.safe_serialization, - ) - - image_processor = write_image_processor( - hf_repo_id=args.hf_repo_id, - output_dir=args.output_dir, - ) - - if args.push_to_hub: - print("Pushing to hub...") - model.push_to_hub(args.hub_repo_id) - image_processor.push_to_hub(args.hub_repo_id) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py deleted file mode 100644 index df2a22610187..000000000000 --- a/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py +++ /dev/null @@ -1,62 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert ALBERT checkpoint.""" - -import argparse - -import torch - -from ...utils import logging -from . import AlbertConfig, AlbertForPreTraining, load_tf_weights_in_albert - - -logging.set_verbosity_info() - - -def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path): - # Initialise PyTorch model - config = AlbertConfig.from_json_file(albert_config_file) - print(f"Building PyTorch model from configuration: {config}") - model = AlbertForPreTraining(config) - - # Load weights from tf checkpoint - load_tf_weights_in_albert(model, config, tf_checkpoint_path) - - # Save pytorch-model - print(f"Save PyTorch model to {pytorch_dump_path}") - torch.save(model.state_dict(), pytorch_dump_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." - ) - parser.add_argument( - "--albert_config_file", - default=None, - type=str, - required=True, - help=( - "The config json file corresponding to the pre-trained ALBERT model. \n" - "This specifies the model architecture." - ), - ) - parser.add_argument( - "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - args = parser.parse_args() - convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.albert_config_file, args.pytorch_dump_path) diff --git a/src/transformers/models/align/convert_align_tf_to_hf.py b/src/transformers/models/align/convert_align_tf_to_hf.py deleted file mode 100644 index 74309a0d7076..000000000000 --- a/src/transformers/models/align/convert_align_tf_to_hf.py +++ /dev/null @@ -1,389 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert ALIGN checkpoints from the original repository.""" - -import argparse -import os - -import align -import numpy as np -import requests -import tensorflow as tf -import torch -from PIL import Image -from tokenizer import Tokenizer - -from transformers import ( - AlignConfig, - AlignModel, - AlignProcessor, - BertConfig, - BertTokenizer, - EfficientNetConfig, - EfficientNetImageProcessor, -) -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def preprocess(image): - image = tf.image.resize(image, (346, 346)) - image = tf.image.crop_to_bounding_box(image, (346 - 289) // 2, (346 - 289) // 2, 289, 289) - return image - - -def get_align_config(): - vision_config = EfficientNetConfig.from_pretrained("google/efficientnet-b7") - vision_config.image_size = 289 - vision_config.hidden_dim = 640 - vision_config.id2label = {"0": "LABEL_0", "1": "LABEL_1"} - vision_config.label2id = {"LABEL_0": 0, "LABEL_1": 1} - vision_config.depthwise_padding = [] - - text_config = BertConfig() - config = AlignConfig.from_text_vision_configs( - text_config=text_config, vision_config=vision_config, projection_dim=640 - ) - return config - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -def get_processor(): - image_processor = EfficientNetImageProcessor( - do_center_crop=True, - rescale_factor=1 / 127.5, - rescale_offset=True, - do_normalize=False, - include_top=False, - resample=Image.BILINEAR, - ) - tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased") - tokenizer.model_max_length = 64 - processor = AlignProcessor(image_processor=image_processor, tokenizer=tokenizer) - return processor - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def rename_keys(original_param_names): - # EfficientNet image encoder - block_names = [v.split("_")[0].split("block")[1] for v in original_param_names if v.startswith("block")] - block_names = list(set(block_names)) - block_names = sorted(block_names) - num_blocks = len(block_names) - block_name_mapping = {b: str(i) for b, i in zip(block_names, range(num_blocks))} - - rename_keys = [] - rename_keys.append(("stem_conv/kernel:0", "embeddings.convolution.weight")) - rename_keys.append(("stem_bn/gamma:0", "embeddings.batchnorm.weight")) - rename_keys.append(("stem_bn/beta:0", "embeddings.batchnorm.bias")) - rename_keys.append(("stem_bn/moving_mean:0", "embeddings.batchnorm.running_mean")) - rename_keys.append(("stem_bn/moving_variance:0", "embeddings.batchnorm.running_var")) - - for b in block_names: - hf_b = block_name_mapping[b] - rename_keys.append((f"block{b}_expand_conv/kernel:0", f"encoder.blocks.{hf_b}.expansion.expand_conv.weight")) - rename_keys.append((f"block{b}_expand_bn/gamma:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.weight")) - rename_keys.append((f"block{b}_expand_bn/beta:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.bias")) - rename_keys.append( - (f"block{b}_expand_bn/moving_mean:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.running_mean") - ) - rename_keys.append( - (f"block{b}_expand_bn/moving_variance:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.running_var") - ) - rename_keys.append( - (f"block{b}_dwconv/depthwise_kernel:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_conv.weight") - ) - rename_keys.append((f"block{b}_bn/gamma:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.weight")) - rename_keys.append((f"block{b}_bn/beta:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.bias")) - rename_keys.append( - (f"block{b}_bn/moving_mean:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.running_mean") - ) - rename_keys.append( - (f"block{b}_bn/moving_variance:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.running_var") - ) - - rename_keys.append((f"block{b}_se_reduce/kernel:0", f"encoder.blocks.{hf_b}.squeeze_excite.reduce.weight")) - rename_keys.append((f"block{b}_se_reduce/bias:0", f"encoder.blocks.{hf_b}.squeeze_excite.reduce.bias")) - rename_keys.append((f"block{b}_se_expand/kernel:0", f"encoder.blocks.{hf_b}.squeeze_excite.expand.weight")) - rename_keys.append((f"block{b}_se_expand/bias:0", f"encoder.blocks.{hf_b}.squeeze_excite.expand.bias")) - rename_keys.append( - (f"block{b}_project_conv/kernel:0", f"encoder.blocks.{hf_b}.projection.project_conv.weight") - ) - rename_keys.append((f"block{b}_project_bn/gamma:0", f"encoder.blocks.{hf_b}.projection.project_bn.weight")) - rename_keys.append((f"block{b}_project_bn/beta:0", f"encoder.blocks.{hf_b}.projection.project_bn.bias")) - rename_keys.append( - (f"block{b}_project_bn/moving_mean:0", f"encoder.blocks.{hf_b}.projection.project_bn.running_mean") - ) - rename_keys.append( - (f"block{b}_project_bn/moving_variance:0", f"encoder.blocks.{hf_b}.projection.project_bn.running_var") - ) - - key_mapping = {} - for item in rename_keys: - if item[0] in original_param_names: - key_mapping[item[0]] = "vision_model." + item[1] - - # BERT text encoder - rename_keys = [] - old = "tf_bert_model/bert" - new = "text_model" - for i in range(12): - rename_keys.append( - ( - f"{old}/encoder/layer_._{i}/attention/self/query/kernel:0", - f"{new}.encoder.layer.{i}.attention.self.query.weight", - ) - ) - rename_keys.append( - ( - f"{old}/encoder/layer_._{i}/attention/self/query/bias:0", - f"{new}.encoder.layer.{i}.attention.self.query.bias", - ) - ) - rename_keys.append( - ( - f"{old}/encoder/layer_._{i}/attention/self/key/kernel:0", - f"{new}.encoder.layer.{i}.attention.self.key.weight", - ) - ) - rename_keys.append( - ( - f"{old}/encoder/layer_._{i}/attention/self/key/bias:0", - f"{new}.encoder.layer.{i}.attention.self.key.bias", - ) - ) - rename_keys.append( - ( - f"{old}/encoder/layer_._{i}/attention/self/value/kernel:0", - f"{new}.encoder.layer.{i}.attention.self.value.weight", - ) - ) - rename_keys.append( - ( - f"{old}/encoder/layer_._{i}/attention/self/value/bias:0", - f"{new}.encoder.layer.{i}.attention.self.value.bias", - ) - ) - rename_keys.append( - ( - f"{old}/encoder/layer_._{i}/attention/output/dense/kernel:0", - f"{new}.encoder.layer.{i}.attention.output.dense.weight", - ) - ) - rename_keys.append( - ( - f"{old}/encoder/layer_._{i}/attention/output/dense/bias:0", - f"{new}.encoder.layer.{i}.attention.output.dense.bias", - ) - ) - rename_keys.append( - ( - f"{old}/encoder/layer_._{i}/attention/output/LayerNorm/gamma:0", - f"{new}.encoder.layer.{i}.attention.output.LayerNorm.weight", - ) - ) - rename_keys.append( - ( - f"{old}/encoder/layer_._{i}/attention/output/LayerNorm/beta:0", - f"{new}.encoder.layer.{i}.attention.output.LayerNorm.bias", - ) - ) - rename_keys.append( - ( - f"{old}/encoder/layer_._{i}/intermediate/dense/kernel:0", - f"{new}.encoder.layer.{i}.intermediate.dense.weight", - ) - ) - rename_keys.append( - ( - f"{old}/encoder/layer_._{i}/intermediate/dense/bias:0", - f"{new}.encoder.layer.{i}.intermediate.dense.bias", - ) - ) - rename_keys.append( - (f"{old}/encoder/layer_._{i}/output/dense/kernel:0", f"{new}.encoder.layer.{i}.output.dense.weight") - ) - rename_keys.append( - (f"{old}/encoder/layer_._{i}/output/dense/bias:0", f"{new}.encoder.layer.{i}.output.dense.bias") - ) - rename_keys.append( - (f"{old}/encoder/layer_._{i}/output/LayerNorm/gamma:0", f"{new}.encoder.layer.{i}.output.LayerNorm.weight") - ) - rename_keys.append( - (f"{old}/encoder/layer_._{i}/output/LayerNorm/beta:0", f"{new}.encoder.layer.{i}.output.LayerNorm.bias") - ) - - rename_keys.append((f"{old}/embeddings/word_embeddings/weight:0", f"{new}.embeddings.word_embeddings.weight")) - rename_keys.append( - (f"{old}/embeddings/position_embeddings/embeddings:0", f"{new}.embeddings.position_embeddings.weight") - ) - rename_keys.append( - (f"{old}/embeddings/token_type_embeddings/embeddings:0", f"{new}.embeddings.token_type_embeddings.weight") - ) - rename_keys.append((f"{old}/embeddings/LayerNorm/gamma:0", f"{new}.embeddings.LayerNorm.weight")) - rename_keys.append((f"{old}/embeddings/LayerNorm/beta:0", f"{new}.embeddings.LayerNorm.bias")) - - rename_keys.append((f"{old}/pooler/dense/kernel:0", f"{new}.pooler.dense.weight")) - rename_keys.append((f"{old}/pooler/dense/bias:0", f"{new}.pooler.dense.bias")) - rename_keys.append(("dense/kernel:0", "text_projection.weight")) - rename_keys.append(("dense/bias:0", "text_projection.bias")) - rename_keys.append(("dense/bias:0", "text_projection.bias")) - rename_keys.append(("temperature:0", "temperature")) - - for item in rename_keys: - if item[0] in original_param_names: - key_mapping[item[0]] = item[1] - return key_mapping - - -def replace_params(hf_params, tf_params, key_mapping): - list(hf_params.keys()) - - for key, value in tf_params.items(): - if key not in key_mapping: - continue - - hf_key = key_mapping[key] - if "_conv" in key and "kernel" in key: - new_hf_value = torch.from_numpy(value).permute(3, 2, 0, 1) - elif "embeddings" in key: - new_hf_value = torch.from_numpy(value) - elif "depthwise_kernel" in key: - new_hf_value = torch.from_numpy(value).permute(2, 3, 0, 1) - elif "kernel" in key: - new_hf_value = torch.from_numpy(np.transpose(value)) - elif "temperature" in key: - new_hf_value = value - elif "bn/gamma" in key or "bn/beta" in key: - new_hf_value = torch.from_numpy(np.transpose(value)).squeeze() - else: - new_hf_value = torch.from_numpy(value) - - # Replace HF parameters with original TF model parameters - hf_params[hf_key].copy_(new_hf_value) - - -@torch.no_grad() -def convert_align_checkpoint(checkpoint_path, pytorch_dump_folder_path, save_model, push_to_hub): - """ - Copy/paste/tweak model's weights to our ALIGN structure. - """ - # Load original model - seq_length = 64 - tok = Tokenizer(seq_length) - original_model = align.Align("efficientnet-b7", "bert-base", 640, seq_length, tok.get_vocab_size()) - original_model.compile() - original_model.load_weights(checkpoint_path) - - tf_params = original_model.trainable_variables - tf_non_train_params = original_model.non_trainable_variables - tf_params = {param.name: param.numpy() for param in tf_params} - for param in tf_non_train_params: - tf_params[param.name] = param.numpy() - tf_param_names = list(tf_params.keys()) - - # Load HuggingFace model - config = get_align_config() - hf_model = AlignModel(config).eval() - hf_params = hf_model.state_dict() - - # Create src-to-dst parameter name mapping dictionary - print("Converting parameters...") - key_mapping = rename_keys(tf_param_names) - replace_params(hf_params, tf_params, key_mapping) - - # Initialize processor - processor = get_processor() - inputs = processor( - images=prepare_img(), text="A picture of a cat", padding="max_length", max_length=64, return_tensors="pt" - ) - - # HF model inference - hf_model.eval() - with torch.no_grad(): - outputs = hf_model(**inputs) - - hf_image_features = outputs.image_embeds.detach().numpy() - hf_text_features = outputs.text_embeds.detach().numpy() - - # Original model inference - original_model.trainable = False - tf_image_processor = EfficientNetImageProcessor( - do_center_crop=True, - do_rescale=False, - do_normalize=False, - include_top=False, - resample=Image.BILINEAR, - ) - image = tf_image_processor(images=prepare_img(), return_tensors="tf", data_format="channels_last")["pixel_values"] - text = tok(tf.constant(["A picture of a cat"])) - - image_features = original_model.image_encoder(image, training=False) - text_features = original_model.text_encoder(text, training=False) - - image_features = tf.nn.l2_normalize(image_features, axis=-1) - text_features = tf.nn.l2_normalize(text_features, axis=-1) - - # Check whether original and HF model outputs match -> np.allclose - if not np.allclose(image_features, hf_image_features, atol=1e-3): - raise ValueError("The predicted image features are not the same.") - if not np.allclose(text_features, hf_text_features, atol=1e-3): - raise ValueError("The predicted text features are not the same.") - print("Model outputs match!") - - if save_model: - # Create folder to save model - if not os.path.isdir(pytorch_dump_folder_path): - os.mkdir(pytorch_dump_folder_path) - # Save converted model and image processor - hf_model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - # Push model and image processor to hub - print("Pushing converted ALIGN to the hub...") - processor.push_to_hub("align-base") - hf_model.push_to_hub("align-base") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--checkpoint_path", - default="./weights/model-weights", - type=str, - help="Path to the pretrained TF ALIGN checkpoint.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default="hf_model", - type=str, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument("--save_model", action="store_true", help="Save model to local") - parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub") - - args = parser.parse_args() - convert_align_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.save_model, args.push_to_hub) diff --git a/src/transformers/models/altclip/configuration_altclip.py b/src/transformers/models/altclip/configuration_altclip.py index 5e8c0f2a262e..474fc48081b5 100755 --- a/src/transformers/models/altclip/configuration_altclip.py +++ b/src/transformers/models/altclip/configuration_altclip.py @@ -303,7 +303,7 @@ def __init__( # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different. for key, value in _text_config_dict.items(): - if key in text_config and value != text_config[key] and key not in ["transformers_version"]: + if key in text_config and value != text_config[key] and key != "transformers_version": # If specified in `text_config_dict` if key in text_config_dict: message = ( @@ -335,7 +335,7 @@ def __init__( # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different. for key, value in _vision_config_dict.items(): - if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]: + if key in vision_config and value != vision_config[key] and key != "transformers_version": # If specified in `vision_config_dict` if key in vision_config_dict: message = ( diff --git a/src/transformers/models/aria/convert_aria_weights_to_hf.py b/src/transformers/models/aria/convert_aria_weights_to_hf.py deleted file mode 100644 index 4931595f92cf..000000000000 --- a/src/transformers/models/aria/convert_aria_weights_to_hf.py +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import glob - -import torch -from huggingface_hub import snapshot_download -from safetensors import safe_open - -from transformers import ( - AddedToken, - AriaForConditionalGeneration, - AriaProcessor, - AutoConfig, - AutoTokenizer, -) - - -EPILOG_TXT = """Example: - python transformers/src/transformers/models/aria/convert_aria_weights_to_hf.py --text_model_id rhymes-ai/Aria --vision_model_id rhymes-ai/Aria --output_hub_path m-ric/Aria_hf_2 --old_state_dict_id rhymes-ai/Aria - -Example for creating the old state dict file with Python: - - import torch - from aria.model.language_model.aria_llama import AriaTextForCausalLM - - # load model - kwargs = {"device_map": "auto", "dtype": torch.float16} - model = AriaTextForCausalLM.from_pretrained("rhymes-ai/Aria", **kwargs) - - # load vision tower - model.get_vision_tower().load_model() - - # Save state dict - torch.save(model.state_dict(), "tmp/hf_models/aria/model_state_dict.bin") -""" - -KEYS_TO_MODIFY_MAPPING = { - "vision_tower.vision_model": "vision_tower", - "ln_ffn": "layer_norm", - "ffn": "feed_forward", - "ln_kv": "layer_norm_kv", -} - - -def load_original_state_dict(model_id): - directory_path = snapshot_download(repo_id=model_id, allow_patterns=["*.safetensors"]) - - original_state_dict = {} - for path in glob.glob(f"{directory_path}/*"): - if path.endswith(".safetensors"): - with safe_open(path, framework="pt", device="cpu") as f: - for key in f.keys(): - original_state_dict[key] = f.get_tensor(key) - - return original_state_dict - - -def convert_state_dict_to_hf(state_dict): - new_state_dict = {} - for key, value in state_dict.items(): - if key.endswith(".inv_freq"): - continue - for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items(): - if key_to_modify in key: - key = key.replace(key_to_modify, new_key) - - new_state_dict[key] = value - new_state_dict["vision_tower.post_layernorm.weight"] = torch.zeros((1152,)) - new_state_dict["vision_tower.post_layernorm.bias"] = torch.zeros((1152,)) - - return new_state_dict - - -def convert_aria_llama_to_hf(text_model_id, vision_model_id, output_hub_path, old_state_dict_id): - torch.set_default_dtype(torch.float16) - - tokenizer = AutoTokenizer.from_pretrained( - text_model_id, - extra_special_tokens={ - "image_token": "<|img|>", - "pad_token": "", - }, - ) - tokenizer.add_tokens(AddedToken("<|img|>", special=True, normalized=False), special_tokens=True) - tokenizer.add_special_tokens({"pad_token": ""}) - tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}{% elif message['content'] is iterable %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}<|img|>{% endif %}{% endfor %}{% endif %}<|im_end|>\n{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}" - - processor = AriaProcessor.from_pretrained( - text_model_id, - tokenizer=tokenizer, - ) - - config = AutoConfig.from_pretrained(text_model_id) - config.vision_config.hidden_size = 1152 - config.vision_config.attention_heads = 16 - config.pad_token_id = 2 - config.image_token_id = 9 - config.intermediate_size = config.moe_intermediate_size - config.auto_map = { - "AutoConfig": "modeling_aria.AriaConfig", - "AutoModelForCausalLM": "modeling_aria.AriaForConditionalGeneration", - } - - with torch.device("meta"): - model = AriaForConditionalGeneration(config) - - state_dict = load_original_state_dict(old_state_dict_id) - - state_dict = convert_state_dict_to_hf(state_dict) - model.load_state_dict(state_dict, strict=False, assign=True) - - # print("Saving models") - # model.save_pretrained("local_aria", safe_serialization=False) - # processor.save_pretrained("local_aria") - print("Pushing to hub") - model.push_to_hub(output_hub_path, create_pr=True) - processor.push_to_hub(output_hub_path, create_pr=True) - - -def main(): - parser = argparse.ArgumentParser( - epilog=EPILOG_TXT, - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - parser.add_argument( - "--text_model_id", - default="rhymes-ai/Aria", - help="Hub location of the text model", - ) - parser.add_argument( - "--vision_model_id", - default="rhymes-ai/Aria", - help="Hub location of the vision model", - ) - parser.add_argument( - "--output_hub_path", - default="rhymes-ai/Aria", - help="Location on the hub of the converted model", - ) - parser.add_argument( - "--old_state_dict_id", - default="rhymes-ai/Aria", - help="Location on the hub of the raw state dict of the original model. The filename needs to be `model_state_dict.bin`", - ) - args = parser.parse_args() - convert_aria_llama_to_hf(args.text_model_id, args.vision_model_id, args.output_hub_path, args.old_state_dict_id) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/aria/image_processing_aria.py b/src/transformers/models/aria/image_processing_aria.py index 4fc2fcf7ec6b..659ed5f112d8 100644 --- a/src/transformers/models/aria/image_processing_aria.py +++ b/src/transformers/models/aria/image_processing_aria.py @@ -43,12 +43,12 @@ logger = logging.get_logger(__name__) -def divide_to_patches(image: np.ndarray, patch_size: int, input_data_format) -> list[np.array]: +def divide_to_patches(image: np.ndarray, patch_size: int, input_data_format) -> list[np.ndarray]: """ Divides an image into patches of a specified size. Args: - image (`np.array`): + image (`np.ndarray`): The input image. patch_size (`int`): The size of each patch. @@ -56,7 +56,7 @@ def divide_to_patches(image: np.ndarray, patch_size: int, input_data_format) -> The channel dimension format of the input image. Returns: - list: A list of np.array representing the patches. + list: A list of np.ndarray representing the patches. """ patches = [] height, width = get_image_size(image, channel_dim=input_data_format) @@ -342,12 +342,12 @@ def preprocess( def _resize_for_patching( self, image: np.ndarray, target_resolution: tuple, resample, input_data_format: ChannelDimension - ) -> np.array: + ) -> np.ndarray: """ Resizes an image to a target resolution while maintaining aspect ratio. Args: - image (np.array): + image (np.ndarray): The input image. target_resolution (tuple): The target resolution (height, width) of the image. @@ -357,7 +357,7 @@ def _resize_for_patching( The channel dimension format of the input image. Returns: - np.array: The resized and padded image. + np.ndarray: The resized and padded image. """ new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format) @@ -375,7 +375,7 @@ def _get_padding_size(self, original_resolution: tuple, target_resolution: tuple def _pad_for_patching( self, image: np.ndarray, target_resolution: tuple, input_data_format: ChannelDimension - ) -> np.array: + ) -> np.ndarray: """ Pad an image to a target resolution while maintaining aspect ratio. """ @@ -460,12 +460,12 @@ def get_image_patches( resample: PILImageResampling, data_format: ChannelDimension, input_data_format: ChannelDimension, - ) -> list[np.array]: + ) -> list[np.ndarray]: """ Process an image with variable resolutions by dividing it into patches. Args: - image (`np.array`): + image (`np.ndarray`): The input image to be processed. grid_pinpoints (list[tuple[int, int]]): A list of possible resolutions as tuples. @@ -479,7 +479,7 @@ def get_image_patches( The channel dimension format of the input image. Returns: - `list[np.array]`: A list of NumPy arrays containing the processed image patches. + `list[np.ndarray]`: A list of NumPy arrays containing the processed image patches. """ if not isinstance(grid_pinpoints, list): raise TypeError("grid_pinpoints must be a list of possible resolutions.") diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py index a626d2cd4b82..02f2f884dadf 100644 --- a/src/transformers/models/aria/modular_aria.py +++ b/src/transformers/models/aria/modular_aria.py @@ -725,12 +725,12 @@ def preprocess( def _resize_for_patching( self, image: np.ndarray, target_resolution: tuple, resample, input_data_format: ChannelDimension - ) -> np.array: + ) -> np.ndarray: """ Resizes an image to a target resolution while maintaining aspect ratio. Args: - image (np.array): + image (np.ndarray): The input image. target_resolution (tuple): The target resolution (height, width) of the image. @@ -740,7 +740,7 @@ def _resize_for_patching( The channel dimension format of the input image. Returns: - np.array: The resized and padded image. + np.ndarray: The resized and padded image. """ new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format) @@ -758,7 +758,7 @@ def _get_padding_size(self, original_resolution: tuple, target_resolution: tuple def _pad_for_patching( self, image: np.ndarray, target_resolution: tuple, input_data_format: ChannelDimension - ) -> np.array: + ) -> np.ndarray: """ Pad an image to a target resolution while maintaining aspect ratio. """ @@ -843,12 +843,12 @@ def get_image_patches( resample: PILImageResampling, data_format: ChannelDimension, input_data_format: ChannelDimension, - ) -> list[np.array]: + ) -> list[np.ndarray]: """ Process an image with variable resolutions by dividing it into patches. Args: - image (`np.array`): + image (`np.ndarray`): The input image to be processed. grid_pinpoints (list[tuple[int, int]]): A list of possible resolutions as tuples. @@ -862,7 +862,7 @@ def get_image_patches( The channel dimension format of the input image. Returns: - `list[np.array]`: A list of NumPy arrays containing the processed image patches. + `list[np.ndarray]`: A list of NumPy arrays containing the processed image patches. """ if not isinstance(grid_pinpoints, list): raise TypeError("grid_pinpoints must be a list of possible resolutions.") diff --git a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py b/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py deleted file mode 100644 index 325e0f65b47c..000000000000 --- a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py +++ /dev/null @@ -1,279 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Audio Spectrogram Transformer checkpoints from the original repository. URL: https://github.com/YuanGongND/ast""" - -import argparse -import json -from pathlib import Path - -import torch -import torchaudio -from datasets import load_dataset -from huggingface_hub import hf_hub_download - -from transformers import ASTConfig, ASTFeatureExtractor, ASTForAudioClassification -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_audio_spectrogram_transformer_config(model_name): - config = ASTConfig() - - if "10-10" in model_name: - pass - elif "speech-commands" in model_name: - config.max_length = 128 - elif "12-12" in model_name: - config.time_stride = 12 - config.frequency_stride = 12 - elif "14-14" in model_name: - config.time_stride = 14 - config.frequency_stride = 14 - elif "16-16" in model_name: - config.time_stride = 16 - config.frequency_stride = 16 - else: - raise ValueError("Model not supported") - - repo_id = "huggingface/label-files" - if "speech-commands" in model_name: - config.num_labels = 35 - filename = "speech-commands-v2-id2label.json" - else: - config.num_labels = 527 - filename = "audioset-id2label.json" - - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - - return config - - -def rename_key(name): - if "module.v" in name: - name = name.replace("module.v", "audio_spectrogram_transformer") - if "cls_token" in name: - name = name.replace("cls_token", "embeddings.cls_token") - if "dist_token" in name: - name = name.replace("dist_token", "embeddings.distillation_token") - if "pos_embed" in name: - name = name.replace("pos_embed", "embeddings.position_embeddings") - if "patch_embed.proj" in name: - name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection") - # transformer blocks - if "blocks" in name: - name = name.replace("blocks", "encoder.layer") - if "attn.proj" in name: - name = name.replace("attn.proj", "attention.output.dense") - if "attn" in name: - name = name.replace("attn", "attention.self") - if "norm1" in name: - name = name.replace("norm1", "layernorm_before") - if "norm2" in name: - name = name.replace("norm2", "layernorm_after") - if "mlp.fc1" in name: - name = name.replace("mlp.fc1", "intermediate.dense") - if "mlp.fc2" in name: - name = name.replace("mlp.fc2", "output.dense") - # final layernorm - if "audio_spectrogram_transformer.norm" in name: - name = name.replace("audio_spectrogram_transformer.norm", "audio_spectrogram_transformer.layernorm") - # classifier head - if "module.mlp_head.0" in name: - name = name.replace("module.mlp_head.0", "classifier.layernorm") - if "module.mlp_head.1" in name: - name = name.replace("module.mlp_head.1", "classifier.dense") - - return name - - -def convert_state_dict(orig_state_dict, config): - for key in orig_state_dict.copy(): - val = orig_state_dict.pop(key) - - if "qkv" in key: - key_split = key.split(".") - layer_num = int(key_split[3]) - dim = config.hidden_size - if "weight" in key: - orig_state_dict[ - f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.query.weight" - ] = val[:dim, :] - orig_state_dict[ - f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.key.weight" - ] = val[dim : dim * 2, :] - orig_state_dict[ - f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.value.weight" - ] = val[-dim:, :] - else: - orig_state_dict[ - f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.query.bias" - ] = val[:dim] - orig_state_dict[ - f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.key.bias" - ] = val[dim : dim * 2] - orig_state_dict[ - f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.value.bias" - ] = val[-dim:] - else: - orig_state_dict[rename_key(key)] = val - - return orig_state_dict - - -def remove_keys(state_dict): - ignore_keys = [ - "module.v.head.weight", - "module.v.head.bias", - "module.v.head_dist.weight", - "module.v.head_dist.bias", - ] - for k in ignore_keys: - state_dict.pop(k, None) - - -@torch.no_grad() -def convert_audio_spectrogram_transformer_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False): - """ - Copy/paste/tweak model's weights to our Audio Spectrogram Transformer structure. - """ - config = get_audio_spectrogram_transformer_config(model_name) - - model_name_to_url = { - "ast-finetuned-audioset-10-10-0.4593": ( - "https://www.dropbox.com/s/ca0b1v2nlxzyeb4/audioset_10_10_0.4593.pth?dl=1" - ), - "ast-finetuned-audioset-10-10-0.450": ( - "https://www.dropbox.com/s/1tv0hovue1bxupk/audioset_10_10_0.4495.pth?dl=1" - ), - "ast-finetuned-audioset-10-10-0.448": ( - "https://www.dropbox.com/s/6u5sikl4b9wo4u5/audioset_10_10_0.4483.pth?dl=1" - ), - "ast-finetuned-audioset-10-10-0.448-v2": ( - "https://www.dropbox.com/s/kt6i0v9fvfm1mbq/audioset_10_10_0.4475.pth?dl=1" - ), - "ast-finetuned-audioset-12-12-0.447": ( - "https://www.dropbox.com/s/snfhx3tizr4nuc8/audioset_12_12_0.4467.pth?dl=1" - ), - "ast-finetuned-audioset-14-14-0.443": ( - "https://www.dropbox.com/s/z18s6pemtnxm4k7/audioset_14_14_0.4431.pth?dl=1" - ), - "ast-finetuned-audioset-16-16-0.442": ( - "https://www.dropbox.com/s/mdsa4t1xmcimia6/audioset_16_16_0.4422.pth?dl=1" - ), - "ast-finetuned-speech-commands-v2": ( - "https://www.dropbox.com/s/q0tbqpwv44pquwy/speechcommands_10_10_0.9812.pth?dl=1" - ), - } - - # load original state_dict - checkpoint_url = model_name_to_url[model_name] - state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu") - # remove some keys - remove_keys(state_dict) - # rename some keys - new_state_dict = convert_state_dict(state_dict, config) - - # load 🤗 model - model = ASTForAudioClassification(config) - model.eval() - - model.load_state_dict(new_state_dict) - - # verify outputs on dummy input - # source: https://github.com/YuanGongND/ast/blob/79e873b8a54d0a3b330dd522584ff2b9926cd581/src/run.py#L62 - mean = -4.2677393 if "speech-commands" not in model_name else -6.845978 - std = 4.5689974 if "speech-commands" not in model_name else 5.5654526 - max_length = 1024 if "speech-commands" not in model_name else 128 - feature_extractor = ASTFeatureExtractor(mean=mean, std=std, max_length=max_length) - - if "speech-commands" in model_name: - # TODO: Convert dataset to Parquet - dataset = load_dataset("google/speech_commands", "v0.02", split="validation") - waveform = dataset[0]["audio"]["array"] - else: - filepath = hf_hub_download( - repo_id="nielsr/audio-spectogram-transformer-checkpoint", - filename="sample_audio.flac", - repo_type="dataset", - ) - - waveform, _ = torchaudio.load(filepath) - waveform = waveform.squeeze().numpy() - - inputs = feature_extractor(waveform, sampling_rate=16000, return_tensors="pt") - - # forward pass - outputs = model(**inputs) - logits = outputs.logits - - if model_name == "ast-finetuned-audioset-10-10-0.4593": - expected_slice = torch.tensor([-0.8760, -7.0042, -8.6602]) - elif model_name == "ast-finetuned-audioset-10-10-0.450": - expected_slice = torch.tensor([-1.1986, -7.0903, -8.2718]) - elif model_name == "ast-finetuned-audioset-10-10-0.448": - expected_slice = torch.tensor([-2.6128, -8.0080, -9.4344]) - elif model_name == "ast-finetuned-audioset-10-10-0.448-v2": - expected_slice = torch.tensor([-1.5080, -7.4534, -8.8917]) - elif model_name == "ast-finetuned-audioset-12-12-0.447": - expected_slice = torch.tensor([-0.5050, -6.5833, -8.0843]) - elif model_name == "ast-finetuned-audioset-14-14-0.443": - expected_slice = torch.tensor([-0.3826, -7.0336, -8.2413]) - elif model_name == "ast-finetuned-audioset-16-16-0.442": - expected_slice = torch.tensor([-1.2113, -6.9101, -8.3470]) - elif model_name == "ast-finetuned-speech-commands-v2": - expected_slice = torch.tensor([6.1589, -8.0566, -8.7984]) - else: - raise ValueError("Unknown model name") - if not torch.allclose(logits[0, :3], expected_slice, atol=1e-4): - raise ValueError("Logits don't match") - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model {model_name} to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - print(f"Saving feature extractor to {pytorch_dump_folder_path}") - feature_extractor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - print("Pushing model and feature extractor to the hub...") - model.push_to_hub(f"MIT/{model_name}") - feature_extractor.push_to_hub(f"MIT/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="ast-finetuned-audioset-10-10-0.4593", - type=str, - help="Name of the Audio Spectrogram Transformer model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub." - ) - - args = parser.parse_args() - convert_audio_spectrogram_transformer_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 38f38cd31b40..f6a12e7cef98 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -65,6 +65,7 @@ ("blip-2", "Blip2Config"), ("blip_2_qformer", "Blip2QFormerConfig"), ("bloom", "BloomConfig"), + ("blt", "BltConfig"), ("bridgetower", "BridgeTowerConfig"), ("bros", "BrosConfig"), ("camembert", "CamembertConfig"), @@ -126,6 +127,9 @@ ("dots1", "Dots1Config"), ("dpr", "DPRConfig"), ("dpt", "DPTConfig"), + ("edgetam", "EdgeTamConfig"), + ("edgetam_video", "EdgeTamVideoConfig"), + ("edgetam_vision_model", "EdgeTamVisionConfig"), ("efficientformer", "EfficientFormerConfig"), ("efficientloftr", "EfficientLoFTRConfig"), ("efficientnet", "EfficientNetConfig"), @@ -222,6 +226,7 @@ ("led", "LEDConfig"), ("levit", "LevitConfig"), ("lfm2", "Lfm2Config"), + ("lfm2_vl", "Lfm2VlConfig"), ("lightglue", "LightGlueConfig"), ("lilt", "LiltConfig"), ("llama", "LlamaConfig"), @@ -294,6 +299,8 @@ ("owlv2", "Owlv2Config"), ("owlvit", "OwlViTConfig"), ("paligemma", "PaliGemmaConfig"), + ("parakeet_ctc", "ParakeetCTCConfig"), + ("parakeet_encoder", "ParakeetEncoderConfig"), ("patchtsmixer", "PatchTSMixerConfig"), ("patchtst", "PatchTSTConfig"), ("pegasus", "PegasusConfig"), @@ -328,6 +335,7 @@ ("qwen3", "Qwen3Config"), ("qwen3_moe", "Qwen3MoeConfig"), ("qwen3_next", "Qwen3NextConfig"), + ("qwen3_omni_moe", "Qwen3OmniMoeConfig"), ("qwen3_vl", "Qwen3VLConfig"), ("qwen3_vl_moe", "Qwen3VLMoeConfig"), ("qwen3_vl_moe_text", "Qwen3VLMoeTextConfig"), @@ -366,6 +374,7 @@ ("shieldgemma2", "ShieldGemma2Config"), ("siglip", "SiglipConfig"), ("siglip2", "Siglip2Config"), + ("siglip2_vision_model", "Siglip2VisionConfig"), ("siglip_vision_model", "SiglipVisionConfig"), ("smollm3", "SmolLM3Config"), ("smolvlm", "SmolVLMConfig"), @@ -488,6 +497,7 @@ ("blip-2", "BLIP-2"), ("blip_2_qformer", "BLIP-2 QFormer"), ("bloom", "BLOOM"), + ("blt", "Blt"), ("bort", "BORT"), ("bridgetower", "BridgeTower"), ("bros", "BROS"), @@ -556,6 +566,9 @@ ("dots1", "dots1"), ("dpr", "DPR"), ("dpt", "DPT"), + ("edgetam", "EdgeTAM"), + ("edgetam_video", "EdgeTamVideo"), + ("edgetam_vision_model", "EdgeTamVisionModel"), ("efficientformer", "EfficientFormer"), ("efficientloftr", "EfficientLoFTR"), ("efficientnet", "EfficientNet"), @@ -657,6 +670,7 @@ ("led", "LED"), ("levit", "LeViT"), ("lfm2", "Lfm2"), + ("lfm2_vl", "Lfm2Vl"), ("lightglue", "LightGlue"), ("lilt", "LiLT"), ("llama", "LLaMA"), @@ -739,6 +753,9 @@ ("owlv2", "OWLv2"), ("owlvit", "OWL-ViT"), ("paligemma", "PaliGemma"), + ("parakeet", "Parakeet"), + ("parakeet_ctc", "Parakeet"), + ("parakeet_encoder", "ParakeetEncoder"), ("patchtsmixer", "PatchTSMixer"), ("patchtst", "PatchTST"), ("pegasus", "Pegasus"), @@ -774,6 +791,7 @@ ("qwen3", "Qwen3"), ("qwen3_moe", "Qwen3MoE"), ("qwen3_next", "Qwen3Next"), + ("qwen3_omni_moe", "Qwen3OmniMoE"), ("qwen3_vl", "Qwen3VL"), ("qwen3_vl_moe", "Qwen3VLMoe"), ("qwen3_vl_moe_text", "Qwen3VLMoe"), @@ -958,6 +976,7 @@ ("glm4v_moe_text", "glm4v_moe"), ("idefics3_vision", "idefics3"), ("siglip_vision_model", "siglip"), + ("siglip2_vision_model", "siglip2"), ("aimv2_vision_model", "aimv2"), ("smolvlm_vision", "smolvlm"), ("chinese_clip_vision_model", "chinese_clip"), @@ -970,12 +989,15 @@ ("qwen3_vl_moe_text", "qwen3_vl_moe"), ("sam_vision_model", "sam"), ("sam2_vision_model", "sam2"), + ("edgetam_vision_model", "edgetam"), ("sam2_hiera_det_model", "sam2"), ("sam_hq_vision_model", "sam_hq"), ("llama4_text", "llama4"), ("blip_2_qformer", "blip_2"), ("fastspeech2_conformer_with_hifigan", "fastspeech2_conformer"), ("perception_encoder", "perception_lm"), + ("parakeet_encoder", "parakeet"), + ("parakeet_ctc", "parakeet"), ] ) diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py index 0307aeba077f..6d4c4f554d9d 100644 --- a/src/transformers/models/auto/feature_extraction_auto.py +++ b/src/transformers/models/auto/feature_extraction_auto.py @@ -81,6 +81,8 @@ ("moshi", "EncodecFeatureExtractor"), ("nat", "ViTFeatureExtractor"), ("owlvit", "OwlViTFeatureExtractor"), + ("parakeet_ctc", "ParakeetFeatureExtractor"), + ("parakeet_encoder", "ParakeetFeatureExtractor"), ("perceiver", "PerceiverFeatureExtractor"), ("phi4_multimodal", "Phi4MultimodalFeatureExtractor"), ("poolformer", "PoolFormerFeatureExtractor"), diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index ebaa4a30849d..4b71712dfc7b 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -91,6 +91,7 @@ ("dinov3_vit", (None, "DINOv3ViTImageProcessorFast")), ("donut-swin", ("DonutImageProcessor", "DonutImageProcessorFast")), ("dpt", ("DPTImageProcessor", "DPTImageProcessorFast")), + ("edgetam", (None, "Sam2ImageProcessorFast")), ("efficientformer", ("EfficientFormerImageProcessor", None)), ("efficientloftr", ("EfficientLoFTRImageProcessor", "EfficientLoFTRImageProcessorFast")), ("efficientnet", ("EfficientNetImageProcessor", "EfficientNetImageProcessorFast")), @@ -120,6 +121,7 @@ ("layoutlmv2", ("LayoutLMv2ImageProcessor", "LayoutLMv2ImageProcessorFast")), ("layoutlmv3", ("LayoutLMv3ImageProcessor", "LayoutLMv3ImageProcessorFast")), ("levit", ("LevitImageProcessor", "LevitImageProcessorFast")), + ("lfm2_vl", (None, "Lfm2VlImageProcessorFast")), ("lightglue", ("LightGlueImageProcessor", None)), ("llama4", ("Llama4ImageProcessor", "Llama4ImageProcessorFast")), ("llava", ("LlavaImageProcessor", "LlavaImageProcessorFast")), @@ -564,9 +566,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): ) image_processor_class = get_image_processor_class_from_name(image_processor_type) else: - image_processor_type_slow = ( - image_processor_type[:-4] if image_processor_type.endswith("Fast") else image_processor_type - ) + image_processor_type_slow = image_processor_type.removesuffix("Fast") image_processor_class = get_image_processor_class_from_name(image_processor_type_slow) if image_processor_class is None and image_processor_type.endswith("Fast"): raise ValueError( diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 93420820fb9e..298834bebe93 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -72,6 +72,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin): ("blip-2", "Blip2Model"), ("blip_2_qformer", "Blip2QFormerModel"), ("bloom", "BloomModel"), + ("blt", "BltModel"), ("bridgetower", "BridgeTowerModel"), ("bros", "BrosModel"), ("camembert", "CamembertModel"), @@ -130,6 +131,9 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin): ("dots1", "Dots1Model"), ("dpr", "DPRQuestionEncoder"), ("dpt", "DPTModel"), + ("edgetam", "EdgeTamModel"), + ("edgetam_video", "EdgeTamVideoModel"), + ("edgetam_vision_model", "EdgeTamVisionModel"), ("efficientformer", "EfficientFormerModel"), ("efficientloftr", "EfficientLoFTRModel"), ("efficientnet", "EfficientNetModel"), @@ -222,6 +226,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin): ("led", "LEDModel"), ("levit", "LevitModel"), ("lfm2", "Lfm2Model"), + ("lfm2_vl", "Lfm2VlModel"), ("lightglue", "LightGlueForKeypointMatching"), ("lilt", "LiltModel"), ("llama", "LlamaModel"), @@ -293,6 +298,8 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin): ("owlv2", "Owlv2Model"), ("owlvit", "OwlViTModel"), ("paligemma", "PaliGemmaModel"), + ("parakeet_ctc", "ParakeetForCTC"), + ("parakeet_encoder", "ParakeetEncoder"), ("patchtsmixer", "PatchTSMixerModel"), ("patchtst", "PatchTSTModel"), ("pegasus", "PegasusModel"), @@ -356,6 +363,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin): ("sew-d", "SEWDModel"), ("siglip", "SiglipModel"), ("siglip2", "Siglip2Model"), + ("siglip2_vision_model", "Siglip2VisionModel"), ("siglip_vision_model", "SiglipVisionModel"), ("smollm3", "SmolLM3Model"), ("smolvlm", "SmolVLMModel"), @@ -631,6 +639,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin): ("blenderbot", "BlenderbotForCausalLM"), ("blenderbot-small", "BlenderbotSmallForCausalLM"), ("bloom", "BloomForCausalLM"), + ("blt", "BltForCausalLM"), ("camembert", "CamembertForCausalLM"), ("code_llama", "LlamaForCausalLM"), ("codegen", "CodeGenForCausalLM"), @@ -1026,6 +1035,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin): ("janus", "JanusForConditionalGeneration"), ("kosmos-2", "Kosmos2ForConditionalGeneration"), ("kosmos-2.5", "Kosmos2_5ForConditionalGeneration"), + ("lfm2_vl", "Lfm2VlForConditionalGeneration"), ("llama4", "Llama4ForConditionalGeneration"), ("llava", "LlavaForConditionalGeneration"), ("llava_next", "LlavaNextForConditionalGeneration"), @@ -1596,6 +1606,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin): ("data2vec-audio", "Data2VecAudioForCTC"), ("hubert", "HubertForCTC"), ("mctct", "MCTCTForCTC"), + ("parakeet_ctc", "ParakeetForCTC"), ("sew", "SEWForCTC"), ("sew-d", "SEWDForCTC"), ("unispeech", "UniSpeechForCTC"), @@ -1649,6 +1660,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin): ("musicgen", "MusicgenForConditionalGeneration"), ("musicgen_melody", "MusicgenMelodyForConditionalGeneration"), ("qwen2_5_omni", "Qwen2_5OmniForConditionalGeneration"), + ("qwen3_omni_moe", "Qwen3OmniMoeForConditionalGeneration"), ("seamless_m4t", "SeamlessM4TForTextToSpeech"), ("seamless_m4t_v2", "SeamlessM4Tv2ForTextToSpeech"), ("vits", "VitsModel"), @@ -1700,6 +1712,8 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin): MODEL_FOR_MASK_GENERATION_MAPPING_NAMES = OrderedDict( [ + ("edgetam", "EdgeTamModel"), + ("edgetam_video", "EdgeTamModel"), ("sam", "SamModel"), ("sam2", "Sam2Model"), ("sam2_video", "Sam2Model"), diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py index 13583c55002f..11862a5896b9 100644 --- a/src/transformers/models/auto/processing_auto.py +++ b/src/transformers/models/auto/processing_auto.py @@ -66,6 +66,7 @@ ("deepseek_vl", "DeepseekVLProcessor"), ("deepseek_vl_hybrid", "DeepseekVLHybridProcessor"), ("dia", "DiaProcessor"), + ("edgetam", "Sam2Processor"), ("emu3", "Emu3Processor"), ("evolla", "EvollaProcessor"), ("flava", "FlavaProcessor"), @@ -93,6 +94,7 @@ ("kyutai_speech_to_text", "KyutaiSpeechToTextProcessor"), ("layoutlmv2", "LayoutLMv2Processor"), ("layoutlmv3", "LayoutLMv3Processor"), + ("lfm2_vl", "Lfm2VlProcessor"), ("llama4", "Llama4Processor"), ("llava", "LlavaProcessor"), ("llava_next", "LlavaNextProcessor"), @@ -120,6 +122,7 @@ ("qwen2_5_vl", "Qwen2_5_VLProcessor"), ("qwen2_audio", "Qwen2AudioProcessor"), ("qwen2_vl", "Qwen2VLProcessor"), + ("qwen3_omni_moe", "Qwen3OmniMoeProcessor"), ("qwen3_vl", "Qwen3VLProcessor"), ("qwen3_vl_moe", "Qwen3VLProcessor"), ("sam", "SamProcessor"), diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index 7858ae587946..d0c3af490d71 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -105,6 +105,7 @@ ("blip", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), ("blip-2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), ("bloom", (None, "BloomTokenizerFast" if is_tokenizers_available() else None)), + ("blt", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), ("bridgetower", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)), ("bros", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), ("byt5", ("ByT5Tokenizer", None)), @@ -501,6 +502,7 @@ ("owlv2", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)), ("owlvit", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)), ("paligemma", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), + ("parakeet", ("ParakeetCTCTokenizer", None)), ( "pegasus", ( @@ -585,6 +587,7 @@ "Qwen2TokenizerFast" if is_tokenizers_available() else None, ), ), + ("qwen3_omni_moe", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)), ("qwen3_vl", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)), ("qwen3_vl_moe", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)), ("rag", ("RagTokenizer", None)), @@ -1139,7 +1142,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): # Otherwise we have to be creative. # if model is an encoder decoder, the encoder tokenizer class is used by default if isinstance(config, EncoderDecoderConfig): - if type(config.decoder) is not type(config.encoder): # noqa: E721 + if type(config.decoder) is not type(config.encoder): logger.warning( f"The encoder model config class: {config.encoder.__class__} is different from the decoder model " f"config class: {config.decoder.__class__}. It is not recommended to use the " diff --git a/src/transformers/models/auto/video_processing_auto.py b/src/transformers/models/auto/video_processing_auto.py index 551de914626e..84bbc8e6fdb1 100644 --- a/src/transformers/models/auto/video_processing_auto.py +++ b/src/transformers/models/auto/video_processing_auto.py @@ -56,6 +56,7 @@ ("qwen2_5_omni", "Qwen2VLVideoProcessor"), ("qwen2_5_vl", "Qwen2VLVideoProcessor"), ("qwen2_vl", "Qwen2VLVideoProcessor"), + ("qwen3_omni_moe", "Qwen2VLVideoProcessor"), ("qwen3_vl", "Qwen3VLVideoProcessor"), ("qwen3_vl_moe", "Qwen3VLVideoProcessor"), ("sam2_video", "Sam2VideoVideoProcessor"), diff --git a/src/transformers/models/bamba/convert_mamba_ssm_checkpoint.py b/src/transformers/models/bamba/convert_mamba_ssm_checkpoint.py deleted file mode 100644 index eaf387a89271..000000000000 --- a/src/transformers/models/bamba/convert_mamba_ssm_checkpoint.py +++ /dev/null @@ -1,273 +0,0 @@ -# coding=utf-8 -# Copyright 2024 IBM and the HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""This script can be used to convert checkpoints provided in the `mamba_ssm` library into the format provided in HuggingFace `transformers`. It depends on the `mamba2_ssm` package to be installed.""" - -import argparse -import json -import os -import re -from os import path -from typing import Optional, Union - -import torch -from huggingface_hub import split_torch_state_dict_into_shards -from safetensors.torch import save_file - -from transformers import AutoTokenizer -from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME - -from .configuration_bamba import BambaConfig - - -def convert_state_dict_from_mamba_ssm(original_sd: dict) -> dict[str, torch.Tensor]: - state_dict = {} - - for orig_k, param in original_sd.items(): - k = orig_k.replace("backbone", "model") - - # for embeddings - k = k.replace("embedding", "embed_tokens") - - # for mixer - k = k.replace("mixer", "mamba") - - # for final layernorm - k = k.replace("norm_f", "final_layernorm") - - # for block layernorm - k = re.sub(r"(\d+)\.norm\.", r"\1.input_layernorm.", k) - k = re.sub(r"(\d+)\.norm2\.", r"\1.pre_ff_layernorm.", k) - - # for mlp - k = k.replace("mlp.fc2", "feed_forward.down_proj") - - if "mlp.fc1" in k: - param, param2 = torch.chunk(param, 2, dim=0) - k2 = k.replace("mlp.fc1", "feed_forward.gate_proj") - state_dict[k2] = param2 - k = k.replace("mlp.fc1", "feed_forward.up_proj") - - if ("in_proj" in k and orig_k.replace("in_proj", "conv1d") in original_sd) or ( - "out_proj" in k and orig_k.replace("out_proj", "conv1d") in original_sd - ): - # then this must be a mamba - pass - else: - # for attn - # - because mixer was replaced to mamba above - k = k.replace("mamba.out_proj", "self_attn.o_proj") - if "mamba.in_proj" in k: - m, n = param.shape - d = (m - n) // 2 - param, param2, param3 = torch.split(param, [n, d, d], dim=0) - k2 = k.replace("mamba.in_proj", "self_attn.k_proj") - state_dict[k2] = param2 - k2 = k.replace("mamba.in_proj", "self_attn.v_proj") - state_dict[k2] = param3 - k = k.replace("mamba.in_proj", "self_attn.q_proj") - - state_dict[k] = param - - return state_dict - - -# Adapted from transformers.models.mamba.convert_mamba_ssm_checkpoint_to_pytorch.py -def convert_ssm_config_to_hf_config( - config_ssm: dict, - **kwargs, -) -> BambaConfig: - """Convert a config from mamba_ssm to a BambaConfig from here.""" - hf_config: BambaConfig = BambaConfig(**kwargs) - - hf_config.architectures = ["BambaForCausalLM"] - - # Set important values from config and recalculate other resulting entries - hf_config.hidden_size = config_ssm["d_model"] - hf_config.intermediate_size = config_ssm["d_intermediate"] - hf_config.mamba_n_heads = (hf_config.hidden_size * hf_config.mamba_expand) // hf_config.mamba_d_head - hf_config.num_hidden_layers = config_ssm["n_layer"] - hf_config.tie_word_embeddings = config_ssm["tie_embeddings"] - - # currently this script assumes config_ssm belongs to v2 - if config_ssm["ssm_cfg"].get("layer") != "Mamba2": - raise ValueError("Conversion script only supports Mamba2") - - # Set attention values - attn_cfg = config_ssm.get("attn_cfg") - if attn_cfg: - assert attn_cfg["causal"], "Only support non-causal attention." - assert not attn_cfg["qkv_proj_bias"], "Only support no qkv bias." - assert not attn_cfg["out_proj_bias"], "Only support no out bias." - hf_config.attn_rotary_emb = attn_cfg["rotary_emb_dim"] - hf_config.num_attention_heads = attn_cfg["num_heads"] - hf_config.num_key_value_heads = attn_cfg["num_heads_kv"] - - attention_layer_indices = config_ssm.get("attn_layer_idx") - if attention_layer_indices: - hf_config.attn_layer_indices = attention_layer_indices - - # Padded vocab size, mostly of 16 but 32 is also very common in different models - vocab_size = config_ssm["vocab_size"] - pad_vocab_size_multiple = config_ssm["pad_vocab_size_multiple"] - if (vocab_size % pad_vocab_size_multiple) != 0: - vocab_size += pad_vocab_size_multiple - (vocab_size % pad_vocab_size_multiple) - hf_config.vocab_size = vocab_size - - return hf_config - - -def save_single_safetensor( - state_dict: dict, - save_directory: str, - metadata: dict, -): - save_file( - state_dict, - os.path.join(save_directory, SAFE_WEIGHTS_NAME), - metadata, - ) - - -def save_sharded_safetensors( - state_dict: dict, - save_directory: str, - metadata: dict, - max_shard_size: Union[int, str] = "5GB", -): - filename_pattern = SAFE_WEIGHTS_NAME.replace(".bin", "{suffix}.bin").replace( - ".safetensors", "{suffix}.safetensors" - ) - state_dict_split = split_torch_state_dict_into_shards( - state_dict, filename_pattern=filename_pattern, max_shard_size=max_shard_size - ) - index = { - "metadata": state_dict_split.metadata, - "weight_map": state_dict_split.tensor_to_filename, - } - # Save the index - with open(os.path.join(save_directory, SAFE_WEIGHTS_INDEX_NAME), "w", encoding="utf-8") as f: - content = json.dumps(index, indent=2, sort_keys=True) + "\n" - f.write(content) - - filename_to_tensors = state_dict_split.filename_to_tensors.items() - for shard_file, tensors in filename_to_tensors: - shard = {tensor: state_dict[tensor].contiguous() for tensor in tensors} - save_file(shard, os.path.join(save_directory, shard_file), metadata=metadata) - - -# Adapted from transformers.models.mamba.convert_mamba_ssm_checkpoint_to_pytorch.py -def convert_mamba_ssm_checkpoint_file_to_huggingface_model_file( - mamba_ssm_checkpoint_path: str, - precision: str, - output_dir: str, - tokenizer_path: Optional[str] = None, - save_model: Union[bool, str] = True, -) -> None: - # load tokenizer if provided, this will be used to set the - # token_ids in the config file - token_ids = {} - if tokenizer_path: - tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) - for key in [ - "bos_token_id", - "eos_token_id", - "pad_token_id", - ]: - id = getattr(tokenizer, key, None) - if id: - token_ids[key] = id - - # there are some configs unsettable by mamba_ssn config, so - # if there are changes from the defaults, have to pass them into - # the function - unsettables = { - "mamba_d_head": 64, - "mamba_d_state": 128, - "mamba_n_groups": 1, - "rms_norm_eps": 1e-5, - } - - # Load and save config based on name - config_path = path.join(mamba_ssm_checkpoint_path, "config.json") - with open(config_path, "r", encoding="utf-8") as json_file: - config = json.load(json_file) - - # convert the config - hf_config = convert_ssm_config_to_hf_config( - config_ssm=config, - **token_ids, - **unsettables, - ) - hf_config.save_pretrained(output_dir) - - # Load state dict of the original model and transfer to hf model - state_dict = torch.load( - path.join(mamba_ssm_checkpoint_path, "pytorch_model.bin"), - map_location="cpu", - weights_only=True, - ) - # FIXME: allow other parameters to pass in - state_dict = convert_state_dict_from_mamba_ssm(state_dict) - - # Save new model to pytorch_dump_path - dtype = torch.float32 if precision == "fp32" else (torch.bfloat16 if precision == "bf16" else torch.float16) - - save_file_fn = None - if isinstance(save_model, bool) and save_model: - save_file_fn = save_single_safetensor - elif isinstance(save_model, str) and save_model == "sharded": - save_file_fn = save_sharded_safetensors - - if save_file_fn: - save_file_fn({k: v.to(dtype) for k, v in state_dict.items()}, output_dir, metadata={"format": "pt"}) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "-i", - "--mamba_ssm_checkpoint_directory", - type=str, - required=True, - help="Path to a directory containing the `pytorch_model.bin` mamba_ssm checkpoint file to be converted.", - ) - parser.add_argument( - "-p", - "--precision", - type=str, - default="fp16", - required=True, - choices=("fp32", "fp16", "bf16"), - help="The precision the model will be saved in. Select from fp32, fp16 or bf16.", - ) - parser.add_argument( - "-o", "--output_dir", type=str, required=True, help="Path to directory to save the converted output model to." - ) - parser.add_argument( - "-t", - "--tokenizer_model_path", - type=str, - default=None, - required=False, - help="Path to a the tokenizer file.", - ) - args = parser.parse_args() - - convert_mamba_ssm_checkpoint_file_to_huggingface_model_file( - args.mamba_ssm_checkpoint_directory, - args.precision, - args.output_dir, - save_model="sharded", - ) diff --git a/src/transformers/models/bamba/modeling_bamba.py b/src/transformers/models/bamba/modeling_bamba.py index 09f00845524d..60bf385bf494 100644 --- a/src/transformers/models/bamba/modeling_bamba.py +++ b/src/transformers/models/bamba/modeling_bamba.py @@ -531,7 +531,7 @@ def __init__(self, config: BambaConfig, layer_idx: int): if not is_fast_path_available: logger.warning_once( - "The fast path is not available because on of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)`" + "The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)`" " is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and" " https://github.com/Dao-AILab/causal-conv1d" ) diff --git a/src/transformers/models/bamba/modular_bamba.py b/src/transformers/models/bamba/modular_bamba.py index f2495b446aa5..5ae5313d21b8 100644 --- a/src/transformers/models/bamba/modular_bamba.py +++ b/src/transformers/models/bamba/modular_bamba.py @@ -288,7 +288,7 @@ def __init__(self, config: BambaConfig, layer_idx: int): if not is_fast_path_available: logger.warning_once( - "The fast path is not available because on of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)`" + "The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)`" " is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and" " https://github.com/Dao-AILab/causal-conv1d" ) diff --git a/src/transformers/models/bark/convert_suno_to_hf.py b/src/transformers/models/bark/convert_suno_to_hf.py deleted file mode 100644 index af2c4f3e8d73..000000000000 --- a/src/transformers/models/bark/convert_suno_to_hf.py +++ /dev/null @@ -1,263 +0,0 @@ -"""Convert Bark checkpoint.""" - -import argparse -import os -from pathlib import Path - -import torch -from bark.generation import _load_model as _bark_load_model -from huggingface_hub import hf_hub_download - -from transformers import EncodecConfig, EncodecModel, set_seed -from transformers.models.bark.configuration_bark import ( - BarkCoarseConfig, - BarkConfig, - BarkFineConfig, - BarkSemanticConfig, -) -from transformers.models.bark.generation_configuration_bark import ( - BarkCoarseGenerationConfig, - BarkFineGenerationConfig, - BarkGenerationConfig, - BarkSemanticGenerationConfig, -) -from transformers.models.bark.modeling_bark import BarkCoarseModel, BarkFineModel, BarkModel, BarkSemanticModel -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -set_seed(770) - - -new_layer_name_dict = { - "c_attn": "att_proj", - "c_proj": "out_proj", - "c_fc": "in_proj", - "transformer.": "", - "h.": "layers.", - "ln_1": "layernorm_1", - "ln_2": "layernorm_2", - "ln_f": "layernorm_final", - "wpe": "position_embeds_layer", - "wte": "input_embeds_layer", -} - - -REMOTE_MODEL_PATHS = { - "text_small": { - "repo_id": "suno/bark", - "file_name": "text.pt", - }, - "coarse_small": { - "repo_id": "suno/bark", - "file_name": "coarse.pt", - }, - "fine_small": { - "repo_id": "suno/bark", - "file_name": "fine.pt", - }, - "text": { - "repo_id": "suno/bark", - "file_name": "text_2.pt", - }, - "coarse": { - "repo_id": "suno/bark", - "file_name": "coarse_2.pt", - }, - "fine": { - "repo_id": "suno/bark", - "file_name": "fine_2.pt", - }, -} - -CUR_PATH = os.path.dirname(os.path.abspath(__file__)) -default_cache_dir = os.path.join(os.path.expanduser("~"), ".cache") -CACHE_DIR = os.path.join(os.getenv("XDG_CACHE_HOME", default_cache_dir), "suno", "bark_v0") - - -def _get_ckpt_path(model_type, use_small=False): - key = model_type - if use_small: - key += "_small" - return os.path.join(CACHE_DIR, REMOTE_MODEL_PATHS[key]["file_name"]) - - -def _download(from_hf_path, file_name): - os.makedirs(CACHE_DIR, exist_ok=True) - hf_hub_download(repo_id=from_hf_path, filename=file_name, local_dir=CACHE_DIR) - - -def _load_model(ckpt_path, device, use_small=False, model_type="text"): - if model_type == "text": - ModelClass = BarkSemanticModel - ConfigClass = BarkSemanticConfig - GenerationConfigClass = BarkSemanticGenerationConfig - elif model_type == "coarse": - ModelClass = BarkCoarseModel - ConfigClass = BarkCoarseConfig - GenerationConfigClass = BarkCoarseGenerationConfig - elif model_type == "fine": - ModelClass = BarkFineModel - ConfigClass = BarkFineConfig - GenerationConfigClass = BarkFineGenerationConfig - else: - raise NotImplementedError() - model_key = f"{model_type}_small" if use_small else model_type - model_info = REMOTE_MODEL_PATHS[model_key] - if not os.path.exists(ckpt_path): - logger.info(f"{model_type} model not found, downloading into `{CACHE_DIR}`.") - _download(model_info["repo_id"], model_info["file_name"]) - checkpoint = torch.load(ckpt_path, map_location=device, weights_only=True) - # this is a hack - model_args = checkpoint["model_args"] - if "input_vocab_size" not in model_args: - model_args["input_vocab_size"] = model_args["vocab_size"] - model_args["output_vocab_size"] = model_args["vocab_size"] - del model_args["vocab_size"] - - # convert Bark model arguments to HF Bark model arguments - model_args["num_heads"] = model_args.pop("n_head") - model_args["hidden_size"] = model_args.pop("n_embd") - model_args["num_layers"] = model_args.pop("n_layer") - - model_config = ConfigClass(**checkpoint["model_args"]) - model = ModelClass(config=model_config) - model_generation_config = GenerationConfigClass() - - model.generation_config = model_generation_config - state_dict = checkpoint["model"] - # fixup checkpoint - unwanted_prefix = "_orig_mod." - for k in state_dict: - if k.startswith(unwanted_prefix): - # replace part of the key with corresponding layer name in HF implementation - new_k = k[len(unwanted_prefix) :] - for old_layer_name, new_layer_name in new_layer_name_dict.items(): - new_k = new_k.replace(old_layer_name, new_layer_name) - - state_dict[new_k] = state_dict.pop(k) - - extra_keys = set(state_dict.keys()) - set(model.state_dict().keys()) - extra_keys = {k for k in extra_keys if not k.endswith(".attn.bias")} - missing_keys = set(model.state_dict().keys()) - set(state_dict.keys()) - missing_keys = {k for k in missing_keys if not k.endswith(".attn.bias")} - if len(extra_keys) != 0: - raise ValueError(f"extra keys found: {extra_keys}") - if len(missing_keys) != 0: - raise ValueError(f"missing keys: {missing_keys}") - model.load_state_dict(state_dict, strict=False) - n_params = model.num_parameters(exclude_embeddings=True) - val_loss = checkpoint["best_val_loss"].item() - logger.info(f"model loaded: {round(n_params / 1e6, 1)}M params, {round(val_loss, 3)} loss") - model.eval() - model.to(device) - del checkpoint, state_dict - - return model - - -def load_model(pytorch_dump_folder_path, use_small=False, model_type="text"): - if model_type not in ("text", "coarse", "fine"): - raise NotImplementedError() - - device = "cpu" # do conversion on cpu - - ckpt_path = _get_ckpt_path(model_type, use_small=use_small) - model = _load_model(ckpt_path, device, model_type=model_type, use_small=use_small) - - # load bark initial model - bark_model = _bark_load_model(ckpt_path, "cpu", model_type=model_type, use_small=use_small) - - if model_type == "text": - bark_model = bark_model["model"] - - if model.num_parameters(exclude_embeddings=True) != bark_model.get_num_params(): - raise ValueError("initial and new models don't have the same number of parameters") - - # check if same output as the bark model - batch_size = 5 - sequence_length = 10 - - if model_type in ["text", "coarse"]: - vec = torch.randint(256, (batch_size, sequence_length), dtype=torch.int) - output_old_model = bark_model(vec)[0] - - output_new_model_total = model(vec) - - # take last logits - output_new_model = output_new_model_total.logits[:, [-1], :] - - else: - prediction_codebook_channel = 3 - n_codes_total = 8 - vec = torch.randint(256, (batch_size, sequence_length, n_codes_total), dtype=torch.int) - - output_new_model_total = model(prediction_codebook_channel, vec) - output_old_model = bark_model(prediction_codebook_channel, vec) - - output_new_model = output_new_model_total.logits - - # output difference should come from the difference of self-attention implementation design - if output_new_model.shape != output_old_model.shape: - raise ValueError("initial and new outputs don't have the same shape") - if (output_new_model - output_old_model).abs().max().item() > 1e-3: - raise ValueError("initial and new outputs are not equal") - - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - model.save_pretrained(pytorch_dump_folder_path) - - -def load_whole_bark_model( - semantic_path, - coarse_path, - fine_path, - append_text, - hub_path, - folder_path, -): - pytorch_dump_folder_path = os.path.join(folder_path, append_text) - - semanticConfig = BarkSemanticConfig.from_pretrained(os.path.join(semantic_path, "config.json")) - coarseAcousticConfig = BarkCoarseConfig.from_pretrained(os.path.join(coarse_path, "config.json")) - fineAcousticConfig = BarkFineConfig.from_pretrained(os.path.join(fine_path, "config.json")) - codecConfig = EncodecConfig.from_pretrained("facebook/encodec_24khz") - - semantic = BarkSemanticModel.from_pretrained(semantic_path) - coarseAcoustic = BarkCoarseModel.from_pretrained(coarse_path) - fineAcoustic = BarkFineModel.from_pretrained(fine_path) - codec = EncodecModel.from_pretrained("facebook/encodec_24khz") - - bark_config = BarkConfig.from_sub_model_configs( - semanticConfig, coarseAcousticConfig, fineAcousticConfig, codecConfig - ) - - bark_generation_config = BarkGenerationConfig.from_sub_model_configs( - semantic.generation_config, coarseAcoustic.generation_config, fineAcoustic.generation_config - ) - - bark = BarkModel(bark_config) - - bark.semantic = semantic - bark.coarse_acoustics = coarseAcoustic - bark.fine_acoustics = fineAcoustic - bark.codec_model = codec - - bark.generation_config = bark_generation_config - - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - bark.save_pretrained(pytorch_dump_folder_path, repo_id=hub_path, push_to_hub=True) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - - parser.add_argument("model_type", type=str, help="text, coarse or fine.") - parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - parser.add_argument("--is_small", action="store_true", help="convert the small version instead of the large.") - - args = parser.parse_args() - - load_model(args.pytorch_dump_folder_path, model_type=args.model_type, use_small=args.is_small) diff --git a/src/transformers/models/bark/modeling_bark.py b/src/transformers/models/bark/modeling_bark.py index 8770e3e0691b..af57f7826734 100644 --- a/src/transformers/models/bark/modeling_bark.py +++ b/src/transformers/models/bark/modeling_bark.py @@ -595,7 +595,7 @@ class BarkSemanticModel(BarkCausalModel): def generate( self, input_ids: torch.Tensor, - semantic_generation_config: BarkSemanticGenerationConfig = None, + semantic_generation_config: Optional[BarkSemanticGenerationConfig] = None, history_prompt: Optional[dict[str, torch.Tensor]] = None, attention_mask: Optional[torch.Tensor] = None, **kwargs, @@ -780,8 +780,8 @@ def preprocess_histories( def generate( self, semantic_output: torch.Tensor, - semantic_generation_config: BarkSemanticGenerationConfig = None, - coarse_generation_config: BarkCoarseGenerationConfig = None, + semantic_generation_config: Optional[BarkSemanticGenerationConfig] = None, + coarse_generation_config: Optional[BarkCoarseGenerationConfig] = None, codebook_size: int = 1024, history_prompt: Optional[dict[str, torch.Tensor]] = None, return_output_lengths: Optional[bool] = None, @@ -1192,8 +1192,8 @@ def forward( def generate( self, coarse_output: torch.Tensor, - semantic_generation_config: BarkSemanticGenerationConfig = None, - coarse_generation_config: BarkCoarseGenerationConfig = None, + semantic_generation_config: Optional[BarkSemanticGenerationConfig] = None, + coarse_generation_config: Optional[BarkCoarseGenerationConfig] = None, fine_generation_config: BarkFineGenerationConfig = None, codebook_size: int = 1024, history_prompt: Optional[dict[str, torch.Tensor]] = None, diff --git a/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index 84dc415443f0..000000000000 --- a/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,156 +0,0 @@ -# coding=utf-8 -# Copyright 2020 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert BART checkpoint.""" - -import argparse -import os -from pathlib import Path - -import fairseq -import torch -from packaging import version -from torch import nn - -from transformers import ( - BartConfig, - BartForConditionalGeneration, - BartForSequenceClassification, - BartModel, - BartTokenizer, -) -from transformers.utils import logging - - -FAIRSEQ_MODELS = ["bart.large", "bart.large.mnli", "bart.large.cnn", "bart_xsum/model.pt"] -extra_arch = {"bart.large": BartModel, "bart.large.mnli": BartForSequenceClassification} -if version.parse(fairseq.__version__) < version.parse("0.9.0"): - raise Exception("requires fairseq >= 0.9.0") - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -SAMPLE_TEXT = " Hello world! cécé herlolip" - -mnli_rename_keys = [ - ("model.classification_heads.mnli.dense.weight", "classification_head.dense.weight"), - ("model.classification_heads.mnli.dense.bias", "classification_head.dense.bias"), - ("model.classification_heads.mnli.out_proj.weight", "classification_head.out_proj.weight"), - ("model.classification_heads.mnli.out_proj.bias", "classification_head.out_proj.bias"), -] - - -def remove_ignore_keys_(state_dict): - ignore_keys = [ - "encoder.version", - "decoder.version", - "model.encoder.version", - "model.decoder.version", - "_float_tensor", - ] - for k in ignore_keys: - state_dict.pop(k, None) - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -def load_xsum_checkpoint(checkpoint_path): - """Checkpoint path should end in model.pt""" - sd = torch.load(checkpoint_path, map_location="cpu", weights_only=True) - hub_interface = torch.hub.load("pytorch/fairseq", "bart.large.cnn").eval() - hub_interface.model.load_state_dict(sd["model"]) - return hub_interface - - -def make_linear_from_emb(emb): - vocab_size, emb_size = emb.weight.shape - lin_layer = nn.Linear(vocab_size, emb_size, bias=False) - lin_layer.weight.data = emb.weight.data - return lin_layer - - -@torch.no_grad() -def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path, hf_checkpoint_name=None): - """ - Copy/paste/tweak model's weights to our BERT structure. - """ - if not os.path.exists(checkpoint_path): - bart = torch.hub.load("pytorch/fairseq", checkpoint_path).eval() - else: - bart = load_xsum_checkpoint(checkpoint_path) - - bart.model.upgrade_state_dict(bart.model.state_dict()) - if hf_checkpoint_name is None: - hf_checkpoint_name = checkpoint_path.replace(".", "-") - config = BartConfig.from_pretrained(hf_checkpoint_name) - tokens = bart.encode(SAMPLE_TEXT).unsqueeze(0) - tokens2 = BartTokenizer.from_pretrained(hf_checkpoint_name).encode(SAMPLE_TEXT, return_tensors="pt").unsqueeze(0) - if not torch.eq(tokens, tokens2).all(): - raise ValueError( - f"converted tokenizer and pretrained tokenizer returned different output: {tokens} != {tokens2}" - ) - - if checkpoint_path == "bart.large.mnli": - state_dict = bart.state_dict() - remove_ignore_keys_(state_dict) - state_dict["model.shared.weight"] = state_dict["model.decoder.embed_tokens.weight"] - for src, dest in mnli_rename_keys: - rename_key(state_dict, src, dest) - model = BartForSequenceClassification(config).eval() - model.load_state_dict(state_dict) - fairseq_output = bart.predict("mnli", tokens, return_logits=True) - new_model_outputs = model(tokens)[0] # logits - else: # no classification heads to worry about - state_dict = bart.model.state_dict() - remove_ignore_keys_(state_dict) - state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"] - fairseq_output = bart.extract_features(tokens) - if hf_checkpoint_name == "facebook/bart-large": - model = BartModel(config).eval() - model.load_state_dict(state_dict) - new_model_outputs = model(tokens).model[0] - else: - model = BartForConditionalGeneration(config).eval() # an existing summarization ckpt - model.model.load_state_dict(state_dict) - if hasattr(model, "lm_head"): - model.lm_head = make_linear_from_emb(model.model.shared) - new_model_outputs = model.model(tokens)[0] - - # Check results - if fairseq_output.shape != new_model_outputs.shape: - raise ValueError( - f"`fairseq_output` shape and `new_model_output` shape are different: {fairseq_output.shape=}, {new_model_outputs.shape}" - ) - if (fairseq_output != new_model_outputs).any().item(): - raise ValueError("Some values in `fairseq_output` are different from `new_model_outputs`") - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - model.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "fairseq_path", type=str, help="bart.large, bart.large.cnn or a path to a model.pt on local filesystem." - ) - parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - parser.add_argument( - "--hf_config", default=None, type=str, help="Which huggingface architecture to use: bart-large-xsum" - ) - args = parser.parse_args() - convert_bart_checkpoint(args.fairseq_path, args.pytorch_dump_folder_path, hf_checkpoint_name=args.hf_config) diff --git a/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py b/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py deleted file mode 100644 index c2e366d7dd02..000000000000 --- a/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py +++ /dev/null @@ -1,373 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert BEiT checkpoints from the unilm repository.""" - -import argparse -import json -from pathlib import Path - -import requests -import torch -from datasets import load_dataset -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import ( - BeitConfig, - BeitForImageClassification, - BeitForMaskedImageModeling, - BeitForSemanticSegmentation, - BeitImageProcessor, -) -from transformers.image_utils import PILImageResampling -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config, has_lm_head=False, is_semantic=False): - prefix = "backbone." if is_semantic else "" - - rename_keys = [] - for i in range(config.num_hidden_layers): - # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms - rename_keys.append((f"{prefix}blocks.{i}.norm1.weight", f"beit.encoder.layer.{i}.layernorm_before.weight")) - rename_keys.append((f"{prefix}blocks.{i}.norm1.bias", f"beit.encoder.layer.{i}.layernorm_before.bias")) - rename_keys.append( - (f"{prefix}blocks.{i}.attn.proj.weight", f"beit.encoder.layer.{i}.attention.output.dense.weight") - ) - rename_keys.append( - (f"{prefix}blocks.{i}.attn.proj.bias", f"beit.encoder.layer.{i}.attention.output.dense.bias") - ) - rename_keys.append((f"{prefix}blocks.{i}.norm2.weight", f"beit.encoder.layer.{i}.layernorm_after.weight")) - rename_keys.append((f"{prefix}blocks.{i}.norm2.bias", f"beit.encoder.layer.{i}.layernorm_after.bias")) - rename_keys.append((f"{prefix}blocks.{i}.mlp.fc1.weight", f"beit.encoder.layer.{i}.intermediate.dense.weight")) - rename_keys.append((f"{prefix}blocks.{i}.mlp.fc1.bias", f"beit.encoder.layer.{i}.intermediate.dense.bias")) - rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.weight", f"beit.encoder.layer.{i}.output.dense.weight")) - rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.bias", f"beit.encoder.layer.{i}.output.dense.bias")) - - # projection layer + position embeddings - rename_keys.extend( - [ - (f"{prefix}cls_token", "beit.embeddings.cls_token"), - (f"{prefix}patch_embed.proj.weight", "beit.embeddings.patch_embeddings.projection.weight"), - (f"{prefix}patch_embed.proj.bias", "beit.embeddings.patch_embeddings.projection.bias"), - ] - ) - - if has_lm_head: - # mask token + shared relative position bias + layernorm - rename_keys.extend( - [ - ("mask_token", "beit.embeddings.mask_token"), - ( - "rel_pos_bias.relative_position_bias_table", - "beit.encoder.relative_position_bias.relative_position_bias_table", - ), - ( - "rel_pos_bias.relative_position_index", - "beit.encoder.relative_position_bias.relative_position_index", - ), - ("norm.weight", "layernorm.weight"), - ("norm.bias", "layernorm.bias"), - ] - ) - elif is_semantic: - # semantic segmentation classification heads - rename_keys.extend( - [ - ("decode_head.conv_seg.weight", "decode_head.classifier.weight"), - ("decode_head.conv_seg.bias", "decode_head.classifier.bias"), - ("auxiliary_head.conv_seg.weight", "auxiliary_head.classifier.weight"), - ("auxiliary_head.conv_seg.bias", "auxiliary_head.classifier.bias"), - ] - ) - else: - # layernorm + classification head - rename_keys.extend( - [ - ("fc_norm.weight", "beit.pooler.layernorm.weight"), - ("fc_norm.bias", "beit.pooler.layernorm.bias"), - ("head.weight", "classifier.weight"), - ("head.bias", "classifier.bias"), - ] - ) - - return rename_keys - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config, has_lm_head=False, is_semantic=False): - for i in range(config.num_hidden_layers): - prefix = "backbone." if is_semantic else "" - # queries, keys and values - in_proj_weight = state_dict.pop(f"{prefix}blocks.{i}.attn.qkv.weight") - q_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.q_bias") - v_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.v_bias") - - state_dict[f"beit.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[ - : config.hidden_size, : - ] - state_dict[f"beit.encoder.layer.{i}.attention.attention.query.bias"] = q_bias - state_dict[f"beit.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ - config.hidden_size : config.hidden_size * 2, : - ] - state_dict[f"beit.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[ - -config.hidden_size :, : - ] - state_dict[f"beit.encoder.layer.{i}.attention.attention.value.bias"] = v_bias - - # gamma_1 and gamma_2 - # we call them lambda because otherwise they are renamed when using .from_pretrained - gamma_1 = state_dict.pop(f"{prefix}blocks.{i}.gamma_1") - gamma_2 = state_dict.pop(f"{prefix}blocks.{i}.gamma_2") - - state_dict[f"beit.encoder.layer.{i}.lambda_1"] = gamma_1 - state_dict[f"beit.encoder.layer.{i}.lambda_2"] = gamma_2 - - # relative_position bias table + index - if not has_lm_head: - # each layer has its own relative position bias - table = state_dict.pop(f"{prefix}blocks.{i}.attn.relative_position_bias_table") - index = state_dict.pop(f"{prefix}blocks.{i}.attn.relative_position_index") - - state_dict[ - f"beit.encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_bias_table" - ] = table - state_dict[ - f"beit.encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_index" - ] = index - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -@torch.no_grad() -def convert_beit_checkpoint(checkpoint_url, pytorch_dump_folder_path): - """ - Copy/paste/tweak model's weights to our BEiT structure. - """ - - # define default BEiT configuration - config = BeitConfig() - has_lm_head = False - is_semantic = False - repo_id = "huggingface/label-files" - # set config parameters based on URL - if checkpoint_url[-9:-4] == "pt22k": - # masked image modeling - config.use_shared_relative_position_bias = True - config.use_mask_token = True - has_lm_head = True - elif checkpoint_url[-9:-4] == "ft22k": - # intermediate fine-tuning on ImageNet-22k - config.use_relative_position_bias = True - config.num_labels = 21841 - filename = "imagenet-22k-id2label.json" - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - # this dataset contains 21843 labels but the model only has 21841 - # we delete the classes as mentioned in https://github.com/google-research/big_transfer/issues/18 - del id2label[9205] - del id2label[15027] - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - elif checkpoint_url[-8:-4] == "to1k": - # fine-tuning on ImageNet-1k - config.use_relative_position_bias = True - config.num_labels = 1000 - filename = "imagenet-1k-id2label.json" - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - if "384" in checkpoint_url: - config.image_size = 384 - if "512" in checkpoint_url: - config.image_size = 512 - elif "ade20k" in checkpoint_url: - # fine-tuning - config.use_relative_position_bias = True - config.num_labels = 150 - filename = "ade20k-id2label.json" - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - config.image_size = 640 - is_semantic = True - else: - raise ValueError("Checkpoint not supported, URL should either end with 'pt22k', 'ft22k', 'to1k' or 'ade20k'") - - # size of the architecture - if "base" in checkpoint_url: - pass - elif "large" in checkpoint_url: - config.hidden_size = 1024 - config.intermediate_size = 4096 - config.num_hidden_layers = 24 - config.num_attention_heads = 16 - if "ade20k" in checkpoint_url: - config.image_size = 640 - config.out_indices = [7, 11, 15, 23] - else: - raise ValueError("Should either find 'base' or 'large' in checkpoint URL") - - # load state_dict of original model, remove and rename some keys - state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", check_hash=True) - state_dict = state_dict["model"] if "ade20k" not in checkpoint_url else state_dict["state_dict"] - - rename_keys = create_rename_keys(config, has_lm_head=has_lm_head, is_semantic=is_semantic) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - read_in_q_k_v(state_dict, config, has_lm_head=has_lm_head, is_semantic=is_semantic) - if is_semantic: - # add prefix to decoder keys - for key, val in state_dict.copy().items(): - val = state_dict.pop(key) - if key.startswith("backbone.fpn"): - key = key.replace("backbone.fpn", "fpn") - state_dict[key] = val - - # load HuggingFace model - if checkpoint_url[-9:-4] == "pt22k": - model = BeitForMaskedImageModeling(config) - elif "ade20k" in checkpoint_url: - model = BeitForSemanticSegmentation(config) - else: - model = BeitForImageClassification(config) - model.eval() - model.load_state_dict(state_dict) - - # Check outputs on an image - if is_semantic: - image_processor = BeitImageProcessor(size=config.image_size, do_center_crop=False) - ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test") - image = Image.open(ds[0]["file"]) - else: - image_processor = BeitImageProcessor( - size=config.image_size, resample=PILImageResampling.BILINEAR, do_center_crop=False - ) - image = prepare_img() - - encoding = image_processor(images=image, return_tensors="pt") - pixel_values = encoding["pixel_values"] - - outputs = model(pixel_values) - logits = outputs.logits - - # verify logits - expected_shape = torch.Size([1, 1000]) - if checkpoint_url[:-4].endswith("beit_base_patch16_224_pt22k"): - expected_shape = torch.Size([1, 196, 8192]) - elif checkpoint_url[:-4].endswith("beit_large_patch16_224_pt22k"): - expected_shape = torch.Size([1, 196, 8192]) - elif checkpoint_url[:-4].endswith("beit_base_patch16_224_pt22k_ft22k"): - expected_shape = torch.Size([1, 21841]) - expected_logits = torch.tensor([2.2288, 2.4671, 0.7395]) - expected_class_idx = 2397 - elif checkpoint_url[:-4].endswith("beit_large_patch16_224_pt22k_ft22k"): - expected_shape = torch.Size([1, 21841]) - expected_logits = torch.tensor([1.6881, -0.2787, 0.5901]) - expected_class_idx = 2396 - elif checkpoint_url[:-4].endswith("beit_base_patch16_224_pt22k_ft1k"): - expected_logits = torch.tensor([0.1241, 0.0798, -0.6569]) - expected_class_idx = 285 - elif checkpoint_url[:-4].endswith("beit_base_patch16_224_pt22k_ft22kto1k"): - expected_logits = torch.tensor([-1.2385, -1.0987, -1.0108]) - expected_class_idx = 281 - elif checkpoint_url[:-4].endswith("beit_base_patch16_384_pt22k_ft22kto1k"): - expected_logits = torch.tensor([-1.5303, -0.9484, -0.3147]) - expected_class_idx = 761 - elif checkpoint_url[:-4].endswith("beit_large_patch16_224_pt22k_ft1k"): - expected_logits = torch.tensor([0.4610, -0.0928, 0.2086]) - expected_class_idx = 761 - elif checkpoint_url[:-4].endswith("beit_large_patch16_224_pt22k_ft22kto1k"): - expected_logits = torch.tensor([-0.4804, 0.6257, -0.1837]) - expected_class_idx = 761 - elif checkpoint_url[:-4].endswith("beit_large_patch16_384_pt22k_ft22kto1k"): - expected_logits = torch.tensor([[-0.5122, 0.5117, -0.2113]]) - expected_class_idx = 761 - elif checkpoint_url[:-4].endswith("beit_large_patch16_512_pt22k_ft22kto1k"): - expected_logits = torch.tensor([-0.3062, 0.7261, 0.4852]) - expected_class_idx = 761 - elif checkpoint_url[:-4].endswith("beit_base_patch16_640_pt22k_ft22ktoade20k"): - expected_shape = (1, 150, 160, 160) - expected_logits = torch.tensor( - [ - [[-4.9225, -2.3954, -3.0522], [-2.8822, -1.0046, -1.7561], [-2.9549, -1.3228, -2.1347]], - [[-5.8168, -3.4129, -4.0778], [-3.8651, -2.2214, -3.0277], [-3.8356, -2.4643, -3.3535]], - [[-0.0078, 3.9952, 4.0754], [2.9856, 4.6944, 5.0035], [3.2413, 4.7813, 4.9969]], - ] - ) - elif checkpoint_url[:-4].endswith("beit_large_patch16_640_pt22k_ft22ktoade20k"): - expected_shape = (1, 150, 160, 160) - expected_logits = torch.tensor( - [ - [[-4.3305, -2.3049, -3.0161], [-2.9591, -1.5305, -2.2251], [-3.4198, -1.8004, -2.9062]], - [[-5.8922, -3.7435, -4.3978], [-4.2063, -2.7872, -3.4755], [-4.2791, -3.1874, -4.1681]], - [[0.9895, 4.3467, 4.7663], [4.2476, 5.6830, 6.1518], [4.5550, 6.2495, 6.5154]], - ] - ) - else: - raise ValueError("Can't verify logits as model is not supported") - - if logits.shape != expected_shape: - raise ValueError(f"Shape of logits not as expected. {logits.shape=}, {expected_shape=}") - if not has_lm_head: - if is_semantic: - if not torch.allclose(logits[0, :3, :3, :3], expected_logits, atol=1e-3): - raise ValueError("First elements of logits not as expected") - else: - print("Predicted class idx:", logits.argmax(-1).item()) - - if not torch.allclose(logits[0, :3], expected_logits, atol=1e-3): - raise ValueError("First elements of logits not as expected") - if logits.argmax(-1).item() != expected_class_idx: - raise ValueError("Predicted class index not as expected") - - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - print(f"Saving image processor to {pytorch_dump_folder_path}") - image_processor.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument( - "--checkpoint_url", - default="https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_224_pt22k_ft22kto1k.pth", - type=str, - help="URL to the original PyTorch checkpoint (.pth file).", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model." - ) - args = parser.parse_args() - convert_beit_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/beit/image_processing_beit_fast.py b/src/transformers/models/beit/image_processing_beit_fast.py index e10dc552cf37..4518043e6841 100644 --- a/src/transformers/models/beit/image_processing_beit_fast.py +++ b/src/transformers/models/beit/image_processing_beit_fast.py @@ -17,6 +17,7 @@ from typing import Optional, Union import torch +from torchvision.transforms.v2 import functional as F from ...image_processing_utils import BatchFeature from ...image_processing_utils_fast import ( @@ -38,16 +39,9 @@ from ...utils import ( TensorType, auto_docstring, - is_torchvision_v2_available, ) -if is_torchvision_v2_available(): - from torchvision.transforms.v2 import functional as F -else: - from torchvision.transforms import functional as F - - class BeitFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): r""" do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`): diff --git a/src/transformers/models/bert/convert_bert_original_tf2_checkpoint_to_pytorch.py b/src/transformers/models/bert/convert_bert_original_tf2_checkpoint_to_pytorch.py deleted file mode 100644 index 9dfd8da474e3..000000000000 --- a/src/transformers/models/bert/convert_bert_original_tf2_checkpoint_to_pytorch.py +++ /dev/null @@ -1,246 +0,0 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script can be used to convert a head-less TF2.x Bert model to PyTorch, as published on the official (now -deprecated) GitHub: https://github.com/tensorflow/models/tree/v2.3.0/official/nlp/bert - -TF2.x uses different variable names from the original BERT (TF 1.4) implementation. The script re-maps the TF2.x Bert -weight names to the original names, so the model can be imported with Huggingface/transformer. - -You may adapt this script to include classification/MLM/NSP/etc. heads. - -Note: This script is only working with an older version of the TensorFlow models repository (<= v2.3.0). - Models trained with never versions are not compatible with this script. -""" - -import argparse -import os -import re - -import tensorflow as tf -import torch - -from transformers import BertConfig, BertModel -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def load_tf2_weights_in_bert(model, tf_checkpoint_path, config): - tf_path = os.path.abspath(tf_checkpoint_path) - logger.info(f"Converting TensorFlow checkpoint from {tf_path}") - # Load weights from TF model - init_vars = tf.train.list_variables(tf_path) - names = [] - arrays = [] - layer_depth = [] - for full_name, shape in init_vars: - # logger.info(f"Loading TF weight {name} with shape {shape}") - name = full_name.split("/") - if full_name == "_CHECKPOINTABLE_OBJECT_GRAPH" or name[0] in ["global_step", "save_counter"]: - logger.info(f"Skipping non-model layer {full_name}") - continue - if "optimizer" in full_name: - logger.info(f"Skipping optimization layer {full_name}") - continue - if name[0] == "model": - # ignore initial 'model' - name = name[1:] - # figure out how many levels deep the name is - depth = 0 - for _name in name: - if _name.startswith("layer_with_weights"): - depth += 1 - else: - break - layer_depth.append(depth) - # read data - array = tf.train.load_variable(tf_path, full_name) - names.append("/".join(name)) - arrays.append(array) - logger.info(f"Read a total of {len(arrays):,} layers") - - # Sanity check - if len(set(layer_depth)) != 1: - raise ValueError(f"Found layer names with different depths (layer depth {list(set(layer_depth))})") - layer_depth = list(set(layer_depth))[0] - if layer_depth != 1: - raise ValueError( - "The model contains more than just the embedding/encoder layers. This script does not handle MLM/NSP" - " heads." - ) - - # convert layers - logger.info("Converting weights...") - for full_name, array in zip(names, arrays): - name = full_name.split("/") - pointer = model - trace = [] - for i, m_name in enumerate(name): - if m_name == ".ATTRIBUTES": - # variable names end with .ATTRIBUTES/VARIABLE_VALUE - break - if m_name.startswith("layer_with_weights"): - layer_num = int(m_name.split("-")[-1]) - if layer_num <= 2: - # embedding layers - # layer_num 0: word_embeddings - # layer_num 1: position_embeddings - # layer_num 2: token_type_embeddings - continue - elif layer_num == 3: - # embedding LayerNorm - trace.extend(["embeddings", "LayerNorm"]) - pointer = getattr(pointer, "embeddings") - pointer = getattr(pointer, "LayerNorm") - elif layer_num > 3 and layer_num < config.num_hidden_layers + 4: - # encoder layers - trace.extend(["encoder", "layer", str(layer_num - 4)]) - pointer = getattr(pointer, "encoder") - pointer = getattr(pointer, "layer") - pointer = pointer[layer_num - 4] - elif layer_num == config.num_hidden_layers + 4: - # pooler layer - trace.extend(["pooler", "dense"]) - pointer = getattr(pointer, "pooler") - pointer = getattr(pointer, "dense") - elif m_name == "embeddings": - trace.append("embeddings") - pointer = getattr(pointer, "embeddings") - if layer_num == 0: - trace.append("word_embeddings") - pointer = getattr(pointer, "word_embeddings") - elif layer_num == 1: - trace.append("position_embeddings") - pointer = getattr(pointer, "position_embeddings") - elif layer_num == 2: - trace.append("token_type_embeddings") - pointer = getattr(pointer, "token_type_embeddings") - else: - raise ValueError(f"Unknown embedding layer with name {full_name}") - trace.append("weight") - pointer = getattr(pointer, "weight") - elif m_name == "_attention_layer": - # self-attention layer - trace.extend(["attention", "self"]) - pointer = getattr(pointer, "attention") - pointer = getattr(pointer, "self") - elif m_name == "_attention_layer_norm": - # output attention norm - trace.extend(["attention", "output", "LayerNorm"]) - pointer = getattr(pointer, "attention") - pointer = getattr(pointer, "output") - pointer = getattr(pointer, "LayerNorm") - elif m_name == "_attention_output_dense": - # output attention dense - trace.extend(["attention", "output", "dense"]) - pointer = getattr(pointer, "attention") - pointer = getattr(pointer, "output") - pointer = getattr(pointer, "dense") - elif m_name == "_output_dense": - # output dense - trace.extend(["output", "dense"]) - pointer = getattr(pointer, "output") - pointer = getattr(pointer, "dense") - elif m_name == "_output_layer_norm": - # output dense - trace.extend(["output", "LayerNorm"]) - pointer = getattr(pointer, "output") - pointer = getattr(pointer, "LayerNorm") - elif m_name == "_key_dense": - # attention key - trace.append("key") - pointer = getattr(pointer, "key") - elif m_name == "_query_dense": - # attention query - trace.append("query") - pointer = getattr(pointer, "query") - elif m_name == "_value_dense": - # attention value - trace.append("value") - pointer = getattr(pointer, "value") - elif m_name == "_intermediate_dense": - # attention intermediate dense - trace.extend(["intermediate", "dense"]) - pointer = getattr(pointer, "intermediate") - pointer = getattr(pointer, "dense") - elif m_name == "_output_layer_norm": - # output layer norm - trace.append("output") - pointer = getattr(pointer, "output") - # weights & biases - elif m_name in ["bias", "beta"]: - trace.append("bias") - pointer = getattr(pointer, "bias") - elif m_name in ["kernel", "gamma"]: - trace.append("weight") - pointer = getattr(pointer, "weight") - else: - logger.warning(f"Ignored {m_name}") - # for certain layers reshape is necessary - trace = ".".join(trace) - if re.match(r"(\S+)\.attention\.self\.(key|value|query)\.(bias|weight)", trace) or re.match( - r"(\S+)\.attention\.output\.dense\.weight", trace - ): - array = array.reshape(pointer.data.shape) - if "kernel" in full_name: - array = array.transpose() - if pointer.shape == array.shape: - pointer.data = torch.from_numpy(array) - else: - raise ValueError( - f"Shape mismatch in layer {full_name}: Model expects shape {pointer.shape} but layer contains shape:" - f" {array.shape}" - ) - logger.info(f"Successfully set variable {full_name} to PyTorch layer {trace}") - return model - - -def convert_tf2_checkpoint_to_pytorch(tf_checkpoint_path, config_path, pytorch_dump_path): - # Instantiate model - logger.info(f"Loading model based on config from {config_path}...") - config = BertConfig.from_json_file(config_path) - model = BertModel(config) - - # Load weights from checkpoint - logger.info(f"Loading weights from checkpoint {tf_checkpoint_path}...") - load_tf2_weights_in_bert(model, tf_checkpoint_path, config) - - # Save pytorch-model - logger.info(f"Saving PyTorch model to {pytorch_dump_path}...") - torch.save(model.state_dict(), pytorch_dump_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--tf_checkpoint_path", type=str, required=True, help="Path to the TensorFlow 2.x checkpoint path." - ) - parser.add_argument( - "--bert_config_file", - type=str, - required=True, - help="The config json file corresponding to the BERT model. This specifies the model architecture.", - ) - parser.add_argument( - "--pytorch_dump_path", - type=str, - required=True, - help="Path to the output PyTorch model (must include filename).", - ) - args = parser.parse_args() - convert_tf2_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path) diff --git a/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py deleted file mode 100755 index be904ddd7e6c..000000000000 --- a/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py +++ /dev/null @@ -1,62 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert BERT checkpoint.""" - -import argparse - -import torch - -from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert -from transformers.utils import logging - - -logging.set_verbosity_info() - - -def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): - # Initialise PyTorch model - config = BertConfig.from_json_file(bert_config_file) - print(f"Building PyTorch model from configuration: {config}") - model = BertForPreTraining(config) - - # Load weights from tf checkpoint - load_tf_weights_in_bert(model, config, tf_checkpoint_path) - - # Save pytorch-model - print(f"Save PyTorch model to {pytorch_dump_path}") - torch.save(model.state_dict(), pytorch_dump_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." - ) - parser.add_argument( - "--bert_config_file", - default=None, - type=str, - required=True, - help=( - "The config json file corresponding to the pre-trained BERT model. \n" - "This specifies the model architecture." - ), - ) - parser.add_argument( - "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - args = parser.parse_args() - convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path) diff --git a/src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py b/src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py deleted file mode 100644 index 8e1e85d5c04e..000000000000 --- a/src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py +++ /dev/null @@ -1,112 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Convert Huggingface Pytorch checkpoint to Tensorflow checkpoint.""" - -import argparse -import os - -import numpy as np -import tensorflow as tf -import torch - -from transformers import BertModel - - -def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str, model_name: str): - """ - Args: - model: BertModel Pytorch model instance to be converted - ckpt_dir: Tensorflow model directory - model_name: model name - - Currently supported HF models: - - - Y BertModel - - N BertForMaskedLM - - N BertForPreTraining - - N BertForMultipleChoice - - N BertForNextSentencePrediction - - N BertForSequenceClassification - - N BertForQuestionAnswering - """ - - tensors_to_transpose = ("dense.weight", "attention.self.query", "attention.self.key", "attention.self.value") - - var_map = ( - ("layer.", "layer_"), - ("word_embeddings.weight", "word_embeddings"), - ("position_embeddings.weight", "position_embeddings"), - ("token_type_embeddings.weight", "token_type_embeddings"), - (".", "/"), - ("LayerNorm/weight", "LayerNorm/gamma"), - ("LayerNorm/bias", "LayerNorm/beta"), - ("weight", "kernel"), - ) - - if not os.path.isdir(ckpt_dir): - os.makedirs(ckpt_dir) - - state_dict = model.state_dict() - - def to_tf_var_name(name: str): - for patt, repl in iter(var_map): - name = name.replace(patt, repl) - return f"bert/{name}" - - def create_tf_var(tensor: np.ndarray, name: str, session: tf.Session): - tf_dtype = tf.dtypes.as_dtype(tensor.dtype) - tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer()) - session.run(tf.variables_initializer([tf_var])) - session.run(tf_var) - return tf_var - - tf.reset_default_graph() - with tf.Session() as session: - for var_name in state_dict: - tf_name = to_tf_var_name(var_name) - torch_tensor = state_dict[var_name].numpy() - if any(x in var_name for x in tensors_to_transpose): - torch_tensor = torch_tensor.T - tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session) - tf_var.assign(tf.cast(torch_tensor, tf_var.dtype)) - tf_weight = session.run(tf_var) - print(f"Successfully created {tf_name}: {np.allclose(tf_weight, torch_tensor)}") - - saver = tf.train.Saver(tf.trainable_variables()) - saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt")) - - -def main(raw_args=None): - parser = argparse.ArgumentParser() - parser.add_argument("--model_name", type=str, required=True, help="model name e.g. google-bert/bert-base-uncased") - parser.add_argument( - "--cache_dir", type=str, default=None, required=False, help="Directory containing pytorch model" - ) - parser.add_argument("--pytorch_model_path", type=str, required=True, help="/path/to/.bin") - parser.add_argument("--tf_cache_dir", type=str, required=True, help="Directory in which to save tensorflow model") - args = parser.parse_args(raw_args) - - model = BertModel.from_pretrained( - pretrained_model_name_or_path=args.model_name, - state_dict=torch.load(args.pytorch_model_path, weights_only=True), - cache_dir=args.cache_dir, - ) - - convert_pytorch_checkpoint_to_tf(model=model, ckpt_dir=args.tf_cache_dir, model_name=args.model_name) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py b/src/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py deleted file mode 100644 index a7832a53d55d..000000000000 --- a/src/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py +++ /dev/null @@ -1,188 +0,0 @@ -# Copyright 2022 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script converts a lm-head checkpoint from the "Token Dropping" implementation into a PyTorch-compatible BERT -model. The official implementation of "Token Dropping" can be found in the TensorFlow Models repository: - -https://github.com/tensorflow/models/tree/master/official/projects/token_dropping -""" - -import argparse - -import tensorflow as tf -import torch - -from transformers import BertConfig, BertForMaskedLM -from transformers.models.bert.modeling_bert import ( - BertIntermediate, - BertLayer, - BertOutput, - BertPooler, - BertSelfAttention, - BertSelfOutput, -) -from transformers.utils import logging - - -logging.set_verbosity_info() - - -def convert_checkpoint_to_pytorch(tf_checkpoint_path: str, config_path: str, pytorch_dump_path: str): - def get_masked_lm_array(name: str): - full_name = f"masked_lm/{name}/.ATTRIBUTES/VARIABLE_VALUE" - array = tf.train.load_variable(tf_checkpoint_path, full_name) - - if "kernel" in name: - array = array.transpose() - - return torch.from_numpy(array) - - def get_encoder_array(name: str): - full_name = f"encoder/{name}/.ATTRIBUTES/VARIABLE_VALUE" - array = tf.train.load_variable(tf_checkpoint_path, full_name) - - if "kernel" in name: - array = array.transpose() - - return torch.from_numpy(array) - - def get_encoder_layer_array(layer_index: int, name: str): - full_name = f"encoder/_transformer_layers/{layer_index}/{name}/.ATTRIBUTES/VARIABLE_VALUE" - array = tf.train.load_variable(tf_checkpoint_path, full_name) - - if "kernel" in name: - array = array.transpose() - - return torch.from_numpy(array) - - def get_encoder_attention_layer_array(layer_index: int, name: str, original_shape): - full_name = f"encoder/_transformer_layers/{layer_index}/_attention_layer/{name}/.ATTRIBUTES/VARIABLE_VALUE" - array = tf.train.load_variable(tf_checkpoint_path, full_name) - array = array.reshape(original_shape) - - if "kernel" in name: - array = array.transpose() - - return torch.from_numpy(array) - - print(f"Loading model based on config from {config_path}...") - config = BertConfig.from_json_file(config_path) - model = BertForMaskedLM(config) - - # Layers - for layer_index in range(0, config.num_hidden_layers): - layer: BertLayer = model.bert.encoder.layer[layer_index] - - # Self-attention - self_attn: BertSelfAttention = layer.attention.self - - self_attn.query.weight.data = get_encoder_attention_layer_array( - layer_index, "_query_dense/kernel", self_attn.query.weight.data.shape - ) - self_attn.query.bias.data = get_encoder_attention_layer_array( - layer_index, "_query_dense/bias", self_attn.query.bias.data.shape - ) - self_attn.key.weight.data = get_encoder_attention_layer_array( - layer_index, "_key_dense/kernel", self_attn.key.weight.data.shape - ) - self_attn.key.bias.data = get_encoder_attention_layer_array( - layer_index, "_key_dense/bias", self_attn.key.bias.data.shape - ) - self_attn.value.weight.data = get_encoder_attention_layer_array( - layer_index, "_value_dense/kernel", self_attn.value.weight.data.shape - ) - self_attn.value.bias.data = get_encoder_attention_layer_array( - layer_index, "_value_dense/bias", self_attn.value.bias.data.shape - ) - - # Self-attention Output - self_output: BertSelfOutput = layer.attention.output - - self_output.dense.weight.data = get_encoder_attention_layer_array( - layer_index, "_output_dense/kernel", self_output.dense.weight.data.shape - ) - self_output.dense.bias.data = get_encoder_attention_layer_array( - layer_index, "_output_dense/bias", self_output.dense.bias.data.shape - ) - - self_output.LayerNorm.weight.data = get_encoder_layer_array(layer_index, "_attention_layer_norm/gamma") - self_output.LayerNorm.bias.data = get_encoder_layer_array(layer_index, "_attention_layer_norm/beta") - - # Intermediate - intermediate: BertIntermediate = layer.intermediate - - intermediate.dense.weight.data = get_encoder_layer_array(layer_index, "_intermediate_dense/kernel") - intermediate.dense.bias.data = get_encoder_layer_array(layer_index, "_intermediate_dense/bias") - - # Output - bert_output: BertOutput = layer.output - - bert_output.dense.weight.data = get_encoder_layer_array(layer_index, "_output_dense/kernel") - bert_output.dense.bias.data = get_encoder_layer_array(layer_index, "_output_dense/bias") - - bert_output.LayerNorm.weight.data = get_encoder_layer_array(layer_index, "_output_layer_norm/gamma") - bert_output.LayerNorm.bias.data = get_encoder_layer_array(layer_index, "_output_layer_norm/beta") - - # Embeddings - model.bert.embeddings.position_embeddings.weight.data = get_encoder_array("_position_embedding_layer/embeddings") - model.bert.embeddings.token_type_embeddings.weight.data = get_encoder_array("_type_embedding_layer/embeddings") - model.bert.embeddings.LayerNorm.weight.data = get_encoder_array("_embedding_norm_layer/gamma") - model.bert.embeddings.LayerNorm.bias.data = get_encoder_array("_embedding_norm_layer/beta") - - # LM Head - lm_head = model.cls.predictions.transform - - lm_head.dense.weight.data = get_masked_lm_array("dense/kernel") - lm_head.dense.bias.data = get_masked_lm_array("dense/bias") - - lm_head.LayerNorm.weight.data = get_masked_lm_array("layer_norm/gamma") - lm_head.LayerNorm.bias.data = get_masked_lm_array("layer_norm/beta") - - model.bert.embeddings.word_embeddings.weight.data = get_masked_lm_array("embedding_table") - - # Pooling - model.bert.pooler = BertPooler(config=config) - model.bert.pooler.dense.weight.data: BertPooler = get_encoder_array("_pooler_layer/kernel") - model.bert.pooler.dense.bias.data: BertPooler = get_encoder_array("_pooler_layer/bias") - - # Export final model - model.save_pretrained(pytorch_dump_path) - - # Integration test - should load without any errors ;) - new_model = BertForMaskedLM.from_pretrained(pytorch_dump_path) - print(new_model.eval()) - - print("Model conversion was done successfully!") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--tf_checkpoint_path", type=str, required=True, help="Path to the TensorFlow Token Dropping checkpoint path." - ) - parser.add_argument( - "--bert_config_file", - type=str, - required=True, - help="The config json file corresponding to the BERT model. This specifies the model architecture.", - ) - parser.add_argument( - "--pytorch_dump_path", - type=str, - required=True, - help="Path to the output PyTorch model.", - ) - args = parser.parse_args() - convert_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path) diff --git a/src/transformers/models/big_bird/convert_bigbird_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/big_bird/convert_bigbird_original_tf_checkpoint_to_pytorch.py deleted file mode 100644 index 0b8e6590f937..000000000000 --- a/src/transformers/models/big_bird/convert_bigbird_original_tf_checkpoint_to_pytorch.py +++ /dev/null @@ -1,69 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert BigBird checkpoint.""" - -import argparse - -from transformers import BigBirdConfig, BigBirdForPreTraining, BigBirdForQuestionAnswering, load_tf_weights_in_big_bird -from transformers.utils import logging - - -logging.set_verbosity_info() - - -def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, big_bird_config_file, pytorch_dump_path, is_trivia_qa): - # Initialise PyTorch model - config = BigBirdConfig.from_json_file(big_bird_config_file) - print(f"Building PyTorch model from configuration: {config}") - - if is_trivia_qa: - model = BigBirdForQuestionAnswering(config) - else: - model = BigBirdForPreTraining(config) - - # Load weights from tf checkpoint - load_tf_weights_in_big_bird(model, tf_checkpoint_path, is_trivia_qa=is_trivia_qa) - - # Save pytorch-model - print(f"Save PyTorch model to {pytorch_dump_path}") - model.save_pretrained(pytorch_dump_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." - ) - parser.add_argument( - "--big_bird_config_file", - default=None, - type=str, - required=True, - help=( - "The config json file corresponding to the pre-trained BERT model. \n" - "This specifies the model architecture." - ), - ) - parser.add_argument( - "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - parser.add_argument( - "--is_trivia_qa", action="store_true", help="Whether to convert a model with a trivia_qa head." - ) - args = parser.parse_args() - convert_tf_checkpoint_to_pytorch( - args.tf_checkpoint_path, args.big_bird_config_file, args.pytorch_dump_path, args.is_trivia_qa - ) diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py index f42b1eeaeeb1..eb89d9872be8 100755 --- a/src/transformers/models/big_bird/modeling_big_bird.py +++ b/src/transformers/models/big_bird/modeling_big_bird.py @@ -1272,14 +1272,14 @@ def _get_single_block_row_attention( if block_id == to_end_block_id - 2: illegal_blocks.append(1) - selected_random_blokcs = [] + selected_random_blocks = [] for i in range(to_end_block_id - to_start_block_id): if perm_block[i] not in illegal_blocks: - selected_random_blokcs.append(perm_block[i]) - if len(selected_random_blokcs) == num_rand_blocks: + selected_random_blocks.append(perm_block[i]) + if len(selected_random_blocks) == num_rand_blocks: break - return np.array(selected_random_blokcs, dtype=np.int32) + return np.array(selected_random_blocks, dtype=np.int32) # Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->BigBird @@ -2877,7 +2877,6 @@ def forward( logits_mask = self.prepare_question_mask(question_lengths, seqlen) if token_type_ids is None: token_type_ids = torch.ones(logits_mask.size(), dtype=int, device=logits_mask.device) - logits_mask - logits_mask = logits_mask logits_mask[:, 0] = False logits_mask.unsqueeze_(2) diff --git a/src/transformers/models/bigbird_pegasus/convert_bigbird_pegasus_tf_to_pytorch.py b/src/transformers/models/bigbird_pegasus/convert_bigbird_pegasus_tf_to_pytorch.py deleted file mode 100644 index d0a312ebc11f..000000000000 --- a/src/transformers/models/bigbird_pegasus/convert_bigbird_pegasus_tf_to_pytorch.py +++ /dev/null @@ -1,169 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse - -import tensorflow as tf -import torch -from tqdm import tqdm - -from transformers import BigBirdPegasusConfig, BigBirdPegasusForConditionalGeneration - - -INIT_COMMON = [ - # tf -> hf - ("/", "."), - ("layer_", "layers."), - ("kernel", "weight"), - ("beta", "bias"), - ("gamma", "weight"), - ("pegasus", "model"), -] -END_COMMON = [ - (".output.dense", ".fc2"), - ("intermediate.LayerNorm", "final_layer_norm"), - ("intermediate.dense", "fc1"), -] - -DECODER_PATTERNS = ( - INIT_COMMON - + [ - ("attention.self.LayerNorm", "self_attn_layer_norm"), - ("attention.output.dense", "self_attn.out_proj"), - ("attention.self", "self_attn"), - ("attention.encdec.LayerNorm", "encoder_attn_layer_norm"), - ("attention.encdec_output.dense", "encoder_attn.out_proj"), - ("attention.encdec", "encoder_attn"), - ("key", "k_proj"), - ("value", "v_proj"), - ("query", "q_proj"), - ("decoder.LayerNorm", "decoder.layernorm_embedding"), - ] - + END_COMMON -) - -REMAINING_PATTERNS = ( - INIT_COMMON - + [ - ("embeddings.word_embeddings", "shared.weight"), - ("embeddings.position_embeddings", "embed_positions.weight"), - ("attention.self.LayerNorm", "self_attn_layer_norm"), - ("attention.output.dense", "self_attn.output"), - ("attention.self", "self_attn.self"), - ("encoder.LayerNorm", "encoder.layernorm_embedding"), - ] - + END_COMMON -) - -KEYS_TO_IGNORE = [ - "encdec/key/bias", - "encdec/query/bias", - "encdec/value/bias", - "self/key/bias", - "self/query/bias", - "self/value/bias", - "encdec_output/dense/bias", - "attention/output/dense/bias", -] - - -def rename_state_dict_key(k, patterns): - for tf_name, hf_name in patterns: - k = k.replace(tf_name, hf_name) - return k - - -def convert_bigbird_pegasus(tf_weights: dict, config_update: dict) -> BigBirdPegasusForConditionalGeneration: - cfg = BigBirdPegasusConfig(**config_update) - torch_model = BigBirdPegasusForConditionalGeneration(cfg) - state_dict = torch_model.state_dict() - mapping = {} - - # separating decoder weights - decoder_weights = {k: tf_weights[k] for k in tf_weights if k.startswith("pegasus/decoder")} - remaining_weights = {k: tf_weights[k] for k in tf_weights if not k.startswith("pegasus/decoder")} - - for k, v in tqdm(decoder_weights.items(), "tf -> hf conversion"): - conditions = [k.endswith(ending) for ending in KEYS_TO_IGNORE] - if any(conditions): - continue - patterns = DECODER_PATTERNS - new_k = rename_state_dict_key(k, patterns) - if new_k not in state_dict: - raise ValueError(f"could not find new key {new_k} in state dict. (converted from {k})") - if any(i in k for i in ["dense", "query", "key", "value"]): - v = v.T - mapping[new_k] = torch.from_numpy(v) - assert v.shape == state_dict[new_k].shape, f"{new_k}, {k}, {v.shape}, {state_dict[new_k].shape}" - - for k, v in tqdm(remaining_weights.items(), "tf -> hf conversion"): - conditions = [k.endswith(ending) for ending in KEYS_TO_IGNORE] - if any(conditions): - continue - patterns = REMAINING_PATTERNS - new_k = rename_state_dict_key(k, patterns) - if new_k not in state_dict and k != "pegasus/embeddings/position_embeddings": - raise ValueError(f"could not find new key {new_k} in state dict. (converted from {k})") - if any(i in k for i in ["dense", "query", "key", "value"]): - v = v.T - mapping[new_k] = torch.from_numpy(v) - if k != "pegasus/embeddings/position_embeddings": - assert v.shape == state_dict[new_k].shape, f"{new_k}, {k}, {v.shape}, {state_dict[new_k].shape}" - - mapping["model.encoder.embed_positions.weight"] = mapping["model.embed_positions.weight"] - mapping["model.decoder.embed_positions.weight"] = mapping.pop("model.embed_positions.weight") - missing, extra = torch_model.load_state_dict(mapping, strict=False) - unexpected_missing = [ - k - for k in missing - if k - not in [ - "final_logits_bias", - "model.encoder.embed_tokens.weight", - "model.decoder.embed_tokens.weight", - "lm_head.weight", - ] - ] - assert unexpected_missing == [], f"no matches found for the following torch keys {unexpected_missing}" - assert extra == [], f"no matches found for the following tf keys {extra}" - return torch_model - - -def get_tf_weights_as_numpy(path) -> dict: - init_vars = tf.train.list_variables(path) - tf_weights = {} - ignore_name = ["global_step"] - for name, shape in tqdm(init_vars, desc="converting tf checkpoint to dict"): - skip_key = any(pat in name for pat in ignore_name) - if skip_key: - continue - array = tf.train.load_variable(path, name) - tf_weights[name] = array - return tf_weights - - -def convert_bigbird_pegasus_ckpt_to_pytorch(ckpt_path: str, save_dir: str, config_update: dict): - tf_weights = get_tf_weights_as_numpy(ckpt_path) - torch_model = convert_bigbird_pegasus(tf_weights, config_update) - torch_model.save_pretrained(save_dir) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--tf_ckpt_path", type=str, help="passed to tf.train.list_variables") - parser.add_argument("--save_dir", default=None, type=str, help="Path to the output PyTorch model.") - args = parser.parse_args() - config_update = {} - convert_bigbird_pegasus_ckpt_to_pytorch(args.tf_ckpt_path, args.save_dir, config_update=config_update) diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py index 90f3c886ad93..e419af75da38 100755 --- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py @@ -1088,14 +1088,14 @@ def _get_single_block_row_attention( if block_id == to_end_block_id - 2: illegal_blocks.append(1) - selected_random_blokcs = [] + selected_random_blocks = [] for i in range(to_end_block_id - to_start_block_id): if perm_block[i] not in illegal_blocks: - selected_random_blokcs.append(perm_block[i]) - if len(selected_random_blokcs) == num_rand_blocks: + selected_random_blocks.append(perm_block[i]) + if len(selected_random_blocks) == num_rand_blocks: break - return np.array(selected_random_blokcs, dtype=np.int32) + return np.array(selected_random_blocks, dtype=np.int32) class BigBirdPegasusEncoderAttention(nn.Module): diff --git a/src/transformers/models/biogpt/convert_biogpt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/biogpt/convert_biogpt_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100755 index 616e9ed6653b..000000000000 --- a/src/transformers/models/biogpt/convert_biogpt_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,292 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import argparse -import json -import os -import re -import shutil - -import torch - -from transformers import BioGptConfig, BioGptForCausalLM -from transformers.models.biogpt.tokenization_biogpt import VOCAB_FILES_NAMES -from transformers.tokenization_utils_base import TOKENIZER_CONFIG_FILE -from transformers.utils import WEIGHTS_NAME, logging - - -logging.set_verbosity_warning() - -json_indent = 2 - - -# modified from https://github.com/facebookresearch/fairseq/blob/dd74992d0d143155998e9ed4076826bcea80fb06/fairseq/data/dictionary.py#L18 -class Dictionary: - """A mapping from symbols to consecutive integers""" - - def __init__( - self, - *, # begin keyword-only arguments - bos="", - pad="", - eos="", - unk="", - extra_special_symbols=None, - ): - self.bos_word, self.unk_word, self.pad_word, self.eos_word = bos, unk, pad, eos - self.symbols = [] - self.count = [] - self.indices = {} - self.bos_index = self.add_symbol(bos) - self.pad_index = self.add_symbol(pad) - self.eos_index = self.add_symbol(eos) - self.unk_index = self.add_symbol(unk) - if extra_special_symbols: - for s in extra_special_symbols: - self.add_symbol(s) - self.nspecial = len(self.symbols) - - def __eq__(self, other): - return self.indices == other.indices - - def __getitem__(self, idx): - if idx < len(self.symbols): - return self.symbols[idx] - return self.unk_word - - def __len__(self): - """Returns the number of symbols in the dictionary""" - return len(self.symbols) - - def __contains__(self, sym): - return sym in self.indices - - @classmethod - def load(cls, f): - """Loads the dictionary from a text file with the format: - - ``` - - - ... - ``` - """ - d = cls() - d.add_from_file(f) - return d - - def add_symbol(self, word, n=1, overwrite=False): - """Adds a word to the dictionary""" - if word in self.indices and not overwrite: - idx = self.indices[word] - self.count[idx] = self.count[idx] + n - return idx - else: - idx = len(self.symbols) - self.indices[word] = idx - self.symbols.append(word) - self.count.append(n) - return idx - - def _load_meta(self, lines): - return 0 - - def add_from_file(self, f): - """ - Loads a pre-existing dictionary from a text file and adds its symbols to this instance. - """ - if isinstance(f, str): - try: - with open(f, "r", encoding="utf-8") as fd: - self.add_from_file(fd) - except FileNotFoundError as fnfe: - raise fnfe - except UnicodeError: - raise Exception(f"Incorrect encoding detected in {f}, please rebuild the dataset") - return - - lines = f.readlines() - indices_start_line = self._load_meta(lines) - - for line in lines[indices_start_line:]: - try: - line, field = line.rstrip().rsplit(" ", 1) - if field == "#fairseq:overwrite": - overwrite = True - line, field = line.rsplit(" ", 1) - else: - overwrite = False - count = int(field) - word = line - if word in self and not overwrite: - raise RuntimeError( - f"Duplicate word found when loading Dictionary: '{word}'. " - "Duplicate words can overwrite earlier ones by adding the " - "#fairseq:overwrite flag at the end of the corresponding row " - "in the dictionary file. If using the Camembert model, please " - "download an updated copy of the model file." - ) - self.add_symbol(word, n=count, overwrite=overwrite) - except ValueError: - raise ValueError("Incorrect dictionary format, expected ' [flags]'") - - -def rewrite_dict_keys(d): - # (1) remove word breaking symbol, (2) add word ending symbol where the word is not broken up, - # e.g.: d = {'le@@': 5, 'tt@@': 6, 'er': 7} => {'le': 5, 'tt': 6, 'er': 7} - d2 = dict((re.sub(r"@@$", "", k), v) if k.endswith("@@") else (re.sub(r"$", "", k), v) for k, v in d.items()) - keep_keys = ["", "", "", ""] - # restore the special tokens - for k in keep_keys: - del d2[f"{k}"] - d2[k] = d[k] # restore - return d2 - - -def convert_biogpt_checkpoint_to_pytorch(biogpt_checkpoint_path, pytorch_dump_folder_path): - # prep - if not os.path.exists(biogpt_checkpoint_path): - raise ValueError(f"path {biogpt_checkpoint_path} does not exist!") - os.makedirs(pytorch_dump_folder_path, exist_ok=True) - print(f"Writing results to {pytorch_dump_folder_path}") - - # handle various types of models - - checkpoint_file = os.path.join(biogpt_checkpoint_path, "checkpoint.pt") - if not os.path.isfile(checkpoint_file): - raise ValueError(f"path to the file {checkpoint_file} does not exist!") - chkpt = torch.load(checkpoint_file, map_location="cpu", weights_only=True) - - args = chkpt["cfg"]["model"] - - # dicts - dict_file = os.path.join(biogpt_checkpoint_path, "dict.txt") - if not os.path.isfile(dict_file): - raise ValueError(f"path to the file {dict_file} does not exist!") - src_dict = Dictionary.load(dict_file) - src_vocab = rewrite_dict_keys(src_dict.indices) - src_vocab_size = len(src_vocab) - src_vocab_file = os.path.join(pytorch_dump_folder_path, VOCAB_FILES_NAMES["vocab_file"]) - print(f"Generating {src_vocab_file} of {src_vocab_size} records") - with open(src_vocab_file, "w", encoding="utf-8") as f: - f.write(json.dumps(src_vocab, ensure_ascii=False, indent=json_indent)) - - # merges_file (bpecodes) - bpecodes_file = os.path.join(biogpt_checkpoint_path, "bpecodes") - if not os.path.isfile(bpecodes_file): - raise ValueError(f"path to the file {bpecodes_file} does not exist!") - - merges_file = os.path.join(pytorch_dump_folder_path, VOCAB_FILES_NAMES["merges_file"]) - shutil.copyfile(bpecodes_file, merges_file) - - # model config - biogpt_model_config_file = os.path.join(pytorch_dump_folder_path, "config.json") - - model_conf = { - "activation_dropout": args["activation_dropout"], - "architectures": ["BioGptForCausalLM"], - "attention_probs_dropout_prob": args["attention_dropout"], - "bos_token_id": 0, - "eos_token_id": 2, - "hidden_act": args["activation_fn"], - "hidden_dropout_prob": args["dropout"], - "hidden_size": args["decoder_embed_dim"], - "initializer_range": 0.02, - "intermediate_size": args["decoder_ffn_embed_dim"], - "layer_norm_eps": 1e-12, - "layerdrop": args["decoder_layerdrop"], - "max_position_embeddings": args["max_target_positions"], - "model_type": "biogpt", - "num_attention_heads": args["decoder_attention_heads"], - "num_hidden_layers": args["decoder_layers"], - "pad_token_id": 1, - "scale_embedding": not args["no_scale_embedding"], - "tie_word_embeddings": args["share_decoder_input_output_embed"], - "vocab_size": src_vocab_size, - } - - # good hparam defaults to start with - - print(f"Generating {biogpt_model_config_file}") - with open(biogpt_model_config_file, "w", encoding="utf-8") as f: - f.write(json.dumps(model_conf, ensure_ascii=False, indent=json_indent)) - - # tokenizer config - biogpt_tokenizer_config_file = os.path.join(pytorch_dump_folder_path, TOKENIZER_CONFIG_FILE) - - tokenizer_conf = { - "bos_token": "", - "eos_token": "", - "model_max_length": 1024, - "pad_token": "", - "special_tokens_map_file": None, - "tokenizer_class": "BioGptTokenizer", - "unk_token": "", - } - - print(f"Generating {biogpt_tokenizer_config_file}") - with open(biogpt_tokenizer_config_file, "w", encoding="utf-8") as f: - f.write(json.dumps(tokenizer_conf, ensure_ascii=False, indent=json_indent)) - - # model - model_state_dict = chkpt["model"] - - # remove unneeded keys - ignore_keys = [ - "decoder.version", - ] - for k in ignore_keys: - model_state_dict.pop(k, None) - - layer_names = list(model_state_dict.keys()) - for layer_name in layer_names: - if layer_name.endswith("output_projection.weight"): - model_state_dict[layer_name.replace("decoder.", "")] = model_state_dict.pop(layer_name) - else: - model_state_dict[layer_name.replace("decoder", "biogpt")] = model_state_dict.pop(layer_name) - - config = BioGptConfig.from_pretrained(pytorch_dump_folder_path) - model_new = BioGptForCausalLM(config) - - # check that it loads ok - model_new.load_state_dict(model_state_dict) - - # save - pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME) - print(f"Generating {pytorch_weights_dump_path}") - torch.save(model_state_dict, pytorch_weights_dump_path) - - print("Conversion is done!") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--biogpt_checkpoint_path", - default=None, - type=str, - required=True, - help=( - "Path to the official PyTorch checkpoint file which is expected to reside in the dump dir with dicts," - " bpecodes, etc." - ), - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - args = parser.parse_args() - convert_biogpt_checkpoint_to_pytorch(args.biogpt_checkpoint_path, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/biogpt/modeling_biogpt.py b/src/transformers/models/biogpt/modeling_biogpt.py index 8690082625a7..7b9937420025 100755 --- a/src/transformers/models/biogpt/modeling_biogpt.py +++ b/src/transformers/models/biogpt/modeling_biogpt.py @@ -871,6 +871,7 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, cache_position: Optional[torch.Tensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, ) -> Union[tuple, SequenceClassifierOutputWithPast]: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): @@ -894,7 +895,8 @@ def forward( cache_position=cache_position, ) hidden_states = transformer_outputs[0] - logits = self.score(hidden_states) + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.score(hidden_states[:, slice_indices, :]) if input_ids is not None: batch_size, sequence_length = input_ids.shape[:2] diff --git a/src/transformers/models/biogpt/modular_biogpt.py b/src/transformers/models/biogpt/modular_biogpt.py index 001c1de65756..8d95b2a2d051 100644 --- a/src/transformers/models/biogpt/modular_biogpt.py +++ b/src/transformers/models/biogpt/modular_biogpt.py @@ -693,6 +693,7 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, cache_position: Optional[torch.Tensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, ) -> Union[tuple, SequenceClassifierOutputWithPast]: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): @@ -716,7 +717,8 @@ def forward( cache_position=cache_position, ) hidden_states = transformer_outputs[0] - logits = self.score(hidden_states) + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.score(hidden_states[:, slice_indices, :]) if input_ids is not None: batch_size, sequence_length = input_ids.shape[:2] diff --git a/src/transformers/models/bit/convert_bit_to_pytorch.py b/src/transformers/models/bit/convert_bit_to_pytorch.py deleted file mode 100644 index 814db3ca4faa..000000000000 --- a/src/transformers/models/bit/convert_bit_to_pytorch.py +++ /dev/null @@ -1,177 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert BiT checkpoints from the timm library.""" - -import argparse -import json -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image -from timm import create_model -from timm.data import resolve_data_config -from timm.data.transforms_factory import create_transform - -from transformers import BitConfig, BitForImageClassification, BitImageProcessor -from transformers.image_utils import PILImageResampling -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_config(model_name): - repo_id = "huggingface/label-files" - filename = "imagenet-1k-id2label.json" - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - label2id = {v: k for k, v in id2label.items()} - - conv_layer = "std_conv" if "bit" in model_name else False - - # note that when using BiT as backbone for ViT-hybrid checkpoints, - # one needs to additionally set config.layer_type = "bottleneck", config.stem_type = "same", - # config.conv_layer = "std_conv_same" - config = BitConfig( - conv_layer=conv_layer, - num_labels=1000, - id2label=id2label, - label2id=label2id, - ) - - return config - - -def rename_key(name): - if "stem.conv" in name: - name = name.replace("stem.conv", "bit.embedder.convolution") - if "blocks" in name: - name = name.replace("blocks", "layers") - if "head.fc" in name: - name = name.replace("head.fc", "classifier.1") - if name.startswith("norm"): - name = "bit." + name - if "bit" not in name and "classifier" not in name: - name = "bit.encoder." + name - - return name - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -@torch.no_grad() -def convert_bit_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False): - """ - Copy/paste/tweak model's weights to our BiT structure. - """ - - # define default BiT configuration - config = get_config(model_name) - - # load original model from timm - timm_model = create_model(model_name, pretrained=True) - timm_model.eval() - - # load state_dict of original model - state_dict = timm_model.state_dict() - for key in state_dict.copy(): - val = state_dict.pop(key) - state_dict[rename_key(key)] = val.squeeze() if "head" in key else val - - # load HuggingFace model - model = BitForImageClassification(config) - model.eval() - model.load_state_dict(state_dict) - - # create image processor - transform = create_transform(**resolve_data_config({}, model=timm_model)) - timm_transforms = transform.transforms - - pillow_resamplings = { - "bilinear": PILImageResampling.BILINEAR, - "bicubic": PILImageResampling.BICUBIC, - "nearest": PILImageResampling.NEAREST, - } - - processor = BitImageProcessor( - do_resize=True, - size={"shortest_edge": timm_transforms[0].size}, - resample=pillow_resamplings[timm_transforms[0].interpolation.value], - do_center_crop=True, - crop_size={"height": timm_transforms[1].size[0], "width": timm_transforms[1].size[1]}, - do_normalize=True, - image_mean=timm_transforms[-1].mean.tolist(), - image_std=timm_transforms[-1].std.tolist(), - ) - - image = prepare_img() - timm_pixel_values = transform(image).unsqueeze(0) - pixel_values = processor(image, return_tensors="pt").pixel_values - - # verify pixel values - assert torch.allclose(timm_pixel_values, pixel_values) - - # verify logits - with torch.no_grad(): - outputs = model(pixel_values) - logits = outputs.logits - - print("Logits:", logits[0, :3]) - print("Predicted class:", model.config.id2label[logits.argmax(-1).item()]) - timm_logits = timm_model(pixel_values) - assert timm_logits.shape == outputs.logits.shape - assert torch.allclose(timm_logits, outputs.logits, atol=1e-3) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model {model_name} and processor to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - print(f"Pushing model {model_name} and processor to the hub") - model.push_to_hub(f"ybelkada/{model_name}") - processor.push_to_hub(f"ybelkada/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="resnetv2_50x1_bitm", - type=str, - help="Name of the BiT timm model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether to push the model to the hub.", - ) - - args = parser.parse_args() - convert_bit_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/blenderbot/convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/blenderbot/convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index d8ce9b056c3d..000000000000 --- a/src/transformers/models/blenderbot/convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,114 +0,0 @@ -# coding=utf-8 -# Copyright 2020 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Blenderbot checkpoint.""" - -import argparse - -import torch - -from transformers import BlenderbotConfig, BlenderbotForConditionalGeneration -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -PATTERNS = [ - ["attention", "attn"], - ["encoder_attention", "encoder_attn"], - ["q_lin", "q_proj"], - ["k_lin", "k_proj"], - ["v_lin", "v_proj"], - ["out_lin", "out_proj"], - ["norm_embeddings", "layernorm_embedding"], - ["position_embeddings", "embed_positions"], - ["embeddings", "embed_tokens"], - ["ffn.lin", "fc"], -] - - -def rename_state_dict_key(k): - if k == "embeddings.weight": - return "shared.weight" - - for parlai_name, hf_name in PATTERNS: - k = k.replace(parlai_name, hf_name) - - if k.startswith("encoder"): - k = k.replace(".attn", ".self_attn") - k = k.replace("norm1", "self_attn_layer_norm") - k = k.replace("norm2", "final_layer_norm") - elif k.startswith("decoder"): - k = k.replace("norm1", "self_attn_layer_norm") - k = k.replace("norm2", "encoder_attn_layer_norm") - k = k.replace("norm3", "final_layer_norm") - return k - - -def rename_layernorm_keys(sd): - keys = [ - "model.encoder.layernorm_embedding.weight", - "model.encoder.layernorm_embedding.bias", - "model.decoder.layernorm_embedding.weight", - "model.decoder.layernorm_embedding.bias", - ] - for k in keys: - v = sd.pop(k) - new_k = k.replace("layernorm_embedding", "layer_norm") - assert new_k not in sd - sd[new_k] = v - - -IGNORE_KEYS = ["START"] - - -@torch.no_grad() -def convert_parlai_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_json_path): - """ - Copy/paste/tweak model's weights to our BERT structure. - """ - model = torch.load(checkpoint_path, map_location="cpu", weights_only=True) - sd = model["model"] - cfg = BlenderbotConfig.from_json_file(config_json_path) - m = BlenderbotForConditionalGeneration(cfg) - valid_keys = m.model.state_dict().keys() - failures = [] - mapping = {} - for k, v in sd.items(): - if k in IGNORE_KEYS: - continue - - new_k = rename_state_dict_key(k) - if new_k not in valid_keys: - failures.append([k, new_k]) - else: - mapping[new_k] = v - if cfg.normalize_before: # Blenderbot-3B checkpoints. Rename layernorm_embedding -> layer_norm - rename_layernorm_keys(sd) - m.model.load_state_dict(mapping, strict=True) - m.half() - m.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument("--src_path", type=str, help="like blenderbot-model.bin") - parser.add_argument("--save_dir", default="hf_blenderbot", type=str, help="Where to save converted model.") - parser.add_argument( - "--hf_config_json", default="blenderbot-3b-config.json", type=str, help="Path to config to use" - ) - args = parser.parse_args() - convert_parlai_checkpoint(args.src_path, args.save_dir, args.hf_config_json) diff --git a/src/transformers/models/blip/convert_blip_original_pytorch_to_hf.py b/src/transformers/models/blip/convert_blip_original_pytorch_to_hf.py deleted file mode 100644 index 3de18c294ae8..000000000000 --- a/src/transformers/models/blip/convert_blip_original_pytorch_to_hf.py +++ /dev/null @@ -1,191 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import re - -import requests -import torch - -# git clone https://github.com/salesforce/BLIP.git -from models.blip import blip_decoder -from models.blip_itm import blip_itm -from models.blip_vqa import blip_vqa -from PIL import Image -from torchvision import transforms -from torchvision.transforms.functional import InterpolationMode - -from transformers import ( - BertTokenizer, - BlipConfig, - BlipForConditionalGeneration, - BlipForImageTextRetrieval, - BlipForQuestionAnswering, -) - - -def load_demo_image(image_size, device): - img_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg" - raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB") - - transform = transforms.Compose( - [ - transforms.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC), - transforms.ToTensor(), - transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), - ] - ) - image = transform(raw_image).unsqueeze(0).to(device) - return image - - -def rename_key(key): - if "visual_encoder" in key: - key = re.sub("visual_encoder*", "vision_model.encoder", key) - if "blocks" in key: - key = re.sub(r"blocks", "layers", key) - if "attn" in key: - key = re.sub(r"attn", "self_attn", key) - if "norm1" in key: - key = re.sub(r"norm1", "layer_norm1", key) - if "norm2" in key: - key = re.sub(r"norm2", "layer_norm2", key) - if "encoder.norm" in key: - key = re.sub(r"encoder.norm", "post_layernorm", key) - if "encoder.patch_embed.proj" in key: - key = re.sub(r"encoder.patch_embed.proj", "embeddings.patch_embedding", key) - - if "encoder.pos_embed" in key: - key = re.sub(r"encoder.pos_embed", "embeddings.position_embedding", key) - if "encoder.cls_token" in key: - key = re.sub(r"encoder.cls_token", "embeddings.class_embedding", key) - - if "self_attn" in key: - key = re.sub(r"self_attn.proj", "self_attn.projection", key) - - return key - - -@torch.no_grad() -def convert_blip_checkpoint(pytorch_dump_folder_path, config_path=None): - """ - Copy/paste/tweak model's weights to transformers design. - """ - if config_path is not None: - config = BlipConfig.from_pretrained(config_path) - else: - config = BlipConfig(projection_dim=512, text_config={}, vision_config={}) - - hf_model = BlipForConditionalGeneration(config).eval() - - model_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" - - pt_model = blip_decoder(pretrained=model_url, image_size=384, vit="base") - pt_model = pt_model.eval() - - modified_state_dict = pt_model.state_dict() - for key in modified_state_dict.copy(): - value = modified_state_dict.pop(key) - renamed_key = rename_key(key) - modified_state_dict[renamed_key] = value - - hf_model.load_state_dict(modified_state_dict) - - image_size = 384 - image = load_demo_image(image_size=image_size, device="cpu") - tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased") - input_ids = tokenizer(["a picture of"]).input_ids - - out = hf_model.generate(image, input_ids) - - assert out[0].tolist() == [30522, 1037, 3861, 1997, 1037, 2450, 3564, 2006, 1996, 3509, 2007, 2014, 3899, 102] - - out = hf_model.generate(image) - - assert out[0].tolist() == [30522, 1037, 2450, 3564, 2006, 1996, 3509, 2007, 2014, 3899, 102] - - if pytorch_dump_folder_path is not None: - hf_model.save_pretrained(pytorch_dump_folder_path) - - # model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_vqa.pth' - model_url = ( - "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth" - ) - - vqa_model = blip_vqa(pretrained=model_url, image_size=image_size, vit="base") - vqa_model.eval() - - modified_state_dict = vqa_model.state_dict() - for key in modified_state_dict.copy(): - value = modified_state_dict.pop(key) - renamed_key = rename_key(key) - modified_state_dict[renamed_key] = value - - hf_vqa_model = BlipForQuestionAnswering(config) - - hf_vqa_model.load_state_dict(modified_state_dict) - - question = ["How many dogs are in this image?"] - question_input_ids = tokenizer(question, return_tensors="pt").input_ids - - answer = hf_vqa_model.generate(question_input_ids, image) - print(tokenizer.decode(answer[0])) - - assert tokenizer.decode(answer[0]) == "[UNK] 1 [SEP]" - if pytorch_dump_folder_path is not None: - hf_vqa_model.save_pretrained(pytorch_dump_folder_path + "_vqa") - - model_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth" - - itm_model = blip_itm(pretrained=model_url, image_size=image_size, vit="base") - itm_model.eval() - - modified_state_dict = itm_model.state_dict() - for key in modified_state_dict.copy(): - value = modified_state_dict.pop(key) - renamed_key = rename_key(key) - modified_state_dict[renamed_key] = value - - hf_itm_model = BlipForImageTextRetrieval(config) - - question = ["A picture of a woman with a dog sitting in a beach"] - question_input_ids = tokenizer( - question, - return_tensors="pt", - padding="max_length", - truncation=True, - max_length=35, - ).input_ids - - hf_itm_model.load_state_dict(modified_state_dict) - hf_itm_model.eval() - - out_itm = hf_itm_model(question_input_ids, image, use_itm_head=True) - out = hf_itm_model(question_input_ids, image, use_itm_head=False) - - assert out[0].item() == 0.2110687494277954 - assert torch.nn.functional.softmax(out_itm[0], dim=1)[:, 1].item() == 0.45698845386505127 - - if pytorch_dump_folder_path is not None: - hf_itm_model.save_pretrained(pytorch_dump_folder_path + "_itm") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert") - args = parser.parse_args() - - convert_blip_checkpoint(args.pytorch_dump_folder_path, args.config_path) diff --git a/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py b/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py deleted file mode 100644 index d6640045b80c..000000000000 --- a/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py +++ /dev/null @@ -1,390 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Convert BLIP-2 checkpoints from the original repository. - -URL: https://github.com/salesforce/LAVIS/tree/main/projects/blip2 -""" - -import argparse - -import requests -import torch - -# pip3 install salesforce-lavis -# I'm actually installing a slightly modified version: pip3 install -U git+https://github.com/nielsrogge/LAVIS.git@blip2_float32 -# to make sure we can compare both original and HF implementation in float32 -from lavis.models import load_model_and_preprocess -from PIL import Image - -from transformers import ( - AutoTokenizer, - BertTokenizer, - Blip2Config, - Blip2ForConditionalGeneration, - Blip2ForImageTextRetrieval, - Blip2Processor, - Blip2QFormerConfig, - Blip2VisionConfig, - BlipImageProcessor, - OPTConfig, - T5Config, - set_seed, -) -from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD - - -def load_demo_image(): - url = "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png" - image = Image.open(requests.get(url, stream=True).raw).convert("RGB") - - return image - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config, model_name): - rename_keys = [] - # fmt: off - - # vision encoder - rename_keys.append(("visual_encoder.cls_token", "vision_model.embeddings.class_embedding")) - rename_keys.append(("visual_encoder.pos_embed", "vision_model.embeddings.position_embedding")) - rename_keys.append(("visual_encoder.patch_embed.proj.weight", "vision_model.embeddings.patch_embedding.weight")) - rename_keys.append(("visual_encoder.patch_embed.proj.bias", "vision_model.embeddings.patch_embedding.bias")) - rename_keys.append(("ln_vision.weight", "vision_model.post_layernorm.weight")) - rename_keys.append(("ln_vision.bias", "vision_model.post_layernorm.bias")) - - for i in range(config.vision_config.num_hidden_layers): - rename_keys.append((f"visual_encoder.blocks.{i}.norm1.weight", f"vision_model.encoder.layers.{i}.layer_norm1.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.norm1.bias", f"vision_model.encoder.layers.{i}.layer_norm1.bias")) - rename_keys.append((f"visual_encoder.blocks.{i}.norm2.weight", f"vision_model.encoder.layers.{i}.layer_norm2.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.norm2.bias", f"vision_model.encoder.layers.{i}.layer_norm2.bias")) - rename_keys.append((f"visual_encoder.blocks.{i}.attn.qkv.weight", f"vision_model.encoder.layers.{i}.self_attn.qkv.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.weight", f"vision_model.encoder.layers.{i}.self_attn.projection.weight",)) - rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.bias", f"vision_model.encoder.layers.{i}.self_attn.projection.bias")) - rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.weight", f"vision_model.encoder.layers.{i}.mlp.fc1.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.bias", f"vision_model.encoder.layers.{i}.mlp.fc1.bias")) - rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.weight", f"vision_model.encoder.layers.{i}.mlp.fc2.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.bias", f"vision_model.encoder.layers.{i}.mlp.fc2.bias")) - - # QFormer - rename_keys.append(("Qformer.bert.embeddings.LayerNorm.weight", "qformer.layernorm.weight")) - rename_keys.append(("Qformer.bert.embeddings.LayerNorm.bias", "qformer.layernorm.bias")) - if "itm" in model_name: - rename_keys.append(("Qformer.bert.embeddings.word_embeddings.weight", "embeddings.word_embeddings.weight")) - rename_keys.append(("Qformer.bert.embeddings.position_embeddings.weight", "embeddings.position_embeddings.weight")) - rename_keys.append(("vision_proj.weight", "vision_projection.weight")) - rename_keys.append(("vision_proj.bias", "vision_projection.bias")) - rename_keys.append(("text_proj.weight", "text_projection.weight")) - rename_keys.append(("text_proj.bias", "text_projection.bias")) - - # fmt: on - return rename_keys - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -def read_in_q_v_bias(state_dict, config): - for i in range(config.vision_config.num_hidden_layers): - # read in original q and v biases - q_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.q_bias") - v_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.v_bias") - - # next, set bias in the state dict - qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias)) - state_dict[f"vision_model.encoder.layers.{i}.self_attn.qkv.bias"] = qkv_bias - - -def get_blip2_config(model_name, eos_token_id): - image_size = 364 if "coco" in model_name else 224 - vision_config = Blip2VisionConfig(image_size=image_size).to_dict() - - # make sure the models have proper bos_token_id and eos_token_id set (important for generation) - # seems like flan-T5 models don't have bos_token_id properly set? - if "opt-2.7b" in model_name: - text_config = OPTConfig.from_pretrained("facebook/opt-2.7b", eos_token_id=eos_token_id).to_dict() - elif "opt-6.7b" in model_name: - text_config = OPTConfig.from_pretrained("facebook/opt-6.7b", eos_token_id=eos_token_id).to_dict() - elif "t5-xl" in model_name: - text_config = T5Config.from_pretrained("google/flan-t5-xl", dense_act_fn="gelu", bos_token_id=1).to_dict() - elif "t5-xxl" in model_name: - text_config = T5Config.from_pretrained("google/flan-t5-xxl", dense_act_fn="gelu", bos_token_id=1).to_dict() - elif "itm" in model_name: - text_config = {} - else: - raise ValueError("Model name not supported") - - if "itm" in model_name: - config = Blip2Config( - vision_config=vision_config, - qformer_config=Blip2QFormerConfig(vocab_size=30523, use_qformer_text_input=True).to_dict(), - ) - else: - config = Blip2Config(vision_config=vision_config, text_config=text_config) - - return config, image_size - - -@torch.no_grad() -def convert_blip2_checkpoint( - model_name, pytorch_dump_folder_path=None, push_to_hub=False, lavis_device="cpu", hf_model_device="cpu" -): - """ - Copy/paste/tweak model's weights to Transformers design. - """ - if "opt" in model_name: - tokenizer = AutoTokenizer.from_pretrained("facebook/opt-2.7b") - elif "itm" in model_name: - tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", truncation_side="right") - tokenizer.add_special_tokens({"bos_token": "[DEC]"}) - else: - tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xl") - - if "itm" in model_name: - eos_token_id = None - else: - eos_token_id = tokenizer("\n", add_special_tokens=False).input_ids[0] - config, image_size = get_blip2_config(model_name, eos_token_id=eos_token_id) - - if "itm" in model_name: - hf_model = Blip2ForImageTextRetrieval(config).eval() - else: - hf_model = Blip2ForConditionalGeneration(config).eval() - - model_name_to_original = { - "blip2-opt-2.7b": ("blip2_opt", "pretrain_opt2.7b"), - "blip2-opt-6.7b": ("blip2_opt", "pretrain_opt6.7b"), - "blip2-opt-2.7b-coco": ("blip2_opt", "caption_coco_opt2.7b"), - "blip2-opt-6.7b-coco": ("blip2_opt", "caption_coco_opt6.7b"), - "blip2-flan-t5-xl": ("blip2_t5", "pretrain_flant5xl"), - "blip2-flan-t5-xl-coco": ("blip2_t5", "caption_coco_flant5xl"), - "blip2-flan-t5-xxl": ("blip2_t5", "pretrain_flant5xxl"), - "blip2-itm-vit-g": ("blip2_image_text_matching", "pretrain"), - "blip2-itm-vit-g-coco": ("blip2_image_text_matching", "coco"), - } - - name, type = model_name_to_original[model_name] - - # load original model - print("Loading original model...") - original_model, vis_processors, _ = load_model_and_preprocess( - name=name, model_type=type, is_eval=True, device=lavis_device - ) - original_model.eval() - print("Done!") - - # update state dict keys - state_dict = original_model.state_dict() - rename_keys = create_rename_keys(config, model_name) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - - # some keys can be renamed efficiently - for key, val in state_dict.copy().items(): - val = state_dict.pop(key) - if key.startswith("Qformer.bert"): - key = key.replace("Qformer.bert", "qformer") - if "attention.self" in key: - key = key.replace("self", "attention") - if "opt_proj" in key: - key = key.replace("opt_proj", "language_projection") - if "t5_proj" in key: - key = key.replace("t5_proj", "language_projection") - if key.startswith("opt"): - key = key.replace("opt", "language") - if key.startswith("t5"): - key = key.replace("t5", "language") - state_dict[key] = val - - # read in qv biases - read_in_q_v_bias(state_dict, config) - - missing_keys, unexpected_keys = hf_model.load_state_dict(state_dict, strict=False) - assert len(missing_keys) == 0 - - if "itm" in model_name: - unexpected_keys = list(filter(lambda x: not x.startswith("Qformer.cls"), unexpected_keys)) - assert unexpected_keys == ["temp", "qformer.embeddings.position_ids"] - else: - assert unexpected_keys == ["qformer.embeddings.position_ids"] - - image = load_demo_image() - original_pixel_values = vis_processors["eval"](image).unsqueeze(0).to(lavis_device) - - # create processor - image_processor = BlipImageProcessor( - size={"height": image_size, "width": image_size}, image_mean=OPENAI_CLIP_MEAN, image_std=OPENAI_CLIP_STD - ) - processor = Blip2Processor(image_processor=image_processor, tokenizer=tokenizer) - pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(hf_model_device) - - # make sure processor creates exact same pixel values - assert torch.allclose(pixel_values, original_pixel_values.to(pixel_values.device)) - - original_model.to(lavis_device) - hf_model.to(hf_model_device) - - if "itm" in model_name: - caption = "a large fountain spewing water into the air" - input_ids = tokenizer([caption], return_tensors="pt").input_ids.to(hf_model_device) - attention_mask = processor(text=caption, return_tensors="pt").attention_mask.to(hf_model_device) - - with torch.no_grad(): - original_logits = original_model( - {"image": original_pixel_values, "text_input": [caption]}, match_head="itm" - ) - logits = hf_model( - pixel_values=pixel_values, - input_ids=input_ids, - attention_mask=attention_mask, - use_image_text_matching_head=True, - ) - - assert original_logits.shape == logits.logits_per_image.shape - print("First values of original logits:", original_logits[0, :3]) - print("First values of HF logits:", logits.logits_per_image[0, :3]) - - # assert values - # cast to same type - target_dtype = logits.logits_per_image.dtype - assert torch.allclose(original_logits.to(target_dtype), logits.logits_per_image, atol=1e-4) - - original_itm_scores = torch.nn.functional.softmax(original_logits, dim=1) - itm_scores = torch.nn.functional.softmax(logits.logits_per_image, dim=1) - assert torch.allclose(original_itm_scores.to(target_dtype), itm_scores, atol=1e-4) - print("Looks ok!") - - with torch.no_grad(): - original_logits = original_model( - {"image": original_pixel_values, "text_input": [caption]}, match_head="itc" - ) - logits = hf_model( - pixel_values=pixel_values, - input_ids=input_ids, - attention_mask=attention_mask, - use_image_text_matching_head=False, - ) - - assert original_logits.shape == logits.logits_per_image.shape - print("First values of original logits:", original_logits[0, :3]) - print("First values of HF logits:", logits.logits_per_image[0, :3]) - - # assert values - # cast to same type - target_dtype = logits.logits_per_image.dtype - assert torch.allclose(original_logits.to(target_dtype), logits.logits_per_image, atol=1e-4) - print("Looks ok!") - - else: - input_ids = tokenizer(["\n"], return_tensors="pt").input_ids.to(hf_model_device) - - with torch.no_grad(): - if "opt" in model_name: - original_logits = original_model({"image": original_pixel_values, "text_input": [""]}).logits - logits = hf_model(pixel_values, input_ids).logits - else: - original_logits = original_model( - {"image": original_pixel_values, "text_input": ["\n"], "text_output": ["\n"]} - ).logits - labels = input_ids.masked_fill(input_ids == tokenizer.pad_token_id, -100) - logits = hf_model(pixel_values, input_ids, labels=labels).logits - - assert original_logits.shape == logits.shape - print("First values of original logits:", original_logits[0, :3, :3]) - print("First values of HF logits:", logits[0, :3, :3]) - - # assert values - assert torch.allclose(original_logits.to(logits.device), logits, atol=1e-4) - print("Looks ok!") - - print("Generating a caption...") - prompt = "Question: what object is in this image? Answer:" - input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(hf_model_device) - - set_seed(42) - - original_outputs = original_model.generate( - {"image": original_pixel_values, "prompt": prompt}, use_nucleus_sampling=True, max_length=50 - ) - outputs = hf_model.generate( - pixel_values, - input_ids, - do_sample=True, - num_beams=5, - max_length=30, - min_length=1, - top_p=0.9, - repetition_penalty=1.0, - length_penalty=1.0, - temperature=1, - ) - output_text = processor.batch_decode(outputs, skip_special_tokens=True) - output_text = [text.strip() for text in output_text] - print("Original generation:", original_outputs) - print("HF generation:", output_text) - - if pytorch_dump_folder_path is not None: - processor.save_pretrained(pytorch_dump_folder_path) - hf_model.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - processor.push_to_hub(f"nielsr/{model_name}") - hf_model.push_to_hub(f"nielsr/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - choices = [ - "blip2-opt-2.7b", - "blip2-opt-6.7b", - "blip2-opt-2.7b-coco", - "blip2-opt-6.7b-coco", - "blip2-flan-t5-xl", - "blip2-flan-t5-xl-coco", - "blip2-flan-t5-xxl", - "blip2-itm-vit-g", - "blip2-itm-vit-g-coco", - ] - parser.add_argument( - "--model_name", - default="blip2-opt-2.7b", - choices=choices, - type=str, - help="Path to hf config.json of model to convert", - ) - parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether to push the model and processor to the hub after converting", - ) - # note: this script is tested on 2 GPUs, as models are compared in float32, - # which requires quite some memory. Hence loading both on a - # separate device is the easiest to compare - parser.add_argument( - "--lavis_device", default="cpu", type=str, help="Torch device to run the conversion, either cpu or cuda." - ) - parser.add_argument( - "--hf_model_device", default="cpu", type=str, help="Torch device to run the conversion, either cpu or cuda." - ) - - args = parser.parse_args() - - convert_blip2_checkpoint( - args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.lavis_device, args.hf_model_device - ) diff --git a/src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py b/src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py deleted file mode 100644 index 148706176b12..000000000000 --- a/src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py +++ /dev/null @@ -1,254 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert BigScience BLOOM checkpoint.""" - -import argparse -import json -import os -import re - -import torch - -from transformers import BloomConfig, BloomModel -from transformers.file_utils import CONFIG_NAME, WEIGHTS_NAME -from transformers.utils import logging - - -logging.set_verbosity_info() - -WEIGHTS_TO_AVERAGE_ENDSWITH = [ - "word_embeddings_layernorm.weight", - "word_embeddings_layernorm.bias", - "input_layernorm.weight", - "input_layernorm.bias", - "post_attention_layernorm.weight", - "post_attention_layernorm.bias", - "self_attention.dense.bias", - "mlp.dense_4h_to_h.bias", - "ln_f.weight", - "ln_f.bias", -] - -WEIGHTS_WITH_ROW_PARALLELISM_CONTAIN = [ - "mlp.dense_4h_to_h.weight", - "self_attention.dense.weight", -] - - -def layer_name_mapping(key, file): - """Convert Megatron-DeepSpeed TP/PP weights mapping in transformers PP only""" - # Handle first and last layers - layer_rename_map = { - "word_embeddings.weight": "word_embeddings.weight", - "word_embeddings.norm.weight": "word_embeddings_layernorm.weight", - "word_embeddings.norm.bias": "word_embeddings_layernorm.bias", - "weight": "ln_f.weight", - "bias": "ln_f.bias", - } - - if key in layer_rename_map: - return layer_rename_map[key] - - # Handle transformer blocks - layer_number = int(re.match(r".*layer_(\d*).*", file)[1]) - layer_number -= 3 - return f"h.{layer_number}." + key - - -def get_dtype_size(dtype): - if dtype == torch.bool: - return 1 / 8 - bit_search = re.search(r"[^\d](\d+)$", str(dtype)) - if bit_search is None: - raise ValueError(f"`dtype` is not a valid dtype: {dtype}.") - bit_size = int(bit_search.groups()[0]) - return bit_size // 8 - - -def convert_bloom_checkpoint_to_pytorch( - bloom_checkpoint_path, bloom_config_file, pytorch_dump_folder_path, shard_model, pretraining_tp -): - # Construct model - if bloom_config_file == "": - config = BloomConfig() - else: - config = BloomConfig.from_json_file(bloom_config_file) - - if shard_model: - file_names = os.listdir(bloom_checkpoint_path) - file_names = sorted(filter(lambda s: s.startswith("layer") and "model_00" in s, file_names)) - - index_dict = {"weight_map": {}, "metadata": {}} - total_size = 0 - - missing_keys = None - - config = BloomConfig() - - for j, file in enumerate(file_names): - print(f"Processing file: {file}") - tensors = None - - for i in range(pretraining_tp): - # load all TP files - f_name = file.replace("model_00", f"model_0{i}") - temp = torch.load(os.path.join(bloom_checkpoint_path, f_name), map_location="cpu", weights_only=True) - - # Rename keys in the transformers names - keys = list(temp.keys()) - for key in keys: - temp[layer_name_mapping(key, file)] = temp.pop(key) - - if tensors is None: - tensors = temp - else: - for key in tensors: - if any(key.endswith(end) for end in WEIGHTS_TO_AVERAGE_ENDSWITH): - # We average (sum and then divide) some weights across TP ranks (see https://github.com/bigscience-workshop/Megatron-DeepSpeed/blob/olruwase/sync_layer_norms/megatron/training.py#L425) - tensors[key] += temp[key] - else: - # Some weights are RowParallelLinear in Megatron-Deepspeed, others are ColumnParallel - cat_dim = 1 if any(text in key for text in WEIGHTS_WITH_ROW_PARALLELISM_CONTAIN) else 0 - # We concatenate these weights across TP ranks - tensors[key] = torch.cat([tensors[key], temp[key]], dim=cat_dim) - - # Divide by the number of TP the weights we want to average - for key in tensors: - if any(key.endswith(end) for end in WEIGHTS_TO_AVERAGE_ENDSWITH): - tensors[key] = tensors[key] / pretraining_tp - torch.save( - tensors, - os.path.join( - pytorch_dump_folder_path, - f"pytorch_model_{str(j + 1).zfill(5)}-of-{str(len(file_names)).zfill(5)}.bin", - ), - ) - - for key in tensors: - value = tensors[key] - total_size += value.numel() * get_dtype_size(value.dtype) - if key not in index_dict["weight_map"]: - index_dict["weight_map"][key] = ( - f"pytorch_model_{str(j + 1).zfill(5)}-of-{str(len(file_names)).zfill(5)}.bin" - ) - - config = BloomConfig() - pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME - index_dict["metadata"]["total_size"] = total_size - with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: - f.write(config.to_json_string()) - with open(os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME + ".index.json"), "w", encoding="utf-8") as f: - json_config = json.dumps(index_dict, indent=2, sort_keys=True) + "\n" - f.write(json_config) - else: - model = BloomModel(config) - - file_names = os.listdir(bloom_checkpoint_path) - file_names = sorted(filter(lambda s: s.startswith("layer") and "model_00" in s, file_names)) - - missing_keys = None - for i, file in enumerate(file_names): - tensors = None - for i in range(pretraining_tp): - # load all TP files - f_name = file.replace("model_00", f"model_0{i}") - temp = torch.load(os.path.join(bloom_checkpoint_path, f_name), map_location="cpu", weights_only=True) - - # Rename keys in the transformers names - keys = list(temp.keys()) - for key in keys: - temp[layer_name_mapping(key, file)] = temp.pop(key) - - if tensors is None: - tensors = temp - else: - for key in tensors: - # We average (sum and then divide) some weights across TP ranks (see https://github.com/bigscience-workshop/Megatron-DeepSpeed/blob/olruwase/sync_layer_norms/megatron/training.py#L425) - if any(key.endswith(end) for end in WEIGHTS_TO_AVERAGE_ENDSWITH): - tensors[key] += temp[key] - else: - # Some weights are RowParallelLinear in Megatron-Deepspeed, others are ColumnParallel - cat_dim = 1 if any(text in key for text in WEIGHTS_WITH_ROW_PARALLELISM_CONTAIN) else 0 - # We concatenate these weights across TP ranks - tensors[key] = torch.cat([tensors[key], temp[key]], dim=cat_dim) - - # Divide by the number of TP the weights we want to average - for key in tensors: - if any(key.endswith(end) for end in WEIGHTS_TO_AVERAGE_ENDSWITH): - tensors[key] = tensors[key] / pretraining_tp - - other_keys = model.load_state_dict(tensors, strict=False) - assert not other_keys.unexpected_keys, f"The keys {other_keys.unexpected_keys} are unexpected" - if missing_keys is None: - missing_keys = set(other_keys.missing_keys) - else: - missing_keys = missing_keys.intersection(set(other_keys.missing_keys)) - - assert not missing_keys, f"The keys {missing_keys} are missing" - - # Save pytorch-model - os.makedirs(pytorch_dump_folder_path, exist_ok=True) - pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME - pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME - print(f"Save PyTorch model to {pytorch_weights_dump_path} with dtype {config.dtype}") - if config.dtype is not None: - model = model.to(config.dtype) - torch.save(model.state_dict(), pytorch_weights_dump_path) - print(f"Save configuration file to {pytorch_config_dump_path}") - with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: - f.write(config.to_json_string()) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--bloom_checkpoint_path", - default=None, - type=str, - required=True, - help="Path to the Megatron-LM checkpoint path.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - parser.add_argument( - "--bloom_config_file", - default="", - type=str, - help=( - "An optional config json file corresponding to the pre-trained model. \n" - "This specifies the model architecture." - ), - ) - parser.add_argument( - "--shard_model", - action="store_true", - help="An optional setting to shard the output model \nThis enables sharding the converted checkpoint", - ) - parser.add_argument( - "--pretraining_tp", - default=4, - type=int, - help="Pretraining TP rank that has been used when training the model in Megatron-LM \n", - ) - args = parser.parse_args() - convert_bloom_checkpoint_to_pytorch( - args.bloom_checkpoint_path, - args.bloom_config_file, - args.pytorch_dump_folder_path, - args.shard_model, - args.pretraining_tp, - ) diff --git a/src/transformers/models/blt/__init__.py b/src/transformers/models/blt/__init__.py new file mode 100644 index 000000000000..703b81ecdd09 --- /dev/null +++ b/src/transformers/models/blt/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_blt import * + from .modeling_blt import * + from .tokenization_blt import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/blt/configuration_blt.py b/src/transformers/models/blt/configuration_blt.py new file mode 100644 index 000000000000..0bc6718e5bd1 --- /dev/null +++ b/src/transformers/models/blt/configuration_blt.py @@ -0,0 +1,423 @@ +# coding=utf-8 +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Blt model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class BltLocalEncoderConfig(PretrainedConfig): + """ + Configuration class for the Blt Local Encoder component. + """ + + model_type = "blt_local_encoder" + + def __init__( + self, + vocab_size=260, + cross_attn_all_layers=False, + cross_attn_k=2, + hidden_size_global=2048, + hidden_size=1024, + num_attention_heads=16, + num_key_value_heads=None, + num_hidden_layers=1, + rms_norm_eps=1e-5, + dropout=0.0, + max_position_embeddings=24576, + rope_theta=500000.0, + rope_scaling=None, + hidden_act="silu", + intermediate_size=2816, + initializer_range=0.02, + **kwargs, + ): + self.vocab_size = vocab_size + self.cross_attn_all_layers = cross_attn_all_layers + self.cross_attn_k = cross_attn_k + self.hidden_size_global = hidden_size_global + self.hidden_size = hidden_size + self.num_attention_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads or num_attention_heads + self.head_dim = hidden_size // num_attention_heads + self.intermediate_size = intermediate_size or int(8 * hidden_size / 3) + self.num_hidden_layers = num_hidden_layers + self.rms_norm_eps = rms_norm_eps + self.dropout = dropout + self.max_position_embeddings = max_position_embeddings + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self.hidden_act = hidden_act + self.initializer_range = initializer_range + + # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error + kwargs.pop("tie_word_embeddings", None) + super().__init__(**kwargs, tie_word_embeddings=False) + + +class BltLocalDecoderConfig(PretrainedConfig): + """ + Configuration class for the Blt Local Decoder component. + """ + + model_type = "blt_local_decoder" + + def __init__( + self, + vocab_size=260, + cross_attn_all_layers=True, + cross_attn_k=2, + hidden_size_global=2048, + hidden_size=1024, + num_attention_heads=16, + num_key_value_heads=None, + num_hidden_layers=9, + rms_norm_eps=1e-5, + dropout=0.0, + max_position_embeddings=24576, + rope_theta=500000.0, + rope_scaling=None, + hidden_act="silu", + intermediate_size=2816, + initializer_range=0.02, + **kwargs, + ): + self.vocab_size = vocab_size + self.cross_attn_all_layers = cross_attn_all_layers + self.cross_attn_k = cross_attn_k + self.hidden_size_global = hidden_size_global + self.hidden_size = hidden_size + self.num_attention_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads or num_attention_heads + self.head_dim = hidden_size // num_attention_heads + self.intermediate_size = intermediate_size or int(8 * hidden_size / 3) + self.num_hidden_layers = num_hidden_layers + self.rms_norm_eps = rms_norm_eps + self.dropout = dropout + self.max_position_embeddings = max_position_embeddings + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self.hidden_act = hidden_act + self.initializer_range = initializer_range + + # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error + kwargs.pop("tie_word_embeddings", None) + super().__init__(**kwargs, tie_word_embeddings=False) + + +class BltGlobalTransformerConfig(PretrainedConfig): + """ + Configuration class for the Blt Global Transformer component. + """ + + model_type = "blt_global_transformer" + + def __init__( + self, + hidden_size=2048, + num_attention_heads=16, + num_key_value_heads=None, + num_hidden_layers=25, + rms_norm_eps=1e-5, + dropout=0.0, + max_position_embeddings=4096, + rope_theta=500000.0, + rope_scaling=None, + hidden_act="silu", + intermediate_size=5632, + initializer_range=0.02, + **kwargs, + ): + self.hidden_size = hidden_size + self.num_attention_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads or num_attention_heads + self.head_dim = hidden_size // num_attention_heads + self.intermediate_size = intermediate_size or int(8 * hidden_size / 3) + self.num_hidden_layers = num_hidden_layers + self.rms_norm_eps = rms_norm_eps + self.dropout = dropout + self.max_position_embeddings = max_position_embeddings + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self.hidden_act = hidden_act + self.initializer_range = initializer_range + + # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error + kwargs.pop("tie_word_embeddings", None) + super().__init__(**kwargs, tie_word_embeddings=False) + + +class BltPatcherConfig(PretrainedConfig): + r""" + Configuration class for the Blt Patcher/Entropy model component. + + Args: + vocab_size (`int`, *optional*, defaults to 260): + Vocabulary size of the Blt patcher model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling the patcher model. + hidden_size (`int`, *optional*, defaults to 768): + Dimension of the hidden representations. + num_hidden_layers (`int`, *optional*, defaults to 14): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details, check out [this + paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to + `num_attention_heads`. + max_position_embeddings (`int`, *optional*, defaults to 8192): + The maximum sequence length that this model might ever be used with. + rms_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the rms normalization layers. + dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + intermediate_size (`int`, *optional*, defaults to 2048): + Dimension of the MLP representations. + rope_scaling (`dict`, *optional*): + Dictionary containing the RoPE scaling configuration. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + """ + + model_type = "blt_patcher" + + def __init__( + self, + vocab_size=260, + hidden_size=768, + num_hidden_layers=14, + num_attention_heads=12, + num_key_value_heads=None, + max_position_embeddings=8192, + rms_norm_eps=1e-5, + dropout=0.0, + rope_theta=10000.0, + intermediate_size=2048, + rope_scaling=None, + initializer_range=0.02, + **kwargs, + ): + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.head_dim = hidden_size // num_attention_heads + self.num_key_value_heads = num_key_value_heads if num_key_value_heads is not None else num_attention_heads + self.max_position_embeddings = max_position_embeddings + self.rms_norm_eps = rms_norm_eps + self.dropout = dropout + self.rope_theta = rope_theta + self.hidden_act = "silu" # Blt uses silu activation + self.intermediate_size = intermediate_size or int(8 * self.hidden_size / 3) + self.rope_scaling = rope_scaling + self.initializer_range = initializer_range + + # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error + kwargs.pop("tie_word_embeddings", None) + super().__init__(**kwargs, tie_word_embeddings=False) + + +class BltConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`BltModel`]. It is used to instantiate a + Blt model according to the specified arguments, defining the model architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 260): + Vocabulary size of the Blt model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`BltModel`]. + max_position_embeddings (`int`, *optional*, defaults to 4096): + The maximum sequence length that this model might ever be used with. + patch_in_forward (`bool`, *optional*, defaults to `True`): + Whether to perform patching during the forward pass. + patch_size (`int`, *optional*, defaults to 4): + Size of the patches used in the patching mechanism. + patching_mode (`str`, *optional*, defaults to `"entropy"`): + The mode used for patching, such as entropy-based patching. + patching_threshold (`float`, *optional*, defaults to 1.34): + Threshold value used for determining when to apply patches. + patching_batch_size (`int`, *optional*, defaults to 1): + Batch size used during the patching process. + max_patch_length (`int`, *optional*): + Maximum length of patches that can be generated. + cross_attn_k (`int`, *optional*, defaults to 2): + Number of cross-attention heads used in the model. + encoder_hash_byte_group_size (`list`, *optional*): + List of byte group sizes used in the encoder hash function. + encoder_hash_byte_group_vocab (`int`, *optional*, defaults to 500002): + Vocabulary size for the encoder hash byte groups. + encoder_hash_byte_group_nb_functions (`int`, *optional*, defaults to 1): + Number of hash functions used in the encoder byte grouping. + patcher_config (`BltPatcherConfig`, *optional*): + Configuration for the patcher component of the model. + encoder_config (`BltLocalEncoderConfig`, *optional*): + Configuration for the local encoder component of the model. + decoder_config (`BltLocalDecoderConfig`, *optional*): + Configuration for the local decoder component of the model. + global_config (`BltGlobalTransformerConfig`, *optional*): + Configuration for the global transformer component of the model. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rope_theta (`float`, *optional*, defaults to 500000.0): + The base period of the RoPE embeddings. + rope_scaling (`dict`, *optional*): + Dictionary containing the RoPE scaling configuration. + + ```python + >>> from transformers import BltModel, BltConfig + + >>> # Initializing a Blt configuration + >>> configuration = BltConfig() + + >>> # Initializing a model from the configuration + >>> model = BltModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ``` + + Checkpoint: [facebook/blt](https://huggingface.co/facebook/blt) + """ + + model_type = "blt" + keys_to_ignore_at_inference = ["past_key_values"] + sub_configs = { + "patcher_config": BltPatcherConfig, + "encoder_config": BltLocalEncoderConfig, + "decoder_config": BltLocalDecoderConfig, + "global_config": BltGlobalTransformerConfig, + } + + def __init__( + self, + vocab_size=260, + max_position_embeddings=4096, + patch_in_forward=True, + patch_size=4, + patching_mode="entropy", + patching_threshold=1.335442066192627, + patching_batch_size=1, + max_patch_length=None, + cross_attn_k=2, + encoder_hash_byte_group_size=None, + encoder_hash_byte_group_vocab=500002, + encoder_hash_byte_group_nb_functions=1, + patcher_config=None, + encoder_config=None, + decoder_config=None, + global_config=None, + tie_word_embeddings=False, + initializer_range=0.02, + rope_theta=500000.0, + rope_scaling=None, + **kwargs, + ): + # Basic model configuration + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + + # Patching configuration + self.patch_in_forward = patch_in_forward + self.patch_size = patch_size + self.patching_mode = patching_mode + self.patching_threshold = patching_threshold + self.patching_batch_size = patching_batch_size + self.max_patch_length = max_patch_length + self.patching_device = kwargs.get("patching_device", "cuda") + self.realtime_patching = kwargs.get("realtime_patching", True) + self.patching_threshold_add = kwargs.get("patching_threshold_add") + self.monotonicity = kwargs.get("monotonicity", False) + + # Cross attention configurations + self.cross_attn_k = cross_attn_k + + # Encoder configurations + self.encoder_hash_byte_group_size = encoder_hash_byte_group_size or [3, 4, 5, 6, 7, 8] + self.encoder_hash_byte_group_vocab = encoder_hash_byte_group_vocab + self.encoder_hash_byte_group_nb_functions = encoder_hash_byte_group_nb_functions + + # Initialize component configurations + if patcher_config is None: + self.patcher_config = BltPatcherConfig(initializer_range=initializer_range) + logger.info("patcher_config is None, using default Blt patcher config") + elif isinstance(patcher_config, dict): + patcher_config.setdefault("initializer_range", initializer_range) + self.patcher_config = BltPatcherConfig(**patcher_config) + elif isinstance(patcher_config, BltPatcherConfig): + self.patcher_config = patcher_config + + if encoder_config is None: + self.encoder_config = BltLocalEncoderConfig(initializer_range=initializer_range) + logger.info("encoder_config is None, using default Blt encoder config") + elif isinstance(encoder_config, dict): + encoder_config.setdefault("initializer_range", initializer_range) + self.encoder_config = BltLocalEncoderConfig(**encoder_config) + elif isinstance(encoder_config, BltLocalEncoderConfig): + self.encoder_config = encoder_config + + if decoder_config is None: + self.decoder_config = BltLocalDecoderConfig(initializer_range=initializer_range) + logger.info("decoder_config is None, using default Blt decoder config") + elif isinstance(decoder_config, dict): + decoder_config.setdefault("initializer_range", initializer_range) + self.decoder_config = BltLocalDecoderConfig(**decoder_config) + elif isinstance(decoder_config, BltLocalDecoderConfig): + self.decoder_config = decoder_config + + if global_config is None: + self.global_config = BltGlobalTransformerConfig(initializer_range=initializer_range) + logger.info("global_config is None, using default Blt global config") + elif isinstance(global_config, dict): + global_config.setdefault("initializer_range", initializer_range) + self.global_config = BltGlobalTransformerConfig(**global_config) + elif isinstance(global_config, BltGlobalTransformerConfig): + self.global_config = global_config + + # Determine if token embedding projection is needed based on dimension mismatch (7b) + encoder_cross_output_size = self.encoder_config.hidden_size * self.cross_attn_k + self.global_config.encoder_cross_output_size = ( + encoder_cross_output_size if encoder_cross_output_size != self.global_config.hidden_size else None + ) + + # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error + kwargs.pop("tie_word_embeddings", None) + super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + + +__all__ = [ + "BltConfig", + "BltPatcherConfig", + "BltLocalEncoderConfig", + "BltLocalDecoderConfig", + "BltGlobalTransformerConfig", +] diff --git a/src/transformers/models/blt/modeling_blt.py b/src/transformers/models/blt/modeling_blt.py new file mode 100644 index 000000000000..1e677dda4a98 --- /dev/null +++ b/src/transformers/models/blt/modeling_blt.py @@ -0,0 +1,1311 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/blt/modular_blt.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_blt.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Callable, Optional, Union + +import torch +import torch.distributions +import torch.nn as nn +import torch.nn.functional as F + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache +from ...generation import GenerationMixin +from ...masking_utils import create_causal_mask +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple +from ...utils.deprecation import deprecate_kwarg +from ...utils.generic import OutputRecorder, check_model_inputs +from .configuration_blt import ( + BltConfig, + BltGlobalTransformerConfig, + BltLocalDecoderConfig, + BltLocalEncoderConfig, + BltPatcherConfig, +) + + +class BltMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + # Ignore copy + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return down_proj + + +class BltRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + BltRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + + +class BltRotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: BltConfig, device=None): + super().__init__() + # BC: "rope_type" was originally "type" + if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = self.inv_freq + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.repeat_interleave(freqs, 2, dim=-1) # diff from Llama: we interleave() instead of cat() + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +# Modified from transformers.models.llama.modeling_llama.LlamaDecoderLayer +class BltTransformerLayer(GradientCheckpointingLayer): + def __init__(self, config, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + self.self_attn = BltSelfAttention(config=config, layer_idx=layer_idx) + self.mlp = BltMLP(config) + self.input_layernorm = BltRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = BltRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.layer_idx = layer_idx + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + cross_attention_states: Optional[torch.Tensor] = None, + cross_attention_mask: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + full_text_row_masked_out_mask: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): + attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1, + query_sequence_length, key_sequence_length)` if default attention is used. + + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_values (`Cache`, *optional*): cached past key and value projection states + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence + position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*): + Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`, + with `head_dim` being the embedding dimension of each attention head. + kwargs (`dict`, *optional*): + Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code + into the model + """ + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + return hidden_states + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs: Unpack[TransformersKwargs], +): + key_states = repeat_kv(key, module.num_key_value_groups) + value_states = repeat_kv(value, module.num_key_value_groups) + + attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +def rotate_half(x): + # Split and rotate. Note that this function is different from e.g. Llama. + x1 = x[..., ::2] + x2 = x[..., 1::2] + rot_x = torch.stack([-x2, x1], dim=-1).flatten(-2) + return rot_x + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class BltSelfAttention(nn.Module): + def __init__(self, config: BltConfig, layer_idx: int): + super().__init__() + self.config = config + self.num_heads = config.num_attention_heads + self.dropout = config.dropout + self.hidden_size = config.hidden_size + self.num_key_value_heads = config.num_key_value_heads + self.head_dim = config.hidden_size // self.num_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.scaling = self.head_dim**-0.5 + self.rope_theta = config.rope_theta + self.layer_idx = layer_idx + + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) + self.is_causal = True + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + position_embeddings: torch.Tensor, + use_cache: bool = False, + past_key_values=None, + cache_position=None, + **kwargs, + ): + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + attention_interface: Callable = eager_attention_forward + + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.dropout, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape(bsz, q_len, -1).contiguous() + attn_output = self.o_proj(attn_output) + + return attn_output, attn_weights + + +class BltCrossAttention(nn.Module): + """Cross-attention module for Blt, following transformers style""" + + def __init__(self, config: BltConfig, layer_idx: int, hidden_size: Optional[int] = None): + super().__init__() + self.config = config + self.num_heads = self.config.num_attention_heads + self.num_key_value_heads = self.config.num_key_value_heads + self.dropout = config.dropout + self.hidden_size = config.hidden_size + self.head_dim = config.hidden_size // self.num_heads + self.layer_idx = layer_idx + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.scaling = self.head_dim**-0.5 + + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) + self.q_norm = BltRMSNorm(self.hidden_size, eps=config.rms_norm_eps) + self.k_norm = BltRMSNorm(self.hidden_size, eps=config.rms_norm_eps) + self.is_causal = False + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + cross_attention_states: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + attention_mask: Optional[torch.Tensor] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + bsz, q_len, _ = hidden_states.size() + query_states = self.q_norm(hidden_states) + query_states = self.q_proj(query_states) + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + + if cross_attention_states is not None: + cross_attention_states = self.k_norm(cross_attention_states) + key_states = self.k_proj(cross_attention_states) + value_states = self.v_proj(cross_attention_states) + key_states = key_states.view(bsz, -1, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, -1, self.num_key_value_heads, self.head_dim).transpose(1, 2) + if past_key_values is not None: + key_states, value_states = past_key_values.update( + key_states, value_states, self.layer_idx, {"cache_position": cache_position} + ) + elif cache_position[0] != 0: + key_states, value_states = ( + past_key_values.layers[self.layer_idx].keys, + past_key_values.layers[self.layer_idx].values, + ) + else: + raise ValueError( + "Cross attention layer can't find neither `cross_attn_states` nor cached values for key/values!" + ) + attention_interface: Callable = eager_attention_forward + + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.dropout, + scaling=self.scaling, + **kwargs, + ) + attn_output = attn_output.reshape(bsz, q_len, -1).contiguous() + attn_output = self.o_proj(attn_output) + attn_output = attn_output + hidden_states + return attn_output, attn_weights + + +@auto_docstring +class BltPreTrainedModel(PreTrainedModel): + config: BltConfig + base_model_prefix = "" + supports_gradient_checkpointing = True + _no_split_modules = ["BltTransformerLayer"] + _can_compile_fullgraph = False # static cache cannot have different shapes for each layer + _supports_sdpa = True + _supports_flash_attn = False + _supports_flex_attn = False + _supports_attention_backend = False + _can_record_outputs = { + "hidden_states": OutputRecorder(BltTransformerLayer, index=0, layer_name="local_decoder"), + "attentions": OutputRecorder(BltSelfAttention, index=1, layer_name="local_decoder"), + } + + +class BltLocalEncoder(BltPreTrainedModel): + config: BltLocalEncoderConfig + _can_record_outputs = { + "encoder_attentions": OutputRecorder(BltSelfAttention, index=1, layer_name="local_encoder"), + } + + def __init__(self, config: BltLocalEncoderConfig): + super().__init__(config) + self.gradient_checkpointing = False + self.config = config + self.layers = nn.ModuleList( + [BltTransformerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.rotary_emb = BltRotaryEmbedding(config=config) + self.patch_embedding_projection = nn.Linear( + in_features=config.hidden_size, + out_features=config.hidden_size * config.cross_attn_k, + bias=False, + ) + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size) + self.cross_attn_layers = nn.ModuleList() + layers_to_add = config.num_hidden_layers if config.cross_attn_all_layers else 1 + for layer_idx in range(layers_to_add): + self.cross_attn_layers.append( + BltCrossAttention(config=config, layer_idx=layer_idx, hidden_size=config.hidden_size) + ) + + self.post_init() + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + patch_embeds: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + num_patches: Optional[int] = None, + patch_ids: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ): + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + batch_size = inputs_embeds.shape[0] + hidden_states = F.dropout(inputs_embeds, p=self.config.dropout, training=self.training) + + if position_ids is None: + position_ids = ( + torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device).unsqueeze(0).expand(batch_size, -1) + ) + + position_embeddings = self.rotary_emb(hidden_states, position_ids) + hidden_states = F.dropout(hidden_states, p=self.config.dropout, training=self.training) + + for idx, layer in enumerate(self.layers): + hidden_states = layer( + hidden_states, + position_embeddings=position_embeddings, + attention_mask=attention_mask, + past_key_values=past_key_values, + cache_position=cache_position, + **kwargs, + ) + if idx == len(self.layers) - 1 or self.config.cross_attn_all_layers: + patch_embeds = self.patch_reduce(hidden_states, num_patches, patch_ids) + patch_embeds = self.patch_embedding_projection(patch_embeds) + patch_embeds = patch_embeds.reshape( + batch_size, patch_embeds.shape[1] * self.config.cross_attn_k, self.config.hidden_size + ) + layer_idx = idx if self.config.cross_attn_all_layers else 0 + cross_attention_output, _ = self.cross_attn_layers[layer_idx]( + hidden_states=patch_embeds, + cross_attention_states=hidden_states, + attention_mask=encoder_attention_mask, + **kwargs, + ) + patch_embeds = patch_embeds + cross_attention_output + encoder_cross_states = patch_embeds + return hidden_states, encoder_cross_states + + def patch_reduce(self, hidden_states, max_num_patches, patch_ids): + """ + Reduce variable length patches to single embedding per patch + Note: this works with variable number of patches for different sequences in the batch + It handles variable length patches by assuming that patch_lengths will be 0 for any + extra patches on the *right*. Since there can be a variable number of patches + this function also return the number of patches for each sequence in the batch. + Any embeddings on the right that are not allocated to a patch + (i.e. if the sum(patch_lengths[i]) < seq_len for any i) + will be sent to a dummy patch, which is trimmed before returning. + """ + batch_size = hidden_states.shape[0] + embedding_dim = hidden_states.shape[-1] + + patch_ids = patch_ids.unsqueeze(-1).expand(-1, -1, hidden_states.shape[-1]) + + reduced_embeddings = torch.zeros( + (batch_size, max_num_patches, embedding_dim), dtype=hidden_states.dtype, device=hidden_states.device + ) + + reduced_embeddings = reduced_embeddings.scatter_reduce( + src=hidden_states, + dim=1, + index=patch_ids, + reduce="amax", + include_self=False, + ) + reduced_embeddings = reduced_embeddings[:, :max_num_patches, :] + + return reduced_embeddings + + +class BltLocalDecoder(BltPreTrainedModel): + config: BltLocalDecoderConfig + + def __init__(self, config: BltLocalDecoderConfig): + super().__init__(config) + self.gradient_checkpointing = False + self.config = config + self.cross_attn_decoder = True + self.layers = nn.ModuleList( + [BltTransformerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.rotary_emb = BltRotaryEmbedding(config=config) + self.patch_embedding_projection = nn.Linear( + in_features=config.hidden_size_global, + out_features=config.hidden_size * config.cross_attn_k, + bias=False, + ) + self.norm = BltRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.cross_attn_layers = nn.ModuleList() + layers_to_add = config.num_hidden_layers if config.cross_attn_all_layers else 1 + for layer_idx in range(layers_to_add): + self.cross_attn_layers.append( + BltCrossAttention(config=config, layer_idx=layer_idx, hidden_size=config.hidden_size) + ) + + self.post_init() + + @check_model_inputs + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + patch_embeds: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ): + batch_size = inputs_embeds.shape[0] + hidden_states = inputs_embeds + patch_embeds = self.patch_embedding_projection(patch_embeds) + patch_embeds = patch_embeds.reshape( + batch_size, patch_embeds.shape[1] * self.config.cross_attn_k, self.config.hidden_size + ) + + if patch_embeds is not None and not self.cross_attn_decoder: + hidden_states = hidden_states + patch_embeds + + if position_ids is None: + position_ids = ( + torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device).unsqueeze(0).expand(batch_size, -1) + ) + + position_embeddings = self.rotary_emb(hidden_states, position_ids) + hidden_states = F.dropout(hidden_states, p=self.config.dropout, training=self.training) + + for i, layer in enumerate(self.layers): + if i == 0 or self.config.cross_attn_all_layers: + cross_attention_output, _ = self.cross_attn_layers[i]( + hidden_states=hidden_states, + cross_attention_states=patch_embeds, + attention_mask=encoder_attention_mask, + **kwargs, + ) + hidden_states = hidden_states + cross_attention_output + hidden_states = layer( + hidden_states, + position_embeddings=position_embeddings, + attention_mask=attention_mask, + past_key_values=past_key_values, + cache_position=cache_position, + **kwargs, + ) + logits = self.norm(hidden_states) + return logits + + +class BltGlobalTransformer(BltPreTrainedModel): + config: BltGlobalTransformerConfig + _can_record_outputs = { + "global_attentions": OutputRecorder(BltSelfAttention, index=1, layer_name="global_transformer"), + } + + def __init__(self, config: BltGlobalTransformerConfig): + super().__init__(config) + self.config = config + self.layers = nn.ModuleList() + for layer_idx in range(config.num_hidden_layers): + self.layers.append(BltTransformerLayer(config, layer_idx)) + self.rotary_emb = BltRotaryEmbedding(config=config) + + # Create token embedding projection (use nn.Identity() when no projection needed) + if getattr(config, "encoder_cross_output_size", None) is not None: + self.token_embedding_projection = nn.Linear( + config.encoder_cross_output_size, config.hidden_size, bias=False + ) + else: + self.token_embedding_projection = nn.Identity() + + self.post_init() + + def forward( + self, + input_embeds: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ): + batch_size, seq_len, _ = input_embeds.shape + hidden_states = self.token_embedding_projection(input_embeds) + hidden_states = F.dropout(hidden_states, p=self.config.dropout, training=self.training) + if position_ids is None: + position_ids = ( + torch.arange(input_embeds.shape[1], device=input_embeds.device).unsqueeze(0).expand(batch_size, -1) + ) + position_embeddings = self.rotary_emb(hidden_states, position_ids) + for i, layer in enumerate(self.layers): + hidden_states = layer( + hidden_states, + position_embeddings=position_embeddings, + attention_mask=attention_mask, + past_key_values=past_key_values, + cache_position=cache_position, + **kwargs, + ) + return hidden_states + + +def process_patch_lengths(patch_lengths: torch.Tensor, max_patch_length: Optional[int]) -> torch.Tensor: + """ + Splits patch lengths into smaller segments if they exceed `max_patch_length`. + Pads the result to uniform length across the batch. + + Args: + patch_lengths (torch.Tensor): [batch_size, num_patches] tensor of patch lengths. + max_patch_length (int, optional): Maximum allowed length per patch. + + Returns: + torch.Tensor: [batch_size, max_len] tensor of split and padded patch lengths. + """ + if max_patch_length is None: + return patch_lengths + + batch_size = patch_lengths.size(0) + processed = [] + + for seq in patch_lengths: + splits = [] + for length in seq[seq > 0]: + length = length.item() + full_chunks, remainder = divmod(length, max_patch_length) + splits.extend([max_patch_length] * full_chunks) + if remainder: + splits.append(remainder) + processed.append(splits) + + # Find max length to pad to + max_len = max(len(splits) for splits in processed) + padded = torch.zeros((batch_size, max_len), dtype=patch_lengths.dtype, device=patch_lengths.device) + + for i, splits in enumerate(processed): + if splits: + padded[i, : len(splits)] = torch.tensor(splits, dtype=patch_lengths.dtype, device=patch_lengths.device) + + # Trim zero columns + if (padded != 0).any(dim=0).sum() < padded.shape[1]: + last_nonzero = (padded != 0).any(dim=0).nonzero().max().item() + 1 + padded = padded[:, :last_nonzero] + + return padded + + +class BltPatcher(BltPreTrainedModel): + config: BltPatcherConfig + + def __init__(self, config: BltPatcherConfig): + super().__init__(config) + self.rotary_emb = BltRotaryEmbedding(config=self.config) + self.layers = nn.ModuleList() + for layer_idx in range(self.config.num_hidden_layers): + self.layers.append(BltTransformerLayer(self.config, layer_idx)) + self.embed_tokens = nn.Embedding(self.config.vocab_size, self.config.hidden_size) + self.norm = BltRMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps) + self.lm_head = nn.Linear( + self.config.hidden_size, + self.config.vocab_size, + bias=False, + ) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + patch_size: Optional[int] = None, + threshold: Optional[float] = None, + max_patch_length: Optional[int] = None, + **kwargs: Unpack[TransformersKwargs], + ): + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if use_cache and past_key_values is None: + past_key_values = DynamicCache() + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = create_causal_mask( + config=self.config, + input_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + past_key_values=past_key_values, + position_ids=position_ids, + ) + + hidden_states = inputs_embeds + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + for layer in self.layers: + hidden_states = layer(hidden_states, position_embeddings=position_embeddings, attention_mask=causal_mask) + + logits = self.lm_head(self.norm(hidden_states)) + prediction_entropies = torch.distributions.Categorical(logits=logits).entropy() + + batch_size, sequence_length = inputs_embeds.shape[:2] + if patch_size is not None: + patch_lengths = self.patch_lengths_from_entropies( + entropies=prediction_entropies, + sequence_length=sequence_length, + patch_size=patch_size, + threshold=threshold, + ) + else: + patch_lengths = torch.ones( + (batch_size, sequence_length), dtype=inputs_embeds.dtype, device=inputs_embeds.device + ) + patch_lengths = process_patch_lengths(patch_lengths, max_patch_length) + return prediction_entropies, patch_lengths, logits + + @staticmethod + def patch_lengths_from_entropies( + entropies, + sequence_length, + patch_size=None, + threshold=None, + ): + """ + Computes patch lengths from token entropies. + + Depending on whether a threshold is provided, the function uses either: + - Thresholding the entropy values (when `threshold` is set). + """ + + batch_size = entropies.shape[0] + + # Always include token 0 and 1 as starting tokens + init_tokens = ( + torch.tensor([0, 1], dtype=torch.long, device=entropies.device).unsqueeze(0).repeat(batch_size, 1) + ) + offset = init_tokens.shape[1] + + # Ignore first token entropy (BOS) + entropies = entropies[:, 1:] + + # Threshold the entropy values to define patch start points + patch_mask = entropies > threshold + + seq_len = patch_mask.shape[1] + + # Create patch IDs (token indices), and add a sentinel to ensure alignment + token_indices = torch.arange(seq_len, device=entropies.device).unsqueeze(0).expand(batch_size, -1) + sentinel = torch.full_like(token_indices, seq_len) + padded_indices = torch.cat([token_indices, sentinel], dim=1) + + # Pad mask with inverse to align sentinel correctly + padded_mask = torch.cat([patch_mask, ~patch_mask], dim=1) + + # Select indices where mask is True + patch_starts = padded_indices[padded_mask].reshape(batch_size, seq_len) + max_valid_patches = patch_mask.sum(dim=1).max() + patch_starts = patch_starts[:, :max_valid_patches] + + # Offset patch starts to account for the two initial tokens + patch_start_ids = torch.cat((init_tokens, patch_starts + offset), dim=1) + + # Compute patch end positions by shifting start positions + last_token = torch.full_like(patch_start_ids[:, :1], sequence_length - 1) + patch_ends = torch.cat((patch_start_ids[:, 1:] - 1, last_token), dim=1) + + patch_lengths = patch_ends - patch_start_ids + 1 + + return patch_lengths + + +def rolling_polynomial_hash(token_tensor, prime: int = 1000000007): + """ + A polynomial rolling hash algorithm that converts sequences + of tokens into hash values. The hash is computed as: + hash = (token_0 * prime^0 + token_1 * prime^1 + ... + token_n * prime^n) + + The rolling hash allows the model to efficiently + identify and encode recurring byte-level patterns in the input text. + + Args: + token_tensor (torch.Tensor): [batch_size, seq_len, group_size] containing token IDs to hash + prime (int): Prime number used as the base for the polynomial hash. + + Returns: + torch.Tensor: Hash values of shape [batch_size, seq_len] where each value + represents the hash of the corresponding token group + + Example: + >>> tokens = torch.tensor([[1, 2, 3], [4, 5, 6]]) + >>> hashes = rolling_polynomial_hash(tokens, prime=31) + >>> # hash[0] = 1*31^0 + 2*31^1 + 3*31^2 + >>> # hash[1] = 4*31^0 + 5*31^1 + 6*31^2 + """ + prime_tensor = torch.tensor(prime, dtype=torch.int64, device=token_tensor.device) + powers = torch.arange(token_tensor.shape[-1], device=token_tensor.device) + prime_powers = prime_tensor**powers + return torch.sum(token_tensor * prime_powers, dim=-1) + + +def byte_group_hash_function( + token_ids: torch.Tensor, group_size: int = 2, prime: int = 1000000007, max_hash: int = 30000 +): + """Hash token groups and map to range [0, max_hash].""" + with torch.no_grad(): + batch_size, seq_len = token_ids.shape + # Add padding for sliding window + padding = torch.zeros(batch_size, group_size - 1, dtype=torch.int64, device=token_ids.device) + padded_tokens = torch.cat([padding, token_ids], dim=1) + + # Create sliding windows and compute hashes + windows = padded_tokens.unfold(1, group_size, 1) + hashes = rolling_polynomial_hash(windows, prime) + hash_values = hashes % max_hash + + return hash_values + + +def compute_hash_embeddings( + local_encoder_tokens: torch.Tensor, + local_encoder, + encoder_hash_tok_embedding: nn.Embedding, + encoder_hash_byte_group_nb_functions: int, + encoder_hash_byte_group_size: list, + encoder_hash_byte_group_vocab: int, +) -> torch.Tensor: + """Compute token embeddings enhanced with hash-based embeddings.""" + # Available primes for hash functions + primes = [ + 1000000007, + 5915587277, + 1500450271, + 3267000013, + 5754853343, + 4093082899, + 9576890767, + 3628273133, + 2860486313, + 5463458053, + 3367900313, + ] + + embeddings = local_encoder.embed_tokens(local_encoder_tokens) + embedding_idx = 0 + for func_nb in range(encoder_hash_byte_group_nb_functions): + prime = primes[func_nb % len(primes)] # Cycle through primes if more functions than primes + for group_size in encoder_hash_byte_group_size: + hash_ids = byte_group_hash_function(local_encoder_tokens, group_size, prime, encoder_hash_byte_group_vocab) + # Apply offset to get the correct slice of the fused embedding + offset_hash_ids = hash_ids + embedding_idx * encoder_hash_byte_group_vocab + embeddings += encoder_hash_tok_embedding(offset_hash_ids) + embedding_idx += 1 + + return embeddings + + +def _prepare_patch_cross_attention_mask( + patch_ids: torch.Tensor, + num_patches: int, + sequence_length: int, + patches_as_queries: bool = False, + cross_attn_k: int = 1, + dtype: torch.dtype = torch.float32, +) -> tuple[torch.Tensor, torch.Tensor]: + """ + Prepare cross-attention mask for patch-based attention, following mllama's robust approach. + + This function creates masks that control which patches can attend to which other patches, + with support for query/key role swapping and cross-attention multipliers. + + Args: + patch_ids (torch.Tensor): Tensor of shape [batch_size, seq_len] containing patch ids. + num_patches (int): Total number of patches. + sequence_length (int): Length of the sequence. + patches_as_queries (bool): If True, patches are used as queries, otherwise as keys. + cross_attn_k (int): Cross-attention multiplier for repeating patches. + dtype (torch.dtype): Data type for the output mask. + + Returns: + Tuple[torch.Tensor, torch.Tensor]: + - cross_attention_mask: 4D tensor [batch_size, 1, q_len, kv_len] + """ + batch_size, seq_len = patch_ids.shape + device = patch_ids.device + + # Determine query and key lengths based on configuration + if patches_as_queries: + q_len = num_patches * cross_attn_k + kv_len = sequence_length + # Create patch-to-sequence mapping + q_patch_ids = ( + torch.arange(num_patches, device=device) + .unsqueeze(0) + .unsqueeze(-1) + .expand(batch_size, num_patches, seq_len) + ) + kv_patch_ids = patch_ids.unsqueeze(1).expand(batch_size, num_patches, seq_len) + else: + q_len = sequence_length + kv_len = num_patches * cross_attn_k + # Create sequence-to-patch mapping + q_patch_ids = patch_ids.unsqueeze(-1).expand(batch_size, seq_len, num_patches) + kv_patch_ids = ( + torch.arange(num_patches, device=device).unsqueeze(0).unsqueeze(0).expand(batch_size, seq_len, num_patches) + ) + + # Create base attention mask - boolean mask where True means "should attend" + # Exact patch matching + cross_attention_mask = q_patch_ids == kv_patch_ids + + # Handle cross_attn_k multiplier by repeating along appropriate dimension + repeat_dim = 1 if patches_as_queries else -1 + cross_attention_mask = cross_attention_mask.repeat_interleave(cross_attn_k, dim=repeat_dim) + + # Validate dimensions + expected_shape = (batch_size, q_len, kv_len) + if cross_attention_mask.shape != expected_shape: + raise ValueError( + f"Cross attention mask shape {cross_attention_mask.shape} doesn't match expected {expected_shape}" + ) + + # Reshape so it can be used by attn module - add head dimension + cross_attention_mask = cross_attention_mask.unsqueeze(1) # [batch_size, 1, q_len, kv_len] + + # Invert the mask (following mllama pattern exactly) + # True -> 0.0 (attend), False -> 1.0 (will become -inf) + inverted_cross_attn_mask = 1.0 - cross_attention_mask.to(dtype) + cross_attention_mask = inverted_cross_attn_mask.masked_fill( + inverted_cross_attn_mask.to(torch.bool), torch.finfo(dtype).min + ) + + return cross_attention_mask + + +class BltModel(BltPreTrainedModel): + def __init__(self, config: BltConfig): + super().__init__(config) + self.gradient_checkpointing = False + + self.config = config + self.local_encoder = BltLocalEncoder(config.encoder_config) + self.global_transformer = BltGlobalTransformer(config.global_config) + self.local_decoder = BltLocalDecoder(config.decoder_config) + num_embeddings = config.encoder_hash_byte_group_nb_functions * len(config.encoder_hash_byte_group_size) + total_vocab_size = config.encoder_hash_byte_group_vocab * num_embeddings + self.encoder_hash_tok_embedding = nn.Embedding(total_vocab_size, config.encoder_config.hidden_size) + if self.config.patch_in_forward: + self.patcher = BltPatcher(config.patcher_config) + self.patcher.eval() + for param in self.patcher.parameters(): + param.requires_grad = False + else: + self.patcher = None + self.post_init() + + @check_model_inputs + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + patch_lengths: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPast: + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + # Extract input embeddings as early as possible + if inputs_embeds is not None: + encoder_embeds = inputs_embeds + batch_size, sequence_length, _ = inputs_embeds.shape + else: + batch_size, sequence_length = input_ids.shape + encoder_embeds = compute_hash_embeddings( + input_ids, + self.local_encoder, + self.encoder_hash_tok_embedding, + self.config.encoder_hash_byte_group_nb_functions, + self.config.encoder_hash_byte_group_size, + self.config.encoder_hash_byte_group_vocab, + ) + + if patch_lengths is None: + if self.config.patching_mode == "entropy" and self.patcher is not None: + if input_ids is None: + raise ValueError("input_ids is required for entropy-based patching") + _, patch_lengths, _ = self.patcher( + input_ids, + patch_size=self.config.patch_size, + threshold=self.config.patching_threshold, + max_patch_length=self.config.max_patch_length, + patching_batch_size=self.config.patching_batch_size, + device=input_ids.device, + ) + else: + device = input_ids.device if input_ids is not None else inputs_embeds.device + dtype = input_ids.dtype if input_ids is not None else inputs_embeds.dtype + patch_lengths = process_patch_lengths( + torch.ones((batch_size, sequence_length + 1), dtype=dtype, device=device), + self.config.max_patch_length, + ) + patch_ids = self._patch_ids_from_lengths(patch_lengths, sequence_length) + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + encoder_embeds.shape[1], device=encoder_embeds.device + ) + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = create_causal_mask( + config=self.config, + input_embeds=encoder_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + past_key_values=past_key_values, + position_ids=position_ids, + ) + + cross_attn_mask_enc = _prepare_patch_cross_attention_mask( + patch_ids=patch_ids, + num_patches=patch_lengths.shape[1], + sequence_length=sequence_length, + patches_as_queries=True, + cross_attn_k=self.config.cross_attn_k, + dtype=encoder_embeds.dtype, + ) + encoder_hidden_states, encoder_cross_states = self.local_encoder( + input_ids=input_ids, + inputs_embeds=encoder_embeds, + attention_mask=causal_mask, + position_ids=position_ids, + encoder_attention_mask=cross_attn_mask_enc, + num_patches=patch_lengths.shape[1], + patch_ids=patch_ids, + **kwargs, + ) + encoder_cross_states = encoder_cross_states.view(batch_size, patch_lengths.shape[1], -1) + global_cache_position = torch.arange(0, encoder_cross_states.shape[1], device=encoder_cross_states.device) + global_position_ids = global_cache_position.unsqueeze(0) + global_causal_mask = create_causal_mask( + config=self.config, + input_embeds=encoder_cross_states, + attention_mask=None, + cache_position=global_cache_position, + past_key_values=None, + position_ids=None, + ) + + global_hidden_states = self.global_transformer( + input_embeds=encoder_cross_states, + attention_mask=global_causal_mask, + position_ids=global_position_ids, + **kwargs, + ) + decoder_patch_ids = self._patch_ids_from_lengths(patch_lengths[:, 1:], sequence_length) + cross_attn_mask_dec = _prepare_patch_cross_attention_mask( + patch_ids=decoder_patch_ids, + num_patches=patch_lengths.shape[1], + sequence_length=sequence_length, + patches_as_queries=False, + cross_attn_k=self.config.cross_attn_k, + dtype=encoder_embeds.dtype, + ) + output = self.local_decoder( + input_ids=input_ids, + inputs_embeds=encoder_hidden_states, + patch_embeds=global_hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_values=past_key_values, + cache_position=cache_position, + encoder_attention_mask=cross_attn_mask_dec, + **kwargs, + ) + return BaseModelOutputWithPast( + last_hidden_state=output, + past_key_values=past_key_values, + ) + + def get_input_embeddings(self): + return self.local_encoder.embed_tokens + + def set_input_embeddings(self, value): + self.local_encoder.embed_tokens = value + + def _patch_ids_from_lengths(self, patch_lengths: torch.Tensor, seq_len: int) -> torch.Tensor: + batch_size = patch_lengths.shape[0] + patch_starts = torch.cat( + [ + torch.zeros(batch_size, 1, dtype=patch_lengths.dtype, device=patch_lengths.device), + patch_lengths.cumsum(dim=-1)[:, :-1], + ], + dim=-1, + ) + token_positions = torch.arange(seq_len, device=patch_lengths.device) + return (patch_starts.unsqueeze(1) <= token_positions.unsqueeze(0).unsqueeze(-1)).sum(dim=-1) - 1 + + +@auto_docstring( + custom_intro=""" + The Blt Text Model with a language modeling head on top. + """ +) +class BltForCausalLM(BltPreTrainedModel, GenerationMixin): + config: BltConfig + _can_compile_fullgraph = False + base_model_prefix = "model" + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config: BltConfig): + super().__init__(config.get_text_config()) + self.text_config = config.get_text_config() + self.vocab_size = config.vocab_size + self.model = BltModel(config) + self.lm_head = nn.Linear(config.decoder_config.hidden_size, config.vocab_size, bias=False) + + self.post_init() + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + cross_attention_states: Optional[torch.LongTensor] = None, # Keep for compatibility + cross_attention_mask: Optional[torch.LongTensor] = None, + full_text_row_masked_out_mask: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, CausalLMOutputWithPast]: + r""" + cross_attention_states (`torch.FloatTensor`, *optional*): + Output of the vision model, used for cross-attention. This tensor contains the processed image features that + the language model will attend to. + cross_attention_mask (`torch.Tensor` of shape `(batch_size, seq_length, max_num_images, max_num_tiles)`, *optional*): + Cross-attention mask to control the interaction between text tokens and image tiles. + This 4D tensor defines which image tiles each text token should attend to. + + For each text token (in seq_length): + - 1 indicates the token **should attend** to the corresponding image tile + - 0 indicates the token **should not attend** to the corresponding image tile + full_text_row_masked_out_mask (`tuple[torch.Tensor, torch.Tensor]`, *optional*): + A tuple containing two tensors that mask out rows in the cross-attention mechanism: + - The first tensor has shape `(batch_size, 1, seq_length, 1)` and contains values of 0 or 1. + A value of 0 indicates that the corresponding text token's entire row in the cross-attention + matrix should be masked out (all image tokens ignored). + - The second tensor has the same shape and is used internally to apply the masking during + the forward pass of cross-attention layers. + This mask is derived from the cross_attention_mask and is used to handle cases where a text token + should not attend to any image token. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example: + + ```python + >>> from transformers import AutoTokenizer, BltForCausalLM + + >>> model = BltForCausalLM.from_pretrained("Llama-3.2-11B-Vision") + >>> tokenizer = AutoTokenizer.from_pretrained("Llama-3.2-11B-Vision") + + >>> prompt = "If I had to write a haiku, it would be:" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=40, do_sample=True, temperature=0.6) + >>> result = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + >>> print(result) + If I had to write a haiku, it would be: "Snowflakes gently fall" - simple, yet peaceful. + I love the idea of snowflakes gently falling, each one + ``` + """ + # Call parent forward but exclude cross_attention_states from model call + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + cross_attention_mask=cross_attention_mask, + full_text_row_masked_out_mask=full_text_row_masked_out_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs.last_hidden_state + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]).float() + + loss = None + if labels is not None: + loss = self.loss_function(logits, labels, self.vocab_size, **kwargs) + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = ["BltPreTrainedModel", "BltModel", "BltPatcher", "BltForCausalLM"] diff --git a/src/transformers/models/blt/modular_blt.py b/src/transformers/models/blt/modular_blt.py new file mode 100644 index 000000000000..00b1211fdb08 --- /dev/null +++ b/src/transformers/models/blt/modular_blt.py @@ -0,0 +1,1015 @@ +# coding=utf-8 +# Copyright 2025 HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Blt modular model, inheriting from Mllama where appropriate.""" + +from typing import Callable, Optional, Union + +import torch +import torch.distributions +import torch.nn as nn +import torch.nn.functional as F + +from ...cache_utils import Cache, DynamicCache +from ...masking_utils import create_causal_mask +from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, logging +from ...utils.generic import OutputRecorder, check_model_inputs +from ..cohere2.modeling_cohere2 import ( + Cohere2RotaryEmbedding, + rotate_half, # noqa: F401 +) +from ..mllama.modeling_mllama import ( + MllamaForCausalLM, + MllamaPreTrainedModel, + MllamaSelfAttentionDecoderLayer, + MllamaTextCrossAttention, + MllamaTextMLP, + MllamaTextRMSNorm, + MllamaTextSelfAttention, + eager_attention_forward, +) +from .configuration_blt import ( + BltConfig, + BltGlobalTransformerConfig, + BltLocalDecoderConfig, + BltLocalEncoderConfig, + BltPatcherConfig, +) + + +logger = logging.get_logger(__name__) + + +def rolling_polynomial_hash(token_tensor, prime: int = 1000000007): + """ + A polynomial rolling hash algorithm that converts sequences + of tokens into hash values. The hash is computed as: + hash = (token_0 * prime^0 + token_1 * prime^1 + ... + token_n * prime^n) + + The rolling hash allows the model to efficiently + identify and encode recurring byte-level patterns in the input text. + + Args: + token_tensor (torch.Tensor): [batch_size, seq_len, group_size] containing token IDs to hash + prime (int): Prime number used as the base for the polynomial hash. + + Returns: + torch.Tensor: Hash values of shape [batch_size, seq_len] where each value + represents the hash of the corresponding token group + + Example: + >>> tokens = torch.tensor([[1, 2, 3], [4, 5, 6]]) + >>> hashes = rolling_polynomial_hash(tokens, prime=31) + >>> # hash[0] = 1*31^0 + 2*31^1 + 3*31^2 + >>> # hash[1] = 4*31^0 + 5*31^1 + 6*31^2 + """ + prime_tensor = torch.tensor(prime, dtype=torch.int64, device=token_tensor.device) + powers = torch.arange(token_tensor.shape[-1], device=token_tensor.device) + prime_powers = prime_tensor**powers + return torch.sum(token_tensor * prime_powers, dim=-1) + + +def byte_group_hash_function( + token_ids: torch.Tensor, group_size: int = 2, prime: int = 1000000007, max_hash: int = 30000 +): + """Hash token groups and map to range [0, max_hash].""" + with torch.no_grad(): + batch_size, seq_len = token_ids.shape + # Add padding for sliding window + padding = torch.zeros(batch_size, group_size - 1, dtype=torch.int64, device=token_ids.device) + padded_tokens = torch.cat([padding, token_ids], dim=1) + + # Create sliding windows and compute hashes + windows = padded_tokens.unfold(1, group_size, 1) + hashes = rolling_polynomial_hash(windows, prime) + hash_values = hashes % max_hash + + return hash_values + + +def compute_hash_embeddings( + local_encoder_tokens: torch.Tensor, + local_encoder, + encoder_hash_tok_embedding: nn.Embedding, + encoder_hash_byte_group_nb_functions: int, + encoder_hash_byte_group_size: list, + encoder_hash_byte_group_vocab: int, +) -> torch.Tensor: + """Compute token embeddings enhanced with hash-based embeddings.""" + # Available primes for hash functions + primes = [ + 1000000007, + 5915587277, + 1500450271, + 3267000013, + 5754853343, + 4093082899, + 9576890767, + 3628273133, + 2860486313, + 5463458053, + 3367900313, + ] + + embeddings = local_encoder.embed_tokens(local_encoder_tokens) + embedding_idx = 0 + for func_nb in range(encoder_hash_byte_group_nb_functions): + prime = primes[func_nb % len(primes)] # Cycle through primes if more functions than primes + for group_size in encoder_hash_byte_group_size: + hash_ids = byte_group_hash_function(local_encoder_tokens, group_size, prime, encoder_hash_byte_group_vocab) + # Apply offset to get the correct slice of the fused embedding + offset_hash_ids = hash_ids + embedding_idx * encoder_hash_byte_group_vocab + embeddings += encoder_hash_tok_embedding(offset_hash_ids) + embedding_idx += 1 + + return embeddings + + +def _prepare_patch_cross_attention_mask( + patch_ids: torch.Tensor, + num_patches: int, + sequence_length: int, + patches_as_queries: bool = False, + cross_attn_k: int = 1, + dtype: torch.dtype = torch.float32, +) -> tuple[torch.Tensor, torch.Tensor]: + """ + Prepare cross-attention mask for patch-based attention, following mllama's robust approach. + + This function creates masks that control which patches can attend to which other patches, + with support for query/key role swapping and cross-attention multipliers. + + Args: + patch_ids (torch.Tensor): Tensor of shape [batch_size, seq_len] containing patch ids. + num_patches (int): Total number of patches. + sequence_length (int): Length of the sequence. + patches_as_queries (bool): If True, patches are used as queries, otherwise as keys. + cross_attn_k (int): Cross-attention multiplier for repeating patches. + dtype (torch.dtype): Data type for the output mask. + + Returns: + Tuple[torch.Tensor, torch.Tensor]: + - cross_attention_mask: 4D tensor [batch_size, 1, q_len, kv_len] + """ + batch_size, seq_len = patch_ids.shape + device = patch_ids.device + + # Determine query and key lengths based on configuration + if patches_as_queries: + q_len = num_patches * cross_attn_k + kv_len = sequence_length + # Create patch-to-sequence mapping + q_patch_ids = ( + torch.arange(num_patches, device=device) + .unsqueeze(0) + .unsqueeze(-1) + .expand(batch_size, num_patches, seq_len) + ) + kv_patch_ids = patch_ids.unsqueeze(1).expand(batch_size, num_patches, seq_len) + else: + q_len = sequence_length + kv_len = num_patches * cross_attn_k + # Create sequence-to-patch mapping + q_patch_ids = patch_ids.unsqueeze(-1).expand(batch_size, seq_len, num_patches) + kv_patch_ids = ( + torch.arange(num_patches, device=device).unsqueeze(0).unsqueeze(0).expand(batch_size, seq_len, num_patches) + ) + + # Create base attention mask - boolean mask where True means "should attend" + # Exact patch matching + cross_attention_mask = q_patch_ids == kv_patch_ids + + # Handle cross_attn_k multiplier by repeating along appropriate dimension + repeat_dim = 1 if patches_as_queries else -1 + cross_attention_mask = cross_attention_mask.repeat_interleave(cross_attn_k, dim=repeat_dim) + + # Validate dimensions + expected_shape = (batch_size, q_len, kv_len) + if cross_attention_mask.shape != expected_shape: + raise ValueError( + f"Cross attention mask shape {cross_attention_mask.shape} doesn't match expected {expected_shape}" + ) + + # Reshape so it can be used by attn module - add head dimension + cross_attention_mask = cross_attention_mask.unsqueeze(1) # [batch_size, 1, q_len, kv_len] + + # Invert the mask (following mllama pattern exactly) + # True -> 0.0 (attend), False -> 1.0 (will become -inf) + inverted_cross_attn_mask = 1.0 - cross_attention_mask.to(dtype) + cross_attention_mask = inverted_cross_attn_mask.masked_fill( + inverted_cross_attn_mask.to(torch.bool), torch.finfo(dtype).min + ) + + return cross_attention_mask + + +def process_patch_lengths(patch_lengths: torch.Tensor, max_patch_length: Optional[int]) -> torch.Tensor: + """ + Splits patch lengths into smaller segments if they exceed `max_patch_length`. + Pads the result to uniform length across the batch. + + Args: + patch_lengths (torch.Tensor): [batch_size, num_patches] tensor of patch lengths. + max_patch_length (int, optional): Maximum allowed length per patch. + + Returns: + torch.Tensor: [batch_size, max_len] tensor of split and padded patch lengths. + """ + if max_patch_length is None: + return patch_lengths + + batch_size = patch_lengths.size(0) + processed = [] + + for seq in patch_lengths: + splits = [] + for length in seq[seq > 0]: + length = length.item() + full_chunks, remainder = divmod(length, max_patch_length) + splits.extend([max_patch_length] * full_chunks) + if remainder: + splits.append(remainder) + processed.append(splits) + + # Find max length to pad to + max_len = max(len(splits) for splits in processed) + padded = torch.zeros((batch_size, max_len), dtype=patch_lengths.dtype, device=patch_lengths.device) + + for i, splits in enumerate(processed): + if splits: + padded[i, : len(splits)] = torch.tensor(splits, dtype=patch_lengths.dtype, device=patch_lengths.device) + + # Trim zero columns + if (padded != 0).any(dim=0).sum() < padded.shape[1]: + last_nonzero = (padded != 0).any(dim=0).nonzero().max().item() + 1 + padded = padded[:, :last_nonzero] + + return padded + + +class BltMLP(MllamaTextMLP): + pass + + +class BltRMSNorm(MllamaTextRMSNorm): + pass + + +class BltRotaryEmbedding(Cohere2RotaryEmbedding): + pass + + +class BltTransformerLayer(MllamaSelfAttentionDecoderLayer): + def __init__(self, config, layer_idx: int): + super().__init__() + + self.self_attn = BltSelfAttention(config=config, layer_idx=layer_idx) + self.mlp = BltMLP(config) + self.input_layernorm = BltRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = BltRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + +class BltSelfAttention(MllamaTextSelfAttention): + def __init__(self, config: BltConfig, layer_idx: int): + super().__init__(config, layer_idx) + self.is_causal = True + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + position_embeddings: torch.Tensor, + use_cache: bool = False, + past_key_values=None, + cache_position=None, + **kwargs, + ): + return super().forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_embeddings=position_embeddings, + use_cache=use_cache, + past_key_values=past_key_values, + cache_position=cache_position, + **kwargs, + ) + + +class BltCrossAttention(MllamaTextCrossAttention): + """Cross-attention module for Blt, following transformers style""" + + def __init__(self, config: BltConfig, layer_idx: int, hidden_size: Optional[int] = None): + super().__init__() + self.is_causal = False + self.q_norm = BltRMSNorm(self.hidden_size, eps=config.rms_norm_eps) + self.k_norm = BltRMSNorm(self.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + cross_attention_states: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + attention_mask: Optional[torch.Tensor] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ): + bsz, q_len, _ = hidden_states.size() + query_states = self.q_norm(hidden_states) + query_states = self.q_proj(query_states) + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + + if cross_attention_states is not None: + cross_attention_states = self.k_norm(cross_attention_states) + key_states = self.k_proj(cross_attention_states) + value_states = self.v_proj(cross_attention_states) + key_states = key_states.view(bsz, -1, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, -1, self.num_key_value_heads, self.head_dim).transpose(1, 2) + if past_key_values is not None: + key_states, value_states = past_key_values.update( + key_states, value_states, self.layer_idx, {"cache_position": cache_position} + ) + elif cache_position[0] != 0: + key_states, value_states = ( + past_key_values.layers[self.layer_idx].keys, + past_key_values.layers[self.layer_idx].values, + ) + else: + raise ValueError( + "Cross attention layer can't find neither `cross_attn_states` nor cached values for key/values!" + ) + attention_interface: Callable = eager_attention_forward + + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.dropout, + scaling=self.scaling, + **kwargs, + ) + attn_output = attn_output.reshape(bsz, q_len, -1).contiguous() + attn_output = self.o_proj(attn_output) + attn_output = attn_output + hidden_states + return attn_output, attn_weights + + +@auto_docstring +class BltPreTrainedModel(MllamaPreTrainedModel): + config: BltConfig + _supports_attention_backend = False + _supports_flash_attn = False + _supports_flex_attn = False + _no_split_modules = ["BltTransformerLayer"] + _can_record_outputs = { + "hidden_states": OutputRecorder(BltTransformerLayer, index=0, layer_name="local_decoder"), + "attentions": OutputRecorder(BltSelfAttention, index=1, layer_name="local_decoder"), + } + + def _init_weights(self, module): + raise AttributeError("No need to inherit it!") + + def _update_causal_mask(self, module): + raise AttributeError("No need to inherit it!") + + def _prepare_4d_causal_attention_mask_with_cache_position(self, module): + raise AttributeError("No need to inherit it!") + + +class BltLocalEncoder(BltPreTrainedModel): + config: BltLocalEncoderConfig + _can_record_outputs = { + "encoder_attentions": OutputRecorder(BltSelfAttention, index=1, layer_name="local_encoder"), + } + + def __init__(self, config: BltLocalEncoderConfig): + super().__init__(config) + self.gradient_checkpointing = False + self.config = config + self.layers = nn.ModuleList( + [BltTransformerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.rotary_emb = BltRotaryEmbedding(config=config) + self.patch_embedding_projection = nn.Linear( + in_features=config.hidden_size, + out_features=config.hidden_size * config.cross_attn_k, + bias=False, + ) + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size) + self.cross_attn_layers = nn.ModuleList() + layers_to_add = config.num_hidden_layers if config.cross_attn_all_layers else 1 + for layer_idx in range(layers_to_add): + self.cross_attn_layers.append( + BltCrossAttention(config=config, layer_idx=layer_idx, hidden_size=config.hidden_size) + ) + + self.post_init() + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + patch_embeds: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + num_patches: Optional[int] = None, + patch_ids: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ): + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + batch_size = inputs_embeds.shape[0] + hidden_states = F.dropout(inputs_embeds, p=self.config.dropout, training=self.training) + + if position_ids is None: + position_ids = ( + torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device).unsqueeze(0).expand(batch_size, -1) + ) + + position_embeddings = self.rotary_emb(hidden_states, position_ids) + hidden_states = F.dropout(hidden_states, p=self.config.dropout, training=self.training) + + for idx, layer in enumerate(self.layers): + hidden_states = layer( + hidden_states, + position_embeddings=position_embeddings, + attention_mask=attention_mask, + past_key_values=past_key_values, + cache_position=cache_position, + **kwargs, + ) + if idx == len(self.layers) - 1 or self.config.cross_attn_all_layers: + patch_embeds = self.patch_reduce(hidden_states, num_patches, patch_ids) + patch_embeds = self.patch_embedding_projection(patch_embeds) + patch_embeds = patch_embeds.reshape( + batch_size, patch_embeds.shape[1] * self.config.cross_attn_k, self.config.hidden_size + ) + layer_idx = idx if self.config.cross_attn_all_layers else 0 + cross_attention_output, _ = self.cross_attn_layers[layer_idx]( + hidden_states=patch_embeds, + cross_attention_states=hidden_states, + attention_mask=encoder_attention_mask, + **kwargs, + ) + patch_embeds = patch_embeds + cross_attention_output + encoder_cross_states = patch_embeds + return hidden_states, encoder_cross_states + + def patch_reduce(self, hidden_states, max_num_patches, patch_ids): + """ + Reduce variable length patches to single embedding per patch + Note: this works with variable number of patches for different sequences in the batch + It handles variable length patches by assuming that patch_lengths will be 0 for any + extra patches on the *right*. Since there can be a variable number of patches + this function also return the number of patches for each sequence in the batch. + Any embeddings on the right that are not allocated to a patch + (i.e. if the sum(patch_lengths[i]) < seq_len for any i) + will be sent to a dummy patch, which is trimmed before returning. + """ + batch_size = hidden_states.shape[0] + embedding_dim = hidden_states.shape[-1] + + patch_ids = patch_ids.unsqueeze(-1).expand(-1, -1, hidden_states.shape[-1]) + + reduced_embeddings = torch.zeros( + (batch_size, max_num_patches, embedding_dim), dtype=hidden_states.dtype, device=hidden_states.device + ) + + reduced_embeddings = reduced_embeddings.scatter_reduce( + src=hidden_states, + dim=1, + index=patch_ids, + reduce="amax", + include_self=False, + ) + reduced_embeddings = reduced_embeddings[:, :max_num_patches, :] + + return reduced_embeddings + + +class BltLocalDecoder(BltPreTrainedModel): + config: BltLocalDecoderConfig + + def __init__(self, config: BltLocalDecoderConfig): + super().__init__(config) + self.gradient_checkpointing = False + self.config = config + self.cross_attn_decoder = True + self.layers = nn.ModuleList( + [BltTransformerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.rotary_emb = BltRotaryEmbedding(config=config) + self.patch_embedding_projection = nn.Linear( + in_features=config.hidden_size_global, + out_features=config.hidden_size * config.cross_attn_k, + bias=False, + ) + self.norm = BltRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.cross_attn_layers = nn.ModuleList() + layers_to_add = config.num_hidden_layers if config.cross_attn_all_layers else 1 + for layer_idx in range(layers_to_add): + self.cross_attn_layers.append( + BltCrossAttention(config=config, layer_idx=layer_idx, hidden_size=config.hidden_size) + ) + + self.post_init() + + @check_model_inputs + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + patch_embeds: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ): + batch_size = inputs_embeds.shape[0] + hidden_states = inputs_embeds + patch_embeds = self.patch_embedding_projection(patch_embeds) + patch_embeds = patch_embeds.reshape( + batch_size, patch_embeds.shape[1] * self.config.cross_attn_k, self.config.hidden_size + ) + + if patch_embeds is not None and not self.cross_attn_decoder: + hidden_states = hidden_states + patch_embeds + + if position_ids is None: + position_ids = ( + torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device).unsqueeze(0).expand(batch_size, -1) + ) + + position_embeddings = self.rotary_emb(hidden_states, position_ids) + hidden_states = F.dropout(hidden_states, p=self.config.dropout, training=self.training) + + for i, layer in enumerate(self.layers): + if i == 0 or self.config.cross_attn_all_layers: + cross_attention_output, _ = self.cross_attn_layers[i]( + hidden_states=hidden_states, + cross_attention_states=patch_embeds, + attention_mask=encoder_attention_mask, + **kwargs, + ) + hidden_states = hidden_states + cross_attention_output + hidden_states = layer( + hidden_states, + position_embeddings=position_embeddings, + attention_mask=attention_mask, + past_key_values=past_key_values, + cache_position=cache_position, + **kwargs, + ) + logits = self.norm(hidden_states) + return logits + + +class BltGlobalTransformer(BltPreTrainedModel): + config: BltGlobalTransformerConfig + _can_record_outputs = { + "global_attentions": OutputRecorder(BltSelfAttention, index=1, layer_name="global_transformer"), + } + + def __init__(self, config: BltGlobalTransformerConfig): + super().__init__(config) + self.config = config + self.layers = nn.ModuleList() + for layer_idx in range(config.num_hidden_layers): + self.layers.append(BltTransformerLayer(config, layer_idx)) + self.rotary_emb = BltRotaryEmbedding(config=config) + + # Create token embedding projection (use nn.Identity() when no projection needed) + if getattr(config, "encoder_cross_output_size", None) is not None: + self.token_embedding_projection = nn.Linear( + config.encoder_cross_output_size, config.hidden_size, bias=False + ) + else: + self.token_embedding_projection = nn.Identity() + + self.post_init() + + def forward( + self, + input_embeds: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ): + batch_size, seq_len, _ = input_embeds.shape + hidden_states = self.token_embedding_projection(input_embeds) + hidden_states = F.dropout(hidden_states, p=self.config.dropout, training=self.training) + if position_ids is None: + position_ids = ( + torch.arange(input_embeds.shape[1], device=input_embeds.device).unsqueeze(0).expand(batch_size, -1) + ) + position_embeddings = self.rotary_emb(hidden_states, position_ids) + for i, layer in enumerate(self.layers): + hidden_states = layer( + hidden_states, + position_embeddings=position_embeddings, + attention_mask=attention_mask, + past_key_values=past_key_values, + cache_position=cache_position, + **kwargs, + ) + return hidden_states + + +class BltPatcher(BltPreTrainedModel): + config: BltPatcherConfig + + def __init__(self, config: BltPatcherConfig): + super().__init__(config) + self.rotary_emb = BltRotaryEmbedding(config=self.config) + self.layers = nn.ModuleList() + for layer_idx in range(self.config.num_hidden_layers): + self.layers.append(BltTransformerLayer(self.config, layer_idx)) + self.embed_tokens = nn.Embedding(self.config.vocab_size, self.config.hidden_size) + self.norm = BltRMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps) + self.lm_head = nn.Linear( + self.config.hidden_size, + self.config.vocab_size, + bias=False, + ) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + patch_size: Optional[int] = None, + threshold: Optional[float] = None, + max_patch_length: Optional[int] = None, + **kwargs: Unpack[TransformersKwargs], + ): + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if use_cache and past_key_values is None: + past_key_values = DynamicCache() + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = create_causal_mask( + config=self.config, + input_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + past_key_values=past_key_values, + position_ids=position_ids, + ) + + hidden_states = inputs_embeds + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + for layer in self.layers: + hidden_states = layer(hidden_states, position_embeddings=position_embeddings, attention_mask=causal_mask) + + logits = self.lm_head(self.norm(hidden_states)) + prediction_entropies = torch.distributions.Categorical(logits=logits).entropy() + + batch_size, sequence_length = inputs_embeds.shape[:2] + if patch_size is not None: + patch_lengths = self.patch_lengths_from_entropies( + entropies=prediction_entropies, + sequence_length=sequence_length, + patch_size=patch_size, + threshold=threshold, + ) + else: + patch_lengths = torch.ones( + (batch_size, sequence_length), dtype=inputs_embeds.dtype, device=inputs_embeds.device + ) + patch_lengths = process_patch_lengths(patch_lengths, max_patch_length) + return prediction_entropies, patch_lengths, logits + + @staticmethod + def patch_lengths_from_entropies( + entropies, + sequence_length, + patch_size=None, + threshold=None, + ): + """ + Computes patch lengths from token entropies. + + Depending on whether a threshold is provided, the function uses either: + - Thresholding the entropy values (when `threshold` is set). + """ + + batch_size = entropies.shape[0] + + # Always include token 0 and 1 as starting tokens + init_tokens = ( + torch.tensor([0, 1], dtype=torch.long, device=entropies.device).unsqueeze(0).repeat(batch_size, 1) + ) + offset = init_tokens.shape[1] + + # Ignore first token entropy (BOS) + entropies = entropies[:, 1:] + + # Threshold the entropy values to define patch start points + patch_mask = entropies > threshold + + seq_len = patch_mask.shape[1] + + # Create patch IDs (token indices), and add a sentinel to ensure alignment + token_indices = torch.arange(seq_len, device=entropies.device).unsqueeze(0).expand(batch_size, -1) + sentinel = torch.full_like(token_indices, seq_len) + padded_indices = torch.cat([token_indices, sentinel], dim=1) + + # Pad mask with inverse to align sentinel correctly + padded_mask = torch.cat([patch_mask, ~patch_mask], dim=1) + + # Select indices where mask is True + patch_starts = padded_indices[padded_mask].reshape(batch_size, seq_len) + max_valid_patches = patch_mask.sum(dim=1).max() + patch_starts = patch_starts[:, :max_valid_patches] + + # Offset patch starts to account for the two initial tokens + patch_start_ids = torch.cat((init_tokens, patch_starts + offset), dim=1) + + # Compute patch end positions by shifting start positions + last_token = torch.full_like(patch_start_ids[:, :1], sequence_length - 1) + patch_ends = torch.cat((patch_start_ids[:, 1:] - 1, last_token), dim=1) + + patch_lengths = patch_ends - patch_start_ids + 1 + + return patch_lengths + + +class BltModel(BltPreTrainedModel): + def __init__(self, config: BltConfig): + super().__init__(config) + self.gradient_checkpointing = False + + self.config = config + self.local_encoder = BltLocalEncoder(config.encoder_config) + self.global_transformer = BltGlobalTransformer(config.global_config) + self.local_decoder = BltLocalDecoder(config.decoder_config) + num_embeddings = config.encoder_hash_byte_group_nb_functions * len(config.encoder_hash_byte_group_size) + total_vocab_size = config.encoder_hash_byte_group_vocab * num_embeddings + self.encoder_hash_tok_embedding = nn.Embedding(total_vocab_size, config.encoder_config.hidden_size) + if self.config.patch_in_forward: + self.patcher = BltPatcher(config.patcher_config) + self.patcher.eval() + for param in self.patcher.parameters(): + param.requires_grad = False + else: + self.patcher = None + self.post_init() + + @check_model_inputs + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + patch_lengths: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPast: + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + # Extract input embeddings as early as possible + if inputs_embeds is not None: + encoder_embeds = inputs_embeds + batch_size, sequence_length, _ = inputs_embeds.shape + else: + batch_size, sequence_length = input_ids.shape + encoder_embeds = compute_hash_embeddings( + input_ids, + self.local_encoder, + self.encoder_hash_tok_embedding, + self.config.encoder_hash_byte_group_nb_functions, + self.config.encoder_hash_byte_group_size, + self.config.encoder_hash_byte_group_vocab, + ) + + if patch_lengths is None: + if self.config.patching_mode == "entropy" and self.patcher is not None: + if input_ids is None: + raise ValueError("input_ids is required for entropy-based patching") + _, patch_lengths, _ = self.patcher( + input_ids, + patch_size=self.config.patch_size, + threshold=self.config.patching_threshold, + max_patch_length=self.config.max_patch_length, + patching_batch_size=self.config.patching_batch_size, + device=input_ids.device, + ) + else: + device = input_ids.device if input_ids is not None else inputs_embeds.device + dtype = input_ids.dtype if input_ids is not None else inputs_embeds.dtype + patch_lengths = process_patch_lengths( + torch.ones((batch_size, sequence_length + 1), dtype=dtype, device=device), + self.config.max_patch_length, + ) + patch_ids = self._patch_ids_from_lengths(patch_lengths, sequence_length) + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + encoder_embeds.shape[1], device=encoder_embeds.device + ) + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = create_causal_mask( + config=self.config, + input_embeds=encoder_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + past_key_values=past_key_values, + position_ids=position_ids, + ) + + cross_attn_mask_enc = _prepare_patch_cross_attention_mask( + patch_ids=patch_ids, + num_patches=patch_lengths.shape[1], + sequence_length=sequence_length, + patches_as_queries=True, + cross_attn_k=self.config.cross_attn_k, + dtype=encoder_embeds.dtype, + ) + encoder_hidden_states, encoder_cross_states = self.local_encoder( + input_ids=input_ids, + inputs_embeds=encoder_embeds, + attention_mask=causal_mask, + position_ids=position_ids, + encoder_attention_mask=cross_attn_mask_enc, + num_patches=patch_lengths.shape[1], + patch_ids=patch_ids, + **kwargs, + ) + encoder_cross_states = encoder_cross_states.view(batch_size, patch_lengths.shape[1], -1) + global_cache_position = torch.arange(0, encoder_cross_states.shape[1], device=encoder_cross_states.device) + global_position_ids = global_cache_position.unsqueeze(0) + global_causal_mask = create_causal_mask( + config=self.config, + input_embeds=encoder_cross_states, + attention_mask=None, + cache_position=global_cache_position, + past_key_values=None, + position_ids=None, + ) + + global_hidden_states = self.global_transformer( + input_embeds=encoder_cross_states, + attention_mask=global_causal_mask, + position_ids=global_position_ids, + **kwargs, + ) + decoder_patch_ids = self._patch_ids_from_lengths(patch_lengths[:, 1:], sequence_length) + cross_attn_mask_dec = _prepare_patch_cross_attention_mask( + patch_ids=decoder_patch_ids, + num_patches=patch_lengths.shape[1], + sequence_length=sequence_length, + patches_as_queries=False, + cross_attn_k=self.config.cross_attn_k, + dtype=encoder_embeds.dtype, + ) + output = self.local_decoder( + input_ids=input_ids, + inputs_embeds=encoder_hidden_states, + patch_embeds=global_hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_values=past_key_values, + cache_position=cache_position, + encoder_attention_mask=cross_attn_mask_dec, + **kwargs, + ) + return BaseModelOutputWithPast( + last_hidden_state=output, + past_key_values=past_key_values, + ) + + def get_input_embeddings(self): + return self.local_encoder.embed_tokens + + def set_input_embeddings(self, value): + self.local_encoder.embed_tokens = value + + def _patch_ids_from_lengths(self, patch_lengths: torch.Tensor, seq_len: int) -> torch.Tensor: + batch_size = patch_lengths.shape[0] + patch_starts = torch.cat( + [ + torch.zeros(batch_size, 1, dtype=patch_lengths.dtype, device=patch_lengths.device), + patch_lengths.cumsum(dim=-1)[:, :-1], + ], + dim=-1, + ) + token_positions = torch.arange(seq_len, device=patch_lengths.device) + return (patch_starts.unsqueeze(1) <= token_positions.unsqueeze(0).unsqueeze(-1)).sum(dim=-1) - 1 + + +class BltForCausalLM(MllamaForCausalLM): + config: BltConfig + _can_compile_fullgraph = False + base_model_prefix = "model" + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config: BltConfig): + super().__init__(config) + self.vocab_size = config.vocab_size + self.model = BltModel(config) + self.lm_head = nn.Linear(config.decoder_config.hidden_size, config.vocab_size, bias=False) + self.post_init() + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + cross_attention_states: Optional[torch.LongTensor] = None, # Keep for compatibility + cross_attention_mask: Optional[torch.LongTensor] = None, + full_text_row_masked_out_mask: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, CausalLMOutputWithPast]: + # Call parent forward but exclude cross_attention_states from model call + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + cross_attention_mask=cross_attention_mask, + full_text_row_masked_out_mask=full_text_row_masked_out_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs.last_hidden_state + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]).float() + + loss = None + if labels is not None: + loss = self.loss_function(logits, labels, self.vocab_size, **kwargs) + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = [ + "BltPreTrainedModel", + "BltModel", + "BltPatcher", + "BltForCausalLM", +] diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower_fast.py b/src/transformers/models/bridgetower/image_processing_bridgetower_fast.py index 44da5d4486e7..5be6f9f6c54b 100644 --- a/src/transformers/models/bridgetower/image_processing_bridgetower_fast.py +++ b/src/transformers/models/bridgetower/image_processing_bridgetower_fast.py @@ -18,6 +18,7 @@ from typing import Optional, Union import torch +from torchvision.transforms.v2 import functional as F from ...image_processing_utils_fast import ( BaseImageProcessorFast, @@ -31,13 +32,7 @@ reorder_images, ) from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling -from ...utils import auto_docstring, is_torchvision_v2_available - - -if is_torchvision_v2_available(): - from torchvision.transforms.v2 import functional as F -else: - from torchvision.transforms import functional as F +from ...utils import auto_docstring def make_pixel_mask( diff --git a/src/transformers/models/bros/convert_bros_to_pytorch.py b/src/transformers/models/bros/convert_bros_to_pytorch.py deleted file mode 100644 index 35c89a88da69..000000000000 --- a/src/transformers/models/bros/convert_bros_to_pytorch.py +++ /dev/null @@ -1,145 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Bros checkpoints.""" - -import argparse - -import bros # original repo -import torch - -from transformers import BrosConfig, BrosModel, BrosProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_configs(model_name): - bros_config = BrosConfig.from_pretrained(model_name) - return bros_config - - -def remove_ignore_keys_(state_dict): - ignore_keys = [ - "embeddings.bbox_sinusoid_emb.inv_freq", - ] - for k in ignore_keys: - state_dict.pop(k, None) - - -def rename_key(name): - if name == "embeddings.bbox_projection.weight": - name = "bbox_embeddings.bbox_projection.weight" - - if name == "embeddings.bbox_sinusoid_emb.x_pos_emb.inv_freq": - name = "bbox_embeddings.bbox_sinusoid_emb.x_pos_emb.inv_freq" - - if name == "embeddings.bbox_sinusoid_emb.y_pos_emb.inv_freq": - name = "bbox_embeddings.bbox_sinusoid_emb.y_pos_emb.inv_freq" - - return name - - -def convert_state_dict(orig_state_dict, model): - # rename keys - for key in orig_state_dict.copy(): - val = orig_state_dict.pop(key) - orig_state_dict[rename_key(key)] = val - - # remove ignore keys - remove_ignore_keys_(orig_state_dict) - - return orig_state_dict - - -def convert_bros_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False): - # load original model - original_model = bros.BrosModel.from_pretrained(model_name).eval() - - # load HuggingFace Model - bros_config = get_configs(model_name) - model = BrosModel.from_pretrained(model_name, config=bros_config) - model.eval() - - state_dict = original_model.state_dict() - new_state_dict = convert_state_dict(state_dict, model) - model.load_state_dict(new_state_dict) - - # verify results - - # original BROS model require 4 points (8 float values) for each bbox, prepare bbox with [batch_size, seq_len, 8] shape - bbox = torch.tensor( - [ - [ - [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000], - [0.4396, 0.6720, 0.4659, 0.6720, 0.4659, 0.6850, 0.4396, 0.6850], - [0.4698, 0.6720, 0.4843, 0.6720, 0.4843, 0.6850, 0.4698, 0.6850], - [0.4698, 0.6720, 0.4843, 0.6720, 0.4843, 0.6850, 0.4698, 0.6850], - [0.2047, 0.6870, 0.2730, 0.6870, 0.2730, 0.7000, 0.2047, 0.7000], - [0.2047, 0.6870, 0.2730, 0.6870, 0.2730, 0.7000, 0.2047, 0.7000], - [1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000], - ] - ] - ) - - processor = BrosProcessor.from_pretrained(model_name) - - encoding = processor("His name is Rocco.", return_tensors="pt") - encoding["bbox"] = bbox - - original_hidden_states = original_model(**encoding).last_hidden_state - # pixel_values = processor(image, return_tensors="pt").pixel_values - - last_hidden_states = model(**encoding).last_hidden_state - - assert torch.allclose(original_hidden_states, last_hidden_states, atol=1e-4) - - if pytorch_dump_folder_path is not None: - print(f"Saving model and processor to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - model.push_to_hub("jinho8345/" + model_name.split("/")[-1], commit_message="Update model") - processor.push_to_hub("jinho8345/" + model_name.split("/")[-1], commit_message="Update model") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - # Required parameters - parser.add_argument( - "--model_name", - default="jinho8345/bros-base-uncased", - required=False, - type=str, - help="Name of the original model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - required=False, - type=str, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether or not to push the converted model and processor to the 🤗 hub.", - ) - - args = parser.parse_args() - convert_bros_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/byt5/convert_byt5_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/byt5/convert_byt5_original_tf_checkpoint_to_pytorch.py deleted file mode 100755 index 9b1b15857cea..000000000000 --- a/src/transformers/models/byt5/convert_byt5_original_tf_checkpoint_to_pytorch.py +++ /dev/null @@ -1,59 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The T5 authors and HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert T5 checkpoint.""" - -import argparse - -from transformers import T5Config, T5ForConditionalGeneration, load_tf_weights_in_t5 -from transformers.utils import logging - - -logging.set_verbosity_info() - - -def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path): - # Initialise PyTorch model - config = T5Config.from_json_file(config_file) - print(f"Building PyTorch model from configuration: {config}") - model = T5ForConditionalGeneration(config) - - # Load weights from tf checkpoint - load_tf_weights_in_t5(model, config, tf_checkpoint_path) - - # Save pytorch-model - print(f"Save PyTorch model to {pytorch_dump_path}") - model.save_pretrained(pytorch_dump_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." - ) - parser.add_argument( - "--config_file", - default=None, - type=str, - required=True, - help=( - "The config json file corresponding to the pre-trained T5 model. \nThis specifies the model architecture." - ), - ) - parser.add_argument( - "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - args = parser.parse_args() - convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path) diff --git a/src/transformers/models/canine/convert_canine_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/canine/convert_canine_original_tf_checkpoint_to_pytorch.py deleted file mode 100644 index 45dcdb290333..000000000000 --- a/src/transformers/models/canine/convert_canine_original_tf_checkpoint_to_pytorch.py +++ /dev/null @@ -1,65 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert CANINE checkpoint.""" - -import argparse - -from transformers import CanineConfig, CanineModel, CanineTokenizer, load_tf_weights_in_canine -from transformers.utils import logging - - -logging.set_verbosity_info() - - -def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, pytorch_dump_path): - # Initialize PyTorch model - config = CanineConfig() - model = CanineModel(config) - model.eval() - - print(f"Building PyTorch model from configuration: {config}") - - # Load weights from tf checkpoint - load_tf_weights_in_canine(model, config, tf_checkpoint_path) - - # Save pytorch-model (weights and configuration) - print(f"Save PyTorch model to {pytorch_dump_path}") - model.save_pretrained(pytorch_dump_path) - - # Save tokenizer files - tokenizer = CanineTokenizer() - print(f"Save tokenizer files to {pytorch_dump_path}") - tokenizer.save_pretrained(pytorch_dump_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--tf_checkpoint_path", - default=None, - type=str, - required=True, - help="Path to the TensorFlow checkpoint. Should end with model.ckpt", - ) - parser.add_argument( - "--pytorch_dump_path", - default=None, - type=str, - required=True, - help="Path to a folder where the PyTorch model will be placed.", - ) - args = parser.parse_args() - convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.pytorch_dump_path) diff --git a/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py b/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py deleted file mode 100644 index 27661ec2bac4..000000000000 --- a/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py +++ /dev/null @@ -1,478 +0,0 @@ -# Copyright 2024 Meta Inc. and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import gc -import json -import os - -import requests -import torch -import yaml -from accelerate import init_empty_weights -from PIL import Image - -from transformers import ( - ChameleonConfig, - ChameleonForConditionalGeneration, - ChameleonImageProcessor, - ChameleonProcessor, -) - - -try: - from transformers import LlamaTokenizerFast -except ImportError: - raise ValueError( - "Chameleon conversion supports only FastTokenizer and LlamaTokenizerFast can't be imported! " - "Update your `tokenizers` library and re-run the tokenizer conversion." - ) - -""" -Sample usage: - -``` -python src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py \ - --input_dir /path/to/downloaded/chameleon/weights --model_size 7B --output_dir /output/path -``` - -Thereafter, models can be loaded via: - -```py -from transformers import ChameleonForConditionalGeneration, LlamaTokenizerFast - -model = ChameleonForConditionalGeneration.from_pretrained("/output/path") -tokenizer = LlamaTokenizerFast.from_pretrained("/output/path") -``` - -Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions -come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). -""" - -NUM_SHARDS = { - "7B": 1, - "30B": 4, -} - -VOCAB_SIZE = 65536 - - -def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256): - return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of) - - -def read_json(path): - with open(path, "r") as f: - return json.load(f) - - -def write_json(text, path): - with open(path, "w") as f: - json.dump(text, f) - - -def write_model(model_path, input_base_path, model_size, chameleon_version=1): - os.makedirs(model_path, exist_ok=True) - input_model_path = os.path.join(input_base_path, "models", model_size.lower()) - params_path = os.path.join(input_model_path, "params.json") - consolidate_params_path = os.path.join(input_model_path, "consolidate_params.json") - - params = read_json(params_path) - if os.path.isfile(consolidate_params_path): - params = {**params, **read_json(consolidate_params_path)} - num_shards = NUM_SHARDS[model_size] - model_parallel_size = params["model_parallel_size"] - params = params.get("model", params) - n_layers = params["n_layers"] - n_heads = params["n_heads"] - n_heads_per_shard = n_heads // num_shards - dim = params["dim"] - dims_per_head = dim // n_heads - base = params.get("rope_theta", 10000.0) - swin_norm = params["swin_norm"] - if base > 10000.0: - max_position_embeddings = 16384 - else: - # Depending on the Chameleon version, the default max_position_embeddings has different values. - if chameleon_version == 1: - max_position_embeddings = 4096 - else: - raise NotImplementedError( - f"Version {chameleon_version} of chameleon is not supported yet. " - "Current supported versions of chameleon are [1]." - ) - - if params.get("n_kv_heads", None) is not None: - num_key_value_heads = params["n_kv_heads"] # for GQA / MQA - num_local_key_value_heads = n_heads_per_shard // num_key_value_heads - key_value_dim = dim // num_key_value_heads - else: # compatibility with other checkpoints - num_key_value_heads = n_heads - num_local_key_value_heads = n_heads_per_shard - key_value_dim = dim - - print(f"Fetching all parameters from the checkpoint at {input_model_path}.") - # Load weights - if num_shards == 1: - # Not sharded - # (The sharded implementation would also work, but this is simpler.) - loaded = None - for possible_name in ["consolidated.pth", "consolidated.00.pth"]: - possible_path = os.path.join(input_model_path, possible_name) - if os.path.exists(possible_path): - loaded = torch.load(possible_path, map_location="cpu", weights_only=True) - break - assert loaded is not None - else: - # Sharded - loaded = [ - torch.load( - os.path.join(input_model_path, f"consolidated.{i:02d}.pth"), map_location="cpu", weights_only=True - ) - for i in range(num_shards) - ] - - # permute for sliced rotary - def permute(w, n_heads, dim1=dim, dim2=dim): - return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2) - - # Load weights to the state dict - state_dict = {} - for layer_i in range(n_layers): - if num_shards == 1: - # Unsharded - state_dict.update( - { - f"model.layers.{layer_i}.self_attn.q_proj.weight": permute( - loaded[f"layers.{layer_i}.attention.wq.weight"], n_heads=n_heads - ), - f"model.layers.{layer_i}.self_attn.k_proj.weight": permute( - loaded[f"layers.{layer_i}.attention.wk.weight"], - n_heads=num_key_value_heads, - dim1=key_value_dim, - ), - f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[f"layers.{layer_i}.attention.wv.weight"], - f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"layers.{layer_i}.attention.wo.weight"], - f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w1.weight"], - f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w2.weight"], - f"model.layers.{layer_i}.mlp.up_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w3.weight"], - f"model.layers.{layer_i}.input_layernorm.weight": loaded[ - f"layers.{layer_i}.attention_norm.weight" - ], - f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[ - f"layers.{layer_i}.ffn_norm.weight" - ], - } - ) - # qk_layernorm (see https://github.com/huggingface/transformers/pull/31534#issuecomment-2207354677) - state_dict[f"model.layers.{layer_i}.self_attn.q_norm.weight"] = ( - loaded[f"layers.{layer_i}.attention.q_normalization.weight"] - .view(dims_per_head // 2, 2) - .t() - .reshape(1, -1) - .repeat_interleave(n_heads, 0) - ) - state_dict[f"model.layers.{layer_i}.self_attn.q_norm.bias"] = ( - loaded[f"layers.{layer_i}.attention.q_normalization.bias"] - .view(dims_per_head // 2, 2) - .t() - .reshape(1, -1) - .repeat_interleave(n_heads, 0) - ) - state_dict[f"model.layers.{layer_i}.self_attn.k_norm.weight"] = ( - loaded[f"layers.{layer_i}.attention.k_normalization.weight"] - .view(dims_per_head // 2, 2) - .t() - .reshape(1, -1) - .repeat_interleave(num_key_value_heads, 0) - ) - state_dict[f"model.layers.{layer_i}.self_attn.k_norm.bias"] = ( - loaded[f"layers.{layer_i}.attention.k_normalization.bias"] - .view(dims_per_head // 2, 2) - .t() - .reshape(1, -1) - .repeat_interleave(num_key_value_heads, 0) - ) - - else: - # Sharded - state_dict.update( - { - f"model.layers.{layer_i}.input_layernorm.weight": torch.stack( - [l[f"layers.{layer_i}.attention_norm.weight"] for l in loaded] - ).mean(dim=0), - f"model.layers.{layer_i}.post_attention_layernorm.weight": torch.stack( - [l[f"layers.{layer_i}.ffn_norm.weight"] for l in loaded] - ).mean(dim=0), - } - ) - state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute( - torch.cat( - [ - loaded[i][f"layers.{layer_i}.attention.wq.weight"].view(n_heads_per_shard, dims_per_head, dim) - for i in range(num_shards) - ], - dim=0, - ).reshape(dim, dim), - n_heads=n_heads, - ) - - state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute( - torch.cat( - [ - loaded[i][f"layers.{layer_i}.attention.wk.weight"].view( - num_local_key_value_heads, dims_per_head, dim - ) - for i in range(num_shards) - ], - dim=0, - ).reshape(key_value_dim, dim), - n_heads=num_key_value_heads, - dim1=key_value_dim, - ) - - # qk_layernorm (see https://github.com/huggingface/transformers/pull/31534#issuecomment-2207354677) - state_dict[f"model.layers.{layer_i}.self_attn.q_norm.weight"] = ( - torch.cat([l[f"layers.{layer_i}.attention.q_normalization.weight"].unsqueeze(0) for l in loaded]) - .view(num_shards, dims_per_head // 2, 2) - .transpose(1, 2) - .reshape(num_shards, -1) - .repeat_interleave(n_heads // num_shards, 0) - ) - state_dict[f"model.layers.{layer_i}.self_attn.q_norm.bias"] = ( - torch.cat([l[f"layers.{layer_i}.attention.q_normalization.bias"].unsqueeze(0) for l in loaded]) - .view(num_shards, dims_per_head // 2, 2) - .transpose(1, 2) - .reshape(num_shards, -1) - .repeat_interleave(n_heads // num_shards, 0) - ) - state_dict[f"model.layers.{layer_i}.self_attn.k_norm.weight"] = ( - torch.cat([l[f"layers.{layer_i}.attention.k_normalization.weight"].unsqueeze(0) for l in loaded]) - .view(num_shards, dims_per_head // 2, 2) - .transpose(1, 2) - .reshape(num_shards, -1) - .repeat_interleave(num_key_value_heads // num_shards, 0) - ) - state_dict[f"model.layers.{layer_i}.self_attn.k_norm.bias"] = ( - torch.cat([l[f"layers.{layer_i}.attention.k_normalization.bias"].unsqueeze(0) for l in loaded]) - .view(num_shards, dims_per_head // 2, 2) - .transpose(1, 2) - .reshape(num_shards, -1) - .repeat_interleave(num_key_value_heads // num_shards, 0) - ) - - state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat( - [ - loaded[i][f"layers.{layer_i}.attention.wv.weight"].view( - num_local_key_value_heads, dims_per_head, dim - ) - for i in range(num_shards) - ], - dim=0, - ).reshape(key_value_dim, dim) - - state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat( - [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(num_shards)], dim=1 - ) - state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat( - [loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(num_shards)], dim=0 - ) - state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat( - [loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(num_shards)], dim=1 - ) - state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat( - [loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(num_shards)], dim=0 - ) - - if num_shards == 1: - # Unsharded - state_dict.update( - { - "model.embed_tokens.weight": loaded["tok_embeddings.weight"], - "model.norm.weight": loaded["norm.weight"], - "lm_head.weight": loaded["output.weight"], - } - ) - else: - state_dict.update( - { - "model.embed_tokens.weight": torch.cat( - [loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=1 - ), - "model.norm.weight": torch.stack([loaded[i]["norm.weight"] for i in range(num_shards)]).mean(dim=0), - "lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(num_shards)], dim=0), - } - ) - - # Load VQGAN weights - vqgan_path = os.path.join(input_base_path, "tokenizer/vqgan.ckpt") - vqgan_state_dict = torch.load(vqgan_path, map_location="cpu", weights_only=True)["state_dict"] - for k, v in vqgan_state_dict.items(): - if "decoder" in k: - continue # we dont do image generation yet - state_dict[f"model.vqmodel.{k}"] = v - - # Write configs - ffn_dim_multiplier = params.get("ffn_dim_multiplier", 1) - multiple_of = params.get("multiple_of", 256) - - with open(os.path.join(input_base_path, "tokenizer/text_tokenizer.json")) as tokenizer_file: - tokenizer_config = json.load(tokenizer_file) - vocabulary_map = tokenizer_config["model"]["vocab"] - vocabulary_map[""] = vocabulary_map[ - "" - ] # use a reserved token instead of adding a new one - del vocabulary_map[""] - - for token in tokenizer_config["added_tokens"]: - if token["content"] == "": - token["content"] = "" - - with open(os.path.join(input_base_path, "tokenizer/text_tokenizer_modified.json"), "w") as f: - json.dump(tokenizer_config, f) # save the new file to init tokenizer later - - vq_keys_to_replace = [ - ("ch", "base_channels"), - ("out_ch", "out_channels"), - ("n_embed", "num_embeddings"), - ("ch_mult", "channel_multiplier"), - ("double_z", "double_latent"), - ("z_channels", "latent_channels"), - ] - with open(os.path.join(input_base_path, "tokenizer/vqgan.yaml")) as vqgan_cfg_file: - vq_config = yaml.safe_load(vqgan_cfg_file)["model"]["params"] - vq_config.update(**vq_config["ddconfig"]) - for old, new in vq_keys_to_replace: - vq_config[new] = vq_config[old] - del vq_config["ddconfig"] - del vq_config["ckpt_path"] - del vq_config["lossconfig"] - - config = ChameleonConfig( - hidden_size=dim, - intermediate_size=compute_intermediate_size(dim, ffn_dim_multiplier, multiple_of), - num_attention_heads=params["n_heads"], - num_hidden_layers=params["n_layers"], - rms_norm_eps=params["norm_eps"], - num_key_value_heads=num_key_value_heads, - vocab_size=VOCAB_SIZE, - rope_theta=base, - max_position_embeddings=max_position_embeddings, - model_parallel_size=model_parallel_size, - swin_norm=swin_norm, - vq_config=vq_config, - vocabulary_map=vocabulary_map, - ) - with init_empty_weights(): - model = ChameleonForConditionalGeneration(config) - - model.load_state_dict(state_dict, assign=True, strict=False) - model.save_pretrained(model_path, safe_serialization=True) - - # Load and save the processor - tokenizer = LlamaTokenizerFast( - tokenizer_file=os.path.join(input_base_path, "tokenizer/text_tokenizer_modified.json"), legacy=False - ) - tokenizer.sep_token_id = 8710 # assign to sep so that we can append it after input text - tokenizer.pad_token_id = 1 # assign to special pad_token - image_processor = ChameleonImageProcessor() - processor = ChameleonProcessor(image_processor=image_processor, tokenizer=tokenizer) - processor.save_pretrained(model_path) - - # Make space so we can load the model properly now. - del state_dict - del loaded - del vqgan_state_dict - gc.collect() - - # Short inference on a few examples to check if generation makes sense - # taken from https://github.com/facebookresearch/chameleon/blob/7a72f40aa5f462965c8374f25257f55b65b25ff4/data/prompts_for_human_evaluations.jsonl - print("Loading the checkpoint in a Chameleon model...") - print("*" * 100) - model = ChameleonForConditionalGeneration.from_pretrained( - model_path, attn_implementation="eager", dtype=torch.bfloat16, device_map="auto" - ) - processor = ChameleonProcessor.from_pretrained(model_path) - - prompt = "I'm very intrigued by this work of art:Please tell me about the artist." - image = Image.open( - requests.get( - "https://uploads4.wikiart.org/images/paul-klee/death-for-the-idea-1915.jpg!Large.jpg", stream=True - ).raw - ) - inputs = processor(prompt, images=image, return_tensors="pt").to(model.device, torch.bfloat16) - length = inputs.input_ids.shape[1] - - out = model.generate(**inputs, max_new_tokens=40, do_sample=False) - generated_text = processor.batch_decode(out[:, length:], skip_special_tokens=True)[0] - - print(f"Generation for single-image: {generated_text}") - print("*" * 100) - - # Multi-image example - prompt = "I used to know a lot about constellations when I was younger, but as I grew older, I forgot most of what I knew. These are the only two constellations that I really remember now.I would like for you to tell me about 3 more constellations and give me a little bit of history about the constellation." - image = Image.open( - requests.get("https://nineplanets.org/wp-content/uploads/2020/12/the-big-dipper-1.jpg", stream=True).raw - ) - image_2 = Image.open( - requests.get("https://www.kxan.com/wp-content/uploads/sites/40/2020/10/ORION.jpg", stream=True).raw - ) - - inputs = processor(prompt, images=[image, image_2], return_tensors="pt").to(model.device, dtype=torch.bfloat16) - length = inputs.input_ids.shape[1] - out = model.generate(**inputs, max_new_tokens=50, do_sample=False) - generated_text = processor.batch_decode(out[:, length:], skip_special_tokens=True)[0] - - print(f"Generation for multi-image: {generated_text}") - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--input_dir", - help="Location of Chameleon weights", - ) - parser.add_argument( - "--model_size", - choices=["7B", "30B"], - help="" - " models correspond to the finetuned versions, and are specific to the Chameleon official release. For more details on Chameleon, check out the original repo: https://github.com/facebookresearch/chameleon", - ) - parser.add_argument( - "--output_dir", - help="Location to write HF model", - ) - parser.add_argument( - "--test_inference", - action="store_true", - help="Whether to load the model for generation to test it's converted correctly.", - ) - # Different Chameleon versions used different default values for max_position_embeddings, hence the need to be able to specify which version is being used. - parser.add_argument( - "--chameleon_version", - choices=[1], - default=1, - type=int, - help="Version of the Chameleon model to convert", - ) - args = parser.parse_args() - write_model( - model_path=args.output_dir, - input_base_path=args.input_dir, - model_size=args.model_size, - chameleon_version=args.chameleon_version, - ) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/chameleon/image_processing_chameleon_fast.py b/src/transformers/models/chameleon/image_processing_chameleon_fast.py index 39aa4ec87b00..1d102614f7df 100644 --- a/src/transformers/models/chameleon/image_processing_chameleon_fast.py +++ b/src/transformers/models/chameleon/image_processing_chameleon_fast.py @@ -19,17 +19,13 @@ import numpy as np import PIL import torch +from torchvision.transforms.v2 import functional as F from ...image_processing_utils_fast import BaseImageProcessorFast from ...image_utils import ImageInput, PILImageResampling, SizeDict -from ...utils import auto_docstring, is_torchvision_v2_available, logging +from ...utils import auto_docstring, logging -if is_torchvision_v2_available(): - from torchvision.transforms.v2 import functional as F -else: - from torchvision.transforms import functional as F - logger = logging.get_logger(__name__) diff --git a/src/transformers/models/chinese_clip/configuration_chinese_clip.py b/src/transformers/models/chinese_clip/configuration_chinese_clip.py index e7c98d0d2d9f..c628107048b9 100644 --- a/src/transformers/models/chinese_clip/configuration_chinese_clip.py +++ b/src/transformers/models/chinese_clip/configuration_chinese_clip.py @@ -307,7 +307,7 @@ def __init__( # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different. for key, value in _text_config_dict.items(): - if key in text_config and value != text_config[key] and key not in ["transformers_version"]: + if key in text_config and value != text_config[key] and key != "transformers_version": # If specified in `text_config_dict` if key in text_config_dict: message = ( @@ -339,7 +339,7 @@ def __init__( # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different. for key, value in _vision_config_dict.items(): - if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]: + if key in vision_config and value != vision_config[key] and key != "transformers_version": # If specified in `vision_config_dict` if key in vision_config_dict: message = ( diff --git a/src/transformers/models/chinese_clip/convert_chinese_clip_original_pytorch_to_hf.py b/src/transformers/models/chinese_clip/convert_chinese_clip_original_pytorch_to_hf.py deleted file mode 100644 index adc9300ef512..000000000000 --- a/src/transformers/models/chinese_clip/convert_chinese_clip_original_pytorch_to_hf.py +++ /dev/null @@ -1,134 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The OFA-Sys Team Authors and The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse - -import torch - -from transformers import ChineseCLIPConfig, ChineseCLIPModel - - -def copy_attn_layer(hf_attn_layer, pt_weights, prefix): - q_proj, k_proj, v_proj = pt_weights[f"{prefix}.in_proj_weight"].chunk(3, dim=0) - q_proj_bias, k_proj_bias, v_proj_bias = pt_weights[f"{prefix}.in_proj_bias"].chunk(3, dim=0) - - out_proj_weights = pt_weights[f"{prefix}.out_proj.weight"] - out_proj_bias = pt_weights[f"{prefix}.out_proj.bias"] - - hf_attn_layer.q_proj.weight.data = q_proj - hf_attn_layer.q_proj.bias.data = q_proj_bias - - hf_attn_layer.k_proj.weight.data = k_proj - hf_attn_layer.k_proj.bias.data = k_proj_bias - - hf_attn_layer.v_proj.weight.data = v_proj - hf_attn_layer.v_proj.bias.data = v_proj_bias - - hf_attn_layer.out_proj.weight.data = out_proj_weights - hf_attn_layer.out_proj.bias.data = out_proj_bias - - -def copy_mlp(hf_mlp, pt_weights, prefix): - copy_linear(hf_mlp.fc1, pt_weights, f"{prefix}.c_fc") - copy_linear(hf_mlp.fc2, pt_weights, f"{prefix}.c_proj") - - -def copy_linear(hf_linear, pt_weights, prefix): - hf_linear.weight.data = pt_weights[f"{prefix}.weight"].data - hf_linear.bias.data = pt_weights[f"{prefix}.bias"].data - - -def copy_layer(hf_layer, pt_weights, prefix): - # copy layer norms - copy_linear(hf_layer.layer_norm1, pt_weights, f"{prefix}.ln_1") - copy_linear(hf_layer.layer_norm2, pt_weights, f"{prefix}.ln_2") - - # copy MLP - copy_mlp(hf_layer.mlp, pt_weights, f"{prefix}.mlp") - - # copy attn - copy_attn_layer(hf_layer.self_attn, pt_weights, f"{prefix}.attn") - - -def copy_layers(hf_layers, pt_weights, prefix): - for layer_id, hf_layer in enumerate(hf_layers): - copy_layer(hf_layer, pt_weights, f"{prefix}.{layer_id}") - - -def copy_text_model_and_projection(hf_model, pt_weights): - # copy projection - hf_model.text_projection.weight.data = pt_weights["text_projection"].data.T - - # copy text encoder - for name, param in hf_model.text_model.named_parameters(): - param.data = pt_weights[f"bert.{name}"].data - - -def copy_vision_model_and_projection(hf_model, pt_weights): - # copy projection - hf_model.visual_projection.weight.data = pt_weights["visual.proj"].data.T - - # copy layer norms - copy_linear(hf_model.vision_model.pre_layrnorm, pt_weights, "visual.ln_pre") - copy_linear(hf_model.vision_model.post_layernorm, pt_weights, "visual.ln_post") - - # copy embeddings - hf_model.vision_model.embeddings.patch_embedding.weight.data = pt_weights["visual.conv1.weight"].data - hf_model.vision_model.embeddings.class_embedding.data = pt_weights["visual.class_embedding"].data - hf_model.vision_model.embeddings.position_embedding.weight.data = pt_weights["visual.positional_embedding"].data - - # copy encoder - copy_layers(hf_model.vision_model.encoder.layers, pt_weights, "visual.transformer.resblocks") - - -@torch.no_grad() -def convert_chinese_clip_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None): - """ - Copy/paste/tweak model's weights to transformers design. - """ - - assert config_path is not None, "Please specify the ChineseCLIP model config of the corresponding model size." - config = ChineseCLIPConfig.from_pretrained(config_path) - - hf_model = ChineseCLIPModel(config).eval() - - pt_weights = torch.load(checkpoint_path, map_location="cpu", weights_only=True)["state_dict"] - pt_weights = {(name[7:] if name.startswith("module.") else name): value for name, value in pt_weights.items()} - - copy_text_model_and_projection(hf_model, pt_weights) - copy_vision_model_and_projection(hf_model, pt_weights) - hf_model.logit_scale.data = pt_weights["logit_scale"].data - - hf_model.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - help="Path to the output folder storing converted hf PyTorch model.", - ) - parser.add_argument( - "--checkpoint_path", default=None, type=str, help="Path to original github format ChineseCLIP checkpoint." - ) - parser.add_argument( - "--config_path", default=None, required=True, type=str, help="Path to hf config.json of model to convert." - ) - args = parser.parse_args() - - convert_chinese_clip_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path) - print("The conversion is finished!") diff --git a/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py b/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py deleted file mode 100644 index 66488e401a1a..000000000000 --- a/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py +++ /dev/null @@ -1,133 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import re - -from laion_clap import CLAP_Module - -from transformers import AutoFeatureExtractor, ClapConfig, ClapModel - - -KEYS_TO_MODIFY_MAPPING = { - "text_branch": "text_model", - "audio_branch": "audio_model.audio_encoder", - "attn": "attention.self", - "self.proj": "output.dense", - "attention.self_mask": "attn_mask", - "mlp.fc1": "intermediate.dense", - "mlp.fc2": "output.dense", - "norm1": "layernorm_before", - "norm2": "layernorm_after", - "bn0": "batch_norm", -} - -processor = AutoFeatureExtractor.from_pretrained("laion/clap-htsat-unfused", truncation="rand_trunc") - - -def init_clap(checkpoint_path, model_type, enable_fusion=False): - model = CLAP_Module( - amodel=model_type, - enable_fusion=enable_fusion, - ) - model.load_ckpt(checkpoint_path) - return model - - -def get_config_from_original(clap_model): - audio_config = { - "patch_embeds_hidden_size": clap_model.model.audio_branch.embed_dim, - "depths": clap_model.model.audio_branch.depths, - "hidden_size": clap_model.model.audio_projection[0].in_features, - } - - text_config = {"hidden_size": clap_model.model.text_branch.pooler.dense.in_features} - - return ClapConfig(audio_config=audio_config, text_config=text_config) - - -def rename_state_dict(state_dict): - model_state_dict = {} - - sequential_layers_pattern = r".*sequential.(\d+).*" - text_projection_pattern = r".*_projection.(\d+).*" - - for key, value in state_dict.items(): - # check if any key needs to be modified - for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items(): - if key_to_modify in key: - key = key.replace(key_to_modify, new_key) - - if re.match(sequential_layers_pattern, key): - # replace sequential layers with list - sequential_layer = re.match(sequential_layers_pattern, key).group(1) - - key = key.replace(f"sequential.{sequential_layer}.", f"layers.{int(sequential_layer) // 3}.linear.") - elif re.match(text_projection_pattern, key): - projecton_layer = int(re.match(text_projection_pattern, key).group(1)) - - # Because in CLAP they use `nn.Sequential`... - transformers_projection_layer = 1 if projecton_layer == 0 else 2 - - key = key.replace(f"_projection.{projecton_layer}.", f"_projection.linear{transformers_projection_layer}.") - - if "audio" and "qkv" in key: - # split qkv into query key and value - mixed_qkv = value - qkv_dim = mixed_qkv.size(0) // 3 - - query_layer = mixed_qkv[:qkv_dim] - key_layer = mixed_qkv[qkv_dim : qkv_dim * 2] - value_layer = mixed_qkv[qkv_dim * 2 :] - - model_state_dict[key.replace("qkv", "query")] = query_layer - model_state_dict[key.replace("qkv", "key")] = key_layer - model_state_dict[key.replace("qkv", "value")] = value_layer - else: - model_state_dict[key] = value - - return model_state_dict - - -def convert_clap_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path, model_type, enable_fusion=False): - clap_model = init_clap(checkpoint_path, model_type, enable_fusion=enable_fusion) - - clap_model.eval() - state_dict = clap_model.model.state_dict() - state_dict = rename_state_dict(state_dict) - - transformers_config = get_config_from_original(clap_model) - transformers_config.audio_config.enable_fusion = enable_fusion - model = ClapModel(transformers_config) - - # ignore the spectrogram embedding layer - model.load_state_dict(state_dict, strict=False) - - model.save_pretrained(pytorch_dump_folder_path) - transformers_config.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint") - parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert") - parser.add_argument("--enable_fusion", action="store_true", help="Whether to enable fusion or not") - parser.add_argument("--model_type", default="HTSAT-tiny", type=str, help="Whether to enable fusion or not") - args = parser.parse_args() - - convert_clap_checkpoint( - args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.model_type, args.enable_fusion - ) diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py index e333248c18ed..33daac615c07 100644 --- a/src/transformers/models/clap/feature_extraction_clap.py +++ b/src/transformers/models/clap/feature_extraction_clap.py @@ -152,7 +152,7 @@ def to_dict(self) -> dict[str, Any]: del output["mel_filters_slaney"] return output - def _np_extract_fbank_features(self, waveform: np.ndarray, mel_filters: Optional[np.array] = None) -> np.ndarray: + def _np_extract_fbank_features(self, waveform: np.ndarray, mel_filters: Optional[np.ndarray] = None) -> np.ndarray: """ Compute the log-mel spectrogram of the provided `waveform` using the Hann window. In CLAP, two different filter banks are used depending on the truncation pattern: @@ -199,7 +199,7 @@ def _random_mel_fusion(self, mel, total_frames, chunk_frames): mel_fusion = np.stack([mel_shrink, mel_chunk_front, mel_chunk_middle, mel_chunk_back], axis=0) return mel_fusion - def _get_input_mel(self, waveform: np.ndarray, max_length, truncation, padding) -> np.array: + def _get_input_mel(self, waveform: np.ndarray, max_length, truncation, padding) -> np.ndarray: """ Extracts the mel spectrogram and prepares it for the mode based on the `truncation` and `padding` arguments. Four different path are possible: diff --git a/src/transformers/models/clip/configuration_clip.py b/src/transformers/models/clip/configuration_clip.py index 0b4fe6ba37f6..e343715e29ee 100644 --- a/src/transformers/models/clip/configuration_clip.py +++ b/src/transformers/models/clip/configuration_clip.py @@ -296,7 +296,7 @@ def __init__( # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different. for key, value in _text_config_dict.items(): - if key in text_config and value != text_config[key] and key not in ["transformers_version"]: + if key in text_config and value != text_config[key] and key != "transformers_version": # If specified in `text_config_dict` if key in text_config_dict: message = ( @@ -328,7 +328,7 @@ def __init__( # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different. for key, value in _vision_config_dict.items(): - if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]: + if key in vision_config and value != vision_config[key] and key != "transformers_version": # If specified in `vision_config_dict` if key in vision_config_dict: message = ( diff --git a/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py b/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py deleted file mode 100644 index 3d88fc1929c3..000000000000 --- a/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py +++ /dev/null @@ -1,156 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse - -import torch -from clip import load - -from transformers import CLIPConfig, CLIPModel - - -def copy_attn_layer(hf_attn_layer, pt_attn_layer): - q_proj, k_proj, v_proj = pt_attn_layer.in_proj_weight.chunk(3, dim=0) - q_proj_bias, k_proj_bias, v_proj_bias = pt_attn_layer.in_proj_bias.chunk(3, dim=0) - - out_proj_weights = pt_attn_layer.out_proj.weight - out_proj_bias = pt_attn_layer.out_proj.bias - - hf_attn_layer.q_proj.weight.data = q_proj - hf_attn_layer.q_proj.bias.data = q_proj_bias - - hf_attn_layer.k_proj.weight.data = k_proj - hf_attn_layer.k_proj.bias.data = k_proj_bias - - hf_attn_layer.v_proj.weight.data = v_proj - hf_attn_layer.v_proj.bias.data = v_proj_bias - - hf_attn_layer.out_proj.weight = out_proj_weights - hf_attn_layer.out_proj.bias = out_proj_bias - - -def copy_mlp(hf_mlp, pt_mlp): - copy_linear(hf_mlp.fc1, pt_mlp.c_fc) - copy_linear(hf_mlp.fc2, pt_mlp.c_proj) - - -def copy_linear(hf_linear, pt_linear): - hf_linear.weight = pt_linear.weight - hf_linear.bias = pt_linear.bias - - -def copy_layer(hf_layer, pt_layer): - # copy layer norms - copy_linear(hf_layer.layer_norm1, pt_layer.ln_1) - copy_linear(hf_layer.layer_norm2, pt_layer.ln_2) - - # copy MLP - copy_mlp(hf_layer.mlp, pt_layer.mlp) - - # copy attn - copy_attn_layer(hf_layer.self_attn, pt_layer.attn) - - -def copy_layers(hf_layers, pt_layers): - for hf_layer, pt_layer in zip(hf_layers, pt_layers): - copy_layer(hf_layer, pt_layer) - - -def copy_encoder(hf_encoder, pt_model): - # copy embeds - hf_encoder.embeddings.token_embedding.weight = pt_model.token_embedding.weight - hf_encoder.embeddings.position_embedding.weight.data = pt_model.positional_embedding - - # copy layer norm - copy_linear(hf_encoder.final_layer_norm, pt_model.ln_final) - - # copy hidden layers - copy_layers(hf_encoder.encoder.layers, pt_model.transformer.resblocks) - - -def copy_text_model_and_projection(hf_model, pt_model): - # copy projection - hf_model.text_projection.weight.data = pt_model.text_projection.data.T.contiguous() - - # copy text encoder - copy_encoder(hf_model.text_model, pt_model) - - -def copy_vison_model_and_projection(hf_model, pt_model): - # copy projection - hf_model.visual_projection.weight.data = pt_model.visual.proj.data.T.contiguous() - - # copy layer norms - copy_linear(hf_model.vision_model.pre_layrnorm, pt_model.visual.ln_pre) - copy_linear(hf_model.vision_model.post_layernorm, pt_model.visual.ln_post) - - # copy embeds - hf_model.vision_model.embeddings.patch_embedding.weight.data = pt_model.visual.conv1.weight.data - hf_model.vision_model.embeddings.class_embedding = pt_model.visual.class_embedding - hf_model.vision_model.embeddings.position_embedding.weight.data = pt_model.visual.positional_embedding.data - - # copy encoder - copy_layers(hf_model.vision_model.encoder.layers, pt_model.visual.transformer.resblocks) - - -@torch.no_grad() -def convert_clip_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None): - """ - Copy/paste/tweak model's weights to transformers design. - """ - if config_path is not None: - config = CLIPConfig.from_pretrained(config_path) - else: - config = CLIPConfig(projection_dim=512, text_config={}, vision_config={}) - - hf_model = CLIPModel(config).eval() - - pt_model, _ = load(checkpoint_path, device="cpu", jit=False) - pt_model = pt_model.eval() - - copy_text_model_and_projection(hf_model, pt_model) - copy_vison_model_and_projection(hf_model, pt_model) - hf_model.logit_scale = pt_model.logit_scale - - # Use `eos_token` so the example is more meaningful - input_ids = torch.tensor( - [ - [config.text_config.bos_token_id] - + list(range(3, 77)) - + [config.text_config.eos_token_id] - + [config.text_config.pad_token_id] - ] - ) - pixel_values = torch.randn(1, 3, 224, 224) - - hf_outputs = hf_model(input_ids=input_ids, pixel_values=pixel_values, return_dict=True) - hf_logits_per_image = hf_outputs.logits_per_image - hf_logits_per_text = hf_outputs.logits_per_text - pt_logits_per_image, pt_logits_per_text = pt_model(pixel_values, input_ids) - - assert torch.allclose(hf_logits_per_image, pt_logits_per_image, atol=1e-3) - assert torch.allclose(hf_logits_per_text, pt_logits_per_text, atol=1e-3) - - hf_model.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to OpenAI checkpoint") - parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert") - args = parser.parse_args() - - convert_clip_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path) diff --git a/src/transformers/models/clipseg/configuration_clipseg.py b/src/transformers/models/clipseg/configuration_clipseg.py index 60b14eb7efbb..e338d278577a 100644 --- a/src/transformers/models/clipseg/configuration_clipseg.py +++ b/src/transformers/models/clipseg/configuration_clipseg.py @@ -307,7 +307,7 @@ def __init__( # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different. for key, value in _text_config_dict.items(): - if key in text_config and value != text_config[key] and key not in ["transformers_version"]: + if key in text_config and value != text_config[key] and key != "transformers_version": # If specified in `text_config_dict` if key in text_config_dict: message = ( @@ -339,7 +339,7 @@ def __init__( # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different. for key, value in _vision_config_dict.items(): - if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]: + if key in vision_config and value != vision_config[key] and key != "transformers_version": # If specified in `vision_config_dict` if key in vision_config_dict: message = ( diff --git a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py deleted file mode 100644 index 7ea82bce515c..000000000000 --- a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py +++ /dev/null @@ -1,264 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Convert CLIPSeg checkpoints from the original repository. URL: https://github.com/timojl/clipseg.""" - -import argparse - -import requests -import torch -from PIL import Image - -from transformers import ( - CLIPSegConfig, - CLIPSegForImageSegmentation, - CLIPSegProcessor, - CLIPSegTextConfig, - CLIPSegVisionConfig, - CLIPTokenizer, - ViTImageProcessor, -) - - -def get_clipseg_config(model_name): - text_config = CLIPSegTextConfig() - vision_config = CLIPSegVisionConfig(patch_size=16) - - use_complex_transposed_convolution = "refined" in model_name - reduce_dim = 16 if "rd16" in model_name else 64 - - config = CLIPSegConfig.from_text_vision_configs( - text_config, - vision_config, - use_complex_transposed_convolution=use_complex_transposed_convolution, - reduce_dim=reduce_dim, - ) - return config - - -def rename_key(name): - # update prefixes - if "clip_model" in name: - name = name.replace("clip_model", "clip") - if "transformer" in name: - if "visual" in name: - name = name.replace("visual.transformer", "vision_model") - else: - name = name.replace("transformer", "text_model") - if "resblocks" in name: - name = name.replace("resblocks", "encoder.layers") - if "ln_1" in name: - name = name.replace("ln_1", "layer_norm1") - if "ln_2" in name: - name = name.replace("ln_2", "layer_norm2") - if "c_fc" in name: - name = name.replace("c_fc", "fc1") - if "c_proj" in name: - name = name.replace("c_proj", "fc2") - if "attn" in name and "self" not in name: - name = name.replace("attn", "self_attn") - # text encoder - if "token_embedding" in name: - name = name.replace("token_embedding", "text_model.embeddings.token_embedding") - if "positional_embedding" in name and "visual" not in name: - name = name.replace("positional_embedding", "text_model.embeddings.position_embedding.weight") - if "ln_final" in name: - name = name.replace("ln_final", "text_model.final_layer_norm") - # vision encoder - if "visual.class_embedding" in name: - name = name.replace("visual.class_embedding", "vision_model.embeddings.class_embedding") - if "visual.conv1" in name: - name = name.replace("visual.conv1", "vision_model.embeddings.patch_embedding") - if "visual.positional_embedding" in name: - name = name.replace("visual.positional_embedding", "vision_model.embeddings.position_embedding.weight") - if "visual.ln_pre" in name: - name = name.replace("visual.ln_pre", "vision_model.pre_layrnorm") - if "visual.ln_post" in name: - name = name.replace("visual.ln_post", "vision_model.post_layernorm") - # projection layers - if "visual.proj" in name: - name = name.replace("visual.proj", "visual_projection.weight") - if "text_projection" in name: - name = name.replace("text_projection", "text_projection.weight") - # decoder - if "trans_conv" in name: - name = name.replace("trans_conv", "transposed_convolution") - if "film_mul" in name or "film_add" in name or "reduce" in name or "transposed_convolution" in name: - name = "decoder." + name - if "blocks" in name: - name = name.replace("blocks", "decoder.layers") - if "linear1" in name: - name = name.replace("linear1", "mlp.fc1") - if "linear2" in name: - name = name.replace("linear2", "mlp.fc2") - if "norm1" in name and "layer_" not in name: - name = name.replace("norm1", "layer_norm1") - if "norm2" in name and "layer_" not in name: - name = name.replace("norm2", "layer_norm2") - - return name - - -def convert_state_dict(orig_state_dict, config): - for key in orig_state_dict.copy(): - val = orig_state_dict.pop(key) - - if key.startswith("clip_model") and "attn.in_proj" in key: - key_split = key.split(".") - if "visual" in key: - layer_num = int(key_split[4]) - dim = config.vision_config.hidden_size - prefix = "vision_model" - else: - layer_num = int(key_split[3]) - dim = config.text_config.hidden_size - prefix = "text_model" - - if "weight" in key: - orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.q_proj.weight"] = val[:dim, :] - orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.k_proj.weight"] = val[ - dim : dim * 2, : - ] - orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.v_proj.weight"] = val[-dim:, :] - else: - orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.q_proj.bias"] = val[:dim] - orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.k_proj.bias"] = val[dim : dim * 2] - orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:] - elif "self_attn" in key and "out_proj" not in key: - key_split = key.split(".") - layer_num = int(key_split[1]) - dim = config.reduce_dim - if "weight" in key: - orig_state_dict[f"decoder.layers.{layer_num}.self_attn.q_proj.weight"] = val[:dim, :] - orig_state_dict[f"decoder.layers.{layer_num}.self_attn.k_proj.weight"] = val[dim : dim * 2, :] - orig_state_dict[f"decoder.layers.{layer_num}.self_attn.v_proj.weight"] = val[-dim:, :] - else: - orig_state_dict[f"decoder.layers.{layer_num}.self_attn.q_proj.bias"] = val[:dim] - orig_state_dict[f"decoder.layers.{layer_num}.self_attn.k_proj.bias"] = val[dim : dim * 2] - orig_state_dict[f"decoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:] - else: - new_name = rename_key(key) - if "visual_projection" in new_name or "text_projection" in new_name: - val = val.T - orig_state_dict[new_name] = val - - return orig_state_dict - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - image = Image.open(requests.get(url, stream=True).raw) - return image - - -def convert_clipseg_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_path, push_to_hub): - config = get_clipseg_config(model_name) - model = CLIPSegForImageSegmentation(config) - model.eval() - - state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True) - - # remove some keys - for key in state_dict.copy(): - if key.startswith("model"): - state_dict.pop(key, None) - - # rename some keys - state_dict = convert_state_dict(state_dict, config) - missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False) - - if missing_keys != ["clip.text_model.embeddings.position_ids", "clip.vision_model.embeddings.position_ids"]: - raise ValueError(f"Missing keys that are not expected: {missing_keys}") - if unexpected_keys != ["decoder.reduce.weight", "decoder.reduce.bias"]: - raise ValueError(f"Unexpected keys: {unexpected_keys}") - - image_processor = ViTImageProcessor(size=352) - tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32") - processor = CLIPSegProcessor(image_processor=image_processor, tokenizer=tokenizer) - - image = prepare_img() - text = ["a glass", "something to fill", "wood", "a jar"] - - inputs = processor(text=text, images=[image] * len(text), padding="max_length", return_tensors="pt") - - with torch.no_grad(): - outputs = model(**inputs) - - # verify values - expected_conditional = torch.tensor([0.1110, -0.1882, 0.1645]) - expected_pooled_output = torch.tensor([0.2692, -0.7197, -0.1328]) - if model_name == "clipseg-rd64-refined": - expected_masks_slice = torch.tensor( - [[-10.0407, -9.9431, -10.2646], [-9.9751, -9.7064, -9.9586], [-9.6891, -9.5645, -9.9618]] - ) - elif model_name == "clipseg-rd64": - expected_masks_slice = torch.tensor( - [[-7.2877, -7.2711, -7.2463], [-7.2652, -7.2780, -7.2520], [-7.2239, -7.2204, -7.2001]] - ) - elif model_name == "clipseg-rd16": - expected_masks_slice = torch.tensor( - [[-6.3955, -6.4055, -6.4151], [-6.3911, -6.4033, -6.4100], [-6.3474, -6.3702, -6.3762]] - ) - else: - raise ValueError(f"Model name {model_name} not supported.") - - assert torch.allclose(outputs.logits[0, :3, :3], expected_masks_slice, atol=1e-3) - assert torch.allclose(outputs.conditional_embeddings[0, :3], expected_conditional, atol=1e-3) - assert torch.allclose(outputs.pooled_output[0, :3], expected_pooled_output, atol=1e-3) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - print(f"Saving model and processor to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - print(f"Pushing model and processor for {model_name} to the hub") - model.push_to_hub(f"CIDAS/{model_name}") - processor.push_to_hub(f"CIDAS/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="clipseg-rd64", - type=str, - choices=["clipseg-rd16", "clipseg-rd64", "clipseg-rd64-refined"], - help=( - "Name of the model. Supported models are: clipseg-rd64, clipseg-rd16 and clipseg-rd64-refined (rd meaning" - " reduce dimension)" - ), - ) - parser.add_argument( - "--checkpoint_path", - default="/Users/nielsrogge/Documents/CLIPSeg/clip_plus_rd64-uni.pth", - type=str, - help=( - "Path to the original checkpoint. Note that the script assumes that the checkpoint includes both CLIP and" - " the decoder weights." - ), - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub." - ) - - args = parser.parse_args() - convert_clipseg_checkpoint(args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/clvp/convert_clvp_to_hf.py b/src/transformers/models/clvp/convert_clvp_to_hf.py deleted file mode 100644 index 89babb3c4caf..000000000000 --- a/src/transformers/models/clvp/convert_clvp_to_hf.py +++ /dev/null @@ -1,234 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Weights conversion script for CLVP -""" - -import argparse -import os - -import torch -from huggingface_hub import hf_hub_download - -from transformers import ClvpConfig, ClvpModelForConditionalGeneration - - -_MODELS = { - "clvp": "https://huggingface.co/jbetker/tortoise-tts-v2/blob/main/.models/clvp2.pth", - "decoder": "https://huggingface.co/jbetker/tortoise-tts-v2/blob/main/.models/autoregressive.pth", -} - -dim = 1024 -sub_dim = dim // 16 - -CLVP_ENCODERS_MAPPING = { - "text_transformer.transformer.attn_layers": "text_encoder_model", - "speech_transformer.transformer.attn_layers": "speech_encoder_model", - "text_transformer.transformer.norm": "text_encoder_model.final_layer_norm", - "speech_transformer.transformer.norm": "speech_encoder_model.final_layer_norm", - "to_text_latent": "text_encoder_model.projection", - "to_speech_latent": "speech_encoder_model.projection", - "text_emb": "text_encoder_model.token_embedding", - "speech_emb": "speech_encoder_model.token_embedding", - "1.wrap.net.0": "mlp.fc1", - "1.wrap.net.3": "mlp.fc2", - "1.wrap": "self_attn", - "to_out": "out_proj", - "to_q": "q_proj", - "to_k": "k_proj", - "to_v": "v_proj", - "temperature": "logit_scale", -} - -CLVP_DECODER_MAPPING = { - "conditioning_encoder.init": "conditioning_encoder.mel_conv", - "conditioning_encoder.attn": "conditioning_encoder.mel_attn_blocks", - "mel_attn_blocks": "group_norms", - ".norm.weight": ".weight", - ".norm.bias": ".bias", - "text_embedding": "conditioning_encoder.text_token_embedding", - "text_pos_embedding.emb": "conditioning_encoder.text_position_embedding", - "final_norm": "speech_decoder_model.final_norm", - "mel_head": "speech_decoder_model.lm_head", - "gpt.ln_f": "speech_decoder_model.model.decoder.layer_norm", - "mel_embedding": "speech_decoder_model.model.decoder.input_embeds_layer", - "mel_pos_embedding.emb": "speech_decoder_model.model.decoder.position_embeds_layer", - "gpt.h": "speech_decoder_model.model.decoder.layers", - "ln_1": "input_layernorm", - "ln_2": "post_attention_layernorm", -} - - -def update_index(present_index): - if present_index % 2 == 0: - return int(present_index / 2) - else: - return int((present_index - 1) / 2) - - -def convert_encoder_weights(original_weights): - converted_weights = {} - original_weights_keys = sorted(original_weights.keys()) - for original_key in original_weights_keys: - updated_key = original_key - # for input_rmsnorm.weight and post_attention_rmsnorm.weight - if "0.0.g" in updated_key: - present_index = updated_key.split(".")[4] - if int(present_index) % 2 == 0: - updated_key = updated_key.replace("0.0.g", "input_rmsnorm.weight") - else: - updated_key = updated_key.replace("0.0.g", "post_attention_rmsnorm.weight") - - if "transformer.attn_layers.layers" in updated_key: - present_index = updated_key.split(".")[4] - updated_index = update_index(int(present_index)) - updated_key = updated_key.replace( - f"transformer.attn_layers.layers.{present_index}", f"transformer.attn_layers.layers.{updated_index}" - ) - - for k, v in CLVP_ENCODERS_MAPPING.items(): - if k in updated_key: - updated_key = updated_key.replace(k, v) - - converted_weights[updated_key] = original_weights.pop(original_key) - - return converted_weights - - -def convert_decoder_weights(original_weights): - converted_weights = {} - original_weights_keys = sorted(original_weights.keys()) - for original_key in original_weights_keys: - updated_key = original_key - if len(updated_key.split(".")) > 3: - index, attr = updated_key.split(".")[2], updated_key.split(".")[-1] - - # for decoder attention - if "attn.c_attn" in updated_key: - if attr == "weight": - slice1, slice2, slice3 = original_weights[updated_key].squeeze(-1).T.split(split_size=dim, dim=0) - else: - slice1, slice2, slice3 = original_weights[updated_key].split(split_size=dim, dim=0) - converted_weights[f"speech_decoder_model.model.decoder.layers.{index}.attn.q_proj.{attr}"] = slice1 - converted_weights[f"speech_decoder_model.model.decoder.layers.{index}.attn.k_proj.{attr}"] = slice2 - converted_weights[f"speech_decoder_model.model.decoder.layers.{index}.attn.v_proj.{attr}"] = slice3 - continue - - if "attn.c_proj" in updated_key: - converted_weights[f"speech_decoder_model.model.decoder.layers.{index}.attn.out_proj.{attr}"] = ( - original_weights[updated_key].squeeze(-1).T - ) - continue - - if "attn.bias" in updated_key or "attn.masked_bias" in updated_key or "text_head" in updated_key: - original_weights.pop(updated_key) - continue - - # conditional encoder attention - if "qkv" in updated_key: - if attr == "weight": - slice1, slice2, slice3 = original_weights[updated_key].squeeze(-1).split(split_size=dim, dim=0) - else: - slice1, slice2, slice3 = original_weights[updated_key].split(split_size=dim, dim=0) - - indices = torch.arange(dim) - index1, index2, index3 = ( - indices.unfold(0, sub_dim, sub_dim * 3).flatten(), - indices[sub_dim:].unfold(0, sub_dim, sub_dim * 3).flatten(), - indices[2 * sub_dim :].unfold(0, sub_dim, sub_dim * 3).flatten(), - ) - - converted_weights[f"conditioning_encoder.mel_attn_blocks.{index}.q_proj.{attr}"] = torch.concatenate( - [slice1[index1], slice2[index3], slice3[index2]], - axis=0, - ) - converted_weights[f"conditioning_encoder.mel_attn_blocks.{index}.k_proj.{attr}"] = torch.concatenate( - [slice1[index2], slice2[index1], slice3[index3]], - axis=0, - ) - converted_weights[f"conditioning_encoder.mel_attn_blocks.{index}.v_proj.{attr}"] = torch.concatenate( - [slice1[index3], slice2[index2], slice3[index1]], - axis=0, - ) - continue - - if "proj_out" in updated_key: - converted_weights[f"conditioning_encoder.mel_attn_blocks.{index}.out_proj.{attr}"] = original_weights[ - updated_key - ].squeeze(-1) - continue - - for k, v in CLVP_DECODER_MAPPING.items(): - if k in updated_key: - updated_key = updated_key.replace(k, v) - - converted_weights[updated_key] = original_weights.pop(original_key) - - return converted_weights - - -def _download(url: str, root: str): - repo_id = f"{url.split('/')[3]}/{url.split('/')[4]}" - filename = f"{url.split('/')[-2]}/{url.split('/')[-1]}" - hf_hub_download( - repo_id=repo_id, - filename=filename, - force_filename=root, - local_dir_use_symlinks=False, - ) - - -def convert_clvp_weights(checkpoint_path, pytorch_dump_folder_path): - converted_checkpoint = {} - - for each_model_name, each_model_url in _MODELS.items(): - each_model_path = os.path.join(checkpoint_path, each_model_url.split("/")[-1]) - if not os.path.exists(each_model_path): - print(f"\n{each_model_name} was not found! Downloading it to {each_model_path}") - _download(url=each_model_url, root=each_model_path) - - if each_model_name == "clvp": - clvp_checkpoint = torch.load(each_model_path, map_location="cpu", weights_only=True) - else: - decoder_checkpoint = torch.load(each_model_path, map_location="cpu", weights_only=True) - - # Converting the weights - converted_checkpoint.update(**convert_encoder_weights(clvp_checkpoint)) - converted_checkpoint.update(**convert_decoder_weights(decoder_checkpoint)) - - config = ClvpConfig.from_pretrained("susnato/clvp_dev") - model = ClvpModelForConditionalGeneration(config) - - model.load_state_dict(converted_checkpoint, strict=True) - model.save_pretrained(pytorch_dump_folder_path) - print(f"Model saved at {pytorch_dump_folder_path}!") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # # Required parameters - parser.add_argument( - "--checkpoint_path", type=str, help="Path to the folder of downloaded checkpoints. (Please enter full path)" - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - help="Path to the output PyTorch model. (Please enter full path)", - ) - args = parser.parse_args() - - convert_clvp_weights(args.checkpoint_path, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py b/src/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py index afe76134bc8d..322e98dbd0f5 100644 --- a/src/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +++ b/src/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py @@ -24,6 +24,7 @@ import numpy as np import torch +from torchvision.transforms.v2 import functional as F from ...image_processing_utils import BatchFeature from ...image_processing_utils_fast import ( @@ -34,13 +35,7 @@ ) from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ImageInput, PILImageResampling, SizeDict from ...processing_utils import Unpack -from ...utils import TensorType, auto_docstring, is_torchvision_v2_available - - -if is_torchvision_v2_available(): - from torchvision.transforms.v2 import functional as F -else: - from torchvision.transforms import functional as F +from ...utils import TensorType, auto_docstring class Cohere2VisionFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): diff --git a/src/transformers/models/colpali/configuration_colpali.py b/src/transformers/models/colpali/configuration_colpali.py index 84be59aef09b..be7eaf47b428 100644 --- a/src/transformers/models/colpali/configuration_colpali.py +++ b/src/transformers/models/colpali/configuration_colpali.py @@ -83,9 +83,7 @@ def __init__( f"The model type `{vlm_config['model_type']}` is not supported. Please provide a valid model type." ) vlm_config = CONFIG_MAPPING[vlm_config["model_type"]](**vlm_config) - elif isinstance(vlm_config, PretrainedConfig): - vlm_config = vlm_config - else: + elif not isinstance(vlm_config, PretrainedConfig): raise TypeError( f"Invalid type for `vlm_config`. Expected `PretrainedConfig`, `dict`, or `None`, but got {type(vlm_config)}." ) diff --git a/src/transformers/models/colpali/convert_colpali_weights_to_hf.py b/src/transformers/models/colpali/convert_colpali_weights_to_hf.py deleted file mode 100644 index 55de46730074..000000000000 --- a/src/transformers/models/colpali/convert_colpali_weights_to_hf.py +++ /dev/null @@ -1,214 +0,0 @@ -# coding=utf-8 -# Copyright 2024 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Convert ColPali weights from the original repository to the HF model format. - -Original repository: https://github.com/illuin-tech/colpali. - -NOTE: This script was originally run using `torch==2.5.1` and with: - -```bash -python src/transformers/models/colpali/convert_colpali_weights_to_hf.py \ - --model_id vidore/colpali-v1.2-merged \ - --revision 89fd9736194236a1ecb7a9ec9b04f537f6f896af \ - --original_vlm_name_or_path google/paligemma-3b-mix-448 \ - --output_dir vidore/colpali-v1.2-hf-internal \ - --push_to_hub - -python src/transformers/models/colpali/convert_colpali_weights_to_hf.py \ - --model_id vidore/colpali-v1.3-merged \ - --revision 5b955e3415a7c5468ab33119d98d6d45c3a5b2c3 \ - --original_vlm_name_or_path google/paligemma-3b-mix-448 \ - --output_dir vidore/colpali-v1.3-hf \ - --push_to_hub -``` -""" - -import argparse -import glob -from pathlib import Path -from typing import Any, Optional - -import torch -from huggingface_hub import snapshot_download -from safetensors import safe_open - -from transformers import AutoConfig -from transformers.models.colpali import ColPaliForRetrieval -from transformers.models.colpali.configuration_colpali import ColPaliConfig -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -ORIGINAL_DTYPE = torch.bfloat16 - - -def rename_state_dict_keys(state_dict: dict[str, Any]) -> dict[str, Any]: - new_state_dict = {} - for key, value in state_dict.items(): - new_key = key - if key.startswith("custom_text_proj"): - new_key = key.replace("custom_text_proj", "embedding_proj_layer") - if key.startswith("model."): - new_key = key.replace("model.", "vlm.", 1) - new_state_dict[new_key] = value - return new_state_dict - - -def load_original_state_dict(model_id: str, revision: Optional[str] = None) -> dict[str, torch.Tensor]: - directory_path = snapshot_download( - repo_id=model_id, - revision=revision, - allow_patterns=["*.safetensors"], - ) - - original_state_dict = {} - for path in glob.glob(f"{directory_path}/*"): - if path.endswith(".safetensors"): - with safe_open(path, framework="pt", device="cpu") as f: - for key in f.keys(): - original_state_dict[key] = f.get_tensor(key) - - # Some weights are tied, so `lm.head`` is not saved. Let's clone to load state dict. - if "lm_head.weight" not in original_state_dict: - original_state_dict["vlm.language_model.lm_head.weight"] = original_state_dict[ - "model.language_model.model.embed_tokens.weight" - ].clone() - - return original_state_dict - - -@torch.no_grad() -def convert_colpali_weights_to_hf( - model_id: str, - output_dir: str, - push_to_hub: bool, - revision: Optional[str] = None, - original_vlm_name_or_path: Optional[str] = None, -): - # Load the original model data - original_config = AutoConfig.from_pretrained( - model_id, - revision=revision, - ) - if original_vlm_name_or_path is not None: - original_config._name_or_path = original_vlm_name_or_path - if hasattr(original_config, "architectures"): - delattr(original_config, "architectures") - - original_state_dict = load_original_state_dict(model_id, revision=revision) - - # Format the state_dict keys - original_state_dict = rename_state_dict_keys(original_state_dict) - - # Create the new config - config = ColPaliConfig( - vlm_config=original_config, - embedding_dim=128, # hardcoded in the original model - ) - config.model_type = "colpali" - config.is_composition = False - - # Load the untrained model - model = ColPaliForRetrieval(config=config).to("cpu").eval() - print("Created model with new config and randomly initialized weights") - - # NOTE: The model was initialized with float32 weights. We need to convert it to the desired precision. - # There are two ways to set the model's dtype: - # - Using `model.from_pretrained(..., dtype=dtype_precision)` doesn't convert the hyperparameters to the desired precision. - # - Using `model.to(dtype_precision)` converts all values - including the hyperparameters - to the desired precision. - # The following snippet allows a fine-grained control over the model's dtype, making sure that all - # the new weights' dtypes match the original model. - for param in model.parameters(): - param.data = param.data.to(ORIGINAL_DTYPE) - print(f"Converted the new model weights to `{ORIGINAL_DTYPE}`") - - # Load the original weights - model.load_state_dict(original_state_dict) - print("Loaded original model weights") - - # Tie the weights (following ColPali's `__init__`` step) - if model.vlm.language_model._tied_weights_keys is not None: - model._tied_weights_keys = [f"vlm.language_model.{k}" for k in model.vlm.language_model._tied_weights_keys] - - # Sanity check: ensure all keys are the same - state_dict_keys_old = set(original_state_dict.keys()) - state_dict_keys_new = set(model.state_dict().keys()) - disjoint_keys = state_dict_keys_old.symmetric_difference(state_dict_keys_new) - if disjoint_keys: - raise ValueError(f"Incompatible keys: {disjoint_keys}") - - # Save the model - if push_to_hub: - model.push_to_hub(output_dir, private=True) - print(f"Model pushed to the hub at `{output_dir}`") - else: - Path(output_dir).mkdir(exist_ok=True, parents=True) - model.save_pretrained(output_dir) - print(f"Model saved to `{output_dir}`") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description=""" - This script converts the original ColPali model to the HF model format. - - Example usage: - ```bash - python src/transformers/models/colpali/convert_colpali_weights_to_hf.py \ - --model_id vidore/colpali-v1.2-merged \ - --revision 89fd9736194236a1ecb7a9ec9b04f537f6f896af \ - --original_vlm_name_or_path google/paligemma-3b-mix-448 \ - --output_dir vidore/colpali-v1.2-hf \ - --push_to_hub - ``` - """ - ) - parser.add_argument( - "--model_id", - help="Model ID of the original model to convert", - ) - parser.add_argument( - "--output_dir", - help="Location to write HF model and tokenizer", - ) - parser.add_argument( - "--push_to_hub", - help="Whether or not to push the model to the hub at `output_dir` instead of saving it locally", - action="store_true", - default=False, - ) - parser.add_argument( - "--revision", - help="Revision of the model to download", - default=None, - ) - parser.add_argument( - "--original_vlm_name_or_path", - help="Name or path of the original VLM backbone model", - default=None, - ) - args = parser.parse_args() - - convert_colpali_weights_to_hf( - model_id=args.model_id, - output_dir=args.output_dir, - push_to_hub=args.push_to_hub, - revision=args.revision, - original_vlm_name_or_path=args.original_vlm_name_or_path, - ) diff --git a/src/transformers/models/colqwen2/configuration_colqwen2.py b/src/transformers/models/colqwen2/configuration_colqwen2.py index d9a42df4c97e..21f6e46f1f00 100644 --- a/src/transformers/models/colqwen2/configuration_colqwen2.py +++ b/src/transformers/models/colqwen2/configuration_colqwen2.py @@ -75,9 +75,7 @@ def __init__( "The `model_type` key is missing in the `vlm_config` dictionary. Please provide the model type." ) vlm_config = CONFIG_MAPPING[vlm_config["model_type"]](**vlm_config) - elif isinstance(vlm_config, PretrainedConfig): - vlm_config = vlm_config - else: + elif not isinstance(vlm_config, PretrainedConfig): raise TypeError( f"Invalid type for `vlm_config`. Expected `PretrainedConfig`, `dict`, or `None`, but got {type(vlm_config)}." ) diff --git a/src/transformers/models/colqwen2/convert_colqwen2_weights_to_hf.py b/src/transformers/models/colqwen2/convert_colqwen2_weights_to_hf.py deleted file mode 100644 index ca990a6d42d4..000000000000 --- a/src/transformers/models/colqwen2/convert_colqwen2_weights_to_hf.py +++ /dev/null @@ -1,212 +0,0 @@ -# coding=utf-8 -# Copyright 2025 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Convert ColQwen2 weights from the original repository to the HF model format. - -Don't forget to manually upload the processor-related files to the HF model repository -after running this script. - -Original repository: https://github.com/illuin-tech/colqwen2. - -NOTE: This script was originally run using `torch==2.5.1` and with: - -```bash -python src/transformers/models/colqwen2/convert_colqwen2_weights_to_hf.py \ - --model_id vidore/colqwen2-v1.0-merged \ - --revision eeccbae1d44bdcb0c83b1788127a2b2cad7d718e \ - --original_vlm_name_or_path Qwen/Qwen2-VL-2B-Instruct \ - --output_dir vidore/colqwen2-v1.0-hf-internal \ - --push_to_hub -``` -""" - -import argparse -import glob -from pathlib import Path -from typing import Any, Optional - -import torch -from huggingface_hub import snapshot_download -from safetensors import safe_open - -from transformers import AutoConfig -from transformers.models.colqwen2 import ColQwen2ForRetrieval -from transformers.models.colqwen2.configuration_colqwen2 import ColQwen2Config -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -ORIGINAL_DTYPE = torch.bfloat16 - - -def load_original_state_dict(model_id: str, revision: Optional[str] = None) -> dict[str, torch.Tensor]: - directory_path = snapshot_download( - repo_id=model_id, - revision=revision, - allow_patterns=["*.safetensors"], - ) - - original_state_dict = {} - for path in glob.glob(f"{directory_path}/*"): - if path.endswith(".safetensors"): - with safe_open(path, framework="pt", device="cpu") as f: - for key in f.keys(): - original_state_dict[key] = f.get_tensor(key) - - # Some weights are tied, so `lm.head`` is not saved. Let's clone to load state dict. - if "lm_head.weight" not in original_state_dict: - original_state_dict["lm_head.weight"] = original_state_dict["model.embed_tokens.weight"].clone() - - return original_state_dict - - -def rename_state_dict_keys(state_dict: dict[str, Any]) -> dict[str, Any]: - new_state_dict: dict[str, Any] = {} - for key, value in state_dict.items(): - if key.startswith("custom_text_proj"): - new_key = key.replace("custom_text_proj", "embedding_proj_layer") - else: - # The original ColQwen2 inherits from Qwen2VL, so we simply need to add the `vlm.` prefix - # to all remaining keys. - if key.startswith("model."): - key = key.replace("model.", "model.language_model.") - if key.startswith("visual."): - key = key.replace("visual.", "model.visual.") - new_key = "vlm." + key - new_state_dict[new_key] = value - return new_state_dict - - -@torch.no_grad() -def convert_colqwen2_weights_to_hf( - model_id: str, - output_dir: str, - push_to_hub: bool, - revision: Optional[str] = None, - original_vlm_name_or_path: Optional[str] = None, -): - # Load the original model data - original_config = AutoConfig.from_pretrained( - model_id, - revision=revision, - ) - if original_vlm_name_or_path is not None: - original_config._name_or_path = original_vlm_name_or_path - if hasattr(original_config, "architectures"): - delattr(original_config, "architectures") - - original_state_dict = load_original_state_dict(model_id, revision=revision) - - # Format the state_dict keys - original_state_dict = rename_state_dict_keys(original_state_dict) - - # Create the new config - config = ColQwen2Config( - vlm_config=original_config, - embedding_dim=128, # hardcoded in the original model - ) - config.model_type = "colqwen2" - config.is_composition = False - - # Load the untrained model - model = ColQwen2ForRetrieval(config=config).to("cpu").eval() - print("Created model with new config and randomly initialized weights") - - # NOTE: The new model was initialized with float32 weights. We need to convert it to the desired precision. - # There are two ways to set the model's dtype: - # - Using `model.from_pretrained(..., dtype=dtype_precision)` doesn't convert the hyperparameters to the desired precision. - # - Using `model.to(dtype_precision)` converts all values - including the hyperparameters - to the desired precision. - # The following snippet allows a fine-grained control over the model's dtype, making sure that all - # the new weights' dtypes match the original model. - for param in model.parameters(): - param.data = param.data.to(ORIGINAL_DTYPE) - print(f"Converted the new model weights to `{ORIGINAL_DTYPE}`") - - # Load the original weights - model.load_state_dict(original_state_dict) - print("Loaded original model weights") - - # # Sanity check: ensure all keys are the same - state_dict_keys_old = set(original_state_dict.keys()) - state_dict_keys_new = set(model.state_dict().keys()) - disjoint_keys = state_dict_keys_old.symmetric_difference(state_dict_keys_new) - if disjoint_keys: - raise ValueError(f"Incompatible keys: {disjoint_keys}") - - # Save the model - if push_to_hub: - model.push_to_hub(output_dir, private=True) - print(f"Model pushed to the hub at `{output_dir}`") - else: - Path(output_dir).mkdir(exist_ok=True, parents=True) - model.save_pretrained(output_dir) - print(f"Model saved to `{output_dir}`") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description=""" - This script converts the original ColQwen2 model to the HF model format. - - Don't forget to manually upload the processor-related files to the HF model repository - after running this script. - - Example usage: - ```bash - python src/transformers/models/colqwen2/convert_colqwen2_weights_to_hf.py \ - --model_id vidore/colqwen2-v1.0-merged \ - --revision eeccbae1d44bdcb0c83b1788127a2b2cad7d718e \ - --original_vlm_name_or_path Qwen/Qwen2-VL-2B-Instruct \ - --output_dir vidore/colqwen2-v1.0-hf-internal \ - --push_to_hub - ``` - """ - ) - parser.add_argument( - "--model_id", - help="Model ID of the original model to convert", - ) - parser.add_argument( - "--output_dir", - help="Location to write HF model and tokenizer", - ) - parser.add_argument( - "--push_to_hub", - help="Whether or not to push the model to the hub at `output_dir` instead of saving it locally", - action="store_true", - default=False, - ) - parser.add_argument( - "--revision", - help="Revision of the model to download", - default=None, - ) - parser.add_argument( - "--original_vlm_name_or_path", - help="Name or path of the original VLM backbone model", - default=None, - ) - args = parser.parse_args() - - convert_colqwen2_weights_to_hf( - model_id=args.model_id, - output_dir=args.output_dir, - push_to_hub=args.push_to_hub, - revision=args.revision, - original_vlm_name_or_path=args.original_vlm_name_or_path, - ) diff --git a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index 22658419eb74..000000000000 --- a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,324 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Conditional DETR checkpoints.""" - -import argparse -import json -from collections import OrderedDict -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import ( - ConditionalDetrConfig, - ConditionalDetrForObjectDetection, - ConditionalDetrForSegmentation, - ConditionalDetrImageProcessor, -) -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -# here we list all keys to be renamed (original name on the left, our name on the right) -rename_keys = [] -for i in range(6): - # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms - rename_keys.append( - (f"transformer.encoder.layers.{i}.self_attn.out_proj.weight", f"encoder.layers.{i}.self_attn.out_proj.weight") - ) - rename_keys.append( - (f"transformer.encoder.layers.{i}.self_attn.out_proj.bias", f"encoder.layers.{i}.self_attn.out_proj.bias") - ) - rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"encoder.layers.{i}.fc1.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"encoder.layers.{i}.fc1.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"encoder.layers.{i}.fc2.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"encoder.layers.{i}.fc2.bias")) - rename_keys.append( - (f"transformer.encoder.layers.{i}.norm1.weight", f"encoder.layers.{i}.self_attn_layer_norm.weight") - ) - rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"encoder.layers.{i}.self_attn_layer_norm.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"encoder.layers.{i}.final_layer_norm.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"encoder.layers.{i}.final_layer_norm.bias")) - # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms - rename_keys.append( - (f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"decoder.layers.{i}.self_attn.out_proj.weight") - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"decoder.layers.{i}.self_attn.out_proj.bias") - ) - rename_keys.append( - ( - f"transformer.decoder.layers.{i}.cross_attn.out_proj.weight", - f"decoder.layers.{i}.encoder_attn.out_proj.weight", - ) - ) - rename_keys.append( - ( - f"transformer.decoder.layers.{i}.cross_attn.out_proj.bias", - f"decoder.layers.{i}.encoder_attn.out_proj.bias", - ) - ) - rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"decoder.layers.{i}.fc1.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"decoder.layers.{i}.fc1.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"decoder.layers.{i}.fc2.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"decoder.layers.{i}.fc2.bias")) - rename_keys.append( - (f"transformer.decoder.layers.{i}.norm1.weight", f"decoder.layers.{i}.self_attn_layer_norm.weight") - ) - rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"decoder.layers.{i}.self_attn_layer_norm.bias")) - rename_keys.append( - (f"transformer.decoder.layers.{i}.norm2.weight", f"decoder.layers.{i}.encoder_attn_layer_norm.weight") - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.norm2.bias", f"decoder.layers.{i}.encoder_attn_layer_norm.bias") - ) - rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"decoder.layers.{i}.final_layer_norm.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias")) - - # q, k, v projections in self/cross-attention in decoder for conditional DETR - rename_keys.append( - (f"transformer.decoder.layers.{i}.sa_qcontent_proj.weight", f"decoder.layers.{i}.sa_qcontent_proj.weight") - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.sa_kcontent_proj.weight", f"decoder.layers.{i}.sa_kcontent_proj.weight") - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.sa_qpos_proj.weight", f"decoder.layers.{i}.sa_qpos_proj.weight") - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.sa_kpos_proj.weight", f"decoder.layers.{i}.sa_kpos_proj.weight") - ) - rename_keys.append((f"transformer.decoder.layers.{i}.sa_v_proj.weight", f"decoder.layers.{i}.sa_v_proj.weight")) - rename_keys.append( - (f"transformer.decoder.layers.{i}.ca_qcontent_proj.weight", f"decoder.layers.{i}.ca_qcontent_proj.weight") - ) - # rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_proj.weight", f"decoder.layers.{i}.ca_qpos_proj.weight")) - rename_keys.append( - (f"transformer.decoder.layers.{i}.ca_kcontent_proj.weight", f"decoder.layers.{i}.ca_kcontent_proj.weight") - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.ca_kpos_proj.weight", f"decoder.layers.{i}.ca_kpos_proj.weight") - ) - rename_keys.append((f"transformer.decoder.layers.{i}.ca_v_proj.weight", f"decoder.layers.{i}.ca_v_proj.weight")) - rename_keys.append( - (f"transformer.decoder.layers.{i}.ca_qpos_sine_proj.weight", f"decoder.layers.{i}.ca_qpos_sine_proj.weight") - ) - - rename_keys.append( - (f"transformer.decoder.layers.{i}.sa_qcontent_proj.bias", f"decoder.layers.{i}.sa_qcontent_proj.bias") - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.sa_kcontent_proj.bias", f"decoder.layers.{i}.sa_kcontent_proj.bias") - ) - rename_keys.append((f"transformer.decoder.layers.{i}.sa_qpos_proj.bias", f"decoder.layers.{i}.sa_qpos_proj.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.sa_kpos_proj.bias", f"decoder.layers.{i}.sa_kpos_proj.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.sa_v_proj.bias", f"decoder.layers.{i}.sa_v_proj.bias")) - rename_keys.append( - (f"transformer.decoder.layers.{i}.ca_qcontent_proj.bias", f"decoder.layers.{i}.ca_qcontent_proj.bias") - ) - # rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_proj.bias", f"decoder.layers.{i}.ca_qpos_proj.bias")) - rename_keys.append( - (f"transformer.decoder.layers.{i}.ca_kcontent_proj.bias", f"decoder.layers.{i}.ca_kcontent_proj.bias") - ) - rename_keys.append((f"transformer.decoder.layers.{i}.ca_kpos_proj.bias", f"decoder.layers.{i}.ca_kpos_proj.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.ca_v_proj.bias", f"decoder.layers.{i}.ca_v_proj.bias")) - rename_keys.append( - (f"transformer.decoder.layers.{i}.ca_qpos_sine_proj.bias", f"decoder.layers.{i}.ca_qpos_sine_proj.bias") - ) - -# convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads -# for conditional DETR, also convert reference point head and query scale MLP -rename_keys.extend( - [ - ("input_proj.weight", "input_projection.weight"), - ("input_proj.bias", "input_projection.bias"), - ("query_embed.weight", "query_position_embeddings.weight"), - ("transformer.decoder.norm.weight", "decoder.layernorm.weight"), - ("transformer.decoder.norm.bias", "decoder.layernorm.bias"), - ("class_embed.weight", "class_labels_classifier.weight"), - ("class_embed.bias", "class_labels_classifier.bias"), - ("bbox_embed.layers.0.weight", "bbox_predictor.layers.0.weight"), - ("bbox_embed.layers.0.bias", "bbox_predictor.layers.0.bias"), - ("bbox_embed.layers.1.weight", "bbox_predictor.layers.1.weight"), - ("bbox_embed.layers.1.bias", "bbox_predictor.layers.1.bias"), - ("bbox_embed.layers.2.weight", "bbox_predictor.layers.2.weight"), - ("bbox_embed.layers.2.bias", "bbox_predictor.layers.2.bias"), - ("transformer.decoder.ref_point_head.layers.0.weight", "decoder.ref_point_head.layers.0.weight"), - ("transformer.decoder.ref_point_head.layers.0.bias", "decoder.ref_point_head.layers.0.bias"), - ("transformer.decoder.ref_point_head.layers.1.weight", "decoder.ref_point_head.layers.1.weight"), - ("transformer.decoder.ref_point_head.layers.1.bias", "decoder.ref_point_head.layers.1.bias"), - ("transformer.decoder.query_scale.layers.0.weight", "decoder.query_scale.layers.0.weight"), - ("transformer.decoder.query_scale.layers.0.bias", "decoder.query_scale.layers.0.bias"), - ("transformer.decoder.query_scale.layers.1.weight", "decoder.query_scale.layers.1.weight"), - ("transformer.decoder.query_scale.layers.1.bias", "decoder.query_scale.layers.1.bias"), - ("transformer.decoder.layers.0.ca_qpos_proj.weight", "decoder.layers.0.ca_qpos_proj.weight"), - ("transformer.decoder.layers.0.ca_qpos_proj.bias", "decoder.layers.0.ca_qpos_proj.bias"), - ] -) - - -def rename_key(state_dict, old, new): - val = state_dict.pop(old) - state_dict[new] = val - - -def rename_backbone_keys(state_dict): - new_state_dict = OrderedDict() - for key, value in state_dict.items(): - if "backbone.0.body" in key: - new_key = key.replace("backbone.0.body", "backbone.conv_encoder.model") - new_state_dict[new_key] = value - else: - new_state_dict[key] = value - - return new_state_dict - - -def read_in_q_k_v(state_dict, is_panoptic=False): - prefix = "" - if is_panoptic: - prefix = "conditional_detr." - - # first: transformer encoder - for i in range(6): - # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :] - state_dict[f"encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256] - state_dict[f"encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :] - state_dict[f"encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512] - state_dict[f"encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :] - state_dict[f"encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:] - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - - return im - - -@torch.no_grad() -def convert_conditional_detr_checkpoint(model_name, pytorch_dump_folder_path): - """ - Copy/paste/tweak model's weights to our CONDITIONAL_DETR structure. - """ - - # load default config - config = ConditionalDetrConfig() - # set backbone and dilation attributes - if "resnet101" in model_name: - config.backbone = "resnet101" - if "dc5" in model_name: - config.dilation = True - is_panoptic = "panoptic" in model_name - if is_panoptic: - config.num_labels = 250 - else: - config.num_labels = 91 - repo_id = "huggingface/label-files" - filename = "coco-detection-id2label.json" - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - - # load image processor - format = "coco_panoptic" if is_panoptic else "coco_detection" - image_processor = ConditionalDetrImageProcessor(format=format) - - # prepare image - img = prepare_img() - encoding = image_processor(images=img, return_tensors="pt") - pixel_values = encoding["pixel_values"] - - logger.info(f"Converting model {model_name}...") - - # load original model from torch hub - conditional_detr = torch.hub.load("DeppMeng/ConditionalDETR", model_name, pretrained=True).eval() - state_dict = conditional_detr.state_dict() - # rename keys - for src, dest in rename_keys: - if is_panoptic: - src = "conditional_detr." + src - rename_key(state_dict, src, dest) - state_dict = rename_backbone_keys(state_dict) - # query, key and value matrices need special treatment - read_in_q_k_v(state_dict, is_panoptic=is_panoptic) - # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them - prefix = "conditional_detr.model." if is_panoptic else "model." - for key in state_dict.copy(): - if is_panoptic: - if ( - key.startswith("conditional_detr") - and not key.startswith("class_labels_classifier") - and not key.startswith("bbox_predictor") - ): - val = state_dict.pop(key) - state_dict["conditional_detr.model" + key[4:]] = val - elif "class_labels_classifier" in key or "bbox_predictor" in key: - val = state_dict.pop(key) - state_dict["conditional_detr." + key] = val - elif key.startswith("bbox_attention") or key.startswith("mask_head"): - continue - else: - val = state_dict.pop(key) - state_dict[prefix + key] = val - else: - if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"): - val = state_dict.pop(key) - state_dict[prefix + key] = val - # finally, create HuggingFace model and load state dict - model = ConditionalDetrForSegmentation(config) if is_panoptic else ConditionalDetrForObjectDetection(config) - model.load_state_dict(state_dict) - model.eval() - model.push_to_hub(repo_id=model_name, organization="DepuMeng", commit_message="Add model") - # verify our conversion - original_outputs = conditional_detr(pixel_values) - outputs = model(pixel_values) - assert torch.allclose(outputs.logits, original_outputs["pred_logits"], atol=1e-4) - assert torch.allclose(outputs.pred_boxes, original_outputs["pred_boxes"], atol=1e-4) - if is_panoptic: - assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4) - - # Save model and image processor - logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...") - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - model.save_pretrained(pytorch_dump_folder_path) - image_processor.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument( - "--model_name", - default="conditional_detr_resnet50", - type=str, - help="Name of the CONDITIONAL_DETR model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model." - ) - args = parser.parse_args() - convert_conditional_detr_checkpoint(args.model_name, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr_fast.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr_fast.py index 5b9fe6325517..351d4fa1470f 100644 --- a/src/transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr_fast.py @@ -10,6 +10,7 @@ import torch from torch import nn from torchvision.io import read_image +from torchvision.transforms.v2 import functional as F from ...image_processing_utils import BatchFeature, get_size_dict from ...image_processing_utils_fast import ( @@ -33,7 +34,7 @@ validate_annotations, ) from ...processing_utils import Unpack -from ...utils import TensorType, auto_docstring, is_torchvision_v2_available, logging +from ...utils import TensorType, auto_docstring, logging from ...utils.import_utils import requires from .image_processing_conditional_detr import ( compute_segments, @@ -43,12 +44,6 @@ ) -if is_torchvision_v2_available(): - from torchvision.transforms.v2 import functional as F -else: - from torchvision.transforms import functional as F - - logger = logging.get_logger(__name__) @@ -433,13 +428,7 @@ def resize_annotation( resample (`InterpolationMode`, defaults to `F.InterpolationMode.NEAREST_EXACT`): The resampling filter to use when resizing the masks. """ - interpolation = ( - interpolation - if interpolation is not None - else F.InterpolationMode.NEAREST_EXACT - if is_torchvision_v2_available() - else F.InterpolationMode.NEAREST - ) + interpolation = interpolation if interpolation is not None else F.InterpolationMode.NEAREST_EXACT ratio_height, ratio_width = [target / orig for target, orig in zip(target_size, orig_size)] new_annotation = {} diff --git a/src/transformers/models/convbert/convert_convbert_original_tf1_checkpoint_to_pytorch_and_tf2.py b/src/transformers/models/convbert/convert_convbert_original_tf1_checkpoint_to_pytorch_and_tf2.py deleted file mode 100644 index 3d4ff779874b..000000000000 --- a/src/transformers/models/convbert/convert_convbert_original_tf1_checkpoint_to_pytorch_and_tf2.py +++ /dev/null @@ -1,57 +0,0 @@ -# coding=utf-8 -# Copyright 2020 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert ConvBERT checkpoint.""" - -import argparse - -from transformers import ConvBertConfig, ConvBertModel, TFConvBertModel, load_tf_weights_in_convbert -from transformers.utils import logging - - -logging.set_verbosity_info() - - -def convert_orig_tf1_checkpoint_to_pytorch(tf_checkpoint_path, convbert_config_file, pytorch_dump_path): - conf = ConvBertConfig.from_json_file(convbert_config_file) - model = ConvBertModel(conf) - - model = load_tf_weights_in_convbert(model, conf, tf_checkpoint_path) - model.save_pretrained(pytorch_dump_path) - - tf_model = TFConvBertModel.from_pretrained(pytorch_dump_path, from_pt=True) - tf_model.save_pretrained(pytorch_dump_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." - ) - parser.add_argument( - "--convbert_config_file", - default=None, - type=str, - required=True, - help=( - "The config json file corresponding to the pre-trained ConvBERT model. \n" - "This specifies the model architecture." - ), - ) - parser.add_argument( - "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - args = parser.parse_args() - convert_orig_tf1_checkpoint_to_pytorch(args.tf_checkpoint_path, args.convbert_config_file, args.pytorch_dump_path) diff --git a/src/transformers/models/convnext/convert_convnext_to_pytorch.py b/src/transformers/models/convnext/convert_convnext_to_pytorch.py deleted file mode 100644 index 426ed98b883b..000000000000 --- a/src/transformers/models/convnext/convert_convnext_to_pytorch.py +++ /dev/null @@ -1,242 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert ConvNext checkpoints from the original repository. - -URL: https://github.com/facebookresearch/ConvNeXt""" - -import argparse -import json -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import ConvNextConfig, ConvNextForImageClassification, ConvNextImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_convnext_config(checkpoint_url): - config = ConvNextConfig() - - if "tiny" in checkpoint_url: - depths = [3, 3, 9, 3] - hidden_sizes = [96, 192, 384, 768] - if "small" in checkpoint_url: - depths = [3, 3, 27, 3] - hidden_sizes = [96, 192, 384, 768] - if "base" in checkpoint_url: - depths = [3, 3, 27, 3] - hidden_sizes = [128, 256, 512, 1024] - if "large" in checkpoint_url: - depths = [3, 3, 27, 3] - hidden_sizes = [192, 384, 768, 1536] - if "xlarge" in checkpoint_url: - depths = [3, 3, 27, 3] - hidden_sizes = [256, 512, 1024, 2048] - - if "1k" in checkpoint_url: - num_labels = 1000 - filename = "imagenet-1k-id2label.json" - expected_shape = (1, 1000) - else: - num_labels = 21841 - filename = "imagenet-22k-id2label.json" - expected_shape = (1, 21841) - - repo_id = "huggingface/label-files" - config.num_labels = num_labels - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - if "1k" not in checkpoint_url: - # this dataset contains 21843 labels but the model only has 21841 - # we delete the classes as mentioned in https://github.com/google-research/big_transfer/issues/18 - del id2label[9205] - del id2label[15027] - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - config.hidden_sizes = hidden_sizes - config.depths = depths - - return config, expected_shape - - -def rename_key(name): - if "downsample_layers.0.0" in name: - name = name.replace("downsample_layers.0.0", "embeddings.patch_embeddings") - if "downsample_layers.0.1" in name: - name = name.replace("downsample_layers.0.1", "embeddings.norm") # we rename to layernorm later on - if "downsample_layers.1.0" in name: - name = name.replace("downsample_layers.1.0", "stages.1.downsampling_layer.0") - if "downsample_layers.1.1" in name: - name = name.replace("downsample_layers.1.1", "stages.1.downsampling_layer.1") - if "downsample_layers.2.0" in name: - name = name.replace("downsample_layers.2.0", "stages.2.downsampling_layer.0") - if "downsample_layers.2.1" in name: - name = name.replace("downsample_layers.2.1", "stages.2.downsampling_layer.1") - if "downsample_layers.3.0" in name: - name = name.replace("downsample_layers.3.0", "stages.3.downsampling_layer.0") - if "downsample_layers.3.1" in name: - name = name.replace("downsample_layers.3.1", "stages.3.downsampling_layer.1") - if "stages" in name and "downsampling_layer" not in name: - # stages.0.0. for instance should be renamed to stages.0.layers.0. - name = name[: len("stages.0")] + ".layers" + name[len("stages.0") :] - if "stages" in name: - name = name.replace("stages", "encoder.stages") - if "norm" in name: - name = name.replace("norm", "layernorm") - if "gamma" in name: - name = name.replace("gamma", "layer_scale_parameter") - if "head" in name: - name = name.replace("head", "classifier") - - return name - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -@torch.no_grad() -def convert_convnext_checkpoint(checkpoint_url, pytorch_dump_folder_path): - """ - Copy/paste/tweak model's weights to our ConvNext structure. - """ - - # define ConvNext configuration based on URL - config, expected_shape = get_convnext_config(checkpoint_url) - # load original state_dict from URL - state_dict = torch.hub.load_state_dict_from_url(checkpoint_url)["model"] - # rename keys - for key in state_dict.copy(): - val = state_dict.pop(key) - state_dict[rename_key(key)] = val - # add prefix to all keys expect classifier head - for key in state_dict.copy(): - val = state_dict.pop(key) - if not key.startswith("classifier"): - key = "convnext." + key - state_dict[key] = val - - # load HuggingFace model - model = ConvNextForImageClassification(config) - model.load_state_dict(state_dict) - model.eval() - - # Check outputs on an image, prepared by ConvNextImageProcessor - size = 224 if "224" in checkpoint_url else 384 - image_processor = ConvNextImageProcessor(size=size) - pixel_values = image_processor(images=prepare_img(), return_tensors="pt").pixel_values - - logits = model(pixel_values).logits - - # note: the logits below were obtained without center cropping - if checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth": - expected_logits = torch.tensor([-0.1210, -0.6605, 0.1918]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_small_1k_224_ema.pth": - expected_logits = torch.tensor([-0.4473, -0.1847, -0.6365]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_224_ema.pth": - expected_logits = torch.tensor([0.4525, 0.7539, 0.0308]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_384.pth": - expected_logits = torch.tensor([0.3561, 0.6350, -0.0384]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_large_1k_224_ema.pth": - expected_logits = torch.tensor([0.4174, -0.0989, 0.1489]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_large_1k_384.pth": - expected_logits = torch.tensor([0.2513, -0.1349, -0.1613]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_224.pth": - expected_logits = torch.tensor([1.2980, 0.3631, -0.1198]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_224.pth": - expected_logits = torch.tensor([1.2963, 0.1227, 0.1723]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_224.pth": - expected_logits = torch.tensor([1.7956, 0.8390, 0.2820]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_1k_224.pth": - expected_logits = torch.tensor([-0.2822, -0.0502, -0.0878]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_1k_384.pth": - expected_logits = torch.tensor([-0.5672, -0.0730, -0.4348]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_1k_224.pth": - expected_logits = torch.tensor([0.2681, 0.2365, 0.6246]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_1k_384.pth": - expected_logits = torch.tensor([-0.2642, 0.3931, 0.5116]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_1k_224_ema.pth": - expected_logits = torch.tensor([-0.6677, -0.1873, -0.8379]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_1k_384_ema.pth": - expected_logits = torch.tensor([-0.7749, -0.2967, -0.6444]) - else: - raise ValueError(f"Unknown URL: {checkpoint_url}") - - assert torch.allclose(logits[0, :3], expected_logits, atol=1e-3) - assert logits.shape == expected_shape - - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - print(f"Saving image processor to {pytorch_dump_folder_path}") - image_processor.save_pretrained(pytorch_dump_folder_path) - - print("Pushing model to the hub...") - model_name = "convnext" - if "tiny" in checkpoint_url: - model_name += "-tiny" - elif "small" in checkpoint_url: - model_name += "-small" - elif "base" in checkpoint_url: - model_name += "-base" - elif "xlarge" in checkpoint_url: - model_name += "-xlarge" - elif "large" in checkpoint_url: - model_name += "-large" - if "224" in checkpoint_url: - model_name += "-224" - elif "384" in checkpoint_url: - model_name += "-384" - if "22k" in checkpoint_url and "1k" not in checkpoint_url: - model_name += "-22k" - if "22k" in checkpoint_url and "1k" in checkpoint_url: - model_name += "-22k-1k" - - model.push_to_hub( - repo_path_or_name=Path(pytorch_dump_folder_path, model_name), - organization="nielsr", - commit_message="Add model", - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--checkpoint_url", - default="https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth", - type=str, - help="URL of the original ConvNeXT checkpoint you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - required=True, - help="Path to the output PyTorch model directory.", - ) - - args = parser.parse_args() - convert_convnext_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/convnext/image_processing_convnext_fast.py b/src/transformers/models/convnext/image_processing_convnext_fast.py index a1002d950399..3ab00c0fd091 100644 --- a/src/transformers/models/convnext/image_processing_convnext_fast.py +++ b/src/transformers/models/convnext/image_processing_convnext_fast.py @@ -17,6 +17,7 @@ from typing import Optional, Union import torch +from torchvision.transforms.v2 import functional as F from ...image_processing_utils import BatchFeature from ...image_processing_utils_fast import ( @@ -37,16 +38,9 @@ from ...utils import ( TensorType, auto_docstring, - is_torchvision_v2_available, ) -if is_torchvision_v2_available(): - from torchvision.transforms.v2 import functional as F -else: - from torchvision.transforms import functional as F - - class ConvNextFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): """ crop_pct (`float`, *optional*): diff --git a/src/transformers/models/convnextv2/convert_convnextv2_to_pytorch.py b/src/transformers/models/convnextv2/convert_convnextv2_to_pytorch.py deleted file mode 100644 index d23f248816e2..000000000000 --- a/src/transformers/models/convnextv2/convert_convnextv2_to_pytorch.py +++ /dev/null @@ -1,286 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert ConvNeXTV2 checkpoints from the original repository. - -URL: https://github.com/facebookresearch/ConvNeXt""" - -import argparse -import json -import os - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import ConvNextImageProcessor, ConvNextV2Config, ConvNextV2ForImageClassification -from transformers.image_utils import PILImageResampling -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_convnextv2_config(checkpoint_url): - config = ConvNextV2Config() - - if "atto" in checkpoint_url: - depths = [2, 2, 6, 2] - hidden_sizes = [40, 80, 160, 320] - if "femto" in checkpoint_url: - depths = [2, 2, 6, 2] - hidden_sizes = [48, 96, 192, 384] - if "pico" in checkpoint_url: - depths = [2, 2, 6, 2] - hidden_sizes = [64, 128, 256, 512] - if "nano" in checkpoint_url: - depths = [2, 2, 8, 2] - hidden_sizes = [80, 160, 320, 640] - if "tiny" in checkpoint_url: - depths = [3, 3, 9, 3] - hidden_sizes = [96, 192, 384, 768] - if "base" in checkpoint_url: - depths = [3, 3, 27, 3] - hidden_sizes = [128, 256, 512, 1024] - if "large" in checkpoint_url: - depths = [3, 3, 27, 3] - hidden_sizes = [192, 384, 768, 1536] - if "huge" in checkpoint_url: - depths = [3, 3, 27, 3] - hidden_sizes = [352, 704, 1408, 2816] - - num_labels = 1000 - filename = "imagenet-1k-id2label.json" - expected_shape = (1, 1000) - - repo_id = "huggingface/label-files" - config.num_labels = num_labels - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - config.hidden_sizes = hidden_sizes - config.depths = depths - - return config, expected_shape - - -def rename_key(name): - if "downsample_layers.0.0" in name: - name = name.replace("downsample_layers.0.0", "embeddings.patch_embeddings") - if "downsample_layers.0.1" in name: - name = name.replace("downsample_layers.0.1", "embeddings.norm") # we rename to layernorm later on - if "downsample_layers.1.0" in name: - name = name.replace("downsample_layers.1.0", "stages.1.downsampling_layer.0") - if "downsample_layers.1.1" in name: - name = name.replace("downsample_layers.1.1", "stages.1.downsampling_layer.1") - if "downsample_layers.2.0" in name: - name = name.replace("downsample_layers.2.0", "stages.2.downsampling_layer.0") - if "downsample_layers.2.1" in name: - name = name.replace("downsample_layers.2.1", "stages.2.downsampling_layer.1") - if "downsample_layers.3.0" in name: - name = name.replace("downsample_layers.3.0", "stages.3.downsampling_layer.0") - if "downsample_layers.3.1" in name: - name = name.replace("downsample_layers.3.1", "stages.3.downsampling_layer.1") - if "stages" in name and "downsampling_layer" not in name: - # stages.0.0. for instance should be renamed to stages.0.layers.0. - name = name[: len("stages.0")] + ".layers" + name[len("stages.0") :] - if "gamma" in name: - name = name.replace("gamma", "weight") - if "beta" in name: - name = name.replace("beta", "bias") - if "stages" in name: - name = name.replace("stages", "encoder.stages") - if "norm" in name: - name = name.replace("norm", "layernorm") - if "head" in name: - name = name.replace("head", "classifier") - - return name - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -def convert_preprocessor(checkpoint_url): - if "224" in checkpoint_url: - size = 224 - crop_pct = 224 / 256 - elif "384" in checkpoint_url: - size = 384 - crop_pct = None - else: - size = 512 - crop_pct = None - - return ConvNextImageProcessor( - size=size, - crop_pct=crop_pct, - image_mean=[0.485, 0.456, 0.406], - image_std=[0.229, 0.224, 0.225], - resample=PILImageResampling.BICUBIC, - ) - - -@torch.no_grad() -def convert_convnextv2_checkpoint(checkpoint_url, pytorch_dump_folder_path, save_model, push_to_hub): - """ - Copy/paste/tweak model's weights to our ConvNeXTV2 structure. - """ - print("Downloading original model from checkpoint...") - # define ConvNeXTV2 configuration based on URL - config, expected_shape = get_convnextv2_config(checkpoint_url) - # load original state_dict from URL - state_dict = torch.hub.load_state_dict_from_url(checkpoint_url)["model"] - - print("Converting model parameters...") - # rename keys - for key in state_dict.copy(): - val = state_dict.pop(key) - state_dict[rename_key(key)] = val - # add prefix to all keys expect classifier head - for key in state_dict.copy(): - val = state_dict.pop(key) - if not key.startswith("classifier"): - key = "convnextv2." + key - state_dict[key] = val - - # load HuggingFace model - model = ConvNextV2ForImageClassification(config) - model.load_state_dict(state_dict) - model.eval() - - # Check outputs on an image, prepared by ConvNextImageProcessor - preprocessor = convert_preprocessor(checkpoint_url) - inputs = preprocessor(images=prepare_img(), return_tensors="pt") - logits = model(**inputs).logits - - # note: the logits below were obtained without center cropping - if checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_atto_1k_224_ema.pt": - expected_logits = torch.tensor([-0.3930, 0.1747, -0.5246, 0.4177, 0.4295]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_femto_1k_224_ema.pt": - expected_logits = torch.tensor([-0.1727, -0.5341, -0.7818, -0.4745, -0.6566]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_pico_1k_224_ema.pt": - expected_logits = torch.tensor([-0.0333, 0.1563, -0.9137, 0.1054, 0.0381]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_nano_1k_224_ema.pt": - expected_logits = torch.tensor([-0.1744, -0.1555, -0.0713, 0.0950, -0.1431]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_tiny_1k_224_ema.pt": - expected_logits = torch.tensor([0.9996, 0.1966, -0.4386, -0.3472, 0.6661]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_base_1k_224_ema.pt": - expected_logits = torch.tensor([-0.2553, -0.6708, -0.1359, 0.2518, -0.2488]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_large_1k_224_ema.pt": - expected_logits = torch.tensor([-0.0673, -0.5627, -0.3753, -0.2722, 0.0178]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_huge_1k_224_ema.pt": - expected_logits = torch.tensor([-0.6377, -0.7458, -0.2150, 0.1184, -0.0597]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_nano_22k_224_ema.pt": - expected_logits = torch.tensor([1.0799, 0.2322, -0.8860, 1.0219, 0.6231]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_nano_22k_384_ema.pt": - expected_logits = torch.tensor([0.3766, 0.4917, -1.1426, 0.9942, 0.6024]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_tiny_22k_224_ema.pt": - expected_logits = torch.tensor([0.4220, -0.6919, -0.4317, -0.2881, -0.6609]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_tiny_22k_384_ema.pt": - expected_logits = torch.tensor([0.1082, -0.8286, -0.5095, 0.4681, -0.8085]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_base_22k_224_ema.pt": - expected_logits = torch.tensor([-0.2419, -0.6221, 0.2176, -0.0980, -0.7527]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_base_22k_384_ema.pt": - expected_logits = torch.tensor([0.0391, -0.4371, 0.3786, 0.1251, -0.2784]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_large_22k_224_ema.pt": - expected_logits = torch.tensor([-0.0504, 0.5636, -0.1729, -0.6507, -0.3949]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_large_22k_384_ema.pt": - expected_logits = torch.tensor([0.3560, 0.9486, 0.3149, -0.2667, -0.5138]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_huge_22k_384_ema.pt": - expected_logits = torch.tensor([-0.2469, -0.4550, -0.5853, -0.0810, 0.0309]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_huge_22k_512_ema.pt": - expected_logits = torch.tensor([-0.3090, 0.0802, -0.0682, -0.1979, -0.2826]) - else: - raise ValueError(f"Unknown URL: {checkpoint_url}") - - assert torch.allclose(logits[0, :5], expected_logits, atol=1e-3) - assert logits.shape == expected_shape - print("Model outputs match the original results!") - - if save_model: - print("Saving model to local...") - # Create folder to save model - if not os.path.isdir(pytorch_dump_folder_path): - os.mkdir(pytorch_dump_folder_path) - - model.save_pretrained(pytorch_dump_folder_path) - preprocessor.save_pretrained(pytorch_dump_folder_path) - - model_name = "convnextv2" - if "atto" in checkpoint_url: - model_name += "-atto" - if "femto" in checkpoint_url: - model_name += "-femto" - if "pico" in checkpoint_url: - model_name += "-pico" - if "nano" in checkpoint_url: - model_name += "-nano" - elif "tiny" in checkpoint_url: - model_name += "-tiny" - elif "base" in checkpoint_url: - model_name += "-base" - elif "large" in checkpoint_url: - model_name += "-large" - elif "huge" in checkpoint_url: - model_name += "-huge" - if "22k" in checkpoint_url and "1k" not in checkpoint_url: - model_name += "-22k" - elif "22k" in checkpoint_url and "1k" in checkpoint_url: - model_name += "-22k-1k" - elif "1k" in checkpoint_url: - model_name += "-1k" - if "224" in checkpoint_url: - model_name += "-224" - elif "384" in checkpoint_url: - model_name += "-384" - elif "512" in checkpoint_url: - model_name += "-512" - - if push_to_hub: - print(f"Pushing {model_name} to the hub...") - model.push_to_hub(model_name) - preprocessor.push_to_hub(model_name) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--checkpoint_url", - default="https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_atto_1k_224_ema.pt", - type=str, - help="URL of the original ConvNeXTV2 checkpoint you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default="model", - type=str, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument("--save_model", action="store_true", help="Save model to local") - parser.add_argument("--push_to_hub", action="store_true", help="Push model and image preprocessor to the hub") - - args = parser.parse_args() - convert_convnextv2_checkpoint( - args.checkpoint_url, args.pytorch_dump_folder_path, args.save_model, args.push_to_hub - ) diff --git a/src/transformers/models/cpmant/modeling_cpmant.py b/src/transformers/models/cpmant/modeling_cpmant.py index 1930cc0e8793..15881a64eb37 100755 --- a/src/transformers/models/cpmant/modeling_cpmant.py +++ b/src/transformers/models/cpmant/modeling_cpmant.py @@ -351,7 +351,7 @@ def forward( output_hidden_states: Optional[bool] = None, past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = None, - cache_postion: Optional[torch.Tensor] = None, + cache_position: Optional[torch.Tensor] = None, ): """ Args: @@ -492,16 +492,16 @@ def _position_bucket(self, relative_position, num_buckets=32, max_distance=128): relative_position = torch.abs(relative_position) max_exact = num_buckets // 2 is_small = relative_position < max_exact - relative_postion_if_large = max_exact + ( + relative_position_if_large = max_exact + ( torch.log(relative_position.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact) ).to(torch.int32) - relative_postion_if_large = torch.min( - relative_postion_if_large, - torch.full_like(relative_postion_if_large, num_buckets - 1), + relative_position_if_large = torch.min( + relative_position_if_large, + torch.full_like(relative_position_if_large, num_buckets - 1), ) - relative_buckets += torch.where(is_small, relative_position.to(torch.int32), relative_postion_if_large) + relative_buckets += torch.where(is_small, relative_position.to(torch.int32), relative_position_if_large) return relative_buckets diff --git a/src/transformers/models/csm/convert_csm.py b/src/transformers/models/csm/convert_csm.py deleted file mode 100644 index 28fbc9fe490d..000000000000 --- a/src/transformers/models/csm/convert_csm.py +++ /dev/null @@ -1,339 +0,0 @@ -# coding=utf-8 -# Copyright 2025 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import gc -import os -import re - -import torch -from tokenizers.processors import TemplateProcessing - -from transformers import ( - AutoFeatureExtractor, - AutoTokenizer, - CsmConfig, - CsmDepthDecoderConfig, - CsmForConditionalGeneration, - CsmProcessor, - MimiModel, -) -from transformers.utils.hub import cached_file - - -# fmt: off -ORIGINAL_TO_CONVERTED_KEY_MAPPING = { - r"backbone\.layers\.(\d+)": r"backbone_model.layers.\1", - r"decoder\.layers\.(\d+)": r"depth_decoder.model.layers.\1", - - r"attn": r"self_attn", - r"output_proj": r"o_proj", - r"w1": r"gate_proj", - r"w2": r"down_proj", - r"w3": r"up_proj", - - r"text_embeddings": r"embed_text_tokens", - r"audio_embeddings": r"backbone_model.embed_tokens.embed_audio_tokens", - - r"codebook0_head": r"lm_head", - r"audio_head": r"depth_decoder.codebooks_head.weight", - r"projection": r"depth_decoder.model.inputs_embeds_projector", - - r"sa_norm.scale": r"input_layernorm.weight", - r"mlp_norm.scale": r"post_attention_layernorm.weight", - r"decoder.norm.scale": r"depth_decoder.model.norm.weight", - r"backbone.norm.scale": r"backbone_model.norm.weight", -} -# fmt: on - - -def permute_for_rope(input_tensor, n_heads, dim1, dim2): - """ - When you go from the complex ROPE formulation to sin and cos one, you need - to permute the query and key weights (to avoid doing it on the fly) - """ - input_tensor = input_tensor.reshape(dim1, dim2) - input_tensor = input_tensor.view(n_heads, dim1 // n_heads // 2, 2, dim2) - input_tensor = input_tensor.transpose(1, 2).reshape(dim1, dim2) - return input_tensor - - -def convert_key(key, mapping): - for pattern, replacement in mapping.items(): - key = re.sub(pattern, replacement, key) - return key - - -def write_model( - input_path_or_repo, - model_name, - codec_model_path_or_repo, - output_dir, - safe_serialization=True, -): - print("Converting the model.") - os.makedirs(output_dir, exist_ok=True) - - codec_model = MimiModel.from_pretrained(codec_model_path_or_repo) - codec_model.config._attn_implementation_autoset = False - - # prepare rope scaling args: the model uses originally - # 1 - for the depth decoder - # rope_theta=500000, - # rope_scaling={ - # "factor": 32.0, - # "high_freq_factor": 4.0, - # "low_freq_factor": 1.0, - # "original_max_position_embeddings": 8192, - # "rope_type": "llama3", - # }, - # 2 - for the backbone - # rope_theta=500000, - # rope_scaling={ - # "factor": 32.0, - # "high_freq_factor": 4.0, - # "low_freq_factor": 1.0, - # "original_max_position_embeddings": 8192, - # "rope_type": "llama3", - # }, - # - # Yet we want to use max_position_embeddings=32, resp. 2048 - # This will throw warning as we would have original_max_position_embeddings >= max_position_embeddings - # Therefore, we convert values to equivalent ones - - depth_decoder_config = CsmDepthDecoderConfig( - rope_scaling={ - "factor": 32.0, - "high_freq_factor": 0.0078125, - "low_freq_factor": 0.001953125, - "original_max_position_embeddings": 16, - "rope_type": "llama3", - }, - ) - - config = CsmConfig( - codec_config=codec_model.config, - depth_decoder_config=depth_decoder_config, - rope_scaling={ - "factor": 32.0, - "high_freq_factor": 0.5, - "low_freq_factor": 0.125, - "original_max_position_embeddings": 1024, - "rope_type": "llama3", - }, - ) - - params = { - "backbone": { - "num_attention_heads": config.num_attention_heads, - "num_key_value_heads": config.num_key_value_heads, - "dim_per_head": config.head_dim, - "key_value_dim": config.head_dim * config.num_key_value_heads, - "dim": config.hidden_size, - }, - "depth_decoder": { - "num_attention_heads": config.depth_decoder_config.num_attention_heads, - "num_key_value_heads": config.depth_decoder_config.num_key_value_heads, - "dim_per_head": config.depth_decoder_config.head_dim, - "key_value_dim": config.depth_decoder_config.head_dim * config.depth_decoder_config.num_key_value_heads, - "dim": config.depth_decoder_config.hidden_size, - }, - } - - model_path = cached_file( - input_path_or_repo, - model_name, - ) - print(f"Fetching all parameters from the checkpoint at {model_path}...") - loaded = torch.load(model_path, map_location="cpu") - - print("Converting model...") - state_dict = {} - - # ----------------------- - # convert parameter names - # ----------------------- - - # Add codec_model. prefix to every key in the codec model state dict - codec_state_dict = {f"codec_model.{k}": v for k, v in codec_model.state_dict().items()} - state_dict.update(codec_state_dict) - - for key, value in loaded.items(): - new_key = convert_key(key, ORIGINAL_TO_CONVERTED_KEY_MAPPING) - current_parameter = value - - # Post-process the current_parameter. - if re.search("(k|q)_proj.weight", new_key): - params_keys = "backbone" if "backbone" in new_key else "depth_decoder" - if "q_proj" in new_key: - num_heads = params[params_keys]["num_attention_heads"] - dim_per_head = params[params_keys]["dim_per_head"] - param_dim = params[params_keys]["dim"] - dim = params[params_keys]["dim"] - else: - num_heads = params[params_keys]["num_key_value_heads"] - dim_per_head = params[params_keys]["dim_per_head"] - param_dim = params[params_keys]["key_value_dim"] - dim = params[params_keys]["dim"] - - current_parameter = permute_for_rope(value, num_heads, param_dim, dim) - state_dict[new_key] = current_parameter.reshape(num_heads * dim_per_head, dim) - - state_dict[new_key] = current_parameter - - # add the depth decoder embed audio tokens weights, latter tied to the backbone embed audio tokens weights - state_dict["depth_decoder.model.embed_tokens.weight"] = state_dict[ - "backbone_model.embed_tokens.embed_audio_tokens.weight" - ].clone() - del loaded - gc.collect() - - # ------------------------- - # load the weights and save - # ------------------------- - - print("Loading the checkpoint in a Csm model.") - with torch.device("meta"): - model = CsmForConditionalGeneration(config) - model.load_state_dict(state_dict, strict=True, assign=True) - print("Checkpoint loaded successfully.") - del model.config._name_or_path - - # default generation config - model.generation_config._from_model_config = False - model.generation_config.max_new_tokens = 125 - model.generation_config.do_sample = True - model.generation_config.top_k = 50 - model.generation_config.temperature = 0.9 - model.generation_config.depth_decoder_do_sample = True - model.generation_config.depth_decoder_top_k = 50 - model.generation_config.depth_decoder_temperature = 0.9 - - print("Saving the model.") - model.save_pretrained(output_dir, safe_serialization=safe_serialization) - del state_dict, model - - # Safety check: reload the converted model - gc.collect() - print("Reloading the model to check if it's saved correctly.") - CsmForConditionalGeneration.from_pretrained(output_dir, dtype=torch.bfloat16, device_map="auto") - print("Model reloaded successfully.") - - -def write_tokenizer(output_dir): - # from https://github.com/SesameAILabs/csm/blob/2d720827843b653c4d67bb4445b1c0a4f59e646f/generator.py#L22-L36 - def load_llama3_tokenizer(): - """ - https://github.com/huggingface/transformers/issues/22794#issuecomment-2092623992 - """ - tokenizer_name = "meta-llama/Llama-3.2-1B" - tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) - bos = tokenizer.bos_token - eos = tokenizer.eos_token - tokenizer._tokenizer.post_processor = TemplateProcessing( - single=f"{bos}:0 $A:0 {eos}:0", - pair=f"{bos}:0 $A:0 {eos}:0 {bos}:1 $B:1 {eos}:1", - special_tokens=[(f"{bos}", tokenizer.bos_token_id), (f"{eos}", tokenizer.eos_token_id)], - ) - - return tokenizer - - tokenizer = load_llama3_tokenizer() - tokenizer.pad_token = tokenizer.eos_token - tokenizer.save_pretrained(output_dir) - - # manually modify in tokenizer_config.json - # "128002": { - # "content": "<|AUDIO|>", - # ... - # } - # "128003": { - # "content": "<|audio_eos|>", - # ... - # } - print( - "Tokenizer saved successfully. Please manually modify in tokenizer_config.json AND tokenizer.json as follows: " - ) - print(""" - # "128002": { - # "content": "<|AUDIO|>", - # ... - # } - # "128003": { - # "content": "<|audio_eos|>", - # ... - # } - """) - - -def write_processor(output_dir, codec_model_path_or_repo): - chat_template = "\n{%- for message in messages %}\n {#-- Validate role is a stringified integer --#}\n {%- if not message['role'] is string or not message['role'].isdigit() %}\n {{- raise_exception(\"The role must be an integer or a stringified integer (e.g. '0') designating the speaker id\") }}\n {%- endif %}\n\n {#-- Validate content is a list --#}\n {%- set content = message['content'] %}\n {%- if content is not iterable or content is string %}\n {{- raise_exception(\"The content must be a list\") }}\n {%- endif %}\n\n {#-- Collect content types --#}\n {%- set content_types = content | map(attribute='type') | list %}\n {%- set is_last = loop.last %}\n\n {#-- Last message validation --#}\n {%- if is_last %}\n {%- if 'text' not in content_types %}\n {{- raise_exception(\"The last message must include one item of type 'text'\") }}\n {%- elif (content_types | select('equalto', 'text') | list | length > 1) or (content_types | select('equalto', 'audio') | list | length > 1) %}\n {{- raise_exception(\"At most two items are allowed in the last message: one 'text' and one 'audio'\") }}\n {%- endif %}\n\n {#-- All other messages validation --#}\n {%- else %}\n {%- if content_types | select('equalto', 'text') | list | length != 1\n or content_types | select('equalto', 'audio') | list | length != 1 %}\n {{- raise_exception(\"Each message (except the last) must contain exactly one 'text' and one 'audio' item\") }}\n {%- elif content_types | reject('in', ['text', 'audio']) | list | length > 0 %}\n {{- raise_exception(\"Only 'text' and 'audio' types are allowed in content\") }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n\n{%- for message in messages %}\n {{- bos_token }}\n {{- '[' + message['role'] + ']' }}\n {{- message['content'][0]['text'] }}\n {{- eos_token }}\n {%- if message['content']|length > 1 %}\n {{- '<|AUDIO|><|audio_eos|>' }}\n {%- endif %}\n{%- endfor %}\n" - tokenizer = AutoTokenizer.from_pretrained(output_dir) - feature_extractor = AutoFeatureExtractor.from_pretrained(codec_model_path_or_repo) - - processor = CsmProcessor( - tokenizer=tokenizer, - feature_extractor=feature_extractor, - chat_template=chat_template, - ) - - processor.save_pretrained(output_dir) - print("Processor saved successfully.") - - -def main(): - parser = argparse.ArgumentParser(description="Convert Csm weights to HuggingFace format") - parser.add_argument( - "--input_path_or_repo", - type=str, - required=True, - help="Path or repo containing Csm weights", - ) - parser.add_argument( - "--model_name", - type=str, - required=True, - help="Name of the model in input_path_or_repo", - ) - parser.add_argument( - "--codec_model_path_or_repo", - type=str, - required=True, - help="Path or repo containing the codec model", - ) - parser.add_argument( - "--output_dir", - help="Location to write HF model and tokenizer", - ) - parser.add_argument( - "--safe_serialization", action="store_true", default=True, help="Whether or not to save using `safetensors`." - ) - args = parser.parse_args() - - write_model( - args.input_path_or_repo, - args.model_name, - args.codec_model_path_or_repo, - output_dir=args.output_dir, - safe_serialization=args.safe_serialization, - ) - - write_tokenizer(args.output_dir) - - write_processor(args.output_dir, args.codec_model_path_or_repo) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/csm/generation_csm.py b/src/transformers/models/csm/generation_csm.py index 400c023e0284..cf8bc141f5d1 100644 --- a/src/transformers/models/csm/generation_csm.py +++ b/src/transformers/models/csm/generation_csm.py @@ -15,7 +15,7 @@ import os from dataclasses import dataclass -from typing import TYPE_CHECKING, Optional, Union +from typing import TYPE_CHECKING, Any, Optional, Union import torch import torch.nn as nn @@ -90,7 +90,7 @@ def _get_stopping_criteria( return kept_criteria def _prepare_generation_config( - self, generation_config: Optional[GenerationConfig], use_model_defaults: Optional[bool] = None, **kwargs: dict + self, generation_config: Optional[GenerationConfig], use_model_defaults: Optional[bool] = None, **kwargs: Any ) -> tuple[GenerationConfig, dict]: """ This method overrides [~generation.utils.GenerationMixin._prepare_generation_config]. diff --git a/src/transformers/models/csm/processing_csm.py b/src/transformers/models/csm/processing_csm.py index 0f929f6a2a0c..95596f4a3a9e 100644 --- a/src/transformers/models/csm/processing_csm.py +++ b/src/transformers/models/csm/processing_csm.py @@ -152,7 +152,6 @@ def _get_encoded_length(audio_length, kernel_sizes=None, strides=None, dilations padding_left = padding_total padding_right = extra_padding else: - padding_left = padding_left padding_right = padding_right + extra_padding cur_length = cur_length + padding_left + padding_right diff --git a/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index f65389d1d18a..000000000000 --- a/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,362 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert CvT checkpoints from the original repository. - -URL: https://github.com/microsoft/CvT""" - -import argparse -import json -from collections import OrderedDict -from pathlib import Path - -import torch -from huggingface_hub import hf_hub_download - -from transformers import AutoImageProcessor, CvtConfig, CvtForImageClassification - - -def embeddings(idx): - """ - The function helps in renaming embedding layer weights. - - Args: - idx: stage number in original model - """ - embed = [] - embed.append( - ( - f"cvt.encoder.stages.{idx}.embedding.convolution_embeddings.projection.weight", - f"stage{idx}.patch_embed.proj.weight", - ) - ) - embed.append( - ( - f"cvt.encoder.stages.{idx}.embedding.convolution_embeddings.projection.bias", - f"stage{idx}.patch_embed.proj.bias", - ) - ) - embed.append( - ( - f"cvt.encoder.stages.{idx}.embedding.convolution_embeddings.normalization.weight", - f"stage{idx}.patch_embed.norm.weight", - ) - ) - embed.append( - ( - f"cvt.encoder.stages.{idx}.embedding.convolution_embeddings.normalization.bias", - f"stage{idx}.patch_embed.norm.bias", - ) - ) - return embed - - -def attention(idx, cnt): - """ - The function helps in renaming attention block layers weights. - - Args: - idx: stage number in original model - cnt: count of blocks in each stage - """ - attention_weights = [] - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.convolution.weight", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.conv.weight", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.weight", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.weight", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.bias", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.bias", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.running_mean", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.running_mean", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.running_var", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.running_var", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.num_batches_tracked", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.num_batches_tracked", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.convolution.weight", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.conv.weight", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.weight", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.weight", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.bias", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.bias", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.running_mean", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.running_mean", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.running_var", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.running_var", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.num_batches_tracked", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.num_batches_tracked", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.convolution.weight", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.conv.weight", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.weight", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.weight", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.bias", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.bias", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.running_mean", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.running_mean", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.running_var", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.running_var", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.num_batches_tracked", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.num_batches_tracked", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_query.weight", - f"stage{idx}.blocks.{cnt}.attn.proj_q.weight", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_query.bias", - f"stage{idx}.blocks.{cnt}.attn.proj_q.bias", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_key.weight", - f"stage{idx}.blocks.{cnt}.attn.proj_k.weight", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_key.bias", - f"stage{idx}.blocks.{cnt}.attn.proj_k.bias", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_value.weight", - f"stage{idx}.blocks.{cnt}.attn.proj_v.weight", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_value.bias", - f"stage{idx}.blocks.{cnt}.attn.proj_v.bias", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.output.dense.weight", - f"stage{idx}.blocks.{cnt}.attn.proj.weight", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.output.dense.bias", - f"stage{idx}.blocks.{cnt}.attn.proj.bias", - ) - ) - attention_weights.append( - (f"cvt.encoder.stages.{idx}.layers.{cnt}.intermediate.dense.weight", f"stage{idx}.blocks.{cnt}.mlp.fc1.weight") - ) - attention_weights.append( - (f"cvt.encoder.stages.{idx}.layers.{cnt}.intermediate.dense.bias", f"stage{idx}.blocks.{cnt}.mlp.fc1.bias") - ) - attention_weights.append( - (f"cvt.encoder.stages.{idx}.layers.{cnt}.output.dense.weight", f"stage{idx}.blocks.{cnt}.mlp.fc2.weight") - ) - attention_weights.append( - (f"cvt.encoder.stages.{idx}.layers.{cnt}.output.dense.bias", f"stage{idx}.blocks.{cnt}.mlp.fc2.bias") - ) - attention_weights.append( - (f"cvt.encoder.stages.{idx}.layers.{cnt}.layernorm_before.weight", f"stage{idx}.blocks.{cnt}.norm1.weight") - ) - attention_weights.append( - (f"cvt.encoder.stages.{idx}.layers.{cnt}.layernorm_before.bias", f"stage{idx}.blocks.{cnt}.norm1.bias") - ) - attention_weights.append( - (f"cvt.encoder.stages.{idx}.layers.{cnt}.layernorm_after.weight", f"stage{idx}.blocks.{cnt}.norm2.weight") - ) - attention_weights.append( - (f"cvt.encoder.stages.{idx}.layers.{cnt}.layernorm_after.bias", f"stage{idx}.blocks.{cnt}.norm2.bias") - ) - return attention_weights - - -def cls_token(idx): - """ - Function helps in renaming cls_token weights - """ - token = [] - token.append((f"cvt.encoder.stages.{idx}.cls_token", "stage2.cls_token")) - return token - - -def final(): - """ - Function helps in renaming final classification layer - """ - head = [] - head.append(("layernorm.weight", "norm.weight")) - head.append(("layernorm.bias", "norm.bias")) - head.append(("classifier.weight", "head.weight")) - head.append(("classifier.bias", "head.bias")) - return head - - -def convert_cvt_checkpoint(cvt_model, image_size, cvt_file_name, pytorch_dump_folder): - """ - Function to convert the microsoft cvt checkpoint to huggingface checkpoint - """ - img_labels_file = "imagenet-1k-id2label.json" - num_labels = 1000 - - repo_id = "huggingface/label-files" - num_labels = num_labels - id2label = json.loads(Path(hf_hub_download(repo_id, img_labels_file, repo_type="dataset")).read_text()) - id2label = {int(k): v for k, v in id2label.items()} - - id2label = id2label - label2id = {v: k for k, v in id2label.items()} - - config = CvtConfig(num_labels=num_labels, id2label=id2label, label2id=label2id) - - # For depth size 13 (13 = 1+2+10) - if cvt_model.rsplit("/", 1)[-1][4:6] == "13": - config.depth = [1, 2, 10] - - # For depth size 21 (21 = 1+4+16) - elif cvt_model.rsplit("/", 1)[-1][4:6] == "21": - config.depth = [1, 4, 16] - - # For wide cvt (similar to wide-resnet) depth size 24 (w24 = 2 + 2 20) - else: - config.depth = [2, 2, 20] - config.num_heads = [3, 12, 16] - config.embed_dim = [192, 768, 1024] - - model = CvtForImageClassification(config) - image_processor = AutoImageProcessor.from_pretrained("facebook/convnext-base-224-22k-1k") - image_processor.size["shortest_edge"] = image_size - original_weights = torch.load(cvt_file_name, map_location=torch.device("cpu"), weights_only=True) - - huggingface_weights = OrderedDict() - list_of_state_dict = [] - - for idx in range(len(config.depth)): - if config.cls_token[idx]: - list_of_state_dict = list_of_state_dict + cls_token(idx) - list_of_state_dict = list_of_state_dict + embeddings(idx) - for cnt in range(config.depth[idx]): - list_of_state_dict = list_of_state_dict + attention(idx, cnt) - - list_of_state_dict = list_of_state_dict + final() - for gg in list_of_state_dict: - print(gg) - for i in range(len(list_of_state_dict)): - huggingface_weights[list_of_state_dict[i][0]] = original_weights[list_of_state_dict[i][1]] - - model.load_state_dict(huggingface_weights) - model.save_pretrained(pytorch_dump_folder) - image_processor.save_pretrained(pytorch_dump_folder) - - -# Download the weights from zoo: https://1drv.ms/u/s!AhIXJn_J-blW9RzF3rMW7SsLHa8h?e=blQ0Al - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--cvt_model", - default="cvt-w24", - type=str, - help="Name of the cvt model you'd like to convert.", - ) - parser.add_argument( - "--image_size", - default=384, - type=int, - help="Input Image Size", - ) - parser.add_argument( - "--cvt_file_name", - default=r"cvtmodels\CvT-w24-384x384-IN-22k.pth", - type=str, - help="Input Image Size", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - - args = parser.parse_args() - convert_cvt_checkpoint(args.cvt_model, args.image_size, args.cvt_file_name, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/d_fine/convert_d_fine_original_pytorch_checkpoint_to_hf.py b/src/transformers/models/d_fine/convert_d_fine_original_pytorch_checkpoint_to_hf.py deleted file mode 100644 index a2d23b3165bf..000000000000 --- a/src/transformers/models/d_fine/convert_d_fine_original_pytorch_checkpoint_to_hf.py +++ /dev/null @@ -1,689 +0,0 @@ -# coding=utf-8 -# Copyright 2025 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import json -import re -from pathlib import Path -from typing import Optional - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image -from torchvision import transforms - -from transformers import DFineConfig, DFineForObjectDetection, RTDetrImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_d_fine_config(model_name: str) -> DFineConfig: - config = DFineConfig() - - config.num_labels = 80 - repo_id = "huggingface/label-files" - filename = "object365-id2label.json" if "obj365" in model_name else "coco-detection-mmdet-id2label.json" - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - - config.backbone_config.hidden_sizes = [64, 128, 256, 512] - config.backbone_config.layer_type = "basic" - config.backbone_config.embedding_size = 32 - config.hidden_expansion = 1.0 - config.decoder_layers = 6 - - if model_name in ["dfine_x_coco", "dfine_x_obj2coco", "dfine_x_obj365"]: - config.backbone_config.hidden_sizes = [256, 512, 1024, 2048] - config.backbone_config.stage_in_channels = [64, 128, 512, 1024] - config.backbone_config.stage_mid_channels = [64, 128, 256, 512] - config.backbone_config.stage_out_channels = [128, 512, 1024, 2048] - config.backbone_config.stage_num_blocks = [1, 2, 5, 2] - config.backbone_config.stage_downsample = [False, True, True, True] - config.backbone_config.stage_light_block = [False, False, True, True] - config.backbone_config.stage_kernel_size = [3, 3, 5, 5] - config.backbone_config.stage_numb_of_layers = [6, 6, 6, 6] - config.backbone_config.stem_channels = [3, 32, 64] - config.encoder_in_channels = [512, 1024, 2048] - config.encoder_hidden_dim = 384 - config.encoder_ffn_dim = 2048 - config.decoder_n_points = [3, 6, 3] - config.decoder_in_channels = [384, 384, 384] - if model_name == "dfine_x_obj365": - config.num_labels = 366 - elif model_name in ["dfine_m_coco", "dfine_m_obj2coco", "dfine_m_obj365"]: - config.backbone_config.hidden_sizes = [192, 384, 768, 1536] - config.backbone_config.stem_channels = [3, 24, 32] - config.backbone_config.stage_in_channels = [32, 96, 384, 768] - config.backbone_config.stage_mid_channels = [32, 64, 128, 256] - config.backbone_config.stage_out_channels = [96, 384, 768, 1536] - config.backbone_config.stage_num_blocks = [1, 1, 3, 1] - config.backbone_config.stage_downsample = [False, True, True, True] - config.backbone_config.stage_light_block = [False, False, True, True] - config.backbone_config.stage_kernel_size = [3, 3, 5, 5] - config.backbone_config.stage_numb_of_layers = [4, 4, 4, 4] - config.decoder_layers = 4 - config.decoder_n_points = [3, 6, 3] - config.encoder_in_channels = [384, 768, 1536] - config.backbone_config.use_learnable_affine_block = True - config.depth_mult = 0.67 - if model_name == "dfine_m_obj365": - config.num_labels = 366 - elif model_name in ["dfine_l_coco", "dfine_l_obj2coco_e25", "dfine_l_obj365"]: - config.backbone_config.hidden_sizes = [256, 512, 1024, 2048] - config.backbone_config.stem_channels = [3, 32, 48] - config.backbone_config.stage_in_channels = [48, 128, 512, 1024] - config.backbone_config.stage_mid_channels = [48, 96, 192, 384] - config.backbone_config.stage_out_channels = [128, 512, 1024, 2048] - config.backbone_config.stage_num_blocks = [1, 1, 3, 1] - config.backbone_config.stage_downsample = [False, True, True, True] - config.backbone_config.stage_light_block = [False, False, True, True] - config.backbone_config.stage_kernel_size = [3, 3, 5, 5] - config.backbone_config.stage_numb_of_layers = [6, 6, 6, 6] - config.encoder_ffn_dim = 1024 - config.encoder_in_channels = [512, 1024, 2048] - config.decoder_n_points = [3, 6, 3] - if model_name == "dfine_l_obj365": - config.num_labels = 366 - elif model_name in ["dfine_n_coco", "dfine_n_obj2coco_e25", "dfine_n_obj365"]: - config.backbone_config.hidden_sizes = [128, 256, 512, 1024] - config.backbone_config.stem_channels = [3, 16, 16] - config.backbone_config.stage_in_channels = [16, 64, 256, 512] - config.backbone_config.stage_mid_channels = [16, 32, 64, 128] - config.backbone_config.stage_out_channels = [64, 256, 512, 1024] - config.backbone_config.stage_num_blocks = [1, 1, 2, 1] - config.backbone_config.stage_downsample = [False, True, True, True] - config.backbone_config.stage_light_block = [False, False, True, True] - config.backbone_config.stage_kernel_size = [3, 3, 5, 5] - config.backbone_config.stage_numb_of_layers = [3, 3, 3, 3] - config.backbone_config.out_indices = [3, 4] - config.backbone_config.use_learnable_affine_block = True - config.num_feature_levels = 2 - config.encoder_ffn_dim = 512 - config.encode_proj_layers = [1] - config.d_model = 128 - config.encoder_hidden_dim = 128 - config.decoder_ffn_dim = 512 - config.encoder_in_channels = [512, 1024] - config.decoder_n_points = [6, 6] - config.decoder_in_channels = [128, 128] - config.feat_strides = [16, 32] - config.depth_mult = 0.5 - config.decoder_layers = 3 - config.hidden_expansion = 0.34 - if model_name == "dfine_n_obj365": - config.num_labels = 366 - else: - config.backbone_config.hidden_sizes = [128, 256, 512, 1024] - config.backbone_config.stem_channels = [3, 16, 16] - config.backbone_config.stage_in_channels = [16, 64, 256, 512] - config.backbone_config.stage_mid_channels = [16, 32, 64, 128] - config.backbone_config.stage_out_channels = [64, 256, 512, 1024] - config.backbone_config.stage_num_blocks = [1, 1, 2, 1] - config.backbone_config.stage_downsample = [False, True, True, True] - config.backbone_config.stage_light_block = [False, False, True, True] - config.backbone_config.stage_kernel_size = [3, 3, 5, 5] - config.backbone_config.stage_numb_of_layers = [3, 3, 3, 3] - config.decoder_layers = 3 - config.hidden_expansion = 0.5 - config.depth_mult = 0.34 - config.decoder_n_points = [3, 6, 3] - config.encoder_in_channels = [256, 512, 1024] - config.backbone_config.use_learnable_affine_block = True - if model_name == "dfine_s_obj365": - config.num_labels = 366 - - return config - - -def load_original_state_dict(repo_id, model_name): - directory_path = hf_hub_download(repo_id=repo_id, filename=f"{model_name}.pth") - - original_state_dict = {} - model = torch.load(directory_path, map_location="cpu")["model"] - for key in model: - original_state_dict[key] = model[key] - - return original_state_dict - - -ORIGINAL_TO_CONVERTED_KEY_MAPPING = { - # Decoder base mappings - r"decoder.valid_mask": r"model.decoder.valid_mask", - r"decoder.anchors": r"model.decoder.anchors", - r"decoder.up": r"model.decoder.up", - r"decoder.reg_scale": r"model.decoder.reg_scale", - # Backbone stem mappings - including stem2a and stem2b - r"backbone.stem.stem1.conv.weight": r"model.backbone.model.embedder.stem1.convolution.weight", - r"backbone.stem.stem2a.conv.weight": r"model.backbone.model.embedder.stem2a.convolution.weight", - r"backbone.stem.stem2b.conv.weight": r"model.backbone.model.embedder.stem2b.convolution.weight", - r"backbone.stem.stem3.conv.weight": r"model.backbone.model.embedder.stem3.convolution.weight", - r"backbone.stem.stem4.conv.weight": r"model.backbone.model.embedder.stem4.convolution.weight", - # Stem normalization - r"backbone.stem.stem1.bn.(weight|bias|running_mean|running_var)": r"model.backbone.model.embedder.stem1.normalization.\1", - r"backbone.stem.stem2a.bn.(weight|bias|running_mean|running_var)": r"model.backbone.model.embedder.stem2a.normalization.\1", - r"backbone.stem.stem2b.bn.(weight|bias|running_mean|running_var)": r"model.backbone.model.embedder.stem2b.normalization.\1", - r"backbone.stem.stem3.bn.(weight|bias|running_mean|running_var)": r"model.backbone.model.embedder.stem3.normalization.\1", - r"backbone.stem.stem4.bn.(weight|bias|running_mean|running_var)": r"model.backbone.model.embedder.stem4.normalization.\1", - # Stem lab parameters - fixed with .lab in the path - r"backbone.stem.stem1.lab.(scale|bias)": r"model.backbone.model.embedder.stem1.lab.\1", - r"backbone.stem.stem2a.lab.(scale|bias)": r"model.backbone.model.embedder.stem2a.lab.\1", - r"backbone.stem.stem2b.lab.(scale|bias)": r"model.backbone.model.embedder.stem2b.lab.\1", - r"backbone.stem.stem3.lab.(scale|bias)": r"model.backbone.model.embedder.stem3.lab.\1", - r"backbone.stem.stem4.lab.(scale|bias)": r"model.backbone.model.embedder.stem4.lab.\1", - # Backbone stages mappings - r"backbone.stages.(\d+).blocks.(\d+).layers.(\d+).conv.weight": r"model.backbone.model.encoder.stages.\1.blocks.\2.layers.\3.convolution.weight", - r"backbone.stages.(\d+).blocks.(\d+).layers.(\d+).bn.(weight|bias|running_mean|running_var)": r"model.backbone.model.encoder.stages.\1.blocks.\2.layers.\3.normalization.\4", - r"backbone.stages.(\d+).blocks.(\d+).layers.(\d+).conv1.conv.weight": r"model.backbone.model.encoder.stages.\1.blocks.\2.layers.\3.conv1.convolution.weight", - r"backbone.stages.(\d+).blocks.(\d+).layers.(\d+).conv2.conv.weight": r"model.backbone.model.encoder.stages.\1.blocks.\2.layers.\3.conv2.convolution.weight", - r"backbone.stages.(\d+).blocks.(\d+).layers.(\d+).conv1.bn.(weight|bias|running_mean|running_var)": r"model.backbone.model.encoder.stages.\1.blocks.\2.layers.\3.conv1.normalization.\4", - r"backbone.stages.(\d+).blocks.(\d+).layers.(\d+).conv2.bn.(weight|bias|running_mean|running_var)": r"model.backbone.model.encoder.stages.\1.blocks.\2.layers.\3.conv2.normalization.\4", - # Backbone stages aggregation - r"backbone.stages.(\d+).blocks.(\d+).aggregation.0.conv.weight": r"model.backbone.model.encoder.stages.\1.blocks.\2.aggregation.0.convolution.weight", - r"backbone.stages.(\d+).blocks.(\d+).aggregation.1.conv.weight": r"model.backbone.model.encoder.stages.\1.blocks.\2.aggregation.1.convolution.weight", - r"backbone.stages.(\d+).blocks.(\d+).aggregation.0.bn.(weight|bias|running_mean|running_var)": r"model.backbone.model.encoder.stages.\1.blocks.\2.aggregation.0.normalization.\3", - r"backbone.stages.(\d+).blocks.(\d+).aggregation.1.bn.(weight|bias|running_mean|running_var)": r"model.backbone.model.encoder.stages.\1.blocks.\2.aggregation.1.normalization.\3", - # Backbone stages lab parameters for aggregation - r"backbone.stages.(\d+).blocks.(\d+).aggregation.0.lab.(scale|bias)": r"model.backbone.model.encoder.stages.\1.blocks.\2.aggregation.0.lab.\3", - r"backbone.stages.(\d+).blocks.(\d+).aggregation.1.lab.(scale|bias)": r"model.backbone.model.encoder.stages.\1.blocks.\2.aggregation.1.lab.\3", - r"backbone.stages.(\d+).blocks.(\d+).layers.(\d+).lab.(scale|bias)": r"model.backbone.model.encoder.stages.\1.blocks.\2.layers.\3.lab.\4", - # Conv1/Conv2 layers with lab - r"backbone.stages.(\d+).blocks.(\d+).layers.(\d+).conv1.lab.(scale|bias)": r"model.backbone.model.encoder.stages.\1.blocks.\2.layers.\3.conv1.lab.\4", - r"backbone.stages.(\d+).blocks.(\d+).layers.(\d+).conv2.lab.(scale|bias)": r"model.backbone.model.encoder.stages.\1.blocks.\2.layers.\3.conv2.lab.\4", - # Downsample with lab - r"backbone.stages.(\d+).downsample.lab.(scale|bias)": r"model.backbone.model.encoder.stages.\1.downsample.lab.\2", - # Backbone downsample - r"backbone.stages.(\d+).downsample.conv.weight": r"model.backbone.model.encoder.stages.\1.downsample.convolution.weight", - r"backbone.stages.(\d+).downsample.bn.(weight|bias|running_mean|running_var)": r"model.backbone.model.encoder.stages.\1.downsample.normalization.\2", - # Encoder mappings - r"encoder.encoder.(\d+).layers.0.self_attn.out_proj.(weight|bias)": r"model.encoder.encoder.\1.layers.0.self_attn.out_proj.\2", - r"encoder.encoder.(\d+).layers.0.linear1.(weight|bias)": r"model.encoder.encoder.\1.layers.0.fc1.\2", - r"encoder.encoder.(\d+).layers.0.linear2.(weight|bias)": r"model.encoder.encoder.\1.layers.0.fc2.\2", - r"encoder.encoder.(\d+).layers.0.norm1.(weight|bias)": r"model.encoder.encoder.\1.layers.0.self_attn_layer_norm.\2", - r"encoder.encoder.(\d+).layers.0.norm2.(weight|bias)": r"model.encoder.encoder.\1.layers.0.final_layer_norm.\2", - # Encoder projections and convolutions - r"encoder.input_proj.(\d+).conv.weight": r"model.encoder_input_proj.\1.0.weight", - r"encoder.input_proj.(\d+).norm.(weight|bias|running_mean|running_var)": r"model.encoder_input_proj.\1.1.\2", - r"encoder.lateral_convs.(\d+).conv.weight": r"model.encoder.lateral_convs.\1.conv.weight", - r"encoder.lateral_convs.(\d+).norm.(weight|bias|running_mean|running_var)": r"model.encoder.lateral_convs.\1.norm.\2", - # FPN blocks - complete structure - # Basic convolutions - r"encoder.fpn_blocks.(\d+).cv1.conv.weight": r"model.encoder.fpn_blocks.\1.conv1.conv.weight", - r"encoder.fpn_blocks.(\d+).cv1.norm.(weight|bias|running_mean|running_var)": r"model.encoder.fpn_blocks.\1.conv1.norm.\2", - # CSP Rep1 path - r"encoder.fpn_blocks.(\d+).cv2.0.conv1.conv.weight": r"model.encoder.fpn_blocks.\1.csp_rep1.conv1.conv.weight", - r"encoder.fpn_blocks.(\d+).cv2.0.conv1.norm.(weight|bias|running_mean|running_var)": r"model.encoder.fpn_blocks.\1.csp_rep1.conv1.norm.\2", - r"encoder.fpn_blocks.(\d+).cv2.0.conv2.conv.weight": r"model.encoder.fpn_blocks.\1.csp_rep1.conv2.conv.weight", - r"encoder.fpn_blocks.(\d+).cv2.0.conv2.norm.(weight|bias|running_mean|running_var)": r"model.encoder.fpn_blocks.\1.csp_rep1.conv2.norm.\2", - r"encoder.fpn_blocks.(\d+).cv2.1.conv.weight": r"model.encoder.fpn_blocks.\1.conv2.conv.weight", - r"encoder.fpn_blocks.(\d+).cv2.1.norm.(weight|bias|running_mean|running_var)": r"model.encoder.fpn_blocks.\1.conv2.norm.\2", - # CSP Rep2 path - r"encoder.fpn_blocks.(\d+).cv3.0.conv1.conv.weight": r"model.encoder.fpn_blocks.\1.csp_rep2.conv1.conv.weight", - r"encoder.fpn_blocks.(\d+).cv3.0.conv1.norm.(weight|bias|running_mean|running_var)": r"model.encoder.fpn_blocks.\1.csp_rep2.conv1.norm.\2", - r"encoder.fpn_blocks.(\d+).cv3.0.conv2.conv.weight": r"model.encoder.fpn_blocks.\1.csp_rep2.conv2.conv.weight", - r"encoder.fpn_blocks.(\d+).cv3.0.conv2.norm.(weight|bias|running_mean|running_var)": r"model.encoder.fpn_blocks.\1.csp_rep2.conv2.norm.\2", - r"encoder.fpn_blocks.(\d+).cv3.1.conv.weight": r"model.encoder.fpn_blocks.\1.conv3.conv.weight", - r"encoder.fpn_blocks.(\d+).cv3.1.norm.(weight|bias|running_mean|running_var)": r"model.encoder.fpn_blocks.\1.conv3.norm.\2", - # Final conv - r"encoder.fpn_blocks.(\d+).cv4.conv.weight": r"model.encoder.fpn_blocks.\1.conv4.conv.weight", - r"encoder.fpn_blocks.(\d+).cv4.norm.(weight|bias|running_mean|running_var)": r"model.encoder.fpn_blocks.\1.conv4.norm.\2", - # Bottlenecks for CSP Rep1 - r"encoder.fpn_blocks.(\d+).cv2.0.bottlenecks.(\d+).conv1.conv.weight": r"model.encoder.fpn_blocks.\1.csp_rep1.bottlenecks.\2.conv1.conv.weight", - r"encoder.fpn_blocks.(\d+).cv2.0.bottlenecks.(\d+).conv1.norm.(weight|bias|running_mean|running_var)": r"model.encoder.fpn_blocks.\1.csp_rep1.bottlenecks.\2.conv1.norm.\3", - r"encoder.fpn_blocks.(\d+).cv2.0.bottlenecks.(\d+).conv2.conv.weight": r"model.encoder.fpn_blocks.\1.csp_rep1.bottlenecks.\2.conv2.conv.weight", - r"encoder.fpn_blocks.(\d+).cv2.0.bottlenecks.(\d+).conv2.norm.(weight|bias|running_mean|running_var)": r"model.encoder.fpn_blocks.\1.csp_rep1.bottlenecks.\2.conv2.norm.\3", - # Bottlenecks for CSP Rep2 - r"encoder.fpn_blocks.(\d+).cv3.0.bottlenecks.(\d+).conv1.conv.weight": r"model.encoder.fpn_blocks.\1.csp_rep2.bottlenecks.\2.conv1.conv.weight", - r"encoder.fpn_blocks.(\d+).cv3.0.bottlenecks.(\d+).conv1.norm.(weight|bias|running_mean|running_var)": r"model.encoder.fpn_blocks.\1.csp_rep2.bottlenecks.\2.conv1.norm.\3", - r"encoder.fpn_blocks.(\d+).cv3.0.bottlenecks.(\d+).conv2.conv.weight": r"model.encoder.fpn_blocks.\1.csp_rep2.bottlenecks.\2.conv2.conv.weight", - r"encoder.fpn_blocks.(\d+).cv3.0.bottlenecks.(\d+).conv2.norm.(weight|bias|running_mean|running_var)": r"model.encoder.fpn_blocks.\1.csp_rep2.bottlenecks.\2.conv2.norm.\3", - # PAN blocks - complete structure - # Basic convolutions - r"encoder.pan_blocks.(\d+).cv1.conv.weight": r"model.encoder.pan_blocks.\1.conv1.conv.weight", - r"encoder.pan_blocks.(\d+).cv1.norm.(weight|bias|running_mean|running_var)": r"model.encoder.pan_blocks.\1.conv1.norm.\2", - # CSP Rep1 path - r"encoder.pan_blocks.(\d+).cv2.0.conv1.conv.weight": r"model.encoder.pan_blocks.\1.csp_rep1.conv1.conv.weight", - r"encoder.pan_blocks.(\d+).cv2.0.conv1.norm.(weight|bias|running_mean|running_var)": r"model.encoder.pan_blocks.\1.csp_rep1.conv1.norm.\2", - r"encoder.pan_blocks.(\d+).cv2.0.conv2.conv.weight": r"model.encoder.pan_blocks.\1.csp_rep1.conv2.conv.weight", - r"encoder.pan_blocks.(\d+).cv2.0.conv2.norm.(weight|bias|running_mean|running_var)": r"model.encoder.pan_blocks.\1.csp_rep1.conv2.norm.\2", - r"encoder.pan_blocks.(\d+).cv2.1.conv.weight": r"model.encoder.pan_blocks.\1.conv2.conv.weight", - r"encoder.pan_blocks.(\d+).cv2.1.norm.(weight|bias|running_mean|running_var)": r"model.encoder.pan_blocks.\1.conv2.norm.\2", - # CSP Rep2 path - r"encoder.pan_blocks.(\d+).cv3.0.conv1.conv.weight": r"model.encoder.pan_blocks.\1.csp_rep2.conv1.conv.weight", - r"encoder.pan_blocks.(\d+).cv3.0.conv1.norm.(weight|bias|running_mean|running_var)": r"model.encoder.pan_blocks.\1.csp_rep2.conv1.norm.\2", - r"encoder.pan_blocks.(\d+).cv3.0.conv2.conv.weight": r"model.encoder.pan_blocks.\1.csp_rep2.conv2.conv.weight", - r"encoder.pan_blocks.(\d+).cv3.0.conv2.norm.(weight|bias|running_mean|running_var)": r"model.encoder.pan_blocks.\1.csp_rep2.conv2.norm.\2", - r"encoder.pan_blocks.(\d+).cv3.1.conv.weight": r"model.encoder.pan_blocks.\1.conv3.conv.weight", - r"encoder.pan_blocks.(\d+).cv3.1.norm.(weight|bias|running_mean|running_var)": r"model.encoder.pan_blocks.\1.conv3.norm.\2", - # Final conv - r"encoder.pan_blocks.(\d+).cv4.conv.weight": r"model.encoder.pan_blocks.\1.conv4.conv.weight", - r"encoder.pan_blocks.(\d+).cv4.norm.(weight|bias|running_mean|running_var)": r"model.encoder.pan_blocks.\1.conv4.norm.\2", - # Bottlenecks for CSP Rep1 - r"encoder.pan_blocks.(\d+).cv2.0.bottlenecks.(\d+).conv1.conv.weight": r"model.encoder.pan_blocks.\1.csp_rep1.bottlenecks.\2.conv1.conv.weight", - r"encoder.pan_blocks.(\d+).cv2.0.bottlenecks.(\d+).conv1.norm.(weight|bias|running_mean|running_var)": r"model.encoder.pan_blocks.\1.csp_rep1.bottlenecks.\2.conv1.norm.\3", - r"encoder.pan_blocks.(\d+).cv2.0.bottlenecks.(\d+).conv2.conv.weight": r"model.encoder.pan_blocks.\1.csp_rep1.bottlenecks.\2.conv2.conv.weight", - r"encoder.pan_blocks.(\d+).cv2.0.bottlenecks.(\d+).conv2.norm.(weight|bias|running_mean|running_var)": r"model.encoder.pan_blocks.\1.csp_rep1.bottlenecks.\2.conv2.norm.\3", - # Bottlenecks for CSP Rep2 - r"encoder.pan_blocks.(\d+).cv3.0.bottlenecks.(\d+).conv1.conv.weight": r"model.encoder.pan_blocks.\1.csp_rep2.bottlenecks.\2.conv1.conv.weight", - r"encoder.pan_blocks.(\d+).cv3.0.bottlenecks.(\d+).conv1.norm.(weight|bias|running_mean|running_var)": r"model.encoder.pan_blocks.\1.csp_rep2.bottlenecks.\2.conv1.norm.\3", - r"encoder.pan_blocks.(\d+).cv3.0.bottlenecks.(\d+).conv2.conv.weight": r"model.encoder.pan_blocks.\1.csp_rep2.bottlenecks.\2.conv2.conv.weight", - r"encoder.pan_blocks.(\d+).cv3.0.bottlenecks.(\d+).conv2.norm.(weight|bias|running_mean|running_var)": r"model.encoder.pan_blocks.\1.csp_rep2.bottlenecks.\2.conv2.norm.\3", - # Downsample convolutions - r"encoder.downsample_convs.(\d+).0.cv(\d+).conv.weight": r"model.encoder.downsample_convs.\1.conv\2.conv.weight", - r"encoder.downsample_convs.(\d+).0.cv(\d+).norm.(weight|bias|running_mean|running_var)": r"model.encoder.downsample_convs.\1.conv\2.norm.\3", - # Decoder layers - r"decoder.decoder.layers.(\d+).self_attn.out_proj.(weight|bias)": r"model.decoder.layers.\1.self_attn.out_proj.\2", - r"decoder.decoder.layers.(\d+).cross_attn.sampling_offsets.(weight|bias)": r"model.decoder.layers.\1.encoder_attn.sampling_offsets.\2", - r"decoder.decoder.layers.(\d+).cross_attn.attention_weights.(weight|bias)": r"model.decoder.layers.\1.encoder_attn.attention_weights.\2", - r"decoder.decoder.layers.(\d+).cross_attn.value_proj.(weight|bias)": r"model.decoder.layers.\1.encoder_attn.value_proj.\2", - r"decoder.decoder.layers.(\d+).cross_attn.output_proj.(weight|bias)": r"model.decoder.layers.\1.encoder_attn.output_proj.\2", - r"decoder.decoder.layers.(\d+).cross_attn.num_points_scale": r"model.decoder.layers.\1.encoder_attn.num_points_scale", - r"decoder.decoder.layers.(\d+).gateway.gate.(weight|bias)": r"model.decoder.layers.\1.gateway.gate.\2", - r"decoder.decoder.layers.(\d+).gateway.norm.(weight|bias)": r"model.decoder.layers.\1.gateway.norm.\2", - r"decoder.decoder.layers.(\d+).norm1.(weight|bias)": r"model.decoder.layers.\1.self_attn_layer_norm.\2", - r"decoder.decoder.layers.(\d+).norm2.(weight|bias)": r"model.decoder.layers.\1.encoder_attn_layer_norm.\2", - r"decoder.decoder.layers.(\d+).norm3.(weight|bias)": r"model.decoder.layers.\1.final_layer_norm.\2", - r"decoder.decoder.layers.(\d+).linear1.(weight|bias)": r"model.decoder.layers.\1.fc1.\2", - r"decoder.decoder.layers.(\d+).linear2.(weight|bias)": r"model.decoder.layers.\1.fc2.\2", - # LQE layers - r"decoder.decoder.lqe_layers.(\d+).reg_conf.layers.(\d+).(weight|bias)": r"model.decoder.lqe_layers.\1.reg_conf.layers.\2.\3", - # Decoder heads and projections - r"decoder.dec_score_head.(\d+).(weight|bias)": r"model.decoder.class_embed.\1.\2", - r"decoder.dec_bbox_head.(\d+).layers.(\d+).(weight|bias)": r"model.decoder.bbox_embed.\1.layers.\2.\3", - r"decoder.pre_bbox_head.layers.(\d+).(weight|bias)": r"model.decoder.pre_bbox_head.layers.\1.\2", - r"decoder.input_proj.(\d+).conv.weight": r"model.decoder_input_proj.\1.0.weight", - r"decoder.input_proj.(\d+).norm.(weight|bias|running_mean|running_var)": r"model.decoder_input_proj.\1.1.\2", - # Other decoder components - r"decoder.denoising_class_embed.weight": r"model.denoising_class_embed.weight", - r"decoder.query_pos_head.layers.(\d+).(weight|bias)": r"model.decoder.query_pos_head.layers.\1.\2", - r"decoder.enc_output.proj.(weight|bias)": r"model.enc_output.0.\1", - r"decoder.enc_output.norm.(weight|bias)": r"model.enc_output.1.\1", - r"decoder.enc_score_head.(weight|bias)": r"model.enc_score_head.\1", - r"decoder.enc_bbox_head.layers.(\d+).(weight|bias)": r"model.enc_bbox_head.layers.\1.\2", -} - - -def convert_old_keys_to_new_keys(state_dict_keys: Optional[dict] = None): - # Use the mapping to rename keys - for original_key, converted_key in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items(): - for key in list(state_dict_keys.keys()): - new_key = re.sub(original_key, converted_key, key) - if new_key != key: - state_dict_keys[new_key] = state_dict_keys.pop(key) - - return state_dict_keys - - -def read_in_q_k_v(state_dict, config, model_name): - prefix = "" - encoder_hidden_dim = config.encoder_hidden_dim - - # first: transformer encoder - for i in range(config.encoder_layers): - # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"{prefix}encoder.encoder.{i}.layers.0.self_attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"{prefix}encoder.encoder.{i}.layers.0.self_attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.q_proj.weight"] = in_proj_weight[ - :encoder_hidden_dim, : - ] - state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.q_proj.bias"] = in_proj_bias[:encoder_hidden_dim] - state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.k_proj.weight"] = in_proj_weight[ - encoder_hidden_dim : 2 * encoder_hidden_dim, : - ] - state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.k_proj.bias"] = in_proj_bias[ - encoder_hidden_dim : 2 * encoder_hidden_dim - ] - state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.v_proj.weight"] = in_proj_weight[ - -encoder_hidden_dim:, : - ] - state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.v_proj.bias"] = in_proj_bias[-encoder_hidden_dim:] - # next: transformer decoder (which is a bit more complex because it also includes cross-attention) - for i in range(config.decoder_layers): - # read in weights + bias of input projection layer of self-attention - in_proj_weight = state_dict.pop(f"{prefix}decoder.decoder.layers.{i}.self_attn.in_proj_weight", None) - in_proj_bias = state_dict.pop(f"{prefix}decoder.decoder.layers.{i}.self_attn.in_proj_bias", None) - # next, add query, keys and values (in that order) to the state dict - if model_name in ["dfine_n_coco", "dfine_n_obj2coco_e25", "dfine_n_obj365"]: - state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:128, :] - state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:128] - state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[128:256, :] - state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[128:256] - state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-128:, :] - state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-128:] - else: - state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :] - state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256] - state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :] - state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512] - state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :] - state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:] - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - - return im - - -@torch.no_grad() -def convert_d_fine_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, repo_id): - """ - Copy/paste/tweak model's weights to our D-FINE structure. - """ - - # load default config - config = get_d_fine_config(model_name) - state_dict = load_original_state_dict(repo_id, model_name) - state_dict.pop("decoder.valid_mask", None) - state_dict.pop("decoder.anchors", None) - model = DFineForObjectDetection(config) - logger.info(f"Converting model {model_name}...") - - state_dict = convert_old_keys_to_new_keys(state_dict) - state_dict.pop("decoder.model.decoder.up", None) - state_dict.pop("decoder.model.decoder.reg_scale", None) - - # query, key and value matrices need special treatment - read_in_q_k_v(state_dict, config, model_name) - # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them - for key in state_dict.copy(): - if key.endswith("num_batches_tracked"): - del state_dict[key] - # for two_stage - if "bbox_embed" in key or ("class_embed" in key and "denoising_" not in key): - state_dict[key.split("model.decoder.")[-1]] = state_dict[key] - - # finally, create HuggingFace model and load state dict - model.load_state_dict(state_dict) - model.eval() - - # load image processor - image_processor = RTDetrImageProcessor() - - # prepare image - img = prepare_img() - - # preprocess image - transformations = transforms.Compose( - [ - transforms.Resize([640, 640], interpolation=transforms.InterpolationMode.BILINEAR), - transforms.ToTensor(), - ] - ) - original_pixel_values = transformations(img).unsqueeze(0) # insert batch dimension - - encoding = image_processor(images=img, return_tensors="pt") - pixel_values = encoding["pixel_values"] - - assert torch.allclose(original_pixel_values, pixel_values) - - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - model.to(device) - pixel_values = pixel_values.to(device) - - outputs = model(pixel_values) - - if model_name == "dfine_x_coco": - expected_slice_logits = torch.tensor( - [ - [-4.844723, -4.7293096, -4.5971327], - [-4.554266, -4.61723, -4.627926], - [-4.3934402, -4.6064143, -4.139952], - ] - ) - expected_slice_boxes = torch.tensor( - [ - [0.2565248, 0.5477609, 0.47644863], - [0.7690029, 0.41423926, 0.46148556], - [0.1688096, 0.19923759, 0.21118002], - ] - ) - elif model_name == "dfine_x_obj2coco": - expected_slice_logits = torch.tensor( - [ - [-4.230433, -6.6295037, -4.8339615], - [-4.085411, -6.3280816, -4.695468], - [-3.8968022, -6.336813, -4.67051], - ] - ) - expected_slice_boxes = torch.tensor( - [ - [0.25707328, 0.54842496, 0.47624254], - [0.76967394, 0.41272867, 0.45970756], - [0.16882066, 0.19918433, 0.2112098], - ] - ) - elif model_name == "dfine_x_obj365": - expected_slice_logits = torch.tensor( - [ - [-6.3844957, -3.7549126, -4.6873264], - [-5.8433194, -3.4490552, -3.3228905], - [-6.5314736, -3.7856622, -4.895984], - ] - ) - expected_slice_boxes = torch.tensor( - [ - [0.7703046, 0.41329497, 0.45932162], - [0.16898105, 0.19876392, 0.21050783], - [0.25134972, 0.5517619, 0.4864124], - ] - ) - elif model_name == "dfine_m_coco": - expected_slice_logits = torch.tensor( - [ - [-4.5187078, -4.71708, -4.117749], - [-4.513984, -4.937715, -3.829125], - [-4.830042, -6.931682, -3.1740026], - ] - ) - expected_slice_boxes = torch.tensor( - [ - [0.25851426, 0.5489963, 0.4757598], - [0.769683, 0.41411665, 0.45988125], - [0.16866133, 0.19921188, 0.21207744], - ] - ) - elif model_name == "dfine_m_obj2coco": - expected_slice_logits = torch.tensor( - [ - [-4.520666, -7.6678333, -5.739887], - [-4.5053635, -7.510611, -5.452532], - [-4.70348, -5.6098466, -5.0199957], - ] - ) - expected_slice_boxes = torch.tensor( - [ - [0.2567608, 0.5485795, 0.4767465], - [0.77035284, 0.41236404, 0.4580645], - [0.5498525, 0.27548885, 0.05886984], - ] - ) - elif model_name == "dfine_m_obj365": - expected_slice_logits = torch.tensor( - [ - [-5.770525, -3.1610885, -5.2807794], - [-5.7809954, -3.768266, -5.1146393], - [-6.180705, -3.7357295, -3.1651964], - ] - ) - expected_slice_boxes = torch.tensor( - [ - [0.2529114, 0.5526663, 0.48270613], - [0.7712474, 0.41294736, 0.457174], - [0.5497157, 0.27588123, 0.05813372], - ] - ) - elif model_name == "dfine_l_coco": - expected_slice_logits = torch.tensor( - [ - [-4.068779, -5.169955, -4.339212], - [-3.9461594, -5.0279613, -4.0161457], - [-4.218292, -6.196324, -5.175245], - ] - ) - expected_slice_boxes = torch.tensor( - [ - [0.2564867, 0.5489948, 0.4748876], - [0.7693534, 0.4138953, 0.4598034], - [0.16875696, 0.19875404, 0.21196914], - ] - ) - elif model_name == "dfine_l_obj365": - expected_slice_logits = torch.tensor( - [ - [-5.7953215, -3.4901116, -5.4394145], - [-5.7032104, -3.671125, -5.76121], - [-6.09466, -3.1512096, -4.285499], - ] - ) - expected_slice_boxes = torch.tensor( - [ - [0.7693825, 0.41265628, 0.4606362], - [0.25306237, 0.55187637, 0.4832178], - [0.16892478, 0.19880727, 0.21115331], - ] - ) - elif model_name == "dfine_l_obj2coco_e25": - expected_slice_logits = torch.tensor( - [ - [-3.6098495, -6.633563, -5.1227236], - [-3.682696, -6.9178205, -5.414557], - [-4.491674, -6.0823426, -4.5718226], - ] - ) - expected_slice_boxes = torch.tensor( - [ - [0.7697078, 0.41368833, 0.45879585], - [0.2573691, 0.54856044, 0.47715297], - [0.16895264, 0.19871138, 0.2115552], - ] - ) - elif model_name == "dfine_n_coco": - expected_slice_logits = torch.tensor( - [ - [-3.7827945, -5.0889463, -4.8341026], - [-5.3046904, -6.2801714, -2.9276395], - [-4.497901, -5.2670407, -6.2380104], - ] - ) - expected_slice_boxes = torch.tensor( - [ - [0.73334837, 0.4270624, 0.39424777], - [0.1680235, 0.1988639, 0.21031213], - [0.25370035, 0.5534435, 0.48496848], - ] - ) - elif model_name == "dfine_s_coco": - expected_slice_logits = torch.tensor( - [ - [-3.8097816, -4.7724586, -5.994499], - [-5.2974715, -9.499067, -6.1653666], - [-5.3502765, -3.9530406, -6.3630295], - ] - ) - expected_slice_boxes = torch.tensor( - [ - [0.7677696, 0.41479152, 0.46441072], - [0.16912134, 0.19869131, 0.2123824], - [0.2581653, 0.54818195, 0.47512347], - ] - ) - elif model_name == "dfine_s_obj2coco": - expected_slice_logits = torch.tensor( - [ - [-6.0208125, -7.532673, -5.0572147], - [-3.3595953, -9.057545, -6.376975], - [-4.3203554, -9.546032, -6.075504], - ] - ) - expected_slice_boxes = torch.tensor( - [ - [0.16901012, 0.19883151, 0.21121952], - [0.76784194, 0.41266578, 0.46402973], - [00.2563128, 0.54797643, 0.47937632], - ] - ) - elif model_name == "dfine_s_obj365": - expected_slice_logits = torch.tensor( - [ - [-6.3807316, -4.320986, -6.4775343], - [-6.5818424, -3.5009093, -5.75824], - [-5.748005, -4.3228016, -4.003726], - ] - ) - expected_slice_boxes = torch.tensor( - [ - [0.2532072, 0.5491191, 0.48222217], - [0.76586807, 0.41175705, 0.46789962], - [0.169111, 0.19844547, 0.21069047], - ] - ) - else: - raise ValueError(f"Unknown d_fine_name: {model_name}") - - assert torch.allclose(outputs.logits[0, :3, :3], expected_slice_logits.to(outputs.logits.device), atol=1e-3) - assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes.to(outputs.pred_boxes.device), atol=1e-4) - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model {model_name} to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - print(f"Saving image processor to {pytorch_dump_folder_path}") - image_processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - # Upload model, image processor and config to the hub - logger.info("Uploading PyTorch model and image processor to the hub...") - config.push_to_hub( - repo_id=repo_id, - commit_message="Add config from convert_d_fine_original_pytorch_checkpoint_to_hf.py", - ) - model.push_to_hub( - repo_id=repo_id, - commit_message="Add model from convert_d_fine_original_pytorch_checkpoint_to_hf.py", - ) - image_processor.push_to_hub( - repo_id=repo_id, - commit_message="Add image processor from convert_d_fine_original_pytorch_checkpoint_to_hf.py", - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--model_name", - default="dfine_s_coco", - type=str, - help="model_name of the checkpoint you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - parser.add_argument("--push_to_hub", action="store_true", help="Whether to push the model to the hub or not.") - parser.add_argument( - "--repo_id", - type=str, - help="repo_id where the model will be pushed to.", - ) - args = parser.parse_args() - convert_d_fine_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.repo_id) diff --git a/src/transformers/models/d_fine/modeling_d_fine.py b/src/transformers/models/d_fine/modeling_d_fine.py index 5cc2f5e221d1..cdc008e3c7bb 100644 --- a/src/transformers/models/d_fine/modeling_d_fine.py +++ b/src/transformers/models/d_fine/modeling_d_fine.py @@ -459,6 +459,12 @@ def _init_weights(self, module): nn.init.constant_(layer.layers[-1].weight, 0) nn.init.constant_(layer.layers[-1].bias, 0) + if hasattr(module, "reg_scale"): + module.reg_scale.fill_(self.config.reg_scale) + + if hasattr(module, "up"): + module.up.fill_(self.config.up) + if isinstance(module, DFineMultiscaleDeformableAttention): nn.init.constant_(module.sampling_offsets.weight.data, 0.0) default_dtype = torch.get_default_dtype() @@ -496,6 +502,10 @@ def _init_weights(self, module): init.constant_(module.reg_conf.layers[-1].bias, 0) init.constant_(module.reg_conf.layers[-1].weight, 0) + if isinstance(module, nn.LayerNorm): + module.weight.data.fill_(1.0) + module.bias.data.zero_() + if hasattr(module, "weight_embedding") and self.config.learn_initial_query: nn.init.xavier_uniform_(module.weight_embedding.weight) if hasattr(module, "denoising_class_embed") and self.config.num_denoising > 0: @@ -1833,8 +1843,6 @@ def __init__( self, config: DFineConfig, in_channels: int, out_channels: int, num_blocks: int, expansion: float = 1.0 ): super().__init__() - in_channels = in_channels - out_channels = out_channels activation = config.activation_function hidden_channels = int(out_channels * expansion) diff --git a/src/transformers/models/d_fine/modular_d_fine.py b/src/transformers/models/d_fine/modular_d_fine.py index 52ac7fef7b0d..9a41fb23308e 100644 --- a/src/transformers/models/d_fine/modular_d_fine.py +++ b/src/transformers/models/d_fine/modular_d_fine.py @@ -635,6 +635,12 @@ def _init_weights(self, module): nn.init.constant_(layer.layers[-1].weight, 0) nn.init.constant_(layer.layers[-1].bias, 0) + if hasattr(module, "reg_scale"): + module.reg_scale.fill_(self.config.reg_scale) + + if hasattr(module, "up"): + module.up.fill_(self.config.up) + if isinstance(module, DFineMultiscaleDeformableAttention): nn.init.constant_(module.sampling_offsets.weight.data, 0.0) default_dtype = torch.get_default_dtype() @@ -672,6 +678,10 @@ def _init_weights(self, module): init.constant_(module.reg_conf.layers[-1].bias, 0) init.constant_(module.reg_conf.layers[-1].weight, 0) + if isinstance(module, nn.LayerNorm): + module.weight.data.fill_(1.0) + module.bias.data.zero_() + if hasattr(module, "weight_embedding") and self.config.learn_initial_query: nn.init.xavier_uniform_(module.weight_embedding.weight) if hasattr(module, "denoising_class_embed") and self.config.num_denoising > 0: @@ -1100,8 +1110,6 @@ def __init__( self, config: DFineConfig, in_channels: int, out_channels: int, num_blocks: int, expansion: float = 1.0 ): super().__init__() - in_channels = in_channels - out_channels = out_channels activation = config.activation_function hidden_channels = int(out_channels * expansion) diff --git a/src/transformers/models/dab_detr/convert_dab_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/dab_detr/convert_dab_detr_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index efaac368f64b..000000000000 --- a/src/transformers/models/dab_detr/convert_dab_detr_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,234 +0,0 @@ -# coding=utf-8 -# Copyright 2024 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DAB-DETR checkpoints.""" - -import argparse -import gc -import json -import re -from pathlib import Path -from typing import Optional - -import torch -from huggingface_hub import hf_hub_download - -from transformers import ConditionalDetrImageProcessor, DabDetrConfig, DabDetrForObjectDetection -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -ORIGINAL_TO_CONVERTED_KEY_MAPPING = { - # convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads - # for dab-DETR, also convert reference point head and query scale MLP - r"input_proj\.(bias|weight)": r"input_projection.\1", - r"refpoint_embed\.weight": r"query_refpoint_embeddings.weight", - r"class_embed\.(bias|weight)": r"class_embed.\1", - # negative lookbehind because of the overlap - r"(?= 0.9.0") - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -SAMPLE_TEXT = "Hello world! cécé herlolip" - - -def convert_data2vec_checkpoint_to_pytorch( - data2vec_checkpoint_path: str, pytorch_dump_folder_path: str, classification_head: bool -): - """ - Copy/paste/tweak data2vec's weights to our BERT structure. - """ - data2vec_checkpoint_dir, data2vec_checkpoint_file_name = os.path.split(data2vec_checkpoint_path) - data2vec = Data2VecTextModel.from_pretrained( - data2vec_checkpoint_dir, checkpoint_file=data2vec_checkpoint_file_name - ) - data2vec.eval() # disable dropout - data2vec_model = data2vec.models[0] - data2vec_sent_encoder = data2vec_model.encoder.sentence_encoder - config = Data2VecTextConfig( - vocab_size=data2vec_sent_encoder.embed_tokens.num_embeddings, - hidden_size=data2vec_model.args.encoder_embed_dim, - num_hidden_layers=data2vec_model.args.encoder_layers, - num_attention_heads=data2vec_model.args.encoder_attention_heads, - intermediate_size=data2vec_model.args.encoder_ffn_embed_dim, - max_position_embeddings=514, - type_vocab_size=1, - layer_norm_eps=1e-5, # PyTorch default used in fairseq - ) - if classification_head: - config.num_labels = data2vec.model.classification_heads["mnli"].out_proj.weight.shape[0] - print("Our BERT config:", config) - - model = Data2VecTextForSequenceClassification(config) if classification_head else Data2VecTextForMaskedLM(config) - model.eval() - - # Now let's copy all the weights. - # Embeddings - model.data2vec_text.embeddings.word_embeddings.weight = data2vec_sent_encoder.embed_tokens.weight - model.data2vec_text.embeddings.position_embeddings.weight = data2vec_sent_encoder.embed_positions.weight - model.data2vec_text.embeddings.token_type_embeddings.weight.data = torch.zeros_like( - model.data2vec_text.embeddings.token_type_embeddings.weight - ) # just zero them out b/c data2vec doesn't use them. - model.data2vec_text.embeddings.LayerNorm.weight = data2vec_sent_encoder.layernorm_embedding.weight - model.data2vec_text.embeddings.LayerNorm.bias = data2vec_sent_encoder.layernorm_embedding.bias - - for i in range(config.num_hidden_layers): - # Encoder: start of layer - layer: BertLayer = model.data2vec_text.encoder.layer[i] - data2vec_layer: TransformerSentenceEncoderLayer = data2vec_sent_encoder.layers[i] - - # self attention - self_attn: BertSelfAttention = layer.attention.self - assert data2vec_layer.self_attn.k_proj.weight.data.shape == torch.Size( - (config.hidden_size, config.hidden_size) - ), ( - "Shape for data2vec_layer.self_attn.k_proj.weight.data should be" - f" {torch.Size((config.hidden_size, config.hidden_size))}" - ) - assert data2vec_layer.self_attn.q_proj.weight.data.shape == torch.Size( - (config.hidden_size, config.hidden_size) - ), ( - "Shape for data2vec_layer.self_attn.q_proj.weight.data should be" - f" {torch.Size((config.hidden_size, config.hidden_size))}" - ) - assert data2vec_layer.self_attn.v_proj.weight.data.shape == torch.Size( - (config.hidden_size, config.hidden_size) - ), ( - "Shape for data2vec_layer.self_attn.v_proj.weight.data should be" - f" {torch.Size((config.hidden_size, config.hidden_size))}" - ) - - self_attn.query.weight.data = data2vec_layer.self_attn.q_proj.weight - self_attn.query.bias.data = data2vec_layer.self_attn.q_proj.bias - self_attn.key.weight.data = data2vec_layer.self_attn.k_proj.weight - self_attn.key.bias.data = data2vec_layer.self_attn.k_proj.bias - self_attn.value.weight.data = data2vec_layer.self_attn.v_proj.weight - self_attn.value.bias.data = data2vec_layer.self_attn.v_proj.bias - - # self-attention output - self_output: BertSelfOutput = layer.attention.output - assert self_output.dense.weight.shape == data2vec_layer.self_attn.out_proj.weight.shape, ( - f"Shape for self_output.dense.weight should be {data2vec_layer.self_attn.out_proj.weight.shape}" - ) - self_output.dense.weight = data2vec_layer.self_attn.out_proj.weight - self_output.dense.bias = data2vec_layer.self_attn.out_proj.bias - self_output.LayerNorm.weight = data2vec_layer.self_attn_layer_norm.weight - self_output.LayerNorm.bias = data2vec_layer.self_attn_layer_norm.bias - - # intermediate - intermediate: BertIntermediate = layer.intermediate - assert intermediate.dense.weight.shape == data2vec_layer.fc1.weight.shape, ( - f"Shape for intermediate.dense.weight should be {data2vec_layer.fc1.weight.shape}" - ) - intermediate.dense.weight = data2vec_layer.fc1.weight - intermediate.dense.bias = data2vec_layer.fc1.bias - - # output - bert_output: BertOutput = layer.output - assert bert_output.dense.weight.shape == data2vec_layer.fc2.weight.shape, ( - f"Shape for bert_output.dense.weight should be {data2vec_layer.fc2.weight.shape}" - ) - bert_output.dense.weight = data2vec_layer.fc2.weight - bert_output.dense.bias = data2vec_layer.fc2.bias - bert_output.LayerNorm.weight = data2vec_layer.final_layer_norm.weight - bert_output.LayerNorm.bias = data2vec_layer.final_layer_norm.bias - # end of layer - - if classification_head: - model.classifier.dense.weight = data2vec.model.classification_heads["mnli"].dense.weight - model.classifier.dense.bias = data2vec.model.classification_heads["mnli"].dense.bias - model.classifier.out_proj.weight = data2vec.model.classification_heads["mnli"].out_proj.weight - model.classifier.out_proj.bias = data2vec.model.classification_heads["mnli"].out_proj.bias - else: - # LM Head - model.lm_head.dense.weight = data2vec_model.encoder.lm_head.dense.weight - model.lm_head.dense.bias = data2vec_model.encoder.lm_head.dense.bias - model.lm_head.layer_norm.weight = data2vec_model.encoder.lm_head.layer_norm.weight - model.lm_head.layer_norm.bias = data2vec_model.encoder.lm_head.layer_norm.bias - model.lm_head.decoder.weight = data2vec_model.encoder.lm_head.weight - model.lm_head.decoder.bias = data2vec_model.encoder.lm_head.bias - - # Let's check that we get the same results. - input_ids: torch.Tensor = data2vec.encode(SAMPLE_TEXT).unsqueeze(0) # batch of size 1 - - our_output = model(input_ids)[0] - if classification_head: - their_output = data2vec.model.classification_heads["mnli"](data2vec.extract_features(input_ids)) - else: - their_output = data2vec_model(input_ids)[0] - print(our_output.shape, their_output.shape) - max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item() - print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7 - success = torch.allclose(our_output, their_output, atol=1e-3) - print("Do both models output the same tensors?", "🔥" if success else "💩") - if not success: - raise Exception("Something went wRoNg") - - pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True) - print(f"Saving model to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump." - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - parser.add_argument( - "--classification_head", action="store_true", help="Whether to convert a final classification head." - ) - args = parser.parse_args() - convert_data2vec_checkpoint_to_pytorch( - args.checkpoint_path, args.pytorch_dump_folder_path, args.classification_head - ) diff --git a/src/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100755 index 910e1fc8e240..000000000000 --- a/src/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,368 +0,0 @@ -#!/usr/bin/env python3 -import argparse -import json - -import torch -from huggingface_hub import hf_hub_download -from PIL import Image -from timm.models import create_model - -from transformers import ( - BeitImageProcessor, - Data2VecVisionConfig, - Data2VecVisionForImageClassification, - Data2VecVisionModel, -) - - -def create_rename_keys(config, has_lm_head=False, is_semantic=False, hf_prefix="data2vec."): - prefix = "backbone." if is_semantic else "" - - rename_keys = [] - for i in range(config.num_hidden_layers): - # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms - rename_keys.append( - (f"{prefix}blocks.{i}.norm1.weight", f"{hf_prefix}encoder.layer.{i}.layernorm_before.weight") - ) - rename_keys.append((f"{prefix}blocks.{i}.norm1.bias", f"{hf_prefix}encoder.layer.{i}.layernorm_before.bias")) - rename_keys.append( - (f"{prefix}blocks.{i}.attn.proj.weight", f"{hf_prefix}encoder.layer.{i}.attention.output.dense.weight") - ) - rename_keys.append( - (f"{prefix}blocks.{i}.attn.proj.bias", f"{hf_prefix}encoder.layer.{i}.attention.output.dense.bias") - ) - rename_keys.append( - (f"{prefix}blocks.{i}.norm2.weight", f"{hf_prefix}encoder.layer.{i}.layernorm_after.weight") - ) - rename_keys.append((f"{prefix}blocks.{i}.norm2.bias", f"{hf_prefix}encoder.layer.{i}.layernorm_after.bias")) - rename_keys.append( - (f"{prefix}blocks.{i}.mlp.fc1.weight", f"{hf_prefix}encoder.layer.{i}.intermediate.dense.weight") - ) - rename_keys.append( - (f"{prefix}blocks.{i}.mlp.fc1.bias", f"{hf_prefix}encoder.layer.{i}.intermediate.dense.bias") - ) - rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.weight", f"{hf_prefix}encoder.layer.{i}.output.dense.weight")) - rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.bias", f"{hf_prefix}encoder.layer.{i}.output.dense.bias")) - - # projection layer + position embeddings - rename_keys.extend( - [ - (f"{prefix}cls_token", f"{hf_prefix}embeddings.cls_token"), - (f"{prefix}patch_embed.proj.weight", f"{hf_prefix}embeddings.patch_embeddings.projection.weight"), - (f"{prefix}patch_embed.proj.bias", f"{hf_prefix}embeddings.patch_embeddings.projection.bias"), - ] - ) - - if has_lm_head: - # mask token + shared relative position bias + layernorm - rename_keys.extend( - [ - ("mask_token", f"{hf_prefix}embeddings.mask_token"), - ( - "rel_pos_bias.relative_position_bias_table", - f"{hf_prefix}encoder.relative_position_bias.relative_position_bias_table", - ), - ( - "rel_pos_bias.relative_position_index", - f"{hf_prefix}encoder.relative_position_bias.relative_position_index", - ), - ("norm.weight", "layernorm.weight"), - ("norm.bias", "layernorm.bias"), - ] - ) - elif is_semantic: - # semantic segmentation classification heads - rename_keys.extend( - [ - ("decode_head.conv_seg.weight", "decode_head.classifier.weight"), - ("decode_head.conv_seg.bias", "decode_head.classifier.bias"), - ("auxiliary_head.conv_seg.weight", "auxiliary_head.classifier.weight"), - ("auxiliary_head.conv_seg.bias", "auxiliary_head.classifier.bias"), - ] - ) - else: - # layernorm + classification head - rename_keys.extend( - [ - ("fc_norm.weight", f"{hf_prefix}pooler.layernorm.weight"), - ("fc_norm.bias", f"{hf_prefix}pooler.layernorm.bias"), - ("head.weight", "classifier.weight"), - ("head.bias", "classifier.bias"), - ] - ) - - return rename_keys - - -def read_in_q_k_v(state_dict, config, has_lm_head=False, is_semantic=False, hf_prefix="data2vec_vision."): - for i in range(config.num_hidden_layers): - prefix = "backbone." if is_semantic else "" - # queries, keys and values - in_proj_weight = state_dict.pop(f"{prefix}blocks.{i}.attn.qkv.weight") - q_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.q_bias") - v_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.v_bias") - - state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[ - : config.hidden_size, : - ] - state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.query.bias"] = q_bias - state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ - config.hidden_size : config.hidden_size * 2, : - ] - state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[ - -config.hidden_size :, : - ] - state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.value.bias"] = v_bias - - # gamma_1 and gamma_2 - # we call them lambda because otherwise they are renamed when using .from_pretrained - gamma_1 = state_dict.pop(f"{prefix}blocks.{i}.gamma_1") - gamma_2 = state_dict.pop(f"{prefix}blocks.{i}.gamma_2") - - state_dict[f"{hf_prefix}encoder.layer.{i}.lambda_1"] = gamma_1 - state_dict[f"{hf_prefix}encoder.layer.{i}.lambda_2"] = gamma_2 - - # relative_position bias table + index - if not has_lm_head: - # each layer has its own relative position bias - table = state_dict.pop(f"{prefix}blocks.{i}.attn.relative_position_bias_table") - index = state_dict.pop(f"{prefix}blocks.{i}.attn.relative_position_index") - - state_dict[ - f"{hf_prefix}encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_bias_table" - ] = table - state_dict[ - f"{hf_prefix}encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_index" - ] = index - - -def get_args(): - parser = argparse.ArgumentParser( - "Convert Data2VecVision to HF for image classification and pretraining", add_help=False - ) - parser.add_argument("--hf_checkpoint_name", type=str) - parser.add_argument("--input_size", default=224, type=int, help="images input size") - parser.add_argument("--beit_checkpoint", default="", help="beit checkpoint") - - return parser.parse_args() - - -def load_beit_model(args, is_finetuned, is_large): - def load_state_dict(model, state_dict, prefix="", ignore_missing="relative_position_index"): - missing_keys = [] - unexpected_keys = [] - error_msgs = [] - # copy state_dict so _load_from_state_dict can modify it - metadata = getattr(state_dict, "_metadata", None) - state_dict = state_dict.copy() - if metadata is not None: - state_dict._metadata = metadata - - def load(module, prefix=""): - local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) - module._load_from_state_dict( - state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs - ) - for name, child in module._modules.items(): - if child is not None: - load(child, prefix + name + ".") - - load(model, prefix=prefix) - - warn_missing_keys = [] - ignore_missing_keys = [] - for key in missing_keys: - keep_flag = True - for ignore_key in ignore_missing.split("|"): - if ignore_key in key: - keep_flag = False - break - if keep_flag: - warn_missing_keys.append(key) - else: - ignore_missing_keys.append(key) - - missing_keys = warn_missing_keys - - if len(missing_keys) > 0: - print(f"Weights of {model.__class__.__name__} not initialized from pretrained model: {missing_keys}") - if len(unexpected_keys) > 0: - print(f"Weights from pretrained model not used in {model.__class__.__name__}: {unexpected_keys}") - if len(ignore_missing_keys) > 0: - print( - f"Ignored weights of {model.__class__.__name__} not initialized from pretrained model: {ignore_missing_keys}" - ) - if len(error_msgs) > 0: - print("\n".join(error_msgs)) - - model_kwargs = { - "pretrained": False, - "use_shared_rel_pos_bias": True, - "use_abs_pos_emb": False, - "init_values": 0.1, - } - - if is_finetuned: - model_kwargs.update( - { - "num_classes": 1000, - "use_mean_pooling": True, - "init_scale": 0.001, - "use_rel_pos_bias": True, - } - ) - - model = create_model( - "beit_large_patch16_224" if is_large else "beit_base_patch16_224", - **model_kwargs, - ) - patch_size = model.patch_embed.patch_size - args.window_size = (args.input_size // patch_size[0], args.input_size // patch_size[1]) - checkpoint = torch.load(args.beit_checkpoint, map_location="cpu", weights_only=True) - - print(f"Load ckpt from {args.beit_checkpoint}") - checkpoint_model = None - for model_key in ("model", "module"): - if model_key in checkpoint: - checkpoint_model = checkpoint[model_key] - print(f"Load state_dict by model_key = {model_key}") - break - - all_keys = list(checkpoint_model.keys()) - for key in all_keys: - if "relative_position_index" in key: - checkpoint_model.pop(key) - - if "relative_position_bias_table" in key: - rel_pos_bias = checkpoint_model[key] - src_num_pos, num_attn_heads = rel_pos_bias.size() - dst_num_pos, _ = model.state_dict()[key].size() - dst_patch_shape = model.patch_embed.patch_shape - if dst_patch_shape[0] != dst_patch_shape[1]: - raise NotImplementedError() - - load_state_dict(model, checkpoint_model, prefix="") - - return model - - -def main(): - args = get_args() - - is_finetuned = "ft1k" in args.hf_checkpoint_name - is_large = "large" in args.hf_checkpoint_name - - if is_finetuned: - # To convert Beit's data2vec_vision to HF you need to copy - # https://github.com/facebookresearch/data2vec_vision/blob/main/beit/modeling_finetune.py - # into this folder. - import modeling_finetune # noqa: F401 - else: - # To convert Beit's data2vec_vision to HF you need to copy - # https://github.com/facebookresearch/data2vec_vision/blob/main/beit/modeling_cyclical.py - # into this folder - # IMPORTANT: Note that for now we've only converted the down-stream - # model and not the full pretrained model. This means for the integration - # test you need to add a `return x` after the following line: - # https://github.com/facebookresearch/data2vec_vision/blob/af9a36349aaed59ae66e69b5dabeef2d62fdc5da/beit/modeling_cyclical.py#L197 - # to make the integration test pass. - import modeling_cyclical # noqa: F401 - - # 1. Create model config - config = Data2VecVisionConfig() - if is_finetuned: - config.use_relative_position_bias = True - config.use_shared_relative_position_bias = False - config.use_mean_pooling = True - config.num_labels = 1000 - - repo_id = "huggingface/label-files" - filename = "imagenet-1k-id2label.json" - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - else: - config.use_relative_position_bias = False - config.use_shared_relative_position_bias = True - config.use_mean_pooling = False - - if is_large: - config.hidden_size = 1024 - config.intermediate_size = 4096 - config.num_hidden_layers = 24 - config.num_attention_heads = 16 - - # 2. Load Beit model - orig_model = load_beit_model(args, is_finetuned, is_large) - orig_model.eval() - - # 3. Forward Beit model - image_processor = BeitImageProcessor(size=config.image_size, do_center_crop=False) - image = Image.open("../../../../tests/fixtures/tests_samples/COCO/000000039769.png") - encoding = image_processor(images=image, return_tensors="pt") - pixel_values = encoding["pixel_values"] - - orig_args = (pixel_values,) if is_finetuned else (pixel_values, None) - with torch.no_grad(): - orig_model_output = orig_model(*orig_args) - - # 4. Load HF Data2VecVision model - if is_finetuned: - hf_model = Data2VecVisionForImageClassification(config) - hf_model.eval() - has_lm_head = False - hf_prefix = "data2vec_vision." - else: - hf_model = Data2VecVisionModel(config) - hf_model.eval() - has_lm_head = True - hf_prefix = "" - - rename_keys = create_rename_keys(config, hf_prefix=hf_prefix, has_lm_head=has_lm_head) - state_dict = orig_model.state_dict() - for src, dest in rename_keys: - val = state_dict.pop(src) - state_dict[dest] = val - - read_in_q_k_v(state_dict, config, hf_prefix=hf_prefix, has_lm_head=has_lm_head) - missing_keys, unexpected_keys = hf_model.load_state_dict(state_dict, strict=False) - print("HF missing", missing_keys) - print("HF unexpected_keys", unexpected_keys) - - # 5. Forward HF Data2VecVision model - with torch.no_grad(): - hf_model_output = hf_model(pixel_values) - - hf_output = hf_model_output.logits if is_finetuned else hf_model_output.last_hidden_state - - # 6. Compare - max_absolute_diff = torch.max(torch.abs(hf_output - orig_model_output)).item() - - print(f"max_absolute_diff = {max_absolute_diff}") - success = torch.allclose(hf_output, orig_model_output, atol=1e-3) - print("Do both models output the same tensors?", "🔥" if success else "💩") - if not success: - raise Exception("Something went wRoNg") - - # 7. Save - print(f"Saving to {args.hf_checkpoint_name}") - hf_model.save_pretrained(args.hf_checkpoint_name) - image_processor.save_pretrained(args.hf_checkpoint_name) - - -if __name__ == "__main__": - main() - # Run the following to convert checkpoints - # python ./convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py \ - # --beit_checkpoint ./pretrained_base.pt \ - # --hf_checkpoint_name "./data2vec-vision-base" - # python ./convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py \ - # --beit_checkpoint ./finetuned_base.pt \ - # --hf_checkpoint_name "./data2vec-vision-base-ft1k" - # python ./convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py \ - # --beit_checkpoint ./pretrained_large.pt \ - # --hf_checkpoint_name "./data2vec-vision-large" - # python ./convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py \ - # --beit_checkpoint ./finetuned_large.pt \ - # --hf_checkpoint_name "./data2vec-vision-large-ft1k" diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py index 9d06f00c0ce6..dd04dd947738 100644 --- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py @@ -253,7 +253,6 @@ def forward( if rel_att is not None: attention_scores = attention_scores + rel_att - attention_scores = attention_scores attention_scores = attention_scores.view( -1, self.num_attention_heads, attention_scores.size(-2), attention_scores.size(-1) ) @@ -914,7 +913,7 @@ def forward(self, sequence_output, word_embeddings): @auto_docstring class DebertaV2ForMaskedLM(DebertaV2PreTrainedModel): _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] - _keys_to_ignore_on_load_unexpected = r"mask_predictions.*" + _keys_to_ignore_on_load_unexpected = [r"mask_predictions.*"] def __init__(self, config): super().__init__(config) diff --git a/src/transformers/models/deepseek_vl/convert_deepseek_vl_weights_to_hf.py b/src/transformers/models/deepseek_vl/convert_deepseek_vl_weights_to_hf.py deleted file mode 100644 index 3e9b6a37fe09..000000000000 --- a/src/transformers/models/deepseek_vl/convert_deepseek_vl_weights_to_hf.py +++ /dev/null @@ -1,356 +0,0 @@ -# coding=utf-8 -# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import gc -import json -import os -from typing import Optional - -import regex as re -import torch -from accelerate import init_empty_weights -from huggingface_hub import snapshot_download -from huggingface_hub.errors import HFValidationError -from safetensors.torch import load_file - -from transformers import ( - AutoTokenizer, - DeepseekVLConfig, - DeepseekVLForConditionalGeneration, - DeepseekVLImageProcessor, - DeepseekVLProcessor, -) -from transformers.image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD - - -# fmt: off -ORIGINAL_TO_CONVERTED_KEY_MAPPING = { - # Siglip (Low Resolution) - r"vision_model.vision_tower.pos_embed": r"model.vision_model.vision_model.embeddings.position_embedding.weight", - r"vision_model.vision_tower.patch_embed.proj.(weight|bias)": r"model.vision_model.vision_model.embeddings.patch_embedding.\1", - r"vision_model.vision_tower.blocks.(\d+).attn.qkv.(weight|bias)": r"model.vision_model.vision_model.encoder.layers.\1.self_attn.(q|k|v)_proj.\2", - r"vision_model.vision_tower.blocks.(\d+).attn.proj.(weight|bias)": r"model.vision_model.vision_model.encoder.layers.\1.self_attn.out_proj.\2", - r"vision_model.vision_tower.blocks.(\d+).norm(\d+).(weight|bias)": r"model.vision_model.vision_model.encoder.layers.\1.layer_norm\2.\3", - r"vision_model.vision_tower.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"model.vision_model.vision_model.encoder.layers.\1.mlp.fc\2.\3", - r"vision_model.vision_tower.norm.(weight|bias)": r"model.vision_model.vision_model.post_layernorm.\1", - r"vision_model.vision_tower.attn_pool.latent": r"model.vision_model.vision_model.head.probe", - r"vision_model.vision_tower.attn_pool.proj.(weight|bias)": r"model.vision_model.vision_model.head.attention.out_proj.\1", - r"vision_model.vision_tower.attn_pool.norm.(weight|bias)": r"model.vision_model.vision_model.head.layernorm.\1", - r"vision_model.vision_tower.attn_pool.mlp.fc(\d+).(weight|bias)": r"model.vision_model.vision_model.head.mlp.fc\1.\2", - - # Aligner - r"aligner.layers.0.(weight|bias)": r"model.aligner.linear1.\1", - r"aligner.layers.2.(weight|bias)": r"model.aligner.linear2.\1", - - # Llama (Text Model) - r"language_model.model.(\w+)": r"model.language_model.\1", - r"language_model.lm_head.(weight|bias)": r"lm_head.\1", -} -# fmt: on - -# Adopted from https://github.com/deepseek-ai/DeepSeek-VL/blob/main/deepseek_vl/utils/conversation.py#L80-L91 -CHAT_TEMPLATE = ( - # Define separators and initialize counter - "{% set seps = ['\n\n', '<\uff5cend\u2581of\u2581sentence\uff5c>'] %}" - "{% set i = 0 %}" - # Start with default system prompt - "You are a helpful language and vision assistant. " - "You are able to understand the visual content that the user provides, " - "and assist the user with a variety of tasks using natural language.\n\n" - # Iterate through messages - "{% for message in messages %}" - # Identify user or assistant role - "{% if message['role']|lower == 'user' %}" - "User: " - "{% elif message['role']|lower == 'assistant' %}" - "Assistant:{% if not (loop.last and not add_generation_prompt and message['content'][0]['type']=='text' and message['content'][0]['text']=='') %} {% endif %}" - "{% else %}" - "{{ message['role'].capitalize() }}: " - "{% endif %}" - # Iterate through message content (text/images) - "{% for content in message['content'] %}" - # If content is an image, replace with placeholder - "{% if content['type'] == 'image' %}" - "" - # If content is text, handle formatting - "{% elif content['type'] == 'text' %}" - "{% set text = content['text'] %}" - # Strip whitespace for first and last text blocks - "{% if loop.first %}{% set text = text.lstrip() %}{% endif %}" - "{% if loop.last %}{% set text = text.rstrip() %}{% endif %}" - # If previous content was text, add space - "{% if not loop.first and message['content'][loop.index0-1]['type'] == 'text' %}" - "{{ ' ' + text }}" - "{% else %}" - "{{ text }}" - "{% endif %}" - "{% endif %}" - "{% endfor %}" # End message content loop - # Add separators between messages - "{% if not loop.last or add_generation_prompt %}" - "{% if message['role']|lower == 'user' %}" - "{{ seps[0] }}" - "{% else %}" - "{{ seps[1] }}" - "{% endif %}" - "{% endif %}" - "{% endfor %}" # End messages loop - # Add final Assistant prompt if required - "{% if add_generation_prompt %}Assistant:{% endif %}" -) - - -def convert_old_keys_to_new_keys(state_dict_keys: dict): - output_dict = {} - - old_text = "\n".join(state_dict_keys) - new_text = old_text - for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items(): - if replacement is None: - new_text = re.sub(pattern, "", new_text) # an empty line - continue - new_text = re.sub(pattern, replacement, new_text) - output_dict = dict(zip(old_text.split("\n"), new_text.split("\n"))) - - return output_dict - - -def get_qkv_state_dict(key, parameter): - """ - new key which looks like this - xxxx.(q|k|v).xxx (m, n) - - is converted to - xxxx.q.xxxx (m//3, n) - xxxx.k.xxxx (m//3, n) - xxxx.v.xxxx (m//3, n) - """ - qkv_state_dict = {} - placeholder = re.search(r"(\(.*?\))", key).group(1) # finds "(query|key|value)" - replacements_keys = placeholder[1:-1].split("|") # creates ['query', 'key', 'value'] - replacements_vals = torch.split( - parameter, split_size_or_sections=parameter.size(0) // len(replacements_keys), dim=0 - ) - for replacement_key, replacement_val in zip(replacements_keys, replacements_vals): - qkv_state_dict[key.replace(placeholder, replacement_key)] = replacement_val - return qkv_state_dict - - -def update_state_dict(old_state_dict): - all_keys = list(old_state_dict.keys()) - new_keys = convert_old_keys_to_new_keys(all_keys) - - state_dict = {} - for key in all_keys: - new_key = new_keys[key] - current_parameter = old_state_dict.pop(key) - - if "qkv" in key and "vision_tower_high" not in key: - qkv_state_dict = get_qkv_state_dict(new_key, current_parameter) - state_dict.update(qkv_state_dict) - elif "pos_embed" in key: - if "vision_tower_high" not in key: - # timm implementation of siglip creates this param of size [1, 576, 1024] - # transformers implementation of siglip creates this param of size [576, 1024] - state_dict[new_key] = current_parameter.squeeze(0) - else: - state_dict[new_key] = current_parameter - else: - state_dict[new_key] = current_parameter - - return state_dict - - -def load_model_state_dict(input_path: str) -> dict: - """ - Load model state dict, handling both single and sharded files. - """ - index_path = os.path.join(input_path, "model.safetensors.index.json") - single_file_path = os.path.join(input_path, "model.safetensors") - - # Check if we have a sharded model - if os.path.exists(index_path): - print("Loading sharded model...") - state_dict = {} - with open(index_path, "r") as f: - index = json.load(f) - - # Get unique shard files and load each one only once - unique_shard_files = sorted(set(index["weight_map"].values())) - for shard_file in unique_shard_files: - print(f"Loading shard {shard_file}...") - shard_path = os.path.join(input_path, shard_file) - shard_dict = load_file(shard_path) - state_dict.update(shard_dict) - - return state_dict - - # Single file model - elif os.path.exists(single_file_path): - print("Loading single file model...") - return load_file(single_file_path, device="cpu") - - else: - raise ValueError(f"No model files found in {input_path}") - - -def convert_model( - hf_repo_id: str, - output_dir: Optional[str] = None, - output_hub_path: Optional[str] = None, - safe_serialization: bool = True, -): - if output_dir: - os.makedirs(output_dir, exist_ok=True) - - try: - input_path = snapshot_download(hf_repo_id) - except HFValidationError: - # If the input path is not a HF repo ID, assume it's a local path - input_path = hf_repo_id - - # ------------------------------------------------------------ - # Create and save config - # ------------------------------------------------------------ - - config = DeepseekVLConfig( - text_config={ - "hidden_size": 2048, - "intermediate_size": 5632, - "max_position_embeddings": 16384, - "num_attention_heads": 16, - "num_hidden_layers": 24, - "vocab_size": 102400, - }, - vision_config={ - "hidden_size": 1024, - "intermediate_size": 4096, - "image_size": 384, - "patch_size": 16, - "hidden_act": "gelu", - "vision_use_head": False, - "num_attention_heads": 16, - "num_hidden_layers": 24, - }, - ) - - # save config - if output_dir: - config.save_pretrained(output_dir) - print("Model config saved successfully...") - - # ------------------------------------------------------------ - # Convert processor - # ------------------------------------------------------------ - - image_processor = DeepseekVLImageProcessor( - image_mean=IMAGENET_STANDARD_MEAN, - image_std=IMAGENET_STANDARD_STD, - ) - - tokenizer = AutoTokenizer.from_pretrained( - input_path, - extra_special_tokens={ - "pad_token": "<|end▁of▁sentence|>", - "image_token": "", - }, - ) - - processor = DeepseekVLProcessor( - image_processor=image_processor, - tokenizer=tokenizer, - chat_template=CHAT_TEMPLATE, - ) - - if output_dir: - print(f"Saving processor to {output_dir}...") - processor.save_pretrained(output_dir) - if output_hub_path: - print(f"Pushing processor to hub at {output_hub_path}...") - processor.push_to_hub(output_hub_path) - - # ------------------------------------------------------------ - # Convert weights - # ------------------------------------------------------------ - - print("Creating empty model...") - with init_empty_weights(): - model = DeepseekVLForConditionalGeneration(config) - - # Load and convert state dict - print("Loading state dict...") - state_dict = load_model_state_dict(input_path) - state_dict = update_state_dict(state_dict) - - # Load converted state dict - print("Loading converted weights into model...") - info = model.load_state_dict(state_dict, strict=False, assign=True) - if len(info.missing_keys) > 0: - raise ValueError(f"Missing keys: {info.missing_keys}") - - # Tie weights before any device mapping - print("Tying weights...") - model.tie_weights() - - # Save the model - if output_dir: - print(f"Saving model to {output_dir}...") - model.save_pretrained(output_dir, safe_serialization=safe_serialization) - if output_hub_path: - print(f"Pushing model to hub at {output_hub_path}...") - model.push_to_hub(output_hub_path, safe_serialization=safe_serialization) - - del state_dict, model - gc.collect() - - # Validate the saved model if saved locally - if output_dir: - print("Reloading the local model to check if it's saved correctly...") - DeepseekVLForConditionalGeneration.from_pretrained(output_dir, device_map="auto") - print("Local model reloaded successfully.") - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--hf_repo_id", - default="deepseek-ai/deepseek-vl-1.3b-chat", - help="Location of official weights from DeepseekAI on HF", - ) - parser.add_argument( - "--output_dir", - default=None, - help="Location to write the converted model and processor", - ) - parser.add_argument( - "--output_hub_path", - default=None, - help="Repository ID to push model to hub (e.g. 'username/model-name')", - ) - parser.add_argument( - "--safe_serialization", default=True, type=bool, help="Whether or not to save using `safetensors`." - ) - args = parser.parse_args() - - convert_model( - hf_repo_id=args.hf_repo_id, - output_dir=args.output_dir, - output_hub_path=args.output_hub_path, - safe_serialization=args.safe_serialization, - ) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py b/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py index 7ab4e98012ac..12aa7caf892e 100644 --- a/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py +++ b/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py @@ -38,12 +38,7 @@ valid_images, validate_preprocess_arguments, ) -from ...utils import ( - TensorType, - filter_out_non_signature_kwargs, - is_vision_available, - logging, -) +from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging if is_vision_available(): @@ -358,7 +353,7 @@ def pad_to_square( background_color: Union[int, tuple[int, int, int]] = 0, data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, - ) -> np.array: + ) -> np.ndarray: """ Pads an image to a square based on the longest edge. diff --git a/src/transformers/models/deepseek_vl/modeling_deepseek_vl.py b/src/transformers/models/deepseek_vl/modeling_deepseek_vl.py index 22d8e0928a6e..ce884da8d08b 100644 --- a/src/transformers/models/deepseek_vl/modeling_deepseek_vl.py +++ b/src/transformers/models/deepseek_vl/modeling_deepseek_vl.py @@ -29,11 +29,7 @@ from ...modeling_outputs import ModelOutput from ...modeling_utils import PreTrainedModel from ...processing_utils import Unpack -from ...utils import ( - TransformersKwargs, - auto_docstring, - can_return_tuple, -) +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple from ..auto import AutoModel from .configuration_deepseek_vl import DeepseekVLConfig diff --git a/src/transformers/models/deepseek_vl_hybrid/convert_deepseek_vl_hybrid_weights_to_hf.py b/src/transformers/models/deepseek_vl_hybrid/convert_deepseek_vl_hybrid_weights_to_hf.py deleted file mode 100644 index 9f377a53c8f3..000000000000 --- a/src/transformers/models/deepseek_vl_hybrid/convert_deepseek_vl_hybrid_weights_to_hf.py +++ /dev/null @@ -1,394 +0,0 @@ -# coding=utf-8 -# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import gc -import json -import os -from typing import Optional - -import regex as re -import torch -from accelerate import init_empty_weights -from huggingface_hub import snapshot_download -from huggingface_hub.errors import HFValidationError -from safetensors.torch import load_file - -from transformers import ( - AutoTokenizer, - DeepseekVLHybridConfig, - DeepseekVLHybridForConditionalGeneration, - DeepseekVLHybridImageProcessor, - DeepseekVLHybridProcessor, -) -from transformers.image_utils import ( - IMAGENET_STANDARD_MEAN, - IMAGENET_STANDARD_STD, - OPENAI_CLIP_MEAN, - OPENAI_CLIP_STD, - PILImageResampling, -) - - -# fmt: off -ORIGINAL_TO_CONVERTED_KEY_MAPPING = { - # # Sam (High Resolution) - r"vision_model.vision_tower_high.vision_tower.pos_embed": r"model.high_res_vision_model.vision_encoder.pos_embed", - r"vision_model.vision_tower_high.vision_tower.patch_embed.proj.(weight|bias)": r"model.high_res_vision_model.vision_encoder.patch_embed.projection.\1", - r"vision_model.vision_tower_high.vision_tower.blocks.(\d+).norm(\d+).(weight|bias)": r"model.high_res_vision_model.vision_encoder.layers.\1.layer_norm\2.\3", - r"vision_model.vision_tower_high.vision_tower.blocks.(\d+).attn.rel_pos_(h|w)": r"model.high_res_vision_model.vision_encoder.layers.\1.attn.rel_pos_\2", - r"vision_model.vision_tower_high.vision_tower.blocks.(\d+).attn.qkv.(weight|bias)": r"model.high_res_vision_model.vision_encoder.layers.\1.attn.qkv.\2", - r"vision_model.vision_tower_high.vision_tower.blocks.(\d+).attn.proj.(weight|bias)": r"model.high_res_vision_model.vision_encoder.layers.\1.attn.proj.\2", - r"vision_model.vision_tower_high.vision_tower.blocks.(\d+).mlp.lin(\d+).(weight|bias)": r"model.high_res_vision_model.vision_encoder.layers.\1.mlp.lin\2.\3", - r"vision_model.vision_tower_high.vision_tower.neck.0.weight": r"model.high_res_vision_model.vision_encoder.neck.conv1.weight", - r"vision_model.vision_tower_high.vision_tower.neck.1.(weight|bias)": r"model.high_res_vision_model.vision_encoder.neck.layer_norm1.\1", - r"vision_model.vision_tower_high.vision_tower.neck.2.weight": r"model.high_res_vision_model.vision_encoder.neck.conv2.weight", - r"vision_model.vision_tower_high.vision_tower.neck.3.(weight|bias)": r"model.high_res_vision_model.vision_encoder.neck.layer_norm2.\1", - r"vision_model.vision_tower_high.vision_tower.neck_hd.0.weight": r"model.high_res_vision_neck.conv1.weight", - r"vision_model.vision_tower_high.vision_tower.neck_hd.1.(weight|bias)": r"model.high_res_vision_neck.layer_norm1.\1", - r"vision_model.vision_tower_high.vision_tower.neck_hd.2.weight": r"model.high_res_vision_neck.conv2.weight", - r"vision_model.vision_tower_high.vision_tower.neck_hd.3.(weight|bias)": r"model.high_res_vision_neck.layer_norm2.\1", - r"vision_model.vision_tower_high.vision_tower.downsamples.0.weight": r"model.high_res_vision_proj.conv1.weight", - r"vision_model.vision_tower_high.vision_tower.downsamples.1.weight": r"model.high_res_vision_proj.conv2.weight", - r"vision_model.vision_tower_high.vision_tower.hd_alpha_downsamples": r"model.high_res_vision_alpha", - - # Siglip (Low Resolution) - r"vision_model.vision_tower_low.vision_tower.pos_embed": r"model.vision_model.vision_model.embeddings.position_embedding.weight", - r"vision_model.vision_tower_low.vision_tower.patch_embed.proj.(weight|bias)": r"model.vision_model.vision_model.embeddings.patch_embedding.\1", - r"vision_model.vision_tower_low.vision_tower.blocks.(\d+).attn.qkv.(weight|bias)": r"model.vision_model.vision_model.encoder.layers.\1.self_attn.(q|k|v)_proj.\2", - r"vision_model.vision_tower_low.vision_tower.blocks.(\d+).attn.proj.(weight|bias)": r"model.vision_model.vision_model.encoder.layers.\1.self_attn.out_proj.\2", - r"vision_model.vision_tower_low.vision_tower.blocks.(\d+).norm(\d+).(weight|bias)": r"model.vision_model.vision_model.encoder.layers.\1.layer_norm\2.\3", - r"vision_model.vision_tower_low.vision_tower.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"model.vision_model.vision_model.encoder.layers.\1.mlp.fc\2.\3", - r"vision_model.vision_tower_low.vision_tower.norm.(weight|bias)": r"model.vision_model.vision_model.post_layernorm.\1", - r"vision_model.vision_tower_low.vision_tower.attn_pool.latent": r"model.vision_model.vision_model.head.probe", - r"vision_model.vision_tower_low.vision_tower.attn_pool.proj.(weight|bias)": r"model.vision_model.vision_model.head.attention.out_proj.\1", - r"vision_model.vision_tower_low.vision_tower.attn_pool.norm.(weight|bias)": r"model.vision_model.vision_model.head.layernorm.\1", - r"vision_model.vision_tower_low.vision_tower.attn_pool.mlp.fc(\d+).(weight|bias)": r"model.vision_model.vision_model.head.mlp.fc\1.\2", - - # Vision Projection - r"aligner.layers.1.(weight|bias)": r"model.aligner.proj.\1", - r"aligner.low_up_proj.(weight|bias)": r"model.aligner.vision_proj.\1", - r"aligner.high_up_proj.(weight|bias)": r"model.aligner.high_res_vision_proj.\1", - - # Llama (Text Model) - r"language_model.model.(\w+)": r"model.language_model.\1", - r"language_model.lm_head.(weight|bias)": r"lm_head.\1", -} -# fmt: on - -# Adopted from https://github.com/deepseek-ai/DeepSeek-VL/blob/main/deepseek_vl/utils/conversation.py#L80-L91 -CHAT_TEMPLATE = ( - # Define separators and initialize counter - "{% set seps = ['\n\n', '<\uff5cend\u2581of\u2581sentence\uff5c>'] %}" - "{% set i = 0 %}" - # Start with default system prompt - "You are a helpful language and vision assistant. " - "You are able to understand the visual content that the user provides, " - "and assist the user with a variety of tasks using natural language.\n\n" - # Iterate through messages - "{% for message in messages %}" - # Identify user or assistant role - "{% if message['role']|lower == 'user' %}" - "User: " - "{% elif message['role']|lower == 'assistant' %}" - "Assistant:{% if not (loop.last and not add_generation_prompt and message['content'][0]['type']=='text' and message['content'][0]['text']=='') %} {% endif %}" - "{% else %}" - "{{ message['role'].capitalize() }}: " - "{% endif %}" - # Iterate through message content (text/images) - "{% for content in message['content'] %}" - # If content is an image, replace with placeholder - "{% if content['type'] == 'image' %}" - "" - # If content is text, handle formatting - "{% elif content['type'] == 'text' %}" - "{% set text = content['text'] %}" - # Strip whitespace for first and last text blocks - "{% if loop.first %}{% set text = text.lstrip() %}{% endif %}" - "{% if loop.last %}{% set text = text.rstrip() %}{% endif %}" - # If previous content was text, add space - "{% if not loop.first and message['content'][loop.index0-1]['type'] == 'text' %}" - "{{ ' ' + text }}" - "{% else %}" - "{{ text }}" - "{% endif %}" - "{% endif %}" - "{% endfor %}" # End message content loop - # Add separators between messages - "{% if not loop.last or add_generation_prompt %}" - "{% if message['role']|lower == 'user' %}" - "{{ seps[0] }}" - "{% else %}" - "{{ seps[1] }}" - "{% endif %}" - "{% endif %}" - "{% endfor %}" # End messages loop - # Add final Assistant prompt if required - "{% if add_generation_prompt %}Assistant:{% endif %}" -) - - -def convert_old_keys_to_new_keys(state_dict_keys: dict): - output_dict = {} - - old_text = "\n".join(state_dict_keys) - new_text = old_text - for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items(): - if replacement is None: - new_text = re.sub(pattern, "", new_text) # an empty line - continue - new_text = re.sub(pattern, replacement, new_text) - output_dict = dict(zip(old_text.split("\n"), new_text.split("\n"))) - - return output_dict - - -def get_qkv_state_dict(key, parameter): - """ - new key which looks like this - xxxx.(q|k|v).xxx (m, n) - - is converted to - xxxx.q.xxxx (m//3, n) - xxxx.k.xxxx (m//3, n) - xxxx.v.xxxx (m//3, n) - """ - qkv_state_dict = {} - placeholder = re.search(r"(\(.*?\))", key).group(1) # finds "(query|key|value)" - replacements_keys = placeholder[1:-1].split("|") # creates ['query', 'key', 'value'] - replacements_vals = torch.split( - parameter, split_size_or_sections=parameter.size(0) // len(replacements_keys), dim=0 - ) - for replacement_key, replacement_val in zip(replacements_keys, replacements_vals): - qkv_state_dict[key.replace(placeholder, replacement_key)] = replacement_val - return qkv_state_dict - - -def update_state_dict(old_state_dict): - all_keys = list(old_state_dict.keys()) - new_keys = convert_old_keys_to_new_keys(all_keys) - - state_dict = {} - for key in all_keys: - new_key = new_keys[key] - current_parameter = old_state_dict.pop(key) - - if "qkv" in key and "vision_tower_high" not in key: - qkv_state_dict = get_qkv_state_dict(new_key, current_parameter) - state_dict.update(qkv_state_dict) - elif "pos_embed" in key: - if "vision_tower_high" not in key: - # timm implementation of siglip creates this param of size [1, 576, 1024] - # transformers implementation of siglip creates this param of size [576, 1024] - state_dict[new_key] = current_parameter.squeeze(0) - else: - state_dict[new_key] = current_parameter - else: - state_dict[new_key] = current_parameter - - return state_dict - - -def load_model_state_dict(input_path: str) -> dict: - """ - Load model state dict, handling both single and sharded files. - """ - index_path = os.path.join(input_path, "model.safetensors.index.json") - single_file_path = os.path.join(input_path, "model.safetensors") - - # Check if we have a sharded model - if os.path.exists(index_path): - print("Loading sharded model...") - state_dict = {} - with open(index_path, "r") as f: - index = json.load(f) - - # Get unique shard files and load each one only once - unique_shard_files = sorted(set(index["weight_map"].values())) - for shard_file in unique_shard_files: - print(f"Loading shard {shard_file}...") - shard_path = os.path.join(input_path, shard_file) - shard_dict = load_file(shard_path) - state_dict.update(shard_dict) - - return state_dict - - # Single file model - elif os.path.exists(single_file_path): - print("Loading single file model...") - return load_file(single_file_path, device="cpu") - - else: - raise ValueError(f"No model files found in {input_path}") - - -def convert_model( - hf_repo_id: str, - output_dir: Optional[str] = None, - output_hub_path: Optional[str] = None, - safe_serialization: bool = True, -): - if output_dir: - os.makedirs(output_dir, exist_ok=True) - - try: - input_path = snapshot_download(hf_repo_id) - except HFValidationError: - # If the input path is not a HF repo ID, assume it's a local path - input_path = hf_repo_id - - # ------------------------------------------------------------ - # Create and save config - # ------------------------------------------------------------ - - config = DeepseekVLHybridConfig( - text_config={ - "hidden_size": 4096, - "intermediate_size": 11008, - "max_position_embeddings": 16384, - "num_attention_heads": 32, - "num_hidden_layers": 30, - "vocab_size": 102400, - }, - vision_config={ - "hidden_size": 1024, - "intermediate_size": 4096, - "image_size": 384, - "patch_size": 16, - "hidden_act": "gelu", - "vision_use_head": False, - "num_attention_heads": 16, - "num_hidden_layers": 24, - }, - high_res_vision_config={ - "hidden_size": 768, - "intermediate_size": 3072, - "image_size": 1024, - "patch_size": 16, - "num_attention_heads": 12, - "num_hidden_layers": 12, - }, - ) - - # save config - if output_dir: - config.save_pretrained(output_dir) - print("Model config saved successfully...") - - # ------------------------------------------------------------ - # Convert processor - # ------------------------------------------------------------ - - image_processor = DeepseekVLHybridImageProcessor( - image_mean=IMAGENET_STANDARD_MEAN, - image_std=IMAGENET_STANDARD_STD, - high_res_image_mean=OPENAI_CLIP_MEAN, - high_res_image_std=OPENAI_CLIP_STD, - resample=PILImageResampling.BILINEAR, - ) - - tokenizer = AutoTokenizer.from_pretrained( - input_path, - extra_special_tokens={ - "pad_token": "<|end▁of▁sentence|>", - "image_token": "", - }, - ) - - processor = DeepseekVLHybridProcessor( - image_processor=image_processor, - tokenizer=tokenizer, - chat_template=CHAT_TEMPLATE, - ) - - if output_dir: - print(f"Saving processor to {output_dir}...") - processor.save_pretrained(output_dir) - if output_hub_path: - print(f"Pushing processor to hub at {output_hub_path}...") - processor.push_to_hub(output_hub_path) - - # ------------------------------------------------------------ - # Convert weights - # ------------------------------------------------------------ - - print("Creating empty model...") - with init_empty_weights(): - model = DeepseekVLHybridForConditionalGeneration(config) - - # Load and convert state dict - print("Loading state dict...") - state_dict = load_model_state_dict(input_path) - state_dict = update_state_dict(state_dict) - - # Load converted state dict - print("Loading converted weights into model...") - info = model.load_state_dict(state_dict, strict=False, assign=True) - if len(info.missing_keys) > 0: - raise ValueError(f"Missing keys: {info.missing_keys}") - - # Tie weights before any device mapping - print("Tying weights...") - model.tie_weights() - - # Save the model - if output_dir: - print(f"Saving model to {output_dir}...") - model.save_pretrained(output_dir, safe_serialization=safe_serialization) - if output_hub_path: - print(f"Pushing model to hub at {output_hub_path}...") - model.push_to_hub(output_hub_path, safe_serialization=safe_serialization) - - del state_dict, model - gc.collect() - - # Validate the saved model if saved locally - if output_dir: - print("Reloading the local model to check if it's saved correctly...") - DeepseekVLHybridForConditionalGeneration.from_pretrained(output_dir, device_map="auto") - print("Local model reloaded successfully.") - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--hf_repo_id", - default="deepseek-ai/deepseek-vl-7b-chat", - help="Location of official weights from DeepseekAI on HF", - ) - parser.add_argument( - "--output_dir", - default=None, - help="Location to write the converted model and processor", - ) - parser.add_argument( - "--output_hub_path", - default=None, - help="Repository ID to push model to hub (e.g. 'username/model-name')", - ) - parser.add_argument( - "--safe_serialization", default=True, type=bool, help="Whether or not to save using `safetensors`." - ) - args = parser.parse_args() - - convert_model( - hf_repo_id=args.hf_repo_id, - output_dir=args.output_dir, - output_hub_path=args.output_hub_path, - safe_serialization=args.safe_serialization, - ) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py index 7c7d6df82424..865e13fa964f 100644 --- a/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +++ b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py @@ -39,12 +39,7 @@ valid_images, validate_preprocess_arguments, ) -from ...utils import ( - TensorType, - filter_out_non_signature_kwargs, - is_vision_available, - logging, -) +from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging if is_vision_available(): @@ -431,7 +426,7 @@ def pad_to_square( background_color: Union[int, tuple[int, int, int]] = 0, data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, - ) -> np.array: + ) -> np.ndarray: """ Pads an image to a square based on the longest edge. diff --git a/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py index db9c9ad987c1..c04e006e358d 100644 --- a/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +++ b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py @@ -21,6 +21,7 @@ from typing import Optional, Union import torch +from torchvision.transforms.v2 import functional as F from ...image_processing_utils_fast import ( BaseImageProcessorFast, @@ -39,13 +40,7 @@ pil_torch_interpolation_mapping, ) from ...processing_utils import Unpack -from ...utils import TensorType, auto_docstring, is_torchvision_v2_available - - -if is_torchvision_v2_available(): - from torchvision.transforms.v2 import functional as F -else: - from torchvision.transforms import functional as F +from ...utils import TensorType, auto_docstring class DeepseekVLHybridFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): diff --git a/src/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py index cae509e14d64..d9a85654e901 100644 --- a/src/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +++ b/src/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py @@ -29,11 +29,7 @@ from ...modeling_outputs import ModelOutput from ...modeling_utils import PreTrainedModel from ...processing_utils import Unpack -from ...utils import ( - TransformersKwargs, - auto_docstring, - can_return_tuple, -) +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple from ..auto import AutoModel from .configuration_deepseek_vl_hybrid import DeepseekVLHybridConfig diff --git a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py index d97b00f7fbd2..0da40603c2e9 100644 --- a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +++ b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py @@ -16,6 +16,7 @@ import torch import torch.nn as nn +from torchvision.transforms.v2 import functional as F from ...cache_utils import Cache from ...image_processing_utils_fast import ( @@ -53,7 +54,6 @@ auto_docstring, can_return_tuple, filter_out_non_signature_kwargs, - is_torchvision_v2_available, logging, ) from ..auto import CONFIG_MAPPING, AutoConfig, AutoModel @@ -70,12 +70,6 @@ from ..sam.modeling_sam import SamLayerNorm, SamVisionNeck -if is_torchvision_v2_available(): - from torchvision.transforms.v2 import functional as F -else: - from torchvision.transforms import functional as F - - logger = logging.get_logger(__name__) diff --git a/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py b/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py deleted file mode 100644 index dbd7fa3f4d23..000000000000 --- a/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py +++ /dev/null @@ -1,236 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Deformable DETR checkpoints.""" - -import argparse -import json -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import DeformableDetrConfig, DeformableDetrForObjectDetection, DeformableDetrImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def rename_key(orig_key): - if "backbone.0.body" in orig_key: - orig_key = orig_key.replace("backbone.0.body", "backbone.conv_encoder.model") - if "transformer" in orig_key: - orig_key = orig_key.replace("transformer.", "") - if "norm1" in orig_key: - if "encoder" in orig_key: - orig_key = orig_key.replace("norm1", "self_attn_layer_norm") - else: - orig_key = orig_key.replace("norm1", "encoder_attn_layer_norm") - if "norm2" in orig_key: - if "encoder" in orig_key: - orig_key = orig_key.replace("norm2", "final_layer_norm") - else: - orig_key = orig_key.replace("norm2", "self_attn_layer_norm") - if "norm3" in orig_key: - orig_key = orig_key.replace("norm3", "final_layer_norm") - if "linear1" in orig_key: - orig_key = orig_key.replace("linear1", "fc1") - if "linear2" in orig_key: - orig_key = orig_key.replace("linear2", "fc2") - if "query_embed" in orig_key: - orig_key = orig_key.replace("query_embed", "query_position_embeddings") - if "cross_attn" in orig_key: - orig_key = orig_key.replace("cross_attn", "encoder_attn") - - return orig_key - - -def read_in_q_k_v(state_dict): - # transformer decoder self-attention layers - for i in range(6): - # read in weights + bias of input projection layer of self-attention - in_proj_weight = state_dict.pop(f"decoder.layers.{i}.self_attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"decoder.layers.{i}.self_attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :] - state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256] - state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :] - state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512] - state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :] - state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:] - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - - return im - - -@torch.no_grad() -def convert_deformable_detr_checkpoint( - checkpoint_path, - single_scale, - dilation, - with_box_refine, - two_stage, - pytorch_dump_folder_path, - push_to_hub, -): - """ - Copy/paste/tweak model's weights to our Deformable DETR structure. - """ - - # load default config - config = DeformableDetrConfig() - # set config attributes - if single_scale: - config.num_feature_levels = 1 - config.dilation = dilation - config.with_box_refine = with_box_refine - config.two_stage = two_stage - # set labels - config.num_labels = 91 - repo_id = "huggingface/label-files" - filename = "coco-detection-id2label.json" - id2label = json.loads(Path(hf_hub_download(repo_id, filename, repo_type="dataset")).read_text()) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - - # load image processor - image_processor = DeformableDetrImageProcessor(format="coco_detection") - - # prepare image - img = prepare_img() - encoding = image_processor(images=img, return_tensors="pt") - pixel_values = encoding["pixel_values"] - - logger.info("Converting model...") - - # load original state dict - state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)["model"] - # rename keys - for key in state_dict.copy(): - val = state_dict.pop(key) - state_dict[rename_key(key)] = val - # query, key and value matrices need special treatment - read_in_q_k_v(state_dict) - # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them - prefix = "model." - for key in state_dict.copy(): - if not key.startswith("class_embed") and not key.startswith("bbox_embed"): - val = state_dict.pop(key) - state_dict[prefix + key] = val - # finally, create HuggingFace model and load state dict - model = DeformableDetrForObjectDetection(config) - model.load_state_dict(state_dict) - model.eval() - - device = "cuda" if torch.cuda.is_available() else "cpu" - model.to(device) - # verify our conversion - outputs = model(pixel_values.to(device)) - - expected_logits = torch.tensor( - [[-9.6645, -4.3449, -5.8705], [-9.7035, -3.8504, -5.0724], [-10.5634, -5.3379, -7.5116]] - ) - expected_boxes = torch.tensor([[0.8693, 0.2289, 0.2492], [0.3150, 0.5489, 0.5845], [0.5563, 0.7580, 0.8518]]) - - if single_scale: - expected_logits = torch.tensor( - [[-9.9051, -4.2541, -6.4852], [-9.6947, -4.0854, -6.8033], [-10.0665, -5.8470, -7.7003]] - ) - expected_boxes = torch.tensor([[0.7292, 0.4991, 0.5532], [0.7959, 0.2426, 0.4236], [0.7582, 0.3518, 0.4451]]) - - if single_scale and dilation: - expected_logits = torch.tensor( - [[-8.9652, -4.1074, -5.6635], [-9.0596, -4.9447, -6.6075], [-10.1178, -4.5275, -6.2671]] - ) - expected_boxes = torch.tensor([[0.7665, 0.4130, 0.4769], [0.8364, 0.1841, 0.3391], [0.6261, 0.3895, 0.7978]]) - - if with_box_refine: - expected_logits = torch.tensor( - [[-8.8895, -5.4187, -6.8153], [-8.4706, -6.1668, -7.6184], [-9.0042, -5.5359, -6.9141]] - ) - expected_boxes = torch.tensor([[0.7828, 0.2208, 0.4323], [0.0892, 0.5996, 0.1319], [0.5524, 0.6389, 0.8914]]) - - if with_box_refine and two_stage: - expected_logits = torch.tensor( - [[-6.7108, -4.3213, -6.3777], [-8.9014, -6.1799, -6.7240], [-6.9315, -4.4735, -6.2298]] - ) - expected_boxes = torch.tensor([[0.2583, 0.5499, 0.4683], [0.7652, 0.9068, 0.4882], [0.5490, 0.2763, 0.0564]]) - - print("Logits:", outputs.logits[0, :3, :3]) - - assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4) - assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4) - - print("Everything ok!") - - # Save model and image processor - logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...") - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - model.save_pretrained(pytorch_dump_folder_path) - image_processor.save_pretrained(pytorch_dump_folder_path) - - # Push to hub - if push_to_hub: - model_name = "deformable-detr" - model_name += "-single-scale" if single_scale else "" - model_name += "-dc5" if dilation else "" - model_name += "-with-box-refine" if with_box_refine else "" - model_name += "-two-stage" if two_stage else "" - print("Pushing model to hub...") - model.push_to_hub(repo_path_or_name=model_name, organization="nielsr", commit_message="Add model") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument( - "--checkpoint_path", - type=str, - default="/home/niels/checkpoints/deformable_detr/r50_deformable_detr-checkpoint.pth", - help="Path to Pytorch checkpoint (.pth file) you'd like to convert.", - ) - parser.add_argument("--single_scale", action="store_true", help="Whether to set config.num_features_levels = 1.") - parser.add_argument("--dilation", action="store_true", help="Whether to set config.dilation=True.") - parser.add_argument("--with_box_refine", action="store_true", help="Whether to set config.with_box_refine=True.") - parser.add_argument("--two_stage", action="store_true", help="Whether to set config.two_stage=True.") - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - required=True, - help="Path to the folder to output PyTorch model.", - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub." - ) - args = parser.parse_args() - convert_deformable_detr_checkpoint( - args.checkpoint_path, - args.single_scale, - args.dilation, - args.with_box_refine, - args.two_stage, - args.pytorch_dump_folder_path, - args.push_to_hub, - ) diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py index cd07f8db350b..8458d02d58a5 100644 --- a/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py @@ -9,6 +9,7 @@ import torch from torchvision.io import read_image +from torchvision.transforms.v2 import functional as F from ...image_processing_utils import BatchFeature, get_size_dict from ...image_processing_utils_fast import ( @@ -32,17 +33,11 @@ validate_annotations, ) from ...processing_utils import Unpack -from ...utils import TensorType, auto_docstring, is_torchvision_v2_available, logging +from ...utils import TensorType, auto_docstring, logging from ...utils.import_utils import requires from .image_processing_deformable_detr import get_size_with_aspect_ratio -if is_torchvision_v2_available(): - from torchvision.transforms.v2 import functional as F -else: - from torchvision.transforms import functional as F - - logger = logging.get_logger(__name__) @@ -427,13 +422,7 @@ def resize_annotation( resample (`InterpolationMode`, defaults to `F.InterpolationMode.NEAREST_EXACT`): The resampling filter to use when resizing the masks. """ - interpolation = ( - interpolation - if interpolation is not None - else F.InterpolationMode.NEAREST_EXACT - if is_torchvision_v2_available() - else F.InterpolationMode.NEAREST - ) + interpolation = interpolation if interpolation is not None else F.InterpolationMode.NEAREST_EXACT ratio_height, ratio_width = [target / orig for target, orig in zip(target_size, orig_size)] new_annotation = {} diff --git a/src/transformers/models/deit/convert_deit_timm_to_pytorch.py b/src/transformers/models/deit/convert_deit_timm_to_pytorch.py deleted file mode 100644 index e7bf3e7a12e8..000000000000 --- a/src/transformers/models/deit/convert_deit_timm_to_pytorch.py +++ /dev/null @@ -1,218 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DeiT distilled checkpoints from the timm library.""" - -import argparse -import json -from pathlib import Path - -import requests -import timm -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import DeiTConfig, DeiTForImageClassificationWithTeacher, DeiTImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config, base_model=False): - rename_keys = [] - for i in range(config.num_hidden_layers): - # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms - rename_keys.append((f"blocks.{i}.norm1.weight", f"deit.encoder.layer.{i}.layernorm_before.weight")) - rename_keys.append((f"blocks.{i}.norm1.bias", f"deit.encoder.layer.{i}.layernorm_before.bias")) - rename_keys.append((f"blocks.{i}.attn.proj.weight", f"deit.encoder.layer.{i}.attention.output.dense.weight")) - rename_keys.append((f"blocks.{i}.attn.proj.bias", f"deit.encoder.layer.{i}.attention.output.dense.bias")) - rename_keys.append((f"blocks.{i}.norm2.weight", f"deit.encoder.layer.{i}.layernorm_after.weight")) - rename_keys.append((f"blocks.{i}.norm2.bias", f"deit.encoder.layer.{i}.layernorm_after.bias")) - rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"deit.encoder.layer.{i}.intermediate.dense.weight")) - rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"deit.encoder.layer.{i}.intermediate.dense.bias")) - rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"deit.encoder.layer.{i}.output.dense.weight")) - rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"deit.encoder.layer.{i}.output.dense.bias")) - - # projection layer + position embeddings - rename_keys.extend( - [ - ("cls_token", "deit.embeddings.cls_token"), - ("dist_token", "deit.embeddings.distillation_token"), - ("patch_embed.proj.weight", "deit.embeddings.patch_embeddings.projection.weight"), - ("patch_embed.proj.bias", "deit.embeddings.patch_embeddings.projection.bias"), - ("pos_embed", "deit.embeddings.position_embeddings"), - ] - ) - - if base_model: - # layernorm + pooler - rename_keys.extend( - [ - ("norm.weight", "layernorm.weight"), - ("norm.bias", "layernorm.bias"), - ("pre_logits.fc.weight", "pooler.dense.weight"), - ("pre_logits.fc.bias", "pooler.dense.bias"), - ] - ) - - # if just the base model, we should remove "deit" from all keys that start with "deit" - rename_keys = [(pair[0], pair[1][4:]) if pair[1].startswith("deit") else pair for pair in rename_keys] - else: - # layernorm + classification heads - rename_keys.extend( - [ - ("norm.weight", "deit.layernorm.weight"), - ("norm.bias", "deit.layernorm.bias"), - ("head.weight", "cls_classifier.weight"), - ("head.bias", "cls_classifier.bias"), - ("head_dist.weight", "distillation_classifier.weight"), - ("head_dist.bias", "distillation_classifier.bias"), - ] - ) - - return rename_keys - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config, base_model=False): - for i in range(config.num_hidden_layers): - if base_model: - prefix = "" - else: - prefix = "deit." - # read in weights + bias of input projection layer (in timm, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight") - in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[ - : config.hidden_size, : - ] - state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size] - state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ - config.hidden_size : config.hidden_size * 2, : - ] - state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[ - config.hidden_size : config.hidden_size * 2 - ] - state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[ - -config.hidden_size :, : - ] - state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :] - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -@torch.no_grad() -def convert_deit_checkpoint(deit_name, pytorch_dump_folder_path): - """ - Copy/paste/tweak model's weights to our DeiT structure. - """ - - # define default DeiT configuration - config = DeiTConfig() - # all deit models have fine-tuned heads - base_model = False - # dataset (fine-tuned on ImageNet 2012), patch_size and image_size - config.num_labels = 1000 - repo_id = "huggingface/label-files" - filename = "imagenet-1k-id2label.json" - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - config.patch_size = int(deit_name[-6:-4]) - config.image_size = int(deit_name[-3:]) - # size of the architecture - if deit_name[9:].startswith("tiny"): - config.hidden_size = 192 - config.intermediate_size = 768 - config.num_hidden_layers = 12 - config.num_attention_heads = 3 - elif deit_name[9:].startswith("small"): - config.hidden_size = 384 - config.intermediate_size = 1536 - config.num_hidden_layers = 12 - config.num_attention_heads = 6 - if deit_name[9:].startswith("base"): - pass - elif deit_name[4:].startswith("large"): - config.hidden_size = 1024 - config.intermediate_size = 4096 - config.num_hidden_layers = 24 - config.num_attention_heads = 16 - - # load original model from timm - timm_model = timm.create_model(deit_name, pretrained=True) - timm_model.eval() - - # load state_dict of original model, remove and rename some keys - state_dict = timm_model.state_dict() - rename_keys = create_rename_keys(config, base_model) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - read_in_q_k_v(state_dict, config, base_model) - - # load HuggingFace model - model = DeiTForImageClassificationWithTeacher(config).eval() - model.load_state_dict(state_dict) - - # Check outputs on an image, prepared by DeiTImageProcessor - size = int( - (256 / 224) * config.image_size - ) # to maintain same ratio w.r.t. 224 images, see https://github.com/facebookresearch/deit/blob/ab5715372db8c6cad5740714b2216d55aeae052e/datasets.py#L103 - image_processor = DeiTImageProcessor(size=size, crop_size=config.image_size) - encoding = image_processor(images=prepare_img(), return_tensors="pt") - pixel_values = encoding["pixel_values"] - outputs = model(pixel_values) - - timm_logits = timm_model(pixel_values) - assert timm_logits.shape == outputs.logits.shape - assert torch.allclose(timm_logits, outputs.logits, atol=1e-3) - - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model {deit_name} to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - print(f"Saving image processor to {pytorch_dump_folder_path}") - image_processor.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--deit_name", - default="vit_deit_base_distilled_patch16_224", - type=str, - help="Name of the DeiT timm model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - - args = parser.parse_args() - convert_deit_checkpoint(args.deit_name, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/deprecated/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py deleted file mode 100644 index 1f3d675e091d..000000000000 --- a/src/transformers/models/deprecated/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py +++ /dev/null @@ -1,318 +0,0 @@ -# coding=utf-8 -# Copyright 2020, The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Bort checkpoint.""" - -import argparse -import os - -import gluonnlp as nlp -import mxnet as mx -import numpy as np -import torch -from gluonnlp.base import get_home_dir -from gluonnlp.model.bert import BERTEncoder -from gluonnlp.model.utils import _load_vocab -from gluonnlp.vocab import Vocab -from packaging import version -from torch import nn - -from transformers import BertConfig, BertForMaskedLM, BertModel, RobertaTokenizer -from transformers.models.bert.modeling_bert import ( - BertIntermediate, - BertLayer, - BertOutput, - BertSelfAttention, - BertSelfOutput, -) -from transformers.utils import logging - - -if version.parse(nlp.__version__) != version.parse("0.8.3"): - raise Exception("requires gluonnlp == 0.8.3") - -if version.parse(mx.__version__) != version.parse("1.5.0"): - raise Exception("requires mxnet == 1.5.0") - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -SAMPLE_TEXT = "The Nymphenburg Palace is a beautiful palace in Munich!" - - -def convert_bort_checkpoint_to_pytorch(bort_checkpoint_path: str, pytorch_dump_folder_path: str): - """ - Convert the original Bort checkpoint (based on MXNET and Gluonnlp) to our BERT structure- - """ - - # Original Bort configuration - bort_4_8_768_1024_hparams = { - "attention_cell": "multi_head", - "num_layers": 4, - "units": 1024, - "hidden_size": 768, - "max_length": 512, - "num_heads": 8, - "scaled": True, - "dropout": 0.1, - "use_residual": True, - "embed_size": 1024, - "embed_dropout": 0.1, - "word_embed": None, - "layer_norm_eps": 1e-5, - "token_type_vocab_size": 2, - } - - predefined_args = bort_4_8_768_1024_hparams - - # Let's construct the original Bort model here - # Taken from official BERT implementation, see: - # https://github.com/alexa/bort/blob/master/bort/bort.py - encoder = BERTEncoder( - attention_cell=predefined_args["attention_cell"], - num_layers=predefined_args["num_layers"], - units=predefined_args["units"], - hidden_size=predefined_args["hidden_size"], - max_length=predefined_args["max_length"], - num_heads=predefined_args["num_heads"], - scaled=predefined_args["scaled"], - dropout=predefined_args["dropout"], - output_attention=False, - output_all_encodings=False, - use_residual=predefined_args["use_residual"], - activation=predefined_args.get("activation", "gelu"), - layer_norm_eps=predefined_args.get("layer_norm_eps", None), - ) - - # Vocab information needs to be fetched first - # It's the same as RoBERTa, so RobertaTokenizer can be used later - vocab_name = "openwebtext_ccnews_stories_books_cased" - - # Specify download folder to Gluonnlp's vocab - gluon_cache_dir = os.path.join(get_home_dir(), "models") - bort_vocab = _load_vocab(vocab_name, None, gluon_cache_dir, cls=Vocab) - - original_bort = nlp.model.BERTModel( - encoder, - len(bort_vocab), - units=predefined_args["units"], - embed_size=predefined_args["embed_size"], - embed_dropout=predefined_args["embed_dropout"], - word_embed=predefined_args["word_embed"], - use_pooler=False, - use_token_type_embed=False, - token_type_vocab_size=predefined_args["token_type_vocab_size"], - use_classifier=False, - use_decoder=False, - ) - - original_bort.load_parameters(bort_checkpoint_path, cast_dtype=True, ignore_extra=True) - params = original_bort._collect_params_with_prefix() - - # Build our config 🤗 - hf_bort_config_json = { - "architectures": ["BertForMaskedLM"], - "attention_probs_dropout_prob": predefined_args["dropout"], - "hidden_act": "gelu", - "hidden_dropout_prob": predefined_args["dropout"], - "hidden_size": predefined_args["embed_size"], - "initializer_range": 0.02, - "intermediate_size": predefined_args["hidden_size"], - "layer_norm_eps": predefined_args["layer_norm_eps"], - "max_position_embeddings": predefined_args["max_length"], - "model_type": "bort", - "num_attention_heads": predefined_args["num_heads"], - "num_hidden_layers": predefined_args["num_layers"], - "pad_token_id": 1, # 2 = BERT, 1 = RoBERTa - "type_vocab_size": 1, # 2 = BERT, 1 = RoBERTa - "vocab_size": len(bort_vocab), - } - - hf_bort_config = BertConfig.from_dict(hf_bort_config_json) - hf_bort_model = BertForMaskedLM(hf_bort_config) - hf_bort_model.eval() - - # Parameter mapping table (Gluonnlp to Transformers) - # * denotes layer index - # - # | Gluon Parameter | Transformers Parameter - # | -------------------------------------------------------------- | ---------------------- - # | `encoder.layer_norm.beta` | `bert.embeddings.LayerNorm.bias` - # | `encoder.layer_norm.gamma` | `bert.embeddings.LayerNorm.weight` - # | `encoder.position_weight` | `bert.embeddings.position_embeddings.weight` - # | `word_embed.0.weight` | `bert.embeddings.word_embeddings.weight` - # | `encoder.transformer_cells.*.attention_cell.proj_key.bias` | `bert.encoder.layer.*.attention.self.key.bias` - # | `encoder.transformer_cells.*.attention_cell.proj_key.weight` | `bert.encoder.layer.*.attention.self.key.weight` - # | `encoder.transformer_cells.*.attention_cell.proj_query.bias` | `bert.encoder.layer.*.attention.self.query.bias` - # | `encoder.transformer_cells.*.attention_cell.proj_query.weight` | `bert.encoder.layer.*.attention.self.query.weight` - # | `encoder.transformer_cells.*.attention_cell.proj_value.bias` | `bert.encoder.layer.*.attention.self.value.bias` - # | `encoder.transformer_cells.*.attention_cell.proj_value.weight` | `bert.encoder.layer.*.attention.self.value.weight` - # | `encoder.transformer_cells.*.ffn.ffn_2.bias` | `bert.encoder.layer.*.attention.output.dense.bias` - # | `encoder.transformer_cells.*.ffn.ffn_2.weight` | `bert.encoder.layer.*.attention.output.dense.weight` - # | `encoder.transformer_cells.*.layer_norm.beta` | `bert.encoder.layer.*.attention.output.LayerNorm.bias` - # | `encoder.transformer_cells.*.layer_norm.gamma` | `bert.encoder.layer.*.attention.output.LayerNorm.weight` - # | `encoder.transformer_cells.*.ffn.ffn_1.bias` | `bert.encoder.layer.*.intermediate.dense.bias` - # | `encoder.transformer_cells.*.ffn.ffn_1.weight` | `bert.encoder.layer.*.intermediate.dense.weight` - # | `encoder.transformer_cells.*.ffn.layer_norm.beta` | `bert.encoder.layer.*.output.LayerNorm.bias` - # | `encoder.transformer_cells.*.ffn.layer_norm.gamma` | `bert.encoder.layer.*.output.LayerNorm.weight` - # | `encoder.transformer_cells.*.proj.bias` | `bert.encoder.layer.*.output.dense.bias` - # | `encoder.transformer_cells.*.proj.weight` | `bert.encoder.layer.*.output.dense.weight` - - # Helper function to convert MXNET Arrays to PyTorch - def to_torch(mx_array) -> nn.Parameter: - return nn.Parameter(torch.FloatTensor(mx_array.data().asnumpy())) - - # Check param shapes and map new HF param back - def check_and_map_params(hf_param, gluon_param): - shape_hf = hf_param.shape - - gluon_param = to_torch(params[gluon_param]) - shape_gluon = gluon_param.shape - - assert shape_hf == shape_gluon, ( - f"The gluon parameter {gluon_param} has shape {shape_gluon}, but expects shape {shape_hf} for Transformers" - ) - - return gluon_param - - hf_bort_model.bert.embeddings.word_embeddings.weight = check_and_map_params( - hf_bort_model.bert.embeddings.word_embeddings.weight, "word_embed.0.weight" - ) - hf_bort_model.bert.embeddings.position_embeddings.weight = check_and_map_params( - hf_bort_model.bert.embeddings.position_embeddings.weight, "encoder.position_weight" - ) - hf_bort_model.bert.embeddings.LayerNorm.bias = check_and_map_params( - hf_bort_model.bert.embeddings.LayerNorm.bias, "encoder.layer_norm.beta" - ) - hf_bort_model.bert.embeddings.LayerNorm.weight = check_and_map_params( - hf_bort_model.bert.embeddings.LayerNorm.weight, "encoder.layer_norm.gamma" - ) - - # Inspired by RoBERTa conversion script, we just zero them out (Bort does not use them) - hf_bort_model.bert.embeddings.token_type_embeddings.weight.data = torch.zeros_like( - hf_bort_model.bert.embeddings.token_type_embeddings.weight.data - ) - - for i in range(hf_bort_config.num_hidden_layers): - layer: BertLayer = hf_bort_model.bert.encoder.layer[i] - - # self attention - self_attn: BertSelfAttention = layer.attention.self - - self_attn.key.bias.data = check_and_map_params( - self_attn.key.bias.data, f"encoder.transformer_cells.{i}.attention_cell.proj_key.bias" - ) - - self_attn.key.weight.data = check_and_map_params( - self_attn.key.weight.data, f"encoder.transformer_cells.{i}.attention_cell.proj_key.weight" - ) - self_attn.query.bias.data = check_and_map_params( - self_attn.query.bias.data, f"encoder.transformer_cells.{i}.attention_cell.proj_query.bias" - ) - self_attn.query.weight.data = check_and_map_params( - self_attn.query.weight.data, f"encoder.transformer_cells.{i}.attention_cell.proj_query.weight" - ) - self_attn.value.bias.data = check_and_map_params( - self_attn.value.bias.data, f"encoder.transformer_cells.{i}.attention_cell.proj_value.bias" - ) - self_attn.value.weight.data = check_and_map_params( - self_attn.value.weight.data, f"encoder.transformer_cells.{i}.attention_cell.proj_value.weight" - ) - - # self attention output - self_output: BertSelfOutput = layer.attention.output - - self_output.dense.bias = check_and_map_params( - self_output.dense.bias, f"encoder.transformer_cells.{i}.proj.bias" - ) - self_output.dense.weight = check_and_map_params( - self_output.dense.weight, f"encoder.transformer_cells.{i}.proj.weight" - ) - self_output.LayerNorm.bias = check_and_map_params( - self_output.LayerNorm.bias, f"encoder.transformer_cells.{i}.layer_norm.beta" - ) - self_output.LayerNorm.weight = check_and_map_params( - self_output.LayerNorm.weight, f"encoder.transformer_cells.{i}.layer_norm.gamma" - ) - - # intermediate - intermediate: BertIntermediate = layer.intermediate - - intermediate.dense.bias = check_and_map_params( - intermediate.dense.bias, f"encoder.transformer_cells.{i}.ffn.ffn_1.bias" - ) - intermediate.dense.weight = check_and_map_params( - intermediate.dense.weight, f"encoder.transformer_cells.{i}.ffn.ffn_1.weight" - ) - - # output - bert_output: BertOutput = layer.output - - bert_output.dense.bias = check_and_map_params( - bert_output.dense.bias, f"encoder.transformer_cells.{i}.ffn.ffn_2.bias" - ) - bert_output.dense.weight = check_and_map_params( - bert_output.dense.weight, f"encoder.transformer_cells.{i}.ffn.ffn_2.weight" - ) - bert_output.LayerNorm.bias = check_and_map_params( - bert_output.LayerNorm.bias, f"encoder.transformer_cells.{i}.ffn.layer_norm.beta" - ) - bert_output.LayerNorm.weight = check_and_map_params( - bert_output.LayerNorm.weight, f"encoder.transformer_cells.{i}.ffn.layer_norm.gamma" - ) - - # Save space and energy 🎄 - hf_bort_model.half() - - # Compare output of both models - tokenizer = RobertaTokenizer.from_pretrained("FacebookAI/roberta-base") - - input_ids = tokenizer.encode_plus(SAMPLE_TEXT)["input_ids"] - - # Get gluon output - gluon_input_ids = mx.nd.array([input_ids]) - output_gluon = original_bort(inputs=gluon_input_ids, token_types=[]) - - # Get Transformer output (save and reload model again) - hf_bort_model.save_pretrained(pytorch_dump_folder_path) - hf_bort_model = BertModel.from_pretrained(pytorch_dump_folder_path) - hf_bort_model.eval() - - input_ids = tokenizer.encode_plus(SAMPLE_TEXT, return_tensors="pt") - output_hf = hf_bort_model(**input_ids)[0] - - gluon_layer = output_gluon[0].asnumpy() - hf_layer = output_hf[0].detach().numpy() - - max_absolute_diff = np.max(np.abs(hf_layer - gluon_layer)).item() - success = np.allclose(gluon_layer, hf_layer, atol=1e-3) - - if success: - print("✔️ Both model do output the same tensors") - else: - print("❌ Both model do **NOT** output the same tensors") - print("Absolute difference is:", max_absolute_diff) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--bort_checkpoint_path", default=None, type=str, required=True, help="Path the official Bort params file." - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - args = parser.parse_args() - convert_bort_checkpoint_to_pytorch(args.bort_checkpoint_path, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/deprecated/deta/convert_deta_resnet_to_pytorch.py b/src/transformers/models/deprecated/deta/convert_deta_resnet_to_pytorch.py deleted file mode 100644 index 2a38bc05ccac..000000000000 --- a/src/transformers/models/deprecated/deta/convert_deta_resnet_to_pytorch.py +++ /dev/null @@ -1,319 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DETA checkpoints from the original repository. - -URL: https://github.com/jozhang97/DETA/tree/master""" - -import argparse -import json -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import DetaConfig, DetaForObjectDetection, DetaImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_deta_config(): - config = DetaConfig( - num_queries=900, - encoder_ffn_dim=2048, - decoder_ffn_dim=2048, - num_feature_levels=5, - assign_first_stage=True, - with_box_refine=True, - two_stage=True, - ) - - # set labels - config.num_labels = 91 - repo_id = "huggingface/label-files" - filename = "coco-detection-id2label.json" - id2label = json.loads(Path(hf_hub_download(repo_id, filename, repo_type="dataset")).read_text()) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - - return config - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config): - rename_keys = [] - - # stem - # fmt: off - rename_keys.append(("backbone.0.body.conv1.weight", "model.backbone.model.embedder.embedder.convolution.weight")) - rename_keys.append(("backbone.0.body.bn1.weight", "model.backbone.model.embedder.embedder.normalization.weight")) - rename_keys.append(("backbone.0.body.bn1.bias", "model.backbone.model.embedder.embedder.normalization.bias")) - rename_keys.append(("backbone.0.body.bn1.running_mean", "model.backbone.model.embedder.embedder.normalization.running_mean")) - rename_keys.append(("backbone.0.body.bn1.running_var", "model.backbone.model.embedder.embedder.normalization.running_var")) - # stages - for stage_idx in range(len(config.backbone_config.depths)): - for layer_idx in range(config.backbone_config.depths[stage_idx]): - # shortcut - if layer_idx == 0: - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.0.weight", - f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.convolution.weight", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.weight", - f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.weight", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.bias", - f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.bias", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_mean", - f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_mean", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_var", - f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_var", - ) - ) - # 3 convs - for i in range(3): - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.conv{i+1}.weight", - f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.convolution.weight", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.weight", - f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.weight", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.bias", - f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.bias", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.running_mean", - f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_mean", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.running_var", - f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_var", - ) - ) - # transformer encoder - for i in range(config.encoder_layers): - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.weight", f"model.encoder.layers.{i}.self_attn.sampling_offsets.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.bias", f"model.encoder.layers.{i}.self_attn.sampling_offsets.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.weight", f"model.encoder.layers.{i}.self_attn.attention_weights.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.bias", f"model.encoder.layers.{i}.self_attn.attention_weights.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.weight", f"model.encoder.layers.{i}.self_attn.value_proj.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.bias", f"model.encoder.layers.{i}.self_attn.value_proj.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.weight", f"model.encoder.layers.{i}.self_attn.output_proj.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.bias", f"model.encoder.layers.{i}.self_attn.output_proj.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.norm1.weight", f"model.encoder.layers.{i}.self_attn_layer_norm.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"model.encoder.layers.{i}.self_attn_layer_norm.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"model.encoder.layers.{i}.fc1.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"model.encoder.layers.{i}.fc1.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"model.encoder.layers.{i}.fc2.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"model.encoder.layers.{i}.fc2.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"model.encoder.layers.{i}.final_layer_norm.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"model.encoder.layers.{i}.final_layer_norm.bias")) - - # transformer decoder - for i in range(config.decoder_layers): - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.weight", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.bias", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.weight", f"model.decoder.layers.{i}.encoder_attn.attention_weights.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.bias", f"model.decoder.layers.{i}.encoder_attn.attention_weights.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.weight", f"model.decoder.layers.{i}.encoder_attn.value_proj.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.bias", f"model.decoder.layers.{i}.encoder_attn.value_proj.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.weight", f"model.decoder.layers.{i}.encoder_attn.output_proj.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.bias", f"model.decoder.layers.{i}.encoder_attn.output_proj.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm1.weight", f"model.decoder.layers.{i}.encoder_attn_layer_norm.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"model.decoder.layers.{i}.encoder_attn_layer_norm.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"model.decoder.layers.{i}.self_attn.out_proj.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"model.decoder.layers.{i}.self_attn.out_proj.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm2.weight", f"model.decoder.layers.{i}.self_attn_layer_norm.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm2.bias", f"model.decoder.layers.{i}.self_attn_layer_norm.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"model.decoder.layers.{i}.fc1.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"model.decoder.layers.{i}.fc1.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"model.decoder.layers.{i}.fc2.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"model.decoder.layers.{i}.fc2.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"model.decoder.layers.{i}.final_layer_norm.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"model.decoder.layers.{i}.final_layer_norm.bias")) - - # fmt: on - - return rename_keys - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -def read_in_decoder_q_k_v(state_dict, config): - # transformer decoder self-attention layers - hidden_size = config.d_model - for i in range(config.decoder_layers): - # read in weights + bias of input projection layer of self-attention - in_proj_weight = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:hidden_size, :] - state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:hidden_size] - state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[ - hidden_size : hidden_size * 2, : - ] - state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[hidden_size : hidden_size * 2] - state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-hidden_size:, :] - state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-hidden_size:] - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - - return im - - -@torch.no_grad() -def convert_deta_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub): - """ - Copy/paste/tweak model's weights to our DETA structure. - """ - - # load config - config = get_deta_config() - - # load original state dict - if model_name == "deta-resnet-50": - filename = "adet_checkpoint0011.pth" - elif model_name == "deta-resnet-50-24-epochs": - filename = "adet_2x_checkpoint0023.pth" - else: - raise ValueError(f"Model name {model_name} not supported") - checkpoint_path = hf_hub_download(repo_id="nielsr/deta-checkpoints", filename=filename) - state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)["model"] - - # rename keys - rename_keys = create_rename_keys(config) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - read_in_decoder_q_k_v(state_dict, config) - - # fix some prefixes - for key in state_dict.copy(): - if "transformer.decoder.class_embed" in key or "transformer.decoder.bbox_embed" in key: - val = state_dict.pop(key) - state_dict[key.replace("transformer.decoder", "model.decoder")] = val - if "input_proj" in key: - val = state_dict.pop(key) - state_dict["model." + key] = val - if "level_embed" in key or "pos_trans" in key or "pix_trans" in key or "enc_output" in key: - val = state_dict.pop(key) - state_dict[key.replace("transformer", "model")] = val - - # finally, create HuggingFace model and load state dict - model = DetaForObjectDetection(config) - model.load_state_dict(state_dict) - model.eval() - - device = "cuda" if torch.cuda.is_available() else "cpu" - model.to(device) - - # load image processor - processor = DetaImageProcessor(format="coco_detection") - - # verify our conversion on image - img = prepare_img() - encoding = processor(images=img, return_tensors="pt") - pixel_values = encoding["pixel_values"] - outputs = model(pixel_values.to(device)) - - # verify logits - if model_name == "deta-resnet-50": - expected_logits = torch.tensor( - [[-7.3978, -2.5406, -4.1668], [-8.2684, -3.9933, -3.8096], [-7.0515, -3.7973, -5.8516]] - ) - expected_boxes = torch.tensor([[0.5043, 0.4973, 0.9998], [0.2542, 0.5489, 0.4748], [0.5490, 0.2765, 0.0570]]) - elif model_name == "deta-resnet-50-24-epochs": - expected_logits = torch.tensor( - [[-7.1688, -2.4857, -4.8669], [-7.8630, -3.8154, -4.2674], [-7.2730, -4.1865, -5.5323]] - ) - expected_boxes = torch.tensor([[0.5021, 0.4971, 0.9994], [0.2546, 0.5486, 0.4731], [0.1686, 0.1986, 0.2142]]) - - assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4) - assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4) - print("Everything ok!") - - if pytorch_dump_folder_path: - # Save model and processor - logger.info(f"Saving PyTorch model and processor to {pytorch_dump_folder_path}...") - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - # Push to hub - if push_to_hub: - print("Pushing model and processor to hub...") - model.push_to_hub(f"jozhang97/{model_name}") - processor.push_to_hub(f"jozhang97/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument( - "--model_name", - type=str, - default="deta-resnet-50", - choices=["deta-resnet-50", "deta-resnet-50-24-epochs"], - help="Name of the model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - help="Path to the folder to output PyTorch model.", - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub." - ) - args = parser.parse_args() - convert_deta_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/deprecated/deta/convert_deta_swin_to_pytorch.py b/src/transformers/models/deprecated/deta/convert_deta_swin_to_pytorch.py deleted file mode 100644 index a72c8c54221c..000000000000 --- a/src/transformers/models/deprecated/deta/convert_deta_swin_to_pytorch.py +++ /dev/null @@ -1,326 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DETA checkpoints from the original repository. - -URL: https://github.com/jozhang97/DETA/tree/master""" - -import argparse -import json -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import DetaConfig, DetaForObjectDetection, DetaImageProcessor, SwinConfig -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_deta_config(model_name): - backbone_config = SwinConfig( - embed_dim=192, - depths=(2, 2, 18, 2), - num_heads=(6, 12, 24, 48), - window_size=12, - out_features=["stage2", "stage3", "stage4"], - ) - - config = DetaConfig( - backbone_config=backbone_config, - num_queries=900, - encoder_ffn_dim=2048, - decoder_ffn_dim=2048, - num_feature_levels=5, - assign_first_stage=True, - with_box_refine=True, - two_stage=True, - ) - - # set labels - repo_id = "huggingface/label-files" - if "o365" in model_name: - num_labels = 366 - filename = "object365-id2label.json" - else: - num_labels = 91 - filename = "coco-detection-id2label.json" - - config.num_labels = num_labels - id2label = json.loads(Path(hf_hub_download(repo_id, filename, repo_type="dataset")).read_text()) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - - return config - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config): - rename_keys = [] - - # stem - # fmt: off - rename_keys.append(("backbone.0.body.patch_embed.proj.weight", "model.backbone.model.embeddings.patch_embeddings.projection.weight")) - rename_keys.append(("backbone.0.body.patch_embed.proj.bias", "model.backbone.model.embeddings.patch_embeddings.projection.bias")) - rename_keys.append(("backbone.0.body.patch_embed.norm.weight", "model.backbone.model.embeddings.norm.weight")) - rename_keys.append(("backbone.0.body.patch_embed.norm.bias", "model.backbone.model.embeddings.norm.bias")) - # stages - for i in range(len(config.backbone_config.depths)): - for j in range(config.backbone_config.depths[i]): - rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm1.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_before.weight")) - rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm1.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_before.bias")) - rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.relative_position_bias_table", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_bias_table")) - rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.relative_position_index", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_index")) - rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.proj.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.output.dense.weight")) - rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.proj.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.output.dense.bias")) - rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm2.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_after.weight")) - rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm2.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_after.bias")) - rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc1.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.intermediate.dense.weight")) - rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc1.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.intermediate.dense.bias")) - rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc2.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.output.dense.weight")) - rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc2.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.output.dense.bias")) - - if i < 3: - rename_keys.append((f"backbone.0.body.layers.{i}.downsample.reduction.weight", f"model.backbone.model.encoder.layers.{i}.downsample.reduction.weight")) - rename_keys.append((f"backbone.0.body.layers.{i}.downsample.norm.weight", f"model.backbone.model.encoder.layers.{i}.downsample.norm.weight")) - rename_keys.append((f"backbone.0.body.layers.{i}.downsample.norm.bias", f"model.backbone.model.encoder.layers.{i}.downsample.norm.bias")) - - rename_keys.append(("backbone.0.body.norm1.weight", "model.backbone.model.hidden_states_norms.stage2.weight")) - rename_keys.append(("backbone.0.body.norm1.bias", "model.backbone.model.hidden_states_norms.stage2.bias")) - rename_keys.append(("backbone.0.body.norm2.weight", "model.backbone.model.hidden_states_norms.stage3.weight")) - rename_keys.append(("backbone.0.body.norm2.bias", "model.backbone.model.hidden_states_norms.stage3.bias")) - rename_keys.append(("backbone.0.body.norm3.weight", "model.backbone.model.hidden_states_norms.stage4.weight")) - rename_keys.append(("backbone.0.body.norm3.bias", "model.backbone.model.hidden_states_norms.stage4.bias")) - - # transformer encoder - for i in range(config.encoder_layers): - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.weight", f"model.encoder.layers.{i}.self_attn.sampling_offsets.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.bias", f"model.encoder.layers.{i}.self_attn.sampling_offsets.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.weight", f"model.encoder.layers.{i}.self_attn.attention_weights.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.bias", f"model.encoder.layers.{i}.self_attn.attention_weights.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.weight", f"model.encoder.layers.{i}.self_attn.value_proj.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.bias", f"model.encoder.layers.{i}.self_attn.value_proj.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.weight", f"model.encoder.layers.{i}.self_attn.output_proj.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.bias", f"model.encoder.layers.{i}.self_attn.output_proj.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.norm1.weight", f"model.encoder.layers.{i}.self_attn_layer_norm.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"model.encoder.layers.{i}.self_attn_layer_norm.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"model.encoder.layers.{i}.fc1.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"model.encoder.layers.{i}.fc1.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"model.encoder.layers.{i}.fc2.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"model.encoder.layers.{i}.fc2.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"model.encoder.layers.{i}.final_layer_norm.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"model.encoder.layers.{i}.final_layer_norm.bias")) - - # transformer decoder - for i in range(config.decoder_layers): - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.weight", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.bias", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.weight", f"model.decoder.layers.{i}.encoder_attn.attention_weights.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.bias", f"model.decoder.layers.{i}.encoder_attn.attention_weights.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.weight", f"model.decoder.layers.{i}.encoder_attn.value_proj.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.bias", f"model.decoder.layers.{i}.encoder_attn.value_proj.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.weight", f"model.decoder.layers.{i}.encoder_attn.output_proj.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.bias", f"model.decoder.layers.{i}.encoder_attn.output_proj.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm1.weight", f"model.decoder.layers.{i}.encoder_attn_layer_norm.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"model.decoder.layers.{i}.encoder_attn_layer_norm.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"model.decoder.layers.{i}.self_attn.out_proj.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"model.decoder.layers.{i}.self_attn.out_proj.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm2.weight", f"model.decoder.layers.{i}.self_attn_layer_norm.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm2.bias", f"model.decoder.layers.{i}.self_attn_layer_norm.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"model.decoder.layers.{i}.fc1.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"model.decoder.layers.{i}.fc1.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"model.decoder.layers.{i}.fc2.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"model.decoder.layers.{i}.fc2.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"model.decoder.layers.{i}.final_layer_norm.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"model.decoder.layers.{i}.final_layer_norm.bias")) - - # fmt: on - - return rename_keys - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_swin_q_k_v(state_dict, backbone_config): - num_features = [int(backbone_config.embed_dim * 2**i) for i in range(len(backbone_config.depths))] - for i in range(len(backbone_config.depths)): - dim = num_features[i] - for j in range(backbone_config.depths[i]): - # fmt: off - # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"backbone.0.body.layers.{i}.blocks.{j}.attn.qkv.weight") - in_proj_bias = state_dict.pop(f"backbone.0.body.layers.{i}.blocks.{j}.attn.qkv.bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.query.weight"] = in_proj_weight[:dim, :] - state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.query.bias"] = in_proj_bias[: dim] - state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.key.weight"] = in_proj_weight[ - dim : dim * 2, : - ] - state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.key.bias"] = in_proj_bias[ - dim : dim * 2 - ] - state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.value.weight"] = in_proj_weight[ - -dim :, : - ] - state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.value.bias"] = in_proj_bias[-dim :] - # fmt: on - - -def read_in_decoder_q_k_v(state_dict, config): - # transformer decoder self-attention layers - hidden_size = config.d_model - for i in range(config.decoder_layers): - # read in weights + bias of input projection layer of self-attention - in_proj_weight = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:hidden_size, :] - state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:hidden_size] - state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[ - hidden_size : hidden_size * 2, : - ] - state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[hidden_size : hidden_size * 2] - state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-hidden_size:, :] - state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-hidden_size:] - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - - return im - - -@torch.no_grad() -def convert_deta_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub): - """ - Copy/paste/tweak model's weights to our DETA structure. - """ - - # load config - config = get_deta_config(model_name) - - # load original state dict - if model_name == "deta-swin-large": - checkpoint_path = hf_hub_download(repo_id="nielsr/deta-checkpoints", filename="adet_swin_ft.pth") - elif model_name == "deta-swin-large-o365": - checkpoint_path = hf_hub_download(repo_id="jozhang97/deta-swin-l-o365", filename="deta_swin_pt_o365.pth") - else: - raise ValueError(f"Model name {model_name} not supported") - - state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)["model"] - - # original state dict - for name, param in state_dict.items(): - print(name, param.shape) - - # rename keys - rename_keys = create_rename_keys(config) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - read_in_swin_q_k_v(state_dict, config.backbone_config) - read_in_decoder_q_k_v(state_dict, config) - - # fix some prefixes - for key in state_dict.copy(): - if "transformer.decoder.class_embed" in key or "transformer.decoder.bbox_embed" in key: - val = state_dict.pop(key) - state_dict[key.replace("transformer.decoder", "model.decoder")] = val - if "input_proj" in key: - val = state_dict.pop(key) - state_dict["model." + key] = val - if "level_embed" in key or "pos_trans" in key or "pix_trans" in key or "enc_output" in key: - val = state_dict.pop(key) - state_dict[key.replace("transformer", "model")] = val - - # finally, create HuggingFace model and load state dict - model = DetaForObjectDetection(config) - model.load_state_dict(state_dict) - model.eval() - - device = "cuda" if torch.cuda.is_available() else "cpu" - model.to(device) - - # load image processor - processor = DetaImageProcessor(format="coco_detection") - - # verify our conversion on image - img = prepare_img() - encoding = processor(images=img, return_tensors="pt") - pixel_values = encoding["pixel_values"] - outputs = model(pixel_values.to(device)) - - # verify logits - print("Logits:", outputs.logits[0, :3, :3]) - print("Boxes:", outputs.pred_boxes[0, :3, :3]) - if model_name == "deta-swin-large": - expected_logits = torch.tensor( - [[-7.6308, -2.8485, -5.3737], [-7.2037, -4.5505, -4.8027], [-7.2943, -4.2611, -4.6617]] - ) - expected_boxes = torch.tensor([[0.4987, 0.4969, 0.9999], [0.2549, 0.5498, 0.4805], [0.5498, 0.2757, 0.0569]]) - elif model_name == "deta-swin-large-o365": - expected_logits = torch.tensor( - [[-8.0122, -3.5720, -4.9717], [-8.1547, -3.6886, -4.6389], [-7.6610, -3.6194, -5.0134]] - ) - expected_boxes = torch.tensor([[0.2523, 0.5549, 0.4881], [0.7715, 0.4149, 0.4601], [0.5503, 0.2753, 0.0575]]) - assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4) - assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4) - print("Everything ok!") - - if pytorch_dump_folder_path: - # Save model and processor - logger.info(f"Saving PyTorch model and processor to {pytorch_dump_folder_path}...") - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - # Push to hub - if push_to_hub: - print("Pushing model and processor to hub...") - model.push_to_hub(f"jozhang97/{model_name}") - processor.push_to_hub(f"jozhang97/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument( - "--model_name", - type=str, - default="deta-swin-large", - choices=["deta-swin-large", "deta-swin-large-o365"], - help="Name of the model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - help="Path to the folder to output PyTorch model.", - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub." - ) - args = parser.parse_args() - convert_deta_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/deprecated/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index 7b1a4aa5f207..000000000000 --- a/src/transformers/models/deprecated/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,252 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Convert EfficientFormer checkpoints from the original repository. - -URL: https://github.com/snap-research/EfficientFormer -""" - -import argparse -import re -from pathlib import Path - -import requests -import torch -from PIL import Image -from torchvision.transforms import CenterCrop, Compose, Normalize, Resize, ToTensor - -from transformers import ( - EfficientFormerConfig, - EfficientFormerForImageClassificationWithTeacher, - EfficientFormerImageProcessor, -) -from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling - - -def rename_key(old_name, num_meta4D_last_stage): - new_name = old_name - - if "patch_embed" in old_name: - _, layer, param = old_name.split(".") - - if layer == "0": - new_name = old_name.replace("0", "convolution1") - elif layer == "1": - new_name = old_name.replace("1", "batchnorm_before") - elif layer == "3": - new_name = old_name.replace("3", "convolution2") - else: - new_name = old_name.replace("4", "batchnorm_after") - - if "network" in old_name and re.search(r"\d\.\d", old_name): - two_digit_num = r"\b\d{2}\b" - if bool(re.search(two_digit_num, old_name)): - match = re.search(r"\d\.\d\d.", old_name).group() - else: - match = re.search(r"\d\.\d.", old_name).group() - if int(match[0]) < 6: - trimmed_name = old_name.replace(match, "") - trimmed_name = trimmed_name.replace("network", match[0] + ".meta4D_layers.blocks." + match[2:-1]) - new_name = "intermediate_stages." + trimmed_name - else: - trimmed_name = old_name.replace(match, "") - if int(match[2]) < num_meta4D_last_stage: - trimmed_name = trimmed_name.replace("network", "meta4D_layers.blocks." + match[2]) - else: - layer_index = str(int(match[2]) - num_meta4D_last_stage) - trimmed_name = trimmed_name.replace("network", "meta3D_layers.blocks." + layer_index) - if "norm1" in old_name: - trimmed_name = trimmed_name.replace("norm1", "layernorm1") - elif "norm2" in old_name: - trimmed_name = trimmed_name.replace("norm2", "layernorm2") - elif "fc1" in old_name: - trimmed_name = trimmed_name.replace("fc1", "linear_in") - elif "fc2" in old_name: - trimmed_name = trimmed_name.replace("fc2", "linear_out") - - new_name = "last_stage." + trimmed_name - - elif "network" in old_name and re.search(r".\d.", old_name): - new_name = old_name.replace("network", "intermediate_stages") - - if "fc" in new_name: - new_name = new_name.replace("fc", "convolution") - elif ("norm1" in new_name) and ("layernorm1" not in new_name): - new_name = new_name.replace("norm1", "batchnorm_before") - elif ("norm2" in new_name) and ("layernorm2" not in new_name): - new_name = new_name.replace("norm2", "batchnorm_after") - if "proj" in new_name: - new_name = new_name.replace("proj", "projection") - if "dist_head" in new_name: - new_name = new_name.replace("dist_head", "distillation_classifier") - elif "head" in new_name: - new_name = new_name.replace("head", "classifier") - elif "patch_embed" in new_name: - new_name = "efficientformer." + new_name - elif new_name == "norm.weight" or new_name == "norm.bias": - new_name = new_name.replace("norm", "layernorm") - new_name = "efficientformer." + new_name - else: - new_name = "efficientformer.encoder." + new_name - - return new_name - - -def convert_torch_checkpoint(checkpoint, num_meta4D_last_stage): - for key in checkpoint.copy(): - val = checkpoint.pop(key) - checkpoint[rename_key(key, num_meta4D_last_stage)] = val - - return checkpoint - - -# We will verify our results on a COCO image -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - image = Image.open(requests.get(url, stream=True).raw) - - return image - - -def convert_efficientformer_checkpoint( - checkpoint_path: Path, efficientformer_config_file: Path, pytorch_dump_path: Path, push_to_hub: bool -): - orig_state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)["model"] - config = EfficientFormerConfig.from_json_file(efficientformer_config_file) - model = EfficientFormerForImageClassificationWithTeacher(config) - model_name = "_".join(checkpoint_path.split("/")[-1].split(".")[0].split("_")[:-1]) - - num_meta4D_last_stage = config.depths[-1] - config.num_meta3d_blocks + 1 - new_state_dict = convert_torch_checkpoint(orig_state_dict, num_meta4D_last_stage) - - model.load_state_dict(new_state_dict) - model.eval() - - pillow_resamplings = { - "bilinear": PILImageResampling.BILINEAR, - "bicubic": PILImageResampling.BICUBIC, - "nearest": PILImageResampling.NEAREST, - } - - # prepare image - image = prepare_img() - image_size = 256 - crop_size = 224 - processor = EfficientFormerImageProcessor( - size={"shortest_edge": image_size}, - crop_size={"height": crop_size, "width": crop_size}, - resample=pillow_resamplings["bicubic"], - ) - pixel_values = processor(images=image, return_tensors="pt").pixel_values - - # original processing pipeline - image_transforms = Compose( - [ - Resize(image_size, interpolation=pillow_resamplings["bicubic"]), - CenterCrop(crop_size), - ToTensor(), - Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD), - ] - ) - original_pixel_values = image_transforms(image).unsqueeze(0) - - assert torch.allclose(original_pixel_values, pixel_values) - - outputs = model(pixel_values) - logits = outputs.logits - - expected_shape = (1, 1000) - - if "l1" in model_name: - expected_logits = torch.Tensor( - [-0.1312, 0.4353, -1.0499, -0.5124, 0.4183, -0.6793, -1.3777, -0.0893, -0.7358, -2.4328] - ) - assert torch.allclose(logits[0, :10], expected_logits, atol=1e-3) - assert logits.shape == expected_shape - elif "l3" in model_name: - expected_logits = torch.Tensor( - [-1.3150, -1.5456, -1.2556, -0.8496, -0.7127, -0.7897, -0.9728, -0.3052, 0.3751, -0.3127] - ) - assert torch.allclose(logits[0, :10], expected_logits, atol=1e-3) - assert logits.shape == expected_shape - elif "l7" in model_name: - expected_logits = torch.Tensor( - [-1.0283, -1.4131, -0.5644, -1.3115, -0.5785, -1.2049, -0.7528, 0.1992, -0.3822, -0.0878] - ) - assert logits.shape == expected_shape - else: - raise ValueError( - f"Unknown model checkpoint: {checkpoint_path}. Supported version of efficientformer are l1, l3 and l7" - ) - - # Save Checkpoints - Path(pytorch_dump_path).mkdir(exist_ok=True) - model.save_pretrained(pytorch_dump_path) - print(f"Checkpoint successfully converted. Model saved at {pytorch_dump_path}") - processor.save_pretrained(pytorch_dump_path) - print(f"Processor successfully saved at {pytorch_dump_path}") - - if push_to_hub: - print("Pushing model to the hub...") - - model.push_to_hub( - repo_id=f"Bearnardd/{pytorch_dump_path}", - commit_message="Add model", - use_temp_dir=True, - ) - processor.push_to_hub( - repo_id=f"Bearnardd/{pytorch_dump_path}", - commit_message="Add image processor", - use_temp_dir=True, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--pytorch_model_path", - default=None, - type=str, - required=True, - help="Path to EfficientFormer pytorch checkpoint.", - ) - parser.add_argument( - "--config_file", - default=None, - type=str, - required=True, - help="The json file for EfficientFormer model config.", - ) - parser.add_argument( - "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - - parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub") - parser.add_argument( - "--no-push_to_hub", - dest="push_to_hub", - action="store_false", - help="Do not push model and image processor to the hub", - ) - parser.set_defaults(push_to_hub=True) - - args = parser.parse_args() - convert_efficientformer_checkpoint( - checkpoint_path=args.pytorch_model_path, - efficientformer_config_file=args.config_file, - pytorch_dump_path=args.pytorch_dump_path, - push_to_hub=args.push_to_hub, - ) diff --git a/src/transformers/models/deprecated/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py deleted file mode 100644 index 76b9c9cf328c..000000000000 --- a/src/transformers/models/deprecated/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py +++ /dev/null @@ -1,181 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Convert GPTSANJapanese checkpoints from the original repository to pytorch model.""" - -import argparse -import json -import os -from collections import OrderedDict - -import numpy as np -import tensorflow as tf -import torch - - -def convert_tf_gptsan_to_pt(args): - parameter_file = os.path.join(args.tf_model_dir, "parameters.json") - params = json.loads(open(parameter_file).read()) - if not params: - raise ValueError( - f"It seems that the json file at {parameter_file} is empty. Make sure you have a correct json file." - ) - if not args.output.endswith(".pt"): - args.output = args.output + ".pt" - new_state = OrderedDict() - with tf.device("/CPU:0"): - reader = tf.train.load_checkpoint(args.tf_model_dir) - shapes = reader.get_variable_to_shape_map() - for key_name in shapes: - vnp = reader.get_tensor(key_name).astype(np.float16) - if key_name.endswith("/adam_m") or key_name.endswith("/adam_v"): - continue - if key_name.startswith("pasts/"): - if key_name.startswith("pasts/mlp"): - player = int(key_name[9]) - elif key_name.startswith("pasts/out"): - player = 8 - name = "model.sqout.%d.weight" % (player * 2) # enter to nn.Sequential with Tanh, so 2 at a time - state = vnp.transpose([1, 0]).copy() # Mesh-Tensorflow is a diagonal matrix - new_state[name] = torch.tensor(state) - elif key_name.startswith("model/moe"): - player = int(key_name[9:].split("/")[0]) - if key_name.endswith("/switch_gating/kernel"): - name = "model.blocks.%d.feed_forward.mlp.router.classifier.weight" % player - state = vnp.transpose([1, 0]).copy() # Mesh-Tensorflow is a diagonal matrix - new_state[name] = torch.tensor(state) - elif key_name.endswith("/softmlp/kernel"): - name = "model.blocks.%d.feed_forward.soft_bypass_mlp.weight" % player - state = vnp.transpose([1, 0]).copy() # Mesh-Tensorflow is a diagonal matrix - new_state[name] = torch.tensor(state) - elif key_name.endswith("/wo/kernel") or key_name.endswith("/wi/kernel"): - nlayer = key_name[-9:-7] - for i in range(16): - name = "model.blocks.%d.feed_forward.mlp.experts.expert_%d.%s.weight" % (player, i, nlayer) - state = ( - vnp[i].transpose([1, 0]).copy() - ) # In Mesh-Tensorflow, it is one array, so it is divided - new_state[name] = torch.tensor(state) - elif key_name.startswith("model/mlp"): - player = int(key_name[9:].split("/")[0]) - if key_name.endswith("/p1/kernel"): - name = "model.blocks.%d.feed_forward.mlp.wi.weight" % player - state = vnp.transpose([1, 0]).copy() # Mesh-Tensorflow is a diagonal matrix - new_state[name] = torch.tensor(state) - elif key_name.endswith("/p1/bias"): - name = "model.blocks.%d.feed_forward.mlp.wi.bias" % player - state = vnp.copy() # same because it is one dimensional - new_state[name] = torch.tensor(state) - elif key_name.endswith("/p2/kernel"): - name = "model.blocks.%d.feed_forward.mlp.wo.weight" % player - state = vnp.transpose([1, 0]).copy() # Mesh-Tensorflow is a diagonal matrix - new_state[name] = torch.tensor(state) - elif key_name.endswith("/p2/bias"): - name = "model.blocks.%d.feed_forward.mlp.wo.bias" % player - state = vnp.copy() # same because it is one dimensional - new_state[name] = torch.tensor(state) - elif key_name.startswith("model/ln"): - player = int(key_name[8:].split("/")[0]) - if key_name.endswith("/b"): - name = "model.blocks.%d.feed_forward.norm.bias" % player - state = vnp.copy() # same because it is one dimensional - new_state[name] = torch.tensor(state) - elif key_name.endswith("/g"): - name = "model.blocks.%d.feed_forward.norm.weight" % player - state = vnp.copy() # same because it is one dimensional - new_state[name] = torch.tensor(state) - elif key_name.startswith("model/att"): - player = int(key_name[9:].split("/")[0]) - if key_name.endswith("/qkv/kernel"): - state = vnp.copy() # Compute same dimension as Mesh-tensorflow using einsum - state_q = state[:, 0, :, :] - state_k = state[:, 1, :, :] - state_v = state[:, 2, :, :] - state_q = ( - state_q.reshape([state_q.shape[0], state_q.shape[1] * state_q.shape[2]]) - .transpose([1, 0]) - .copy() - ) # Mesh-Tensorflow is a diagonal matrix - state_k = ( - state_k.reshape([state_k.shape[0], state_k.shape[1] * state_k.shape[2]]) - .transpose([1, 0]) - .copy() - ) # Mesh-Tensorflow is a diagonal matrix - state_v = ( - state_v.reshape([state_v.shape[0], state_v.shape[1] * state_v.shape[2]]) - .transpose([1, 0]) - .copy() - ) # Mesh-Tensorflow is a diagonal matrix - name = "model.blocks.%d.self_attn.self_attn.q_proj.weight" % player - new_state[name] = torch.tensor(state_q) - name = "model.blocks.%d.self_attn.self_attn.k_proj.weight" % player - new_state[name] = torch.tensor(state_k) - name = "model.blocks.%d.self_attn.self_attn.v_proj.weight" % player - new_state[name] = torch.tensor(state_v) - elif key_name.endswith("/o/kernel"): - name = "model.blocks.%d.self_attn.self_attn.out_proj.weight" % player - state = ( - vnp.reshape([vnp.shape[0] * vnp.shape[1], vnp.shape[2]]).transpose([1, 0]).copy() - ) # Mesh-Tensorflow is a diagonal matrix - new_state[name] = torch.tensor(state) - elif key_name.startswith("model/an"): - player = int(key_name[8:].split("/")[0]) - if key_name.endswith("/b"): - name = "model.blocks.%d.self_attn.norm.bias" % player - state = vnp.copy() # same because it is one dimensional - new_state[name] = torch.tensor(state) - elif key_name.endswith("/g"): - name = "model.blocks.%d.self_attn.norm.weight" % player - state = vnp.copy() # same because it is one dimensional - new_state[name] = torch.tensor(state) - elif ( - key_name.startswith("model/wte") - or key_name.startswith("model/wpe") - or key_name.startswith("model/ete") - ): - nlayer = {"wte": "embed_tokens", "wpe": "position_embeddings", "ete": "extra_position_embeddings"}[ - key_name[-3:] - ] - name = "model.%s.weight" % nlayer - state = vnp.copy() # same in embedded - new_state[name] = torch.tensor(state) - if key_name.startswith("model/wte"): - name = "lm_head.weight" - state = vnp.copy() # same in embedded - new_state[name] = torch.tensor(state) - elif key_name.startswith("model/wob"): - name = "final_logits_bias" - state = vnp.copy() # same in embedded - state = state.reshape((1, -1)) - new_state[name] = torch.tensor(state) - elif key_name == "model/dense/kernel": - name = "model.last_project.weight" - state = vnp.transpose([1, 0]).copy() # Mesh-Tensorflow is a diagonal matrix - new_state[name] = torch.tensor(state) - elif key_name == "model/dense_1/bias": - name = "model.last_project.bias" - state = vnp.copy() # same because it is one dimensional - new_state[name] = torch.tensor(state) - torch.save(new_state, args.output) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="model converter.", formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - parser.add_argument("--tf_model_dir", metavar="PATH", type=str, required=True, help="import model") - parser.add_argument("--output", metavar="PATH", type=str, required=True, help="output model") - args = parser.parse_args() - convert_tf_gptsan_to_pt(args) diff --git a/src/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py b/src/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py index c67b27f64fa1..1025fdf75fb4 100644 --- a/src/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py +++ b/src/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py @@ -495,7 +495,7 @@ def checku2e(x): candidates.append((self.vocab[wd], wd, e)) if len(candidates) > 0: # the smallest token_id is adopted - _, wd, e = sorted(candidates, key=lambda x: x[0])[0] + _, wd, e = min(candidates, key=lambda x: x[0]) result.append(wd) pos = e else: diff --git a/src/transformers/models/deprecated/graphormer/collating_graphormer.py b/src/transformers/models/deprecated/graphormer/collating_graphormer.py index 19bcaac3f572..88657bab435d 100644 --- a/src/transformers/models/deprecated/graphormer/collating_graphormer.py +++ b/src/transformers/models/deprecated/graphormer/collating_graphormer.py @@ -14,7 +14,7 @@ import pyximport pyximport.install(setup_args={"include_dirs": np.get_include()}) - from . import algos_graphormer # noqa E402 + from . import algos_graphormer def convert_to_single_emb(x, offset: int = 512): diff --git a/src/transformers/models/deprecated/jukebox/convert_jukebox.py b/src/transformers/models/deprecated/jukebox/convert_jukebox.py deleted file mode 100644 index 29763daaa30a..000000000000 --- a/src/transformers/models/deprecated/jukebox/convert_jukebox.py +++ /dev/null @@ -1,279 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Jukebox checkpoints""" - -import argparse -import json -import os -from pathlib import Path - -import requests -import torch - -from transformers import JukeboxConfig, JukeboxModel -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -PREFIX = "https://openaipublic.azureedge.net/jukebox/models/" -MODEL_MAPPING = { - "jukebox-1b-lyrics": [ - "5b/vqvae.pth.tar", - "5b/prior_level_0.pth.tar", - "5b/prior_level_1.pth.tar", - "1b_lyrics/prior_level_2.pth.tar", - ], - "jukebox-5b-lyrics": [ - "5b/vqvae.pth.tar", - "5b/prior_level_0.pth.tar", - "5b/prior_level_1.pth.tar", - "5b_lyrics/prior_level_2.pth.tar", - ], -} - - -def replace_key(key): - if key.endswith(".model.1.bias") and len(key.split(".")) > 10: - key = key.replace(".model.1.bias", ".conv1d_1.bias") - elif key.endswith(".model.1.weight") and len(key.split(".")) > 10: - key = key.replace(".model.1.weight", ".conv1d_1.weight") - elif key.endswith(".model.3.bias") and len(key.split(".")) > 10: - key = key.replace(".model.3.bias", ".conv1d_2.bias") - elif key.endswith(".model.3.weight") and len(key.split(".")) > 10: - key = key.replace(".model.3.weight", ".conv1d_2.weight") - - if "conditioner_blocks.0." in key: - key = key.replace("conditioner_blocks.0", "conditioner_blocks") - - if "prime_prior" in key: - key = key.replace("prime_prior", "encoder") - - if ".emb." in key and "total" not in key and "absolute" not in key and "relative" not in key: - key = key.replace(".emb.", ".") - - if key.endswith("k"): # replace vqvae.X.k with vqvae.X.codebook - return key.replace(".k", ".codebook") - if "y_emb." in key: - return key.replace("y_emb.", "metadata_embedding.") - - if "x_emb.emb." in key: - key = key.replace("0.x_emb.emb", "embed_tokens") - - if "prime_state_ln" in key: - return key.replace("prime_state_ln", "encoder.final_layer_norm") - if ".ln" in key: - return key.replace(".ln", ".layer_norm") - if "_ln" in key: - return key.replace("_ln", "_layer_norm") - - if "prime_state_proj" in key: - return key.replace("prime_state_proj", "encoder.proj_in") - if "prime_x_out" in key: - return key.replace("prime_x_out", "encoder.lm_head") - if "prior.x_out" in key: - return key.replace("x_out", "fc_proj_out") - if "x_emb" in key: - return key.replace("x_emb", "embed_tokens") - - return key - - -def fix_jukebox_keys(state_dict, model_state_dict, key_prefix, mapping): - new_dict = {} - import re - - re_encoder_block_conv_in = re.compile(r"encoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).(bias|weight)") - re_encoder_block_resnet = re.compile( - r"encoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).model.(\d*).model.(\d*).(bias|weight)" - ) - re_encoder_block_proj_out = re.compile(r"encoders.(\d*).level_blocks.(\d*).model.(\d*).(bias|weight)") - - re_decoder_block_conv_out = re.compile(r"decoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).(bias|weight)") - re_decoder_block_resnet = re.compile( - r"decoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).model.(\d*).model.(\d*).(bias|weight)" - ) - re_decoder_block_proj_in = re.compile(r"decoders.(\d*).level_blocks.(\d*).model.(\d*).(bias|weight)") - - re_prior_cond_conv_out = re.compile(r"conditioner_blocks.(\d*).cond.model.(\d*).(\d).(bias|weight)") - re_prior_cond_resnet = re.compile( - r"conditioner_blocks.(\d*).cond.model.(\d*).(\d).model.(\d*).model.(\d*).(bias|weight)" - ) - re_prior_cond_proj_in = re.compile(r"conditioner_blocks.(\d*).cond.model.(\d*).(bias|weight)") - - for original_key, value in state_dict.items(): - # rename vqvae.encoder keys - if re_encoder_block_conv_in.fullmatch(original_key): - regex_match = re_encoder_block_conv_in.match(original_key) - groups = regex_match.groups() - block_index = int(groups[2]) * 2 + int(groups[3]) - re_new_key = f"encoders.{groups[0]}.level_blocks.{groups[1]}.downsample_block.{block_index}.{groups[-1]}" - key = re_encoder_block_conv_in.sub(re_new_key, original_key) - - elif re_encoder_block_resnet.fullmatch(original_key): - regex_match = re_encoder_block_resnet.match(original_key) - groups = regex_match.groups() - block_index = int(groups[2]) * 2 + int(groups[3]) - conv_index = {"1": 1, "3": 2}[groups[-2]] - prefix = f"encoders.{groups[0]}.level_blocks.{groups[1]}.downsample_block.{block_index}." - resnet_block = f"resnet_block.{groups[-3]}.conv1d_{conv_index}.{groups[-1]}" - re_new_key = prefix + resnet_block - key = re_encoder_block_resnet.sub(re_new_key, original_key) - - elif re_encoder_block_proj_out.fullmatch(original_key): - regex_match = re_encoder_block_proj_out.match(original_key) - groups = regex_match.groups() - re_new_key = f"encoders.{groups[0]}.level_blocks.{groups[1]}.proj_out.{groups[-1]}" - key = re_encoder_block_proj_out.sub(re_new_key, original_key) - - # rename vqvae.decoder keys - elif re_decoder_block_conv_out.fullmatch(original_key): - regex_match = re_decoder_block_conv_out.match(original_key) - groups = regex_match.groups() - block_index = int(groups[2]) * 2 + int(groups[3]) - 2 - re_new_key = f"decoders.{groups[0]}.level_blocks.{groups[1]}.upsample_block.{block_index}.{groups[-1]}" - key = re_decoder_block_conv_out.sub(re_new_key, original_key) - - elif re_decoder_block_resnet.fullmatch(original_key): - regex_match = re_decoder_block_resnet.match(original_key) - groups = regex_match.groups() - block_index = int(groups[2]) * 2 + int(groups[3]) - 2 - conv_index = {"1": 1, "3": 2}[groups[-2]] - prefix = f"decoders.{groups[0]}.level_blocks.{groups[1]}.upsample_block.{block_index}." - resnet_block = f"resnet_block.{groups[-3]}.conv1d_{conv_index}.{groups[-1]}" - re_new_key = prefix + resnet_block - key = re_decoder_block_resnet.sub(re_new_key, original_key) - - elif re_decoder_block_proj_in.fullmatch(original_key): - regex_match = re_decoder_block_proj_in.match(original_key) - groups = regex_match.groups() - re_new_key = f"decoders.{groups[0]}.level_blocks.{groups[1]}.proj_in.{groups[-1]}" - key = re_decoder_block_proj_in.sub(re_new_key, original_key) - - # rename prior cond.model to upsampler.upsample_block and resnet - elif re_prior_cond_conv_out.fullmatch(original_key): - regex_match = re_prior_cond_conv_out.match(original_key) - groups = regex_match.groups() - block_index = int(groups[1]) * 2 + int(groups[2]) - 2 - re_new_key = f"conditioner_blocks.upsampler.upsample_block.{block_index}.{groups[-1]}" - key = re_prior_cond_conv_out.sub(re_new_key, original_key) - - elif re_prior_cond_resnet.fullmatch(original_key): - regex_match = re_prior_cond_resnet.match(original_key) - groups = regex_match.groups() - block_index = int(groups[1]) * 2 + int(groups[2]) - 2 - conv_index = {"1": 1, "3": 2}[groups[-2]] - prefix = f"conditioner_blocks.upsampler.upsample_block.{block_index}." - resnet_block = f"resnet_block.{groups[-3]}.conv1d_{conv_index}.{groups[-1]}" - re_new_key = prefix + resnet_block - key = re_prior_cond_resnet.sub(re_new_key, original_key) - - elif re_prior_cond_proj_in.fullmatch(original_key): - regex_match = re_prior_cond_proj_in.match(original_key) - groups = regex_match.groups() - re_new_key = f"conditioner_blocks.upsampler.proj_in.{groups[-1]}" - key = re_prior_cond_proj_in.sub(re_new_key, original_key) - - # keep original key - else: - key = original_key - - key = replace_key(key) - - if f"{key_prefix}.{key}" not in model_state_dict or key is None: - print(f"failed converting {original_key} to {key}, does not match") - - # handle mismatched shape - elif value.shape != model_state_dict[f"{key_prefix}.{key}"].shape: - val = model_state_dict[f"{key_prefix}.{key}"] - print(f"{original_key}-> {key} : \nshape {val.shape} and {value.shape}, do not match") - key = original_key - - mapping[key] = original_key - new_dict[key] = value - - return new_dict - - -@torch.no_grad() -def convert_openai_checkpoint(model_name=None, pytorch_dump_folder_path=None): - """ - Copy/paste/tweak model's weights to our Jukebox structure. - """ - for file in MODEL_MAPPING[model_name]: - if not os.path.isfile(f"{pytorch_dump_folder_path}/{file.split('/')[-1]}"): - r = requests.get(f"{PREFIX}{file}", allow_redirects=True) - os.makedirs(f"{pytorch_dump_folder_path}/", exist_ok=True) - open(f"{pytorch_dump_folder_path}/{file.split('/')[-1]}", "wb").write(r.content) - - model_to_convert = MODEL_MAPPING[model_name.split("/")[-1]] - - config = JukeboxConfig.from_pretrained(model_name) - model = JukeboxModel(config) - - weight_dict = [] - mapping = {} - for i, dict_name in enumerate(model_to_convert): - old_dic = torch.load(f"{pytorch_dump_folder_path}/{dict_name.split('/')[-1]}", weights_only=True)["model"] - - new_dic = {} - for k in old_dic: - if k.endswith(".b"): - new_dic[k.replace("b", "bias")] = old_dic[k] - elif k.endswith(".w"): - new_dic[k.replace("w", "weight")] = old_dic[k] - elif "level_2" not in dict_name and "cond.model." in k: - new_dic[k.replace(".blocks.", ".model.")] = old_dic[k] - else: - new_dic[k] = old_dic[k] - - key_prefix = "vqvae" if i == 0 else f"priors.{3 - i}" - new_dic = fix_jukebox_keys(new_dic, model.state_dict(), key_prefix, mapping) - weight_dict.append(new_dic) - - vqvae_state_dict = weight_dict.pop(0) - model.vqvae.load_state_dict(vqvae_state_dict) - for i in range(len(weight_dict)): - model.priors[i].load_state_dict(weight_dict[2 - i]) - - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - with open(f"{pytorch_dump_folder_path}/mapping.json", "w") as txtfile: - json.dump(mapping, txtfile) - - print(f"Saving model {model_name} to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - - return weight_dict - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="jukebox-5b-lyrics", - type=str, - help="Name of the model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default="jukebox-5b-lyrics-converted", - type=str, - help="Path to the output PyTorch model directory.", - ) - args = parser.parse_args() - convert_openai_checkpoint(args.model_name, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/deprecated/mctct/modeling_mctct.py b/src/transformers/models/deprecated/mctct/modeling_mctct.py index 253b09c1c43c..16f59d3d1dfa 100755 --- a/src/transformers/models/deprecated/mctct/modeling_mctct.py +++ b/src/transformers/models/deprecated/mctct/modeling_mctct.py @@ -96,7 +96,7 @@ def __init__(self, config): def forward(self, input_features): # NOTE: in reference to the NOTE in __init__, right now it just calculates padding as if # there will be just one conv layer. - padding = sum([size // 2 for size in self.kernel_size]) # (7, 7) -> (3, 3) + padding = sum(size // 2 for size in self.kernel_size) # (7, 7) -> (3, 3) input_features = torch.nn.functional.pad(input_features, (0, 0, padding, padding), "constant", 0) hidden_states = input_features.transpose(1, 2).contiguous() # -> Batch x Frame x Time diff --git a/src/transformers/models/deprecated/mega/convert_mega_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/mega/convert_mega_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index 3a0f7cead0ee..000000000000 --- a/src/transformers/models/deprecated/mega/convert_mega_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,298 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Convert Mega pretrained checkpoint. Built to convert the Masked LM checkpoint located at -https://huggingface.co/mnaylor/mega-wikitext-103 - -Requirements: - - clone the Mega repo and install fairseq from there - 1. git clone https://github.com/facebookresearch/mega.git - 2. cd mega && pip install -e - - clone the pretrained weights for the original implementation from the hugging face repo - * use this location as the path for pretrained weights -""" - -import argparse - -# utilities to import the model weights and config file -import os -import pickle as pkl - -# PyTorch + new model classes -import torch -from torch import nn - -from transformers import AutoTokenizer, MegaConfig, MegaForMaskedLM - - -# import the EncoderLayer class used to pretrain -# !! NOTE !! this requires the version of fairseq that is built when you install the Mega source -try: - from fairseq.modules.mega_layer import MegaEncoderLayer -except ImportError: - raise ImportError("You need to install the version of fairseq from the Mega repo!") - - -# define the wrapper classes used to train the MLM (see colab notebook below) -# https://colab.research.google.com/drive/1qfUO6o5HRdxBblWlw058HVyvaEPhPpH8?usp=sharing -# MegaLM outputs hidden states -class MegaLM(nn.Module): - "The base class for our Mega encoder - given input IDs, embed text and return encoder output" - - def __init__(self, mega_args, depth, vocab_size): - super().__init__() - self.mega_args = mega_args - self.embedding_layer = nn.Embedding(vocab_size, self.mega_args.encoder_embed_dim) - self.encoders = nn.ModuleList([MegaEncoderLayer(self.mega_args) for _ in range(depth)]) - self.depth = depth - - def forward(self, input_ids, attention_mask, batch_first=True, ignore_mask_value=0): - """ - Code for a forward pass - expects input_ids and attention_mask to come from a Hugging Face tokenizer as PyTorch - tensors, and returns a tensor of size (batch, n_classes) containing classification logits - - Other options: - - batch_first: boolean indicating whether the batch dimension is first in input_ids (default: True, which - aligns with the HF tokenizer behavior) - - ignore_mask_value: the value in attention_mask that identifies tokens that should be ignored (default: 0, - which aligns with HF tokenizer) - """ - - # Mega expects embeddings to be (time, batch, embedding size), but - # Hugging Face returns tokens as (batch, time) - if batch_first: - input_ids = input_ids.T - - # to make things more confusing, Mega expects the attention mask to - # be (batch, time), but with values of 0 (normal token) and 1 (ignore token) - # which is the opposite of what HF returns - if ignore_mask_value == 0: - attention_mask = 1 - attention_mask - - # get token embeddings from IDs - embeds = self.embedding_layer(input_ids) - - # pass through the Mega layers - # input is (time, batch, encoder dim) and output is the same - for encoder in self.encoders: - embeds = encoder(embeds, attention_mask) - - # return according to the shape specified - if batch_first: - # (T, B, H) --> (B, T, H) - return torch.transpose(embeds, 0, 1) - else: - return embeds - - -# renamed from MegaForMaskedLM to avoid confusion with new module -class OriginalMegaForMaskedLM(nn.Module): - "A wrapper class for doing masked language modeling with Mega" - - def __init__(self, mega_args, depth, vocab_size): - super().__init__() - self.mega = MegaLM(mega_args, depth, vocab_size) - self.mlm_head = nn.Linear(mega_args.encoder_embed_dim, vocab_size) - self.dropout = nn.Dropout(p=0.1) - - def forward(self, input_ids, attention_mask, batch_first=True, ignore_mask_value=0): - """ - Perform a forward pass through the Mega encoder and the masked LM head. Returns logits for each vocabulary - entry. - - If `batch_first` (default to align with Hugging Face tokenizer behavior), output will have the shape (Batch - size, Sequence length, Vocab size); otherwise (S, B, V) - """ - encoder_output = self.mega(input_ids, attention_mask, batch_first, ignore_mask_value) - return self.mlm_head(self.dropout(encoder_output)) - - -# code to convert the checkpoint located in the user-specified location -def convert_checkpoint_to_huggingface(pretrained_checkpoint_path, output_path, includes_tokenizer): - with open(os.path.join(pretrained_checkpoint_path, "model_args.pkl"), "rb") as f: - mega_original_args = pkl.load(f) - - # load the original encoder - original_mlm = OriginalMegaForMaskedLM(**mega_original_args).eval() - - # load its weights - print( - "Original Mega encoder:", - original_mlm.mega.load_state_dict( - torch.load( - os.path.join(pretrained_checkpoint_path, "encoder_weights.pt"), map_location="cpu", weights_only=True - ) - ), - ) - print( - "Original Mega MLM layer:", - original_mlm.mlm_head.load_state_dict( - torch.load( - os.path.join(pretrained_checkpoint_path, "mlm_head_weights.pt"), map_location="cpu", weights_only=True - ) - ), - ) - - # create a new config from the old one - hf_config = MegaConfig( - num_hidden_layers=mega_original_args["depth"], - vocab_size=mega_original_args["vocab_size"], - hidden_size=mega_original_args["mega_args"].encoder_embed_dim, - shared_representation_size=mega_original_args["mega_args"].encoder_z_dim, - intermediate_size=mega_original_args["mega_args"].encoder_hidden_dim, - ema_projection_size=mega_original_args["mega_args"].encoder_n_dim, - dropout_prob=mega_original_args["mega_args"].dropout, - attention_probs_dropout_prob=mega_original_args["mega_args"].attention_dropout, - hidden_dropout_prob=mega_original_args["mega_args"].hidden_dropout, - activation=mega_original_args["mega_args"].activation_fn, - attention_activation=mega_original_args["mega_args"].attention_activation_fn, - bidirectional=mega_original_args["mega_args"].bidirectional, - use_chunking=mega_original_args["mega_args"].encoder_chunk_size > 0, - chunk_size=mega_original_args["mega_args"].encoder_chunk_size, - truncation=mega_original_args["mega_args"].truncation_length, - normalization_type=mega_original_args["mega_args"].normalization_type, - normalize_before_mega=True, - norm_affine=True, - use_feature_dropout=mega_original_args["mega_args"].feature_dropout, - relative_positional_bias=mega_original_args["mega_args"].rel_pos_bias, - max_positions=mega_original_args["mega_args"].max_source_positions, - nffn_hidden_size=mega_original_args["mega_args"].encoder_ffn_embed_dim, - normalize_before_ffn=mega_original_args["mega_args"].normalize_before, - # new arguments added for HF implementation - nffn_activation_dropout_prob=0.0, - add_token_type_embeddings=False, - add_lm_hidden_dense_layer=False, - ) - - hf_mlm = MegaForMaskedLM(hf_config).eval() - - # the original checkpoint just uses nn.Embedding for the word embeddings - # we use a wrapper module for embeddings to add support for positional embeddings - hf_mlm.mega.embedding_layer.word_embeddings.weight = original_mlm.mega.embedding_layer.weight - - # modify the state dictionary of the original checkpoint to account for naming issues in the Hugging Face - # ecosystem -- any names containing "beta" or "gamma" aren't safe to use and are renamed upon _load_pretrained, - # also renaming previously confusing parameter names - original_state_dict = original_mlm.mega.encoders.state_dict() - updated_keys = {} - for module_name in original_state_dict: - new_module_name = None - # have to handle gamma, beta, and alpha differently due to their use - # in multiple modules within the original repository; - # beta is used in EMA, MovingAverageGatedAttention, and RotaryRelativePositionalBias, and must be renamed due to flax/tf weights - # the EMA sublayer was renamed from "move" to "ema_gate" for readability, so that is also done here - if "beta" in module_name: - # EMA sub-layers were always called "move" in the original repo - if "move.beta" in module_name: - new_module_name = module_name.replace("move.beta", "ema_gate.ema_expansion_matrix") - elif "mega_layer.beta" in module_name: - new_module_name = module_name.replace("beta", "qk_bias") - else: - new_module_name = module_name.replace("beta", "b_param") - # beta is used in EMA and MovingAverageGatedAttention, and must be renamed due to flax/tf weights - elif "gamma" in module_name: - if "move.gamma" in module_name: - new_module_name = module_name.replace("move.gamma", "ema_gate.kernel_projection_matrix") - elif "mega_layer.gamma" in module_name: - new_module_name = module_name.replace("gamma", "qk_weight") - else: - new_module_name = module_name.replace("gamma", "g_param") - # alpha is used in EMA and positional bias; renaming to improve readability - elif "move.alpha" in module_name: - new_module_name = module_name.replace("move.alpha", "ema_gate.decay_factor") - # delta is only used in EMA; renaming to improve readability - elif "move.delta" in module_name: - new_module_name = module_name.replace("move.delta", "ema_gate.damping_factor") - # omega is only used in EMA; renaming to improve readability - elif "omega" in module_name: - new_module_name = module_name.replace("move.omega", "ema_gate.residual_weight") - - if new_module_name: - updated_keys[module_name] = new_module_name - - if len(updated_keys) != 0: - print(f"Renaming these keys: {updated_keys.keys()}") - else: - print("No need to rename state dict entries") - for old, new in updated_keys.items(): - original_state_dict[new] = original_state_dict.pop(old) - - # now attempt to load the state dictionary with updated names - # note that we now call it `mega.layers` instead of `mega.encoders` due to hugging face style - print("HF Mega encoder:", hf_mlm.mega.layers.load_state_dict(original_state_dict)) - - # load the MLM head weights directly - print( - "HF Mega MLM layer:", - hf_mlm.mlm_head.load_state_dict( - torch.load( - os.path.join(pretrained_checkpoint_path, "mlm_head_weights.pt"), map_location="cpu", weights_only=True - ) - ), - ) - - # test on a randomly generated input sequence - input_ids = torch.randint(0, hf_config.vocab_size, size=(4, 256)) - input_mask = torch.ones_like(input_ids) - # mask a few tokens to make sure masking is applied appropriately :) - input_mask[:, -10:] = 0 - - # run forward passes - original_output = original_mlm(input_ids, input_mask, batch_first=True, ignore_mask_value=0) - hf_output = hf_mlm(input_ids, input_mask)[0] - - # print shapes and diff - print(f"original output {original_output.shape}") - print(f"hf output {hf_output.shape}") - print(f"max diff: {(original_output - hf_output).max()}") # 0.0 - success = torch.allclose(original_output, hf_output, atol=1e-3) - - if success: - print("Yay!") - hf_mlm.save_pretrained(output_path) - else: - raise RuntimeError(f"Something's broken :(\nOriginal:\n{original_output}\n\nHF\n{hf_output}\n{hf_mlm}") - - if includes_tokenizer: - print("Transferring tokenizer") - tokenizer = AutoTokenizer.from_pretrained(pretrained_checkpoint_path) - tokenizer.save_pretrained(output_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument( - "--pretrained_checkpoint_path", - default=None, - type=str, - required=True, - help="Point to the directory containing your model weights using the official Mega repo", - ) - - parser.add_argument( - "--output_path", default=None, type=str, required=True, help="Location to save the Hugging Face version" - ) - - parser.add_argument( - "--includes_tokenizer", - action="store_true", - help="Use this flag if there is a Hugging Face tokenizer in the original checkpoint repo", - ) - - args = parser.parse_args() - - convert_checkpoint_to_huggingface(args.pretrained_checkpoint_path, args.output_path, args.includes_tokenizer) diff --git a/src/transformers/models/deprecated/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index da7f7806671d..000000000000 --- a/src/transformers/models/deprecated/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,70 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The Trajectory Transformers paper authors and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""TrajectoryTransformer pytorch checkpoint conversion""" - -import torch -import trajectory.utils as utils - -from transformers import TrajectoryTransformerModel - - -class Parser(utils.Parser): - dataset: str = "halfcheetah-medium-expert-v2" - config: str = "config.offline" - - -def convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch(logbase, dataset, loadpath, epoch, device): - """Converting Sequential blocks to ModuleList""" - - gpt, gpt_epoch = utils.load_model(logbase, dataset, loadpath, epoch=epoch, device=device) - trajectory_transformer = TrajectoryTransformerModel(gpt.config) - - trajectory_transformer.tok_emb.load_state_dict(gpt.tok_emb.state_dict()) - trajectory_transformer.pos_emb = gpt.pos_emb - trajectory_transformer.drop.load_state_dict(gpt.drop.state_dict()) - trajectory_transformer.ln_f.load_state_dict(gpt.ln_f.state_dict()) - trajectory_transformer.head.load_state_dict(gpt.head.state_dict()) - - for i, block in enumerate(gpt.blocks): - trajectory_transformer.blocks[i].ln1.load_state_dict(gpt.blocks[i].ln1.state_dict()) - trajectory_transformer.blocks[i].ln2.load_state_dict(gpt.blocks[i].ln2.state_dict()) - trajectory_transformer.blocks[i].attn.load_state_dict(gpt.blocks[i].attn.state_dict()) - - trajectory_transformer.blocks[i].l1.load_state_dict(gpt.blocks[i].mlp[0].state_dict()) - trajectory_transformer.blocks[i].act.load_state_dict(gpt.blocks[i].mlp[1].state_dict()) - trajectory_transformer.blocks[i].l2.load_state_dict(gpt.blocks[i].mlp[2].state_dict()) - trajectory_transformer.blocks[i].drop.load_state_dict(gpt.blocks[i].mlp[3].state_dict()) - - torch.save(trajectory_transformer.state_dict(), "pytorch_model.bin") - - -if __name__ == "__main__": - """ - To run this script you will need to install the original repository to run the original model. You can find it - here: https://github.com/jannerm/trajectory-transformer From this repository code you can also download the - original pytorch checkpoints. - - Run with the command: - - ```sh - >>> python convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py --dataset - ... --gpt_loadpath - ``` - """ - - args = Parser().parse_args("plan") - convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch( - args.logbase, args.dataset, args.gpt_loadpath, args.gpt_epoch, args.device - ) diff --git a/src/transformers/models/deprecated/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py deleted file mode 100644 index 2c7b687c4d98..000000000000 --- a/src/transformers/models/deprecated/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py +++ /dev/null @@ -1,121 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Transformer XL checkpoint and datasets.""" - -import argparse -import os -import pickle -import sys - -import torch - -from transformers import TransfoXLConfig, TransfoXLLMHeadModel, load_tf_weights_in_transfo_xl -from transformers.models.deprecated.transfo_xl import tokenization_transfo_xl as data_utils -from transformers.models.deprecated.transfo_xl.tokenization_transfo_xl import CORPUS_NAME, VOCAB_FILES_NAMES -from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging - - -logging.set_verbosity_info() - -# We do this to be able to load python 2 datasets pickles -# See e.g. https://stackoverflow.com/questions/2121874/python-pickling-after-changing-a-modules-directory/2121918#2121918 -data_utils.Vocab = data_utils.TransfoXLTokenizer -data_utils.Corpus = data_utils.TransfoXLCorpus -sys.modules["data_utils"] = data_utils -sys.modules["vocabulary"] = data_utils - - -def convert_transfo_xl_checkpoint_to_pytorch( - tf_checkpoint_path, transfo_xl_config_file, pytorch_dump_folder_path, transfo_xl_dataset_file -): - if transfo_xl_dataset_file: - # Convert a pre-processed corpus (see original TensorFlow repo) - with open(transfo_xl_dataset_file, "rb") as fp: - corpus = pickle.load(fp, encoding="latin1") - # Save vocabulary and dataset cache as Dictionaries (should be better than pickles for the long-term) - pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["pretrained_vocab_file"] - print(f"Save vocabulary to {pytorch_vocab_dump_path}") - corpus_vocab_dict = corpus.vocab.__dict__ - torch.save(corpus_vocab_dict, pytorch_vocab_dump_path) - - corpus_dict_no_vocab = corpus.__dict__ - corpus_dict_no_vocab.pop("vocab", None) - pytorch_dataset_dump_path = pytorch_dump_folder_path + "/" + CORPUS_NAME - print(f"Save dataset to {pytorch_dataset_dump_path}") - torch.save(corpus_dict_no_vocab, pytorch_dataset_dump_path) - - if tf_checkpoint_path: - # Convert a pre-trained TensorFlow model - config_path = os.path.abspath(transfo_xl_config_file) - tf_path = os.path.abspath(tf_checkpoint_path) - - print(f"Converting Transformer XL checkpoint from {tf_path} with config at {config_path}.") - # Initialise PyTorch model - if transfo_xl_config_file == "": - config = TransfoXLConfig() - else: - config = TransfoXLConfig.from_json_file(transfo_xl_config_file) - print(f"Building PyTorch model from configuration: {config}") - model = TransfoXLLMHeadModel(config) - - model = load_tf_weights_in_transfo_xl(model, config, tf_path) - # Save pytorch-model - pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME) - pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME) - print(f"Save PyTorch model to {os.path.abspath(pytorch_weights_dump_path)}") - torch.save(model.state_dict(), pytorch_weights_dump_path) - print(f"Save configuration file to {os.path.abspath(pytorch_config_dump_path)}") - with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: - f.write(config.to_json_string()) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - required=True, - help="Path to the folder to store the PyTorch model or dataset/vocab.", - ) - parser.add_argument( - "--tf_checkpoint_path", - default="", - type=str, - help="An optional path to a TensorFlow checkpoint path to be converted.", - ) - parser.add_argument( - "--transfo_xl_config_file", - default="", - type=str, - help=( - "An optional config json file corresponding to the pre-trained BERT model. \n" - "This specifies the model architecture." - ), - ) - parser.add_argument( - "--transfo_xl_dataset_file", - default="", - type=str, - help="An optional dataset file to be converted in a vocabulary.\n" - "Given the files are in the pickle format, please be wary of passing it files you trust.", - ) - args = parser.parse_args() - convert_transfo_xl_checkpoint_to_pytorch( - args.tf_checkpoint_path, - args.transfo_xl_config_file, - args.pytorch_dump_folder_path, - args.transfo_xl_dataset_file, - ) diff --git a/src/transformers/models/deprecated/transfo_xl/modeling_transfo_xl.py b/src/transformers/models/deprecated/transfo_xl/modeling_transfo_xl.py index 19c3fb0bd485..49d07391320d 100644 --- a/src/transformers/models/deprecated/transfo_xl/modeling_transfo_xl.py +++ b/src/transformers/models/deprecated/transfo_xl/modeling_transfo_xl.py @@ -558,8 +558,8 @@ def _get_new_num_tokens_layer(self, new_num_tokens, layer): new_num_tokens_layer = ( new_num_tokens - - sum([emb.weight.shape[0] for emb in embeddings.emb_layers[:layer]]) - - sum([emb.weight.shape[0] for emb in embeddings.emb_layers[layer + 1 :]]) + - sum(emb.weight.shape[0] for emb in embeddings.emb_layers[:layer]) + - sum(emb.weight.shape[0] for emb in embeddings.emb_layers[layer + 1 :]) ) return new_num_tokens_layer, layer diff --git a/src/transformers/models/deprecated/tvlt/feature_extraction_tvlt.py b/src/transformers/models/deprecated/tvlt/feature_extraction_tvlt.py index 3c65f4314616..b9350d31a019 100644 --- a/src/transformers/models/deprecated/tvlt/feature_extraction_tvlt.py +++ b/src/transformers/models/deprecated/tvlt/feature_extraction_tvlt.py @@ -202,7 +202,7 @@ def __call__( # Create audio attention mask max_patch_len = max( - [ceil(feature.shape[0] / self.patch_size[0]) * self.freq_len for feature in audio_features] + ceil(feature.shape[0] / self.patch_size[0]) * self.freq_len for feature in audio_features ) # The maximum number of audio patches in a batch if return_attention_mask: audio_mask = [ diff --git a/src/transformers/models/deprecated/tvlt/image_processing_tvlt.py b/src/transformers/models/deprecated/tvlt/image_processing_tvlt.py index c0e1a33f091b..01fb42429a96 100644 --- a/src/transformers/models/deprecated/tvlt/image_processing_tvlt.py +++ b/src/transformers/models/deprecated/tvlt/image_processing_tvlt.py @@ -395,7 +395,7 @@ def preprocess( f"number of frames must not be greater than the maximum frames of the model {self.num_frames}." ) - max_num_frames = max([len(video) for video in videos]) + max_num_frames = max(len(video) for video in videos) num_patches_per_image = (size["shortest_edge"] // patch_size[0]) ** 2 video_masks = np.array( [ diff --git a/src/transformers/models/deprecated/van/convert_van_to_pytorch.py b/src/transformers/models/deprecated/van/convert_van_to_pytorch.py deleted file mode 100644 index ec43af68d76c..000000000000 --- a/src/transformers/models/deprecated/van/convert_van_to_pytorch.py +++ /dev/null @@ -1,290 +0,0 @@ -# coding=utf-8 -# Copyright 2022 BNRist (Tsinghua University), TKLNDST (Nankai University) and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert VAN checkpoints from the original repository. - -URL: https://github.com/Visual-Attention-Network/VAN-Classification""" - -import argparse -import json -import sys -from dataclasses import dataclass, field -from functools import partial -from pathlib import Path -from typing import Optional - -import torch -import torch.nn as nn -from huggingface_hub import cached_download, hf_hub_download -from torch import Tensor - -from transformers import AutoImageProcessor, VanConfig, VanForImageClassification -from transformers.models.deprecated.van.modeling_van import VanLayerScaling -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -@dataclass -class Tracker: - module: nn.Module - traced: list[nn.Module] = field(default_factory=list) - handles: list = field(default_factory=list) - - def _forward_hook(self, m, inputs: Tensor, outputs: Tensor): - has_not_submodules = len(list(m.modules())) == 1 or isinstance(m, (nn.Conv2d, nn.BatchNorm2d)) - if has_not_submodules: - if not isinstance(m, VanLayerScaling): - self.traced.append(m) - - def __call__(self, x: Tensor): - for m in self.module.modules(): - self.handles.append(m.register_forward_hook(self._forward_hook)) - self.module(x) - [x.remove() for x in self.handles] - return self - - @property - def parametrized(self): - # check the len of the state_dict keys to see if we have learnable params - return list(filter(lambda x: len(list(x.state_dict().keys())) > 0, self.traced)) - - -@dataclass -class ModuleTransfer: - src: nn.Module - dest: nn.Module - verbose: int = 0 - src_skip: list = field(default_factory=list) - dest_skip: list = field(default_factory=list) - - def __call__(self, x: Tensor): - """ - Transfer the weights of `self.src` to `self.dest` by performing a forward pass using `x` as input. Under the - hood we tracked all the operations in both modules. - """ - dest_traced = Tracker(self.dest)(x).parametrized - src_traced = Tracker(self.src)(x).parametrized - - src_traced = list(filter(lambda x: type(x) not in self.src_skip, src_traced)) - dest_traced = list(filter(lambda x: type(x) not in self.dest_skip, dest_traced)) - - if len(dest_traced) != len(src_traced): - raise Exception( - f"Numbers of operations are different. Source module has {len(src_traced)} operations while" - f" destination module has {len(dest_traced)}." - ) - - for dest_m, src_m in zip(dest_traced, src_traced): - dest_m.load_state_dict(src_m.state_dict()) - if self.verbose == 1: - print(f"Transferred from={src_m} to={dest_m}") - - -def copy_parameters(from_model: nn.Module, our_model: nn.Module) -> nn.Module: - # nn.Parameter cannot be tracked by the Tracker, thus we need to manually convert them - from_state_dict = from_model.state_dict() - our_state_dict = our_model.state_dict() - config = our_model.config - all_keys = [] - for stage_idx in range(len(config.hidden_sizes)): - for block_id in range(config.depths[stage_idx]): - from_key = f"block{stage_idx + 1}.{block_id}.layer_scale_1" - to_key = f"van.encoder.stages.{stage_idx}.layers.{block_id}.attention_scaling.weight" - - all_keys.append((from_key, to_key)) - from_key = f"block{stage_idx + 1}.{block_id}.layer_scale_2" - to_key = f"van.encoder.stages.{stage_idx}.layers.{block_id}.mlp_scaling.weight" - - all_keys.append((from_key, to_key)) - - for from_key, to_key in all_keys: - our_state_dict[to_key] = from_state_dict.pop(from_key) - - our_model.load_state_dict(our_state_dict) - return our_model - - -def convert_weight_and_push( - name: str, - config: VanConfig, - checkpoint: str, - from_model: nn.Module, - save_directory: Path, - push_to_hub: bool = True, -): - print(f"Downloading weights for {name}...") - checkpoint_path = cached_download(checkpoint) - print(f"Converting {name}...") - from_state_dict = torch.load(checkpoint_path, weights_only=True)["state_dict"] - from_model.load_state_dict(from_state_dict) - from_model.eval() - with torch.no_grad(): - our_model = VanForImageClassification(config).eval() - module_transfer = ModuleTransfer(src=from_model, dest=our_model) - x = torch.randn((1, 3, 224, 224)) - module_transfer(x) - our_model = copy_parameters(from_model, our_model) - - if not torch.allclose(from_model(x), our_model(x).logits): - raise ValueError("The model logits don't match the original one.") - - checkpoint_name = name - print(checkpoint_name) - - if push_to_hub: - our_model.push_to_hub( - repo_path_or_name=save_directory / checkpoint_name, - commit_message="Add model", - use_temp_dir=True, - ) - - # we can use the convnext one - image_processor = AutoImageProcessor.from_pretrained("facebook/convnext-base-224-22k-1k") - image_processor.push_to_hub( - repo_path_or_name=save_directory / checkpoint_name, - commit_message="Add image processor", - use_temp_dir=True, - ) - - print(f"Pushed {checkpoint_name}") - - -def convert_weights_and_push(save_directory: Path, model_name: Optional[str] = None, push_to_hub: bool = True): - filename = "imagenet-1k-id2label.json" - num_labels = 1000 - - repo_id = "huggingface/label-files" - num_labels = num_labels - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - - id2label = id2label - label2id = {v: k for k, v in id2label.items()} - - ImageNetPreTrainedConfig = partial(VanConfig, num_labels=num_labels, id2label=id2label, label2id=label2id) - - names_to_config = { - "van-tiny": ImageNetPreTrainedConfig( - hidden_sizes=[32, 64, 160, 256], - depths=[3, 3, 5, 2], - mlp_ratios=[8, 8, 4, 4], - ), - "van-small": ImageNetPreTrainedConfig( - hidden_sizes=[64, 128, 320, 512], - depths=[2, 2, 4, 2], - mlp_ratios=[8, 8, 4, 4], - ), - "van-base": ImageNetPreTrainedConfig( - hidden_sizes=[64, 128, 320, 512], - depths=[3, 3, 12, 3], - mlp_ratios=[8, 8, 4, 4], - ), - "van-large": ImageNetPreTrainedConfig( - hidden_sizes=[64, 128, 320, 512], - depths=[3, 5, 27, 3], - mlp_ratios=[8, 8, 4, 4], - ), - } - - names_to_original_models = { - "van-tiny": van_tiny, - "van-small": van_small, - "van-base": van_base, - "van-large": van_large, - } - - names_to_original_checkpoints = { - "van-tiny": ( - "https://huggingface.co/Visual-Attention-Network/VAN-Tiny-original/resolve/main/van_tiny_754.pth.tar" - ), - "van-small": ( - "https://huggingface.co/Visual-Attention-Network/VAN-Small-original/resolve/main/van_small_811.pth.tar" - ), - "van-base": ( - "https://huggingface.co/Visual-Attention-Network/VAN-Base-original/resolve/main/van_base_828.pth.tar" - ), - "van-large": ( - "https://huggingface.co/Visual-Attention-Network/VAN-Large-original/resolve/main/van_large_839.pth.tar" - ), - } - - if model_name: - convert_weight_and_push( - model_name, - names_to_config[model_name], - checkpoint=names_to_original_checkpoints[model_name], - from_model=names_to_original_models[model_name](), - save_directory=save_directory, - push_to_hub=push_to_hub, - ) - else: - for model_name, config in names_to_config.items(): - convert_weight_and_push( - model_name, - config, - checkpoint=names_to_original_checkpoints[model_name], - from_model=names_to_original_models[model_name](), - save_directory=save_directory, - push_to_hub=push_to_hub, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model-name", - default=None, - type=str, - help=( - "The name of the model you wish to convert, it must be one of the supported resnet* architecture," - " currently: van-tiny/small/base/large. If `None`, all of them will the converted." - ), - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=Path, - required=True, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument( - "--van_dir", - required=True, - type=Path, - help=( - "A path to VAN's original implementation directory. You can download from here:" - " https://github.com/Visual-Attention-Network/VAN-Classification" - ), - ) - parser.add_argument( - "--push_to_hub", - default=True, - type=bool, - required=False, - help="If True, push model and image processor to the hub.", - ) - - args = parser.parse_args() - pytorch_dump_folder_path: Path = args.pytorch_dump_folder_path - pytorch_dump_folder_path.mkdir(exist_ok=True, parents=True) - van_dir = args.van_dir - # append the path to the parents to maskformer dir - sys.path.append(str(van_dir.parent)) - from van.models.van import van_base, van_large, van_small, van_tiny - - convert_weights_and_push(pytorch_dump_folder_path, args.model_name, args.push_to_hub) diff --git a/src/transformers/models/deprecated/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py b/src/transformers/models/deprecated/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py deleted file mode 100644 index 1d717d74c961..000000000000 --- a/src/transformers/models/deprecated/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py +++ /dev/null @@ -1,282 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert ViT hybrid checkpoints from the timm library.""" - -import argparse -import json -from pathlib import Path - -import requests -import timm -import torch -from huggingface_hub import hf_hub_download -from PIL import Image -from timm.data import resolve_data_config -from timm.data.transforms_factory import create_transform - -from transformers import ( - BitConfig, - ViTHybridConfig, - ViTHybridForImageClassification, - ViTHybridImageProcessor, - ViTHybridModel, -) -from transformers.image_utils import PILImageResampling -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config, base_model=False): - rename_keys = [] - - # fmt: off - # stem: - rename_keys.append(("cls_token", "vit.embeddings.cls_token")) - rename_keys.append(("pos_embed", "vit.embeddings.position_embeddings")) - - rename_keys.append(("patch_embed.proj.weight", "vit.embeddings.patch_embeddings.projection.weight")) - rename_keys.append(("patch_embed.proj.bias", "vit.embeddings.patch_embeddings.projection.bias")) - - # backbone - rename_keys.append(("patch_embed.backbone.stem.conv.weight", "vit.embeddings.patch_embeddings.backbone.bit.embedder.convolution.weight")) - rename_keys.append(("patch_embed.backbone.stem.norm.weight", "vit.embeddings.patch_embeddings.backbone.bit.embedder.norm.weight")) - rename_keys.append(("patch_embed.backbone.stem.norm.bias", "vit.embeddings.patch_embeddings.backbone.bit.embedder.norm.bias")) - - for stage_idx in range(len(config.backbone_config.depths)): - for layer_idx in range(config.backbone_config.depths[stage_idx]): - rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.conv1.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.conv1.weight")) - rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm1.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.norm1.weight")) - rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm1.bias", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.norm1.bias")) - rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.conv2.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.conv2.weight")) - rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm2.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.norm2.weight")) - rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm2.bias", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.norm2.bias")) - rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.conv3.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.conv3.weight")) - rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm3.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.norm3.weight")) - rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm3.bias", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.norm3.bias")) - - rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.0.downsample.conv.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.0.downsample.conv.weight")) - rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.0.downsample.norm.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.0.downsample.norm.weight")) - rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.0.downsample.norm.bias", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.0.downsample.norm.bias")) - - # transformer encoder - for i in range(config.num_hidden_layers): - # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms - rename_keys.append((f"blocks.{i}.norm1.weight", f"vit.encoder.layer.{i}.layernorm_before.weight")) - rename_keys.append((f"blocks.{i}.norm1.bias", f"vit.encoder.layer.{i}.layernorm_before.bias")) - rename_keys.append((f"blocks.{i}.attn.proj.weight", f"vit.encoder.layer.{i}.attention.output.dense.weight")) - rename_keys.append((f"blocks.{i}.attn.proj.bias", f"vit.encoder.layer.{i}.attention.output.dense.bias")) - rename_keys.append((f"blocks.{i}.norm2.weight", f"vit.encoder.layer.{i}.layernorm_after.weight")) - rename_keys.append((f"blocks.{i}.norm2.bias", f"vit.encoder.layer.{i}.layernorm_after.bias")) - rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"vit.encoder.layer.{i}.intermediate.dense.weight")) - rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"vit.encoder.layer.{i}.intermediate.dense.bias")) - rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"vit.encoder.layer.{i}.output.dense.weight")) - rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"vit.encoder.layer.{i}.output.dense.bias")) - - if base_model: - # layernorm + pooler - rename_keys.extend( - [ - ("norm.weight", "layernorm.weight"), - ("norm.bias", "layernorm.bias"), - ("pre_logits.fc.weight", "pooler.dense.weight"), - ("pre_logits.fc.bias", "pooler.dense.bias"), - ] - ) - - # if just the base model, we should remove "vit" from all keys that start with "vit" - rename_keys = [(pair[0], pair[1][4:]) if pair[1].startswith("vit") else pair for pair in rename_keys] - else: - # layernorm + classification head - rename_keys.extend( - [ - ("norm.weight", "vit.layernorm.weight"), - ("norm.bias", "vit.layernorm.bias"), - ("head.weight", "classifier.weight"), - ("head.bias", "classifier.bias"), - ] - ) - # fmt: on - - return rename_keys - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config, base_model=False): - for i in range(config.num_hidden_layers): - if base_model: - prefix = "" - else: - prefix = "vit." - # read in weights + bias of input projection layer (in timm, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight") - in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[ - : config.hidden_size, : - ] - state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size] - state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ - config.hidden_size : config.hidden_size * 2, : - ] - state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[ - config.hidden_size : config.hidden_size * 2 - ] - state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[ - -config.hidden_size :, : - ] - state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :] - - -def remove_classification_head_(state_dict): - ignore_keys = ["head.weight", "head.bias"] - for k in ignore_keys: - state_dict.pop(k, None) - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -@torch.no_grad() -def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path, push_to_hub=False): - """ - Copy/paste/tweak model's weights to our ViT structure. - """ - - # define default ViT hybrid configuration - backbone_config = BitConfig( - global_padding="same", - layer_type="bottleneck", - depths=(3, 4, 9), - out_features=["stage3"], - embedding_dynamic_padding=True, - ) - config = ViTHybridConfig(backbone_config=backbone_config, image_size=384, num_labels=1000) - base_model = False - - # load original model from timm - timm_model = timm.create_model(vit_name, pretrained=True) - timm_model.eval() - - # load state_dict of original model, remove and rename some keys - state_dict = timm_model.state_dict() - if base_model: - remove_classification_head_(state_dict) - rename_keys = create_rename_keys(config, base_model) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - read_in_q_k_v(state_dict, config, base_model) - - repo_id = "huggingface/label-files" - filename = "imagenet-1k-id2label.json" - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - - # load HuggingFace model - if vit_name[-5:] == "in21k": - model = ViTHybridModel(config).eval() - else: - model = ViTHybridForImageClassification(config).eval() - model.load_state_dict(state_dict) - - # create image processor - transform = create_transform(**resolve_data_config({}, model=timm_model)) - timm_transforms = transform.transforms - - pillow_resamplings = { - "bilinear": PILImageResampling.BILINEAR, - "bicubic": PILImageResampling.BICUBIC, - "nearest": PILImageResampling.NEAREST, - } - - processor = ViTHybridImageProcessor( - do_resize=True, - size={"shortest_edge": timm_transforms[0].size}, - resample=pillow_resamplings[timm_transforms[0].interpolation.value], - do_center_crop=True, - crop_size={"height": timm_transforms[1].size[0], "width": timm_transforms[1].size[1]}, - do_normalize=True, - image_mean=timm_transforms[-1].mean.tolist(), - image_std=timm_transforms[-1].std.tolist(), - ) - - image = prepare_img() - timm_pixel_values = transform(image).unsqueeze(0) - pixel_values = processor(image, return_tensors="pt").pixel_values - - # verify pixel values - assert torch.allclose(timm_pixel_values, pixel_values) - - # verify logits - with torch.no_grad(): - outputs = model(pixel_values) - logits = outputs.logits - - print("Predicted class:", logits.argmax(-1).item()) - if base_model: - timm_pooled_output = timm_model.forward_features(pixel_values) - assert timm_pooled_output.shape == outputs.pooler_output.shape - assert torch.allclose(timm_pooled_output, outputs.pooler_output, atol=1e-3) - else: - timm_logits = timm_model(pixel_values) - assert timm_logits.shape == outputs.logits.shape - assert torch.allclose(timm_logits, outputs.logits, atol=1e-3) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model {vit_name} to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - print(f"Saving processor to {pytorch_dump_folder_path}") - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - print(f"Pushing model and processor to the hub {vit_name}") - model.push_to_hub(f"ybelkada/{vit_name}") - processor.push_to_hub(f"ybelkada/{vit_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--vit_name", - default="vit_base_r50_s16_384", - type=str, - help="Name of the hybrid ViT timm model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether to upload the model to the HuggingFace hub." - ) - - args = parser.parse_args() - convert_vit_checkpoint(args.vit_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/deprecated/xlm_prophetnet/modeling_xlm_prophetnet.py b/src/transformers/models/deprecated/xlm_prophetnet/modeling_xlm_prophetnet.py index 3c4dc3de8393..36f6e6097bc3 100644 --- a/src/transformers/models/deprecated/xlm_prophetnet/modeling_xlm_prophetnet.py +++ b/src/transformers/models/deprecated/xlm_prophetnet/modeling_xlm_prophetnet.py @@ -1233,7 +1233,7 @@ class XLMProphetNetEncoder(XLMProphetNetPreTrainedModel): embeddings instead of randomly initialized word embeddings. """ - def __init__(self, config: XLMProphetNetConfig, word_embeddings: nn.Embedding = None): + def __init__(self, config: XLMProphetNetConfig, word_embeddings: Optional[nn.Embedding] = None): super().__init__(config) self.word_embeddings = ( diff --git a/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py b/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py deleted file mode 100644 index f07a76b2b235..000000000000 --- a/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py +++ /dev/null @@ -1,368 +0,0 @@ -# coding=utf-8 -# Copyright 2024 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Depth Anything checkpoints from the original repository. URL: -https://github.com/LiheYoung/Depth-Anything""" - -import argparse -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import DepthAnythingConfig, DepthAnythingForDepthEstimation, Dinov2Config, DPTImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_dpt_config(model_name): - if "small" in model_name: - out_indices = [3, 6, 9, 12] if "v2" in model_name else [9, 10, 11, 12] - backbone_config = Dinov2Config.from_pretrained( - "facebook/dinov2-small", out_indices=out_indices, apply_layernorm=True, reshape_hidden_states=False - ) - fusion_hidden_size = 64 - neck_hidden_sizes = [48, 96, 192, 384] - elif "base" in model_name: - out_indices = [3, 6, 9, 12] if "v2" in model_name else [9, 10, 11, 12] - backbone_config = Dinov2Config.from_pretrained( - "facebook/dinov2-base", out_indices=out_indices, apply_layernorm=True, reshape_hidden_states=False - ) - fusion_hidden_size = 128 - neck_hidden_sizes = [96, 192, 384, 768] - elif "large" in model_name: - out_indices = [5, 12, 18, 24] if "v2" in model_name else [21, 22, 23, 24] - backbone_config = Dinov2Config.from_pretrained( - "facebook/dinov2-large", out_indices=out_indices, apply_layernorm=True, reshape_hidden_states=False - ) - fusion_hidden_size = 256 - neck_hidden_sizes = [256, 512, 1024, 1024] - else: - raise NotImplementedError(f"Model not supported: {model_name}") - - if "metric" in model_name: - depth_estimation_type = "metric" - max_depth = 20 if "indoor" in model_name else 80 - else: - depth_estimation_type = "relative" - max_depth = None - - config = DepthAnythingConfig( - reassemble_hidden_size=backbone_config.hidden_size, - patch_size=backbone_config.patch_size, - backbone_config=backbone_config, - fusion_hidden_size=fusion_hidden_size, - neck_hidden_sizes=neck_hidden_sizes, - depth_estimation_type=depth_estimation_type, - max_depth=max_depth, - ) - - return config - - -def create_rename_keys(config): - rename_keys = [] - - # fmt: off - # stem - rename_keys.append(("pretrained.cls_token", "backbone.embeddings.cls_token")) - rename_keys.append(("pretrained.mask_token", "backbone.embeddings.mask_token")) - rename_keys.append(("pretrained.pos_embed", "backbone.embeddings.position_embeddings")) - rename_keys.append(("pretrained.patch_embed.proj.weight", "backbone.embeddings.patch_embeddings.projection.weight")) - rename_keys.append(("pretrained.patch_embed.proj.bias", "backbone.embeddings.patch_embeddings.projection.bias")) - - # Transformer encoder - for i in range(config.backbone_config.num_hidden_layers): - rename_keys.append((f"pretrained.blocks.{i}.ls1.gamma", f"backbone.encoder.layer.{i}.layer_scale1.lambda1")) - rename_keys.append((f"pretrained.blocks.{i}.ls2.gamma", f"backbone.encoder.layer.{i}.layer_scale2.lambda1")) - rename_keys.append((f"pretrained.blocks.{i}.norm1.weight", f"backbone.encoder.layer.{i}.norm1.weight")) - rename_keys.append((f"pretrained.blocks.{i}.norm1.bias", f"backbone.encoder.layer.{i}.norm1.bias")) - rename_keys.append((f"pretrained.blocks.{i}.norm2.weight", f"backbone.encoder.layer.{i}.norm2.weight")) - rename_keys.append((f"pretrained.blocks.{i}.norm2.bias", f"backbone.encoder.layer.{i}.norm2.bias")) - rename_keys.append((f"pretrained.blocks.{i}.mlp.fc1.weight", f"backbone.encoder.layer.{i}.mlp.fc1.weight")) - rename_keys.append((f"pretrained.blocks.{i}.mlp.fc1.bias", f"backbone.encoder.layer.{i}.mlp.fc1.bias")) - rename_keys.append((f"pretrained.blocks.{i}.mlp.fc2.weight", f"backbone.encoder.layer.{i}.mlp.fc2.weight")) - rename_keys.append((f"pretrained.blocks.{i}.mlp.fc2.bias", f"backbone.encoder.layer.{i}.mlp.fc2.bias")) - rename_keys.append((f"pretrained.blocks.{i}.attn.proj.weight", f"backbone.encoder.layer.{i}.attention.output.dense.weight")) - rename_keys.append((f"pretrained.blocks.{i}.attn.proj.bias", f"backbone.encoder.layer.{i}.attention.output.dense.bias")) - - # Head - rename_keys.append(("pretrained.norm.weight", "backbone.layernorm.weight")) - rename_keys.append(("pretrained.norm.bias", "backbone.layernorm.bias")) - - # activation postprocessing (readout projections + resize blocks) - # Depth Anything does not use CLS token => readout_projects not required - - for i in range(4): - rename_keys.append((f"depth_head.projects.{i}.weight", f"neck.reassemble_stage.layers.{i}.projection.weight")) - rename_keys.append((f"depth_head.projects.{i}.bias", f"neck.reassemble_stage.layers.{i}.projection.bias")) - - if i != 2: - rename_keys.append((f"depth_head.resize_layers.{i}.weight", f"neck.reassemble_stage.layers.{i}.resize.weight")) - rename_keys.append((f"depth_head.resize_layers.{i}.bias", f"neck.reassemble_stage.layers.{i}.resize.bias")) - - # refinenet (tricky here) - mapping = {1:3, 2:2, 3:1, 4:0} - - for i in range(1, 5): - j = mapping[i] - rename_keys.append((f"depth_head.scratch.refinenet{i}.out_conv.weight", f"neck.fusion_stage.layers.{j}.projection.weight")) - rename_keys.append((f"depth_head.scratch.refinenet{i}.out_conv.bias", f"neck.fusion_stage.layers.{j}.projection.bias")) - rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit1.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.weight")) - rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit1.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.bias")) - rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit1.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.weight")) - rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit1.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.bias")) - rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit2.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.weight")) - rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit2.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.bias")) - rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit2.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.weight")) - rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit2.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.bias")) - - # scratch convolutions - for i in range(4): - rename_keys.append((f"depth_head.scratch.layer{i+1}_rn.weight", f"neck.convs.{i}.weight")) - - # head - rename_keys.append(("depth_head.scratch.output_conv1.weight", "head.conv1.weight")) - rename_keys.append(("depth_head.scratch.output_conv1.bias", "head.conv1.bias")) - rename_keys.append(("depth_head.scratch.output_conv2.0.weight", "head.conv2.weight")) - rename_keys.append(("depth_head.scratch.output_conv2.0.bias", "head.conv2.bias")) - rename_keys.append(("depth_head.scratch.output_conv2.2.weight", "head.conv3.weight")) - rename_keys.append(("depth_head.scratch.output_conv2.2.bias", "head.conv3.bias")) - - return rename_keys - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config): - hidden_size = config.backbone_config.hidden_size - for i in range(config.backbone_config.num_hidden_layers): - # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"pretrained.blocks.{i}.attn.qkv.weight") - in_proj_bias = state_dict.pop(f"pretrained.blocks.{i}.attn.qkv.bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[:hidden_size, :] - state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[:hidden_size] - state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ - hidden_size : hidden_size * 2, : - ] - state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[ - hidden_size : hidden_size * 2 - ] - state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-hidden_size:, :] - state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-hidden_size:] - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -name_to_checkpoint = { - "depth-anything-small": "pytorch_model.bin", - "depth-anything-base": "pytorch_model.bin", - "depth-anything-large": "pytorch_model.bin", - "depth-anything-v2-small": "depth_anything_v2_vits.pth", - "depth-anything-v2-base": "depth_anything_v2_vitb.pth", - "depth-anything-v2-large": "depth_anything_v2_vitl.pth", - "depth-anything-v2-metric-indoor-small": "depth_anything_v2_metric_hypersim_vits.pth", - "depth-anything-v2-metric-indoor-base": "depth_anything_v2_metric_hypersim_vitb.pth", - "depth-anything-v2-metric-indoor-large": "depth_anything_v2_metric_hypersim_vitl.pth", - "depth-anything-v2-metric-outdoor-small": "depth_anything_v2_metric_vkitti_vits.pth", - "depth-anything-v2-metric-outdoor-base": "depth_anything_v2_metric_vkitti_vitb.pth", - "depth-anything-v2-metric-outdoor-large": "depth_anything_v2_metric_vkitti_vitl.pth", - # v2-giant pending -} - - -@torch.no_grad() -def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, verify_logits): - """ - Copy/paste/tweak model's weights to our DPT structure. - """ - - # define DPT configuration - config = get_dpt_config(model_name) - - model_name_to_repo = { - "depth-anything-small": "LiheYoung/depth_anything_vits14", - "depth-anything-base": "LiheYoung/depth_anything_vitb14", - "depth-anything-large": "LiheYoung/depth_anything_vitl14", - "depth-anything-v2-small": "depth-anything/Depth-Anything-V2-Small", - "depth-anything-v2-base": "depth-anything/Depth-Anything-V2-Base", - "depth-anything-v2-large": "depth-anything/Depth-Anything-V2-Large", - "depth-anything-v2-metric-indoor-small": "depth-anything/Depth-Anything-V2-Metric-Hypersim-Small", - "depth-anything-v2-metric-indoor-base": "depth-anything/Depth-Anything-V2-Metric-Hypersim-Base", - "depth-anything-v2-metric-indoor-large": "depth-anything/Depth-Anything-V2-Metric-Hypersim-Large", - "depth-anything-v2-metric-outdoor-small": "depth-anything/Depth-Anything-V2-Metric-VKITTI-Small", - "depth-anything-v2-metric-outdoor-base": "depth-anything/Depth-Anything-V2-Metric-VKITTI-Base", - "depth-anything-v2-metric-outdoor-large": "depth-anything/Depth-Anything-V2-Metric-VKITTI-Large", - } - - # load original state_dict - repo_id = model_name_to_repo[model_name] - filename = name_to_checkpoint[model_name] - filepath = hf_hub_download( - repo_id=repo_id, - filename=f"{filename}", - ) - - state_dict = torch.load(filepath, map_location="cpu", weights_only=True) - # rename keys - rename_keys = create_rename_keys(config) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - # read in qkv matrices - read_in_q_k_v(state_dict, config) - - # load HuggingFace model - model = DepthAnythingForDepthEstimation(config) - model.load_state_dict(state_dict) - model.eval() - - processor = DPTImageProcessor( - do_resize=True, - size={"height": 518, "width": 518}, - ensure_multiple_of=14, - keep_aspect_ratio=True, - do_rescale=True, - do_normalize=True, - image_mean=[0.485, 0.456, 0.406], - image_std=[0.229, 0.224, 0.225], - ) - - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - image = Image.open(requests.get(url, stream=True).raw) - - pixel_values = processor(image, return_tensors="pt").pixel_values - - # Verify forward pass - with torch.no_grad(): - outputs = model(pixel_values) - predicted_depth = outputs.predicted_depth - - print("Shape of predicted depth:", predicted_depth.shape) - print("First values:", predicted_depth[0, :3, :3]) - - # assert logits - if verify_logits: - expected_shape = torch.Size([1, 518, 686]) - if model_name == "depth-anything-small": - expected_slice = torch.tensor( - [[8.8204, 8.6468, 8.6195], [8.3313, 8.6027, 8.7526], [8.6526, 8.6866, 8.7453]], - ) - elif model_name == "depth-anything-base": - expected_slice = torch.tensor( - [[26.3997, 26.3004, 26.3928], [26.2260, 26.2092, 26.3427], [26.0719, 26.0483, 26.1254]], - ) - elif model_name == "depth-anything-large": - expected_slice = torch.tensor( - [[87.9968, 87.7493, 88.2704], [87.1927, 87.6611, 87.3640], [86.7789, 86.9469, 86.7991]] - ) - elif model_name == "depth-anything-v2-small": - expected_slice = torch.tensor( - [[2.6751, 2.6211, 2.6571], [2.5820, 2.6138, 2.6271], [2.6160, 2.6141, 2.6306]] - ) - elif model_name == "depth-anything-v2-base": - expected_slice = torch.tensor( - [[4.3576, 4.3723, 4.3908], [4.3231, 4.3146, 4.3611], [4.3016, 4.3170, 4.3121]] - ) - elif model_name == "depth-anything-v2-large": - expected_slice = torch.tensor( - [[162.2751, 161.8504, 162.8788], [160.3138, 160.8050, 161.9835], [159.3812, 159.9884, 160.0768]] - ) - elif model_name == "depth-anything-v2-metric-indoor-small": - expected_slice = torch.tensor( - [[1.3349, 1.2946, 1.2801], [1.2793, 1.2337, 1.2899], [1.2629, 1.2218, 1.2476]] - ) - elif model_name == "depth-anything-v2-metric-indoor-base": - expected_slice = torch.tensor( - [[1.4601, 1.3824, 1.4904], [1.5031, 1.4349, 1.4274], [1.4570, 1.4578, 1.4200]] - ) - elif model_name == "depth-anything-v2-metric-indoor-large": - expected_slice = torch.tensor( - [[1.5040, 1.5019, 1.5218], [1.5087, 1.5195, 1.5149], [1.5437, 1.5128, 1.5252]] - ) - elif model_name == "depth-anything-v2-metric-outdoor-small": - expected_slice = torch.tensor( - [[9.5804, 8.0339, 7.7386], [7.9890, 7.2464, 7.7149], [7.7021, 7.2330, 7.3304]] - ) - elif model_name == "depth-anything-v2-metric-outdoor-base": - expected_slice = torch.tensor( - [[10.2916, 9.0933, 8.8622], [9.1964, 9.3393, 9.0644], [8.9618, 9.4201, 9.2262]] - ) - elif model_name == "depth-anything-v2-metric-outdoor-large": - expected_slice = torch.tensor( - [[14.0137, 13.3627, 13.1080], [13.2522, 13.3943, 13.3705], [13.0581, 13.4505, 13.3925]] - ) - else: - raise ValueError("Not supported") - - assert predicted_depth.shape == torch.Size(expected_shape) - assert torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-4) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model and processor to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - print("Pushing model and processor to hub...") - model.push_to_hub(repo_id=f"{model_name.title()}-hf") - processor.push_to_hub(repo_id=f"{model_name.title()}-hf") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="depth-anything-small", - type=str, - choices=name_to_checkpoint.keys(), - help="Name of the model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether to push the model to the hub after conversion.", - ) - parser.add_argument( - "--verify_logits", - action="store_false", - required=False, - help="Whether to verify the logits after conversion.", - ) - - args = parser.parse_args() - convert_dpt_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.verify_logits) diff --git a/src/transformers/models/depth_anything/convert_distill_any_depth_to_hf.py b/src/transformers/models/depth_anything/convert_distill_any_depth_to_hf.py deleted file mode 100644 index 47cec7afac1a..000000000000 --- a/src/transformers/models/depth_anything/convert_distill_any_depth_to_hf.py +++ /dev/null @@ -1,246 +0,0 @@ -# coding=utf-8 -# Copyright 2025 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Distill Any Depth checkpoints from the original repository. URL: -https://github.com/Westlake-AGI-Lab/Distill-Any-Depth""" - -import argparse -import re -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image -from safetensors.torch import load_file - -from transformers import DepthAnythingConfig, DepthAnythingForDepthEstimation, Dinov2Config, DPTImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -ORIGINAL_TO_CONVERTED_KEY_MAPPING = { - r"(backbone|pretrained)\.cls_token": r"backbone.embeddings.cls_token", - r"(backbone|pretrained)\.mask_token": r"backbone.embeddings.mask_token", - r"(backbone|pretrained)\.pos_embed": r"backbone.embeddings.position_embeddings", - r"(backbone|pretrained)\.patch_embed\.proj\.(weight|bias)": r"backbone.embeddings.patch_embeddings.projection.\2", - r"(backbone|pretrained)\.norm\.(weight|bias)": r"backbone.layernorm.\2", - r"(backbone|pretrained)(\.blocks(\.\d+)?)?\.(\d+)\.attn\.proj\.(weight|bias)": r"backbone.encoder.layer.\4.attention.output.dense.\5", - r"(backbone|pretrained)(\.blocks(\.\d+)?)?\.(\d+)\.ls(1|2)\.gamma": r"backbone.encoder.layer.\4.layer_scale\5.lambda1", - r"(backbone|pretrained)(\.blocks(\.\d+)?)?\.(\d+)\.mlp\.fc(1|2)\.(weight|bias)": r"backbone.encoder.layer.\4.mlp.fc\5.\6", - r"(backbone|pretrained)(\.blocks(\.\d+)?)?\.(\d+)\.norm(1|2)\.(weight|bias)": r"backbone.encoder.layer.\4.norm\5.\6", - r"depth_head\.projects\.(\d+)\.(weight|bias)": r"neck.reassemble_stage.layers.\1.projection.\2", - r"depth_head\.resize_layers\.(?!2)(\d+)\.(weight|bias)": r"neck.reassemble_stage.layers.\1.resize.\2", - r"depth_head\.scratch\.layer(\d+)_rn\.weight": lambda m: f"neck.convs.{int(m[1]) - 1}.weight", - r"depth_head\.scratch\.output_conv(\d+)(?:\.(\d+))?\.(weight|bias)": lambda m: ( - f"head.conv{int(m[1]) + (int(m[2]) // 2 if m[2] else 0)}.{m[3]}" if m[1] == "2" else f"head.conv{m[1]}.{m[3]}" - ), - r"depth_head\.scratch\.refinenet(\d+)\.out_conv\.(weight|bias)": lambda m: f"neck.fusion_stage.layers.{3 - (int(m[1]) - 1)}.projection.{m[2]}", - r"depth_head\.scratch\.refinenet(\d+)\.resConfUnit(\d+)\.conv(\d+)\.(weight|bias)": lambda m: f"neck.fusion_stage.layers.{3 - (int(m[1]) - 1)}.residual_layer{m[2]}.convolution{m[3]}.{m[4]}", -} - - -def get_dpt_config(model_name): - if "small" in model_name: - out_indices = [3, 6, 9, 12] - backbone_config = Dinov2Config.from_pretrained( - "facebook/dinov2-small", out_indices=out_indices, apply_layernorm=True, reshape_hidden_states=False - ) - fusion_hidden_size = 64 - neck_hidden_sizes = [48, 96, 192, 384] - elif "base" in model_name: - out_indices = [3, 6, 9, 12] - backbone_config = Dinov2Config.from_pretrained( - "facebook/dinov2-base", out_indices=out_indices, apply_layernorm=True, reshape_hidden_states=False - ) - fusion_hidden_size = 128 - neck_hidden_sizes = [96, 192, 384, 768] - elif "large" in model_name: - out_indices = [5, 12, 18, 24] - backbone_config = Dinov2Config.from_pretrained( - "facebook/dinov2-large", out_indices=out_indices, apply_layernorm=True, reshape_hidden_states=False - ) - fusion_hidden_size = 256 - neck_hidden_sizes = [256, 512, 1024, 1024] - else: - raise NotImplementedError(f"Model not supported: {model_name}") - - depth_estimation_type = "relative" - max_depth = None - - config = DepthAnythingConfig( - reassemble_hidden_size=backbone_config.hidden_size, - patch_size=backbone_config.patch_size, - backbone_config=backbone_config, - fusion_hidden_size=fusion_hidden_size, - neck_hidden_sizes=neck_hidden_sizes, - depth_estimation_type=depth_estimation_type, - max_depth=max_depth, - ) - - return config - - -def convert_key_pattern(key, mapping): - for pattern, replacement in mapping.items(): - match = re.fullmatch(pattern, key) - if match: - if callable(replacement): - return replacement(match) - return re.sub(pattern, replacement, key) - return None - - -def convert_keys(state_dict, config): - new_state_dict = {} - qkv_pattern = r"(backbone|pretrained)(\.blocks(\.\d+)?)?\.(\d+)\.attn\.qkv\.(weight|bias)" - qkv_keys = [k for k in list(state_dict.keys()) if re.match(qkv_pattern, k)] - for old_key in qkv_keys: - value = state_dict.pop(old_key) - match = re.match(qkv_pattern, old_key) - _, _, _, layer, attr = match.groups() - hidden_size = config.backbone_config.hidden_size - q = value[:hidden_size] - k = value[hidden_size : hidden_size * 2] - v = value[-hidden_size:] - - for proj, tensor in zip(["query", "key", "value"], [q, k, v]): - new_key = f"backbone.encoder.layer.{layer}.attention.attention.{proj}.{attr}" - new_state_dict[new_key] = tensor - - for old_key in list(state_dict.keys()): - value = state_dict.pop(old_key) - new_key = convert_key_pattern(old_key, ORIGINAL_TO_CONVERTED_KEY_MAPPING) - - new_state_dict[new_key] = value - - return new_state_dict - - -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - return Image.open(requests.get(url, stream=True).raw) - - -name_to_checkpoint = { - "distill-any-depth-small": "small/model.safetensors", - "distill-any-depth-base": "base/model.safetensors", - "distill-any-depth-large": "large/model.safetensors", -} - - -@torch.no_grad() -def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, verify_logits): - config = get_dpt_config(model_name) - - repo_id = "xingyang1/Distill-Any-Depth" - filepath = hf_hub_download(repo_id=repo_id, filename=name_to_checkpoint[model_name]) - state_dict = load_file(filepath) - - converted_state_dict = convert_keys(state_dict, config) - - model = DepthAnythingForDepthEstimation(config) - model.load_state_dict(converted_state_dict) - model.eval() - - processor = DPTImageProcessor( - do_resize=True, - size={"height": 518, "width": 518}, - ensure_multiple_of=14, - keep_aspect_ratio=True, - do_rescale=True, - do_normalize=True, - image_mean=[0.485, 0.456, 0.406], - image_std=[0.229, 0.224, 0.225], - ) - - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - image = Image.open(requests.get(url, stream=True).raw) - - pixel_values = processor(image, return_tensors="pt").pixel_values - - with torch.no_grad(): - outputs = model(pixel_values) - predicted_depth = outputs.predicted_depth - - print("Shape of predicted depth:", predicted_depth.shape) - print("First values:", predicted_depth[0, :3, :3]) - - if verify_logits: - print("Verifying logits...") - expected_shape = torch.Size([1, 518, 686]) - - if model_name == "distill-any-depth-small": - expected_slice = torch.tensor( - [[2.5653, 2.5249, 2.5570], [2.4897, 2.5235, 2.5355], [2.5255, 2.5261, 2.5422]] - ) - elif model_name == "distill-any-depth-base": - expected_slice = torch.tensor( - [[4.8976, 4.9075, 4.9403], [4.8872, 4.8906, 4.9448], [4.8712, 4.8898, 4.8838]] - ) - elif model_name == "distill-any-depth-large": - expected_slice = torch.tensor( - [[55.1067, 51.1828, 51.6803], [51.9098, 50.7529, 51.4494], [50.1745, 50.5491, 50.8818]] - ) - else: - raise ValueError("Not supported") - - assert predicted_depth.shape == torch.Size(expected_shape) - assert torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-4) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model and processor to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - print("Pushing model and processor to hub...") - model.push_to_hub(repo_id=f"{model_name.title()}-hf") - processor.push_to_hub(repo_id=f"{model_name.title()}-hf") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--model_name", - default="distill-any-depth-small", - type=str, - choices=name_to_checkpoint.keys(), - help="Name of the model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether to push the model to the hub after conversion.", - ) - parser.add_argument( - "--verify_logits", - action="store_true", - required=False, - help="Whether to verify the logits after conversion.", - ) - - args = parser.parse_args() - convert_dpt_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.verify_logits) diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py index 6bc14a0e154f..69bfffeb93f1 100644 --- a/src/transformers/models/depth_pro/configuration_depth_pro.py +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -188,7 +188,6 @@ def __init__( sub_config.update({"image_size": patch_size}) sub_config = CONFIG_MAPPING[sub_config["model_type"]](**sub_config) elif isinstance(sub_config, PretrainedConfig): - sub_config = sub_config image_size = getattr(sub_config, "image_size", None) if image_size != patch_size: raise ValueError( diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py deleted file mode 100644 index 655bbdc0230f..000000000000 --- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py +++ /dev/null @@ -1,255 +0,0 @@ -# Copyright 2024 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import gc -import os -from typing import Optional - -import regex as re -import torch -from huggingface_hub import hf_hub_download - -from transformers import ( - DepthProConfig, - DepthProForDepthEstimation, - DepthProImageProcessorFast, -) - - -# fmt: off -ORIGINAL_TO_CONVERTED_KEY_MAPPING = { - - # encoder - r"encoder.(patch|image)_encoder.cls_token": r"depth_pro.encoder.\1_encoder.model.embeddings.cls_token", - r"encoder.(patch|image)_encoder.pos_embed": r"depth_pro.encoder.\1_encoder.model.embeddings.position_embeddings", - r"encoder.(patch|image)_encoder.patch_embed.proj.(weight|bias)": r"depth_pro.encoder.\1_encoder.model.embeddings.patch_embeddings.projection.\2", - r"encoder.(patch|image)_encoder.blocks.(\d+).norm(\d+).(weight|bias)": r"depth_pro.encoder.\1_encoder.model.encoder.layer.\2.norm\3.\4", - r"encoder.(patch|image)_encoder.blocks.(\d+).attn.qkv.(weight|bias)": r"depth_pro.encoder.\1_encoder.model.encoder.layer.\2.attention.attention.(query|key|value).\3", - r"encoder.(patch|image)_encoder.blocks.(\d+).attn.proj.(weight|bias)": r"depth_pro.encoder.\1_encoder.model.encoder.layer.\2.attention.output.dense.\3", - r"encoder.(patch|image)_encoder.blocks.(\d+).ls(\d+).gamma": r"depth_pro.encoder.\1_encoder.model.encoder.layer.\2.layer_scale\3.lambda1", - r"encoder.(patch|image)_encoder.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"depth_pro.encoder.\1_encoder.model.encoder.layer.\2.mlp.fc\3.\4", - r"encoder.(patch|image)_encoder.norm.(weight|bias)": r"depth_pro.encoder.\1_encoder.model.layernorm.\2", - r"encoder.fuse_lowres.(weight|bias)": r"depth_pro.neck.fuse_image_with_low_res.\1", - - # fov - r"fov.encoder.0.cls_token": r"fov_model.fov_encoder.model.embeddings.cls_token", - r"fov.encoder.0.pos_embed": r"fov_model.fov_encoder.model.embeddings.position_embeddings", - r"fov.encoder.0.patch_embed.proj.(weight|bias)": r"fov_model.fov_encoder.model.embeddings.patch_embeddings.projection.\1", - r"fov.encoder.0.blocks.(\d+).norm(\d+).(weight|bias)": r"fov_model.fov_encoder.model.encoder.layer.\1.norm\2.\3", - r"fov.encoder.0.blocks.(\d+).attn.qkv.(weight|bias)": r"fov_model.fov_encoder.model.encoder.layer.\1.attention.attention.(query|key|value).\2", - r"fov.encoder.0.blocks.(\d+).attn.proj.(weight|bias)": r"fov_model.fov_encoder.model.encoder.layer.\1.attention.output.dense.\2", - r"fov.encoder.0.blocks.(\d+).ls(\d+).gamma": r"fov_model.fov_encoder.model.encoder.layer.\1.layer_scale\2.lambda1", - r"fov.encoder.0.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"fov_model.fov_encoder.model.encoder.layer.\1.mlp.fc\2.\3", - r"fov.encoder.0.norm.(weight|bias)": r"fov_model.fov_encoder.model.layernorm.\1", - r"fov.downsample.0.(weight|bias)": r"fov_model.conv.\1", - r"fov.encoder.1.(weight|bias)": r"fov_model.fov_encoder.neck.\1", - r"fov.head.(\d+).(weight|bias)": r"fov_model.head.layers.\1.\2", - - # head - r"head.(\d+).(weight|bias)": r"head.layers.\1.\2", - - # upsamples - r"encoder.upsample_lowres.(weight|bias)": r"depth_pro.neck.feature_upsample.image_block.layers.0.\1", - r"encoder.upsample_latent(\d+).(\d+).(weight|bias)": lambda match: ( - f"depth_pro.neck.feature_upsample.intermediate.{1-int(match.group(1))}.layers.{match.group(2)}.{match.group(3)}" - ), - r"encoder.upsample(\d+).(\d+).(weight|bias)": lambda match: ( - f"depth_pro.neck.feature_upsample.scaled_images.{2-int(match.group(1))}.layers.{match.group(2)}.{match.group(3)}" - ), - - # projections between encoder and fusion - r"decoder.convs.(\d+).weight": lambda match: ( - f"depth_pro.neck.feature_projection.projections.{4-int(match.group(1))}.weight" - ), - - # fusion stage - r"decoder.fusions.([1234]).resnet(\d+).residual.(\d+).(weight|bias)": lambda match: ( - f"fusion_stage.intermediate.{4-int(match.group(1))}.residual_layer{match.group(2)}.convolution{(int(match.group(3))+1)//2}.{match.group(4)}" - ), - r"decoder.fusions.0.resnet(\d+).residual.(\d+).(weight|bias)": lambda match: ( - f"fusion_stage.final.residual_layer{match.group(1)}.convolution{(int(match.group(2))+1)//2}.{match.group(3)}" - ), - r"decoder.fusions.([1234]).out_conv.(weight|bias)": lambda match: ( - f"fusion_stage.intermediate.{4-int(match.group(1))}.projection.{match.group(2)}" - ), - r"decoder.fusions.0.out_conv.(weight|bias)": lambda match: ( - f"fusion_stage.final.projection.{match.group(1)}" - ), - r"decoder.fusions.(\d+).deconv.(weight|bias)": lambda match: ( - f"fusion_stage.intermediate.{4-int(match.group(1))}.deconv.{match.group(2)}" - ), -} -# fmt: on - - -def convert_old_keys_to_new_keys(state_dict_keys: Optional[dict] = None): - output_dict = {} - if state_dict_keys is not None: - old_text = "\n".join(state_dict_keys) - new_text = old_text - for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items(): - if replacement is None: - new_text = re.sub(pattern, "", new_text) # an empty line - continue - new_text = re.sub(pattern, replacement, new_text) - output_dict = dict(zip(old_text.split("\n"), new_text.split("\n"))) - return output_dict - - -def get_qkv_state_dict(key, parameter): - """ - new key which looks like this - xxxx.(q|k|v).xxx (m, n) - - is converted to - xxxx.q.xxxx (m//3, n) - xxxx.k.xxxx (m//3, n) - xxxx.v.xxxx (m//3, n) - """ - qkv_state_dict = {} - placeholder = re.search(r"(\(.*?\))", key).group(1) # finds "(query|key|value)" - replacements_keys = placeholder[1:-1].split("|") # creates ['query', 'key', 'value'] - replacements_vals = torch.split( - parameter, split_size_or_sections=parameter.size(0) // len(replacements_keys), dim=0 - ) - for replacement_key, replacement_val in zip(replacements_keys, replacements_vals): - qkv_state_dict[key.replace(placeholder, replacement_key)] = replacement_val - return qkv_state_dict - - -def write_model( - hf_repo_id: str, - output_dir: str, - safe_serialization: bool = True, -): - os.makedirs(output_dir, exist_ok=True) - - # ------------------------------------------------------------ - # Create and save config - # ------------------------------------------------------------ - - # create config - backbone_config = { - "model_type": "dinov2", - "num_hidden_layers": 24, - "patch_size": 16, - "hidden_size": 1024, - "num_attention_heads": 16, - "image_size": 384, - "use_mask_token": False, - } - config = DepthProConfig( - # original implementation uses same config for all 3 models - image_model_config=backbone_config, - patch_model_config=backbone_config, - fov_model_config=backbone_config, - use_fov_model=True, - ) - - # save config - config.save_pretrained(output_dir) - print("Model config saved successfully...") - - # ------------------------------------------------------------ - # Convert weights - # ------------------------------------------------------------ - - # download and load state_dict from hf repo - file_path = hf_hub_download(hf_repo_id, "depth_pro.pt") - loaded = torch.load(file_path, weights_only=True) - - print("Converting model...") - all_keys = list(loaded.keys()) - new_keys = convert_old_keys_to_new_keys(all_keys) - - state_dict = {} - for key in all_keys: - new_key = new_keys[key] - current_parameter = loaded.pop(key) - - if "qkv" in key: - qkv_state_dict = get_qkv_state_dict(new_key, current_parameter) - state_dict.update(qkv_state_dict) - else: - state_dict[new_key] = current_parameter - - print("Loading the checkpoint in a DepthPro model.") - model = DepthProForDepthEstimation(config) - model.load_state_dict(state_dict, strict=True, assign=True) - print("Checkpoint loaded successfully.") - - print("Saving the model.") - model.save_pretrained(output_dir, safe_serialization=safe_serialization) - del state_dict, model - - # Safety check: reload the converted model - gc.collect() - print("Reloading the model to check if it's saved correctly.") - model = DepthProForDepthEstimation.from_pretrained(output_dir, device_map="auto") - print("Model reloaded successfully.") - return model - - -def write_image_processor(output_dir: str): - image_processor = DepthProImageProcessorFast() - image_processor.save_pretrained(output_dir) - return image_processor - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--hf_repo_id", - default="apple/DepthPro", - help="Location of official weights from apple on HF", - ) - parser.add_argument( - "--output_dir", - default="apple_DepthPro", - help="Location to write the converted model and processor", - ) - parser.add_argument( - "--safe_serialization", default=True, type=bool, help="Whether or not to save using `safetensors`." - ) - parser.add_argument( - "--push_to_hub", - action=argparse.BooleanOptionalAction, - help="Whether or not to push the converted model to the huggingface hub.", - ) - parser.add_argument( - "--hub_repo_id", - default="apple/DepthPro-hf", - help="Huggingface hub repo to write the converted model and processor", - ) - args = parser.parse_args() - - model = write_model( - hf_repo_id=args.hf_repo_id, - output_dir=args.output_dir, - safe_serialization=args.safe_serialization, - ) - - image_processor = write_image_processor( - output_dir=args.output_dir, - ) - - if args.push_to_hub: - print("Pushing to hub...") - model.push_to_hub(args.hub_repo_id) - image_processor.push_to_hub(args.hub_repo_id) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py index 76c1a53e0073..bc621e0ffc26 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py @@ -30,7 +30,6 @@ from ...utils import ( TensorType, auto_docstring, - is_torchvision_v2_available, logging, requires_backends, ) @@ -41,10 +40,7 @@ from .modeling_depth_pro import DepthProDepthEstimatorOutput -if is_torchvision_v2_available(): - from torchvision.transforms.v2 import functional as F -else: - from torchvision.transforms import functional as F +from torchvision.transforms.v2 import functional as F logger = logging.get_logger(__name__) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 52de04d42df7..7c32703b7c25 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -299,7 +299,6 @@ def forward( scaled_images_features = [] for i in range(self.n_scaled_images): hidden_state = scaled_images_last_hidden_state[i] - batch_size = batch_size padding = torch_int(self.merge_padding_value * (1 / self.scaled_images_ratios[i])) output_height = base_height * 2**i output_width = base_width * 2**i diff --git a/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index 8a7a2e0e0af8..000000000000 --- a/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,277 +0,0 @@ -# coding=utf-8 -# Copyright 2020 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DETR checkpoints with timm backbone.""" - -import argparse -import json -from collections import OrderedDict -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import DetrConfig, DetrForObjectDetection, DetrForSegmentation, DetrImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -# here we list all keys to be renamed (original name on the left, our name on the right) -rename_keys = [] -for i in range(6): - # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms - rename_keys.append( - (f"transformer.encoder.layers.{i}.self_attn.out_proj.weight", f"encoder.layers.{i}.self_attn.out_proj.weight") - ) - rename_keys.append( - (f"transformer.encoder.layers.{i}.self_attn.out_proj.bias", f"encoder.layers.{i}.self_attn.out_proj.bias") - ) - rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"encoder.layers.{i}.fc1.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"encoder.layers.{i}.fc1.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"encoder.layers.{i}.fc2.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"encoder.layers.{i}.fc2.bias")) - rename_keys.append( - (f"transformer.encoder.layers.{i}.norm1.weight", f"encoder.layers.{i}.self_attn_layer_norm.weight") - ) - rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"encoder.layers.{i}.self_attn_layer_norm.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"encoder.layers.{i}.final_layer_norm.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"encoder.layers.{i}.final_layer_norm.bias")) - # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms - rename_keys.append( - (f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"decoder.layers.{i}.self_attn.out_proj.weight") - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"decoder.layers.{i}.self_attn.out_proj.bias") - ) - rename_keys.append( - ( - f"transformer.decoder.layers.{i}.multihead_attn.out_proj.weight", - f"decoder.layers.{i}.encoder_attn.out_proj.weight", - ) - ) - rename_keys.append( - ( - f"transformer.decoder.layers.{i}.multihead_attn.out_proj.bias", - f"decoder.layers.{i}.encoder_attn.out_proj.bias", - ) - ) - rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"decoder.layers.{i}.fc1.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"decoder.layers.{i}.fc1.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"decoder.layers.{i}.fc2.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"decoder.layers.{i}.fc2.bias")) - rename_keys.append( - (f"transformer.decoder.layers.{i}.norm1.weight", f"decoder.layers.{i}.self_attn_layer_norm.weight") - ) - rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"decoder.layers.{i}.self_attn_layer_norm.bias")) - rename_keys.append( - (f"transformer.decoder.layers.{i}.norm2.weight", f"decoder.layers.{i}.encoder_attn_layer_norm.weight") - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.norm2.bias", f"decoder.layers.{i}.encoder_attn_layer_norm.bias") - ) - rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"decoder.layers.{i}.final_layer_norm.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias")) - -# convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads -rename_keys.extend( - [ - ("input_proj.weight", "input_projection.weight"), - ("input_proj.bias", "input_projection.bias"), - ("query_embed.weight", "query_position_embeddings.weight"), - ("transformer.decoder.norm.weight", "decoder.layernorm.weight"), - ("transformer.decoder.norm.bias", "decoder.layernorm.bias"), - ("class_embed.weight", "class_labels_classifier.weight"), - ("class_embed.bias", "class_labels_classifier.bias"), - ("bbox_embed.layers.0.weight", "bbox_predictor.layers.0.weight"), - ("bbox_embed.layers.0.bias", "bbox_predictor.layers.0.bias"), - ("bbox_embed.layers.1.weight", "bbox_predictor.layers.1.weight"), - ("bbox_embed.layers.1.bias", "bbox_predictor.layers.1.bias"), - ("bbox_embed.layers.2.weight", "bbox_predictor.layers.2.weight"), - ("bbox_embed.layers.2.bias", "bbox_predictor.layers.2.bias"), - ] -) - - -def rename_key(state_dict, old, new): - val = state_dict.pop(old) - state_dict[new] = val - - -def rename_backbone_keys(state_dict): - new_state_dict = OrderedDict() - for key, value in state_dict.items(): - if "backbone.0.body" in key: - new_key = key.replace("backbone.0.body", "backbone.conv_encoder.model") - new_state_dict[new_key] = value - else: - new_state_dict[key] = value - - return new_state_dict - - -def read_in_q_k_v(state_dict, is_panoptic=False): - prefix = "" - if is_panoptic: - prefix = "detr." - - # first: transformer encoder - for i in range(6): - # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :] - state_dict[f"encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256] - state_dict[f"encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :] - state_dict[f"encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512] - state_dict[f"encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :] - state_dict[f"encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:] - # next: transformer decoder (which is a bit more complex because it also includes cross-attention) - for i in range(6): - # read in weights + bias of input projection layer of self-attention - in_proj_weight = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :] - state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256] - state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :] - state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512] - state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :] - state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:] - # read in weights + bias of input projection layer of cross-attention - in_proj_weight_cross_attn = state_dict.pop( - f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_weight" - ) - in_proj_bias_cross_attn = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_bias") - # next, add query, keys and values (in that order) of cross-attention to the state dict - state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.weight"] = in_proj_weight_cross_attn[:256, :] - state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.bias"] = in_proj_bias_cross_attn[:256] - state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.weight"] = in_proj_weight_cross_attn[256:512, :] - state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.bias"] = in_proj_bias_cross_attn[256:512] - state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.weight"] = in_proj_weight_cross_attn[-256:, :] - state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.bias"] = in_proj_bias_cross_attn[-256:] - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - - return im - - -@torch.no_grad() -def convert_detr_checkpoint(model_name, pytorch_dump_folder_path): - """ - Copy/paste/tweak model's weights to our DETR structure. - """ - - # load default config - config = DetrConfig() - # set backbone and dilation attributes - if "resnet101" in model_name: - config.backbone = "resnet101" - if "dc5" in model_name: - config.dilation = True - is_panoptic = "panoptic" in model_name - if is_panoptic: - config.num_labels = 250 - else: - config.num_labels = 91 - repo_id = "huggingface/label-files" - filename = "coco-detection-id2label.json" - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - - # load image processor - format = "coco_panoptic" if is_panoptic else "coco_detection" - image_processor = DetrImageProcessor(format=format) - - # prepare image - img = prepare_img() - encoding = image_processor(images=img, return_tensors="pt") - pixel_values = encoding["pixel_values"] - - logger.info(f"Converting model {model_name}...") - - # load original model from torch hub - detr = torch.hub.load("facebookresearch/detr", model_name, pretrained=True).eval() - state_dict = detr.state_dict() - # rename keys - for src, dest in rename_keys: - if is_panoptic: - src = "detr." + src - rename_key(state_dict, src, dest) - state_dict = rename_backbone_keys(state_dict) - # query, key and value matrices need special treatment - read_in_q_k_v(state_dict, is_panoptic=is_panoptic) - # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them - prefix = "detr.model." if is_panoptic else "model." - for key in state_dict.copy(): - if is_panoptic: - if ( - key.startswith("detr") - and not key.startswith("class_labels_classifier") - and not key.startswith("bbox_predictor") - ): - val = state_dict.pop(key) - state_dict["detr.model" + key[4:]] = val - elif "class_labels_classifier" in key or "bbox_predictor" in key: - val = state_dict.pop(key) - state_dict["detr." + key] = val - elif key.startswith("bbox_attention") or key.startswith("mask_head"): - continue - else: - val = state_dict.pop(key) - state_dict[prefix + key] = val - else: - if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"): - val = state_dict.pop(key) - state_dict[prefix + key] = val - # finally, create HuggingFace model and load state dict - model = DetrForSegmentation(config) if is_panoptic else DetrForObjectDetection(config) - model.load_state_dict(state_dict) - model.eval() - # verify our conversion - original_outputs = detr(pixel_values) - outputs = model(pixel_values) - assert torch.allclose(outputs.logits, original_outputs["pred_logits"], atol=1e-4) - assert torch.allclose(outputs.pred_boxes, original_outputs["pred_boxes"], atol=1e-4) - if is_panoptic: - assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4) - - # Save model and image processor - logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...") - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - model.save_pretrained(pytorch_dump_folder_path) - image_processor.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument( - "--model_name", default="detr_resnet50", type=str, help="Name of the DETR model you'd like to convert." - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model." - ) - args = parser.parse_args() - convert_detr_checkpoint(args.model_name, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/detr/convert_detr_to_pytorch.py b/src/transformers/models/detr/convert_detr_to_pytorch.py deleted file mode 100644 index ffc755074d50..000000000000 --- a/src/transformers/models/detr/convert_detr_to_pytorch.py +++ /dev/null @@ -1,385 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DETR checkpoints with native (Transformers) backbone.""" - -import argparse -import json -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import DetrConfig, DetrForObjectDetection, DetrForSegmentation, DetrImageProcessor, ResNetConfig -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_detr_config(model_name): - # initialize config - if "resnet-50" in model_name: - backbone_config = ResNetConfig.from_pretrained("microsoft/resnet-50") - elif "resnet-101" in model_name: - backbone_config = ResNetConfig.from_pretrained("microsoft/resnet-101") - else: - raise ValueError("Model name should include either resnet50 or resnet101") - - config = DetrConfig(use_timm_backbone=False, backbone_config=backbone_config) - - # set label attributes - is_panoptic = "panoptic" in model_name - if is_panoptic: - config.num_labels = 250 - else: - config.num_labels = 91 - repo_id = "huggingface/label-files" - filename = "coco-detection-id2label.json" - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - - return config, is_panoptic - - -def create_rename_keys(config): - # here we list all keys to be renamed (original name on the left, our name on the right) - rename_keys = [] - - # stem - # fmt: off - rename_keys.append(("backbone.0.body.conv1.weight", "backbone.conv_encoder.model.embedder.embedder.convolution.weight")) - rename_keys.append(("backbone.0.body.bn1.weight", "backbone.conv_encoder.model.embedder.embedder.normalization.weight")) - rename_keys.append(("backbone.0.body.bn1.bias", "backbone.conv_encoder.model.embedder.embedder.normalization.bias")) - rename_keys.append(("backbone.0.body.bn1.running_mean", "backbone.conv_encoder.model.embedder.embedder.normalization.running_mean")) - rename_keys.append(("backbone.0.body.bn1.running_var", "backbone.conv_encoder.model.embedder.embedder.normalization.running_var")) - # stages - for stage_idx in range(len(config.backbone_config.depths)): - for layer_idx in range(config.backbone_config.depths[stage_idx]): - # shortcut - if layer_idx == 0: - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.0.weight", - f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.convolution.weight", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.weight", - f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.weight", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.bias", - f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.bias", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_mean", - f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_mean", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_var", - f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_var", - ) - ) - # 3 convs - for i in range(3): - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.conv{i+1}.weight", - f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.convolution.weight", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.weight", - f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.weight", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.bias", - f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.bias", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.running_mean", - f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_mean", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.running_var", - f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_var", - ) - ) - # fmt: on - - for i in range(config.encoder_layers): - # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms - rename_keys.append( - ( - f"transformer.encoder.layers.{i}.self_attn.out_proj.weight", - f"encoder.layers.{i}.self_attn.out_proj.weight", - ) - ) - rename_keys.append( - (f"transformer.encoder.layers.{i}.self_attn.out_proj.bias", f"encoder.layers.{i}.self_attn.out_proj.bias") - ) - rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"encoder.layers.{i}.fc1.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"encoder.layers.{i}.fc1.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"encoder.layers.{i}.fc2.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"encoder.layers.{i}.fc2.bias")) - rename_keys.append( - (f"transformer.encoder.layers.{i}.norm1.weight", f"encoder.layers.{i}.self_attn_layer_norm.weight") - ) - rename_keys.append( - (f"transformer.encoder.layers.{i}.norm1.bias", f"encoder.layers.{i}.self_attn_layer_norm.bias") - ) - rename_keys.append( - (f"transformer.encoder.layers.{i}.norm2.weight", f"encoder.layers.{i}.final_layer_norm.weight") - ) - rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"encoder.layers.{i}.final_layer_norm.bias")) - # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms - rename_keys.append( - ( - f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", - f"decoder.layers.{i}.self_attn.out_proj.weight", - ) - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"decoder.layers.{i}.self_attn.out_proj.bias") - ) - rename_keys.append( - ( - f"transformer.decoder.layers.{i}.multihead_attn.out_proj.weight", - f"decoder.layers.{i}.encoder_attn.out_proj.weight", - ) - ) - rename_keys.append( - ( - f"transformer.decoder.layers.{i}.multihead_attn.out_proj.bias", - f"decoder.layers.{i}.encoder_attn.out_proj.bias", - ) - ) - rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"decoder.layers.{i}.fc1.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"decoder.layers.{i}.fc1.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"decoder.layers.{i}.fc2.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"decoder.layers.{i}.fc2.bias")) - rename_keys.append( - (f"transformer.decoder.layers.{i}.norm1.weight", f"decoder.layers.{i}.self_attn_layer_norm.weight") - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.norm1.bias", f"decoder.layers.{i}.self_attn_layer_norm.bias") - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.norm2.weight", f"decoder.layers.{i}.encoder_attn_layer_norm.weight") - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.norm2.bias", f"decoder.layers.{i}.encoder_attn_layer_norm.bias") - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.norm3.weight", f"decoder.layers.{i}.final_layer_norm.weight") - ) - rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias")) - - # convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads - rename_keys.extend( - [ - ("input_proj.weight", "input_projection.weight"), - ("input_proj.bias", "input_projection.bias"), - ("query_embed.weight", "query_position_embeddings.weight"), - ("transformer.decoder.norm.weight", "decoder.layernorm.weight"), - ("transformer.decoder.norm.bias", "decoder.layernorm.bias"), - ("class_embed.weight", "class_labels_classifier.weight"), - ("class_embed.bias", "class_labels_classifier.bias"), - ("bbox_embed.layers.0.weight", "bbox_predictor.layers.0.weight"), - ("bbox_embed.layers.0.bias", "bbox_predictor.layers.0.bias"), - ("bbox_embed.layers.1.weight", "bbox_predictor.layers.1.weight"), - ("bbox_embed.layers.1.bias", "bbox_predictor.layers.1.bias"), - ("bbox_embed.layers.2.weight", "bbox_predictor.layers.2.weight"), - ("bbox_embed.layers.2.bias", "bbox_predictor.layers.2.bias"), - ] - ) - - return rename_keys - - -def rename_key(state_dict, old, new): - val = state_dict.pop(old) - state_dict[new] = val - - -def read_in_q_k_v(state_dict, is_panoptic=False): - prefix = "" - if is_panoptic: - prefix = "detr." - - # first: transformer encoder - for i in range(6): - # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :] - state_dict[f"encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256] - state_dict[f"encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :] - state_dict[f"encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512] - state_dict[f"encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :] - state_dict[f"encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:] - # next: transformer decoder (which is a bit more complex because it also includes cross-attention) - for i in range(6): - # read in weights + bias of input projection layer of self-attention - in_proj_weight = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :] - state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256] - state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :] - state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512] - state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :] - state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:] - # read in weights + bias of input projection layer of cross-attention - in_proj_weight_cross_attn = state_dict.pop( - f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_weight" - ) - in_proj_bias_cross_attn = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_bias") - # next, add query, keys and values (in that order) of cross-attention to the state dict - state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.weight"] = in_proj_weight_cross_attn[:256, :] - state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.bias"] = in_proj_bias_cross_attn[:256] - state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.weight"] = in_proj_weight_cross_attn[256:512, :] - state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.bias"] = in_proj_bias_cross_attn[256:512] - state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.weight"] = in_proj_weight_cross_attn[-256:, :] - state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.bias"] = in_proj_bias_cross_attn[-256:] - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - - return im - - -@torch.no_grad() -def convert_detr_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False): - """ - Copy/paste/tweak model's weights to our DETR structure. - """ - - # load default config - config, is_panoptic = get_detr_config(model_name) - - # load original model from torch hub - model_name_to_original_name = { - "detr-resnet-50": "detr_resnet50", - "detr-resnet-101": "detr_resnet101", - } - logger.info(f"Converting model {model_name}...") - detr = torch.hub.load("facebookresearch/detr", model_name_to_original_name[model_name], pretrained=True).eval() - state_dict = detr.state_dict() - # rename keys - for src, dest in create_rename_keys(config): - if is_panoptic: - src = "detr." + src - rename_key(state_dict, src, dest) - # query, key and value matrices need special treatment - read_in_q_k_v(state_dict, is_panoptic=is_panoptic) - # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them - prefix = "detr.model." if is_panoptic else "model." - for key in state_dict.copy(): - if is_panoptic: - if ( - key.startswith("detr") - and not key.startswith("class_labels_classifier") - and not key.startswith("bbox_predictor") - ): - val = state_dict.pop(key) - state_dict["detr.model" + key[4:]] = val - elif "class_labels_classifier" in key or "bbox_predictor" in key: - val = state_dict.pop(key) - state_dict["detr." + key] = val - elif key.startswith("bbox_attention") or key.startswith("mask_head"): - continue - else: - val = state_dict.pop(key) - state_dict[prefix + key] = val - else: - if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"): - val = state_dict.pop(key) - state_dict[prefix + key] = val - - # finally, create HuggingFace model and load state dict - model = DetrForSegmentation(config) if is_panoptic else DetrForObjectDetection(config) - model.load_state_dict(state_dict) - model.eval() - - # verify our conversion on an image - format = "coco_panoptic" if is_panoptic else "coco_detection" - processor = DetrImageProcessor(format=format) - - encoding = processor(images=prepare_img(), return_tensors="pt") - pixel_values = encoding["pixel_values"] - - original_outputs = detr(pixel_values) - outputs = model(pixel_values) - - assert torch.allclose(outputs.logits, original_outputs["pred_logits"], atol=1e-3) - assert torch.allclose(outputs.pred_boxes, original_outputs["pred_boxes"], atol=1e-3) - if is_panoptic: - assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - # Save model and image processor - logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...") - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - # Upload model and image processor to the hub - logger.info("Uploading PyTorch model and image processor to the hub...") - model.push_to_hub(f"nielsr/{model_name}") - processor.push_to_hub(f"nielsr/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument( - "--model_name", - default="detr-resnet-50", - type=str, - choices=["detr-resnet-50", "detr-resnet-101"], - help="Name of the DETR model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model." - ) - parser.add_argument("--push_to_hub", action="store_true", help="Whether to push the model to the hub or not.") - args = parser.parse_args() - convert_detr_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/detr/image_processing_detr_fast.py b/src/transformers/models/detr/image_processing_detr_fast.py index 96a89a98074c..ffe040898497 100644 --- a/src/transformers/models/detr/image_processing_detr_fast.py +++ b/src/transformers/models/detr/image_processing_detr_fast.py @@ -23,6 +23,7 @@ import torch from torch import nn from torchvision.io import read_image +from torchvision.transforms.v2 import functional as F from ...image_processing_utils import BatchFeature, get_size_dict from ...image_processing_utils_fast import ( @@ -49,7 +50,6 @@ from ...utils import ( TensorType, auto_docstring, - is_torchvision_v2_available, logging, ) from ...utils.import_utils import requires @@ -61,12 +61,6 @@ ) -if is_torchvision_v2_available(): - from torchvision.transforms.v2 import functional as F -else: - from torchvision.transforms import functional as F - - logger = logging.get_logger(__name__) SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) @@ -450,13 +444,7 @@ def resize_annotation( resample (`InterpolationMode`, defaults to `F.InterpolationMode.NEAREST_EXACT`): The resampling filter to use when resizing the masks. """ - interpolation = ( - interpolation - if interpolation is not None - else F.InterpolationMode.NEAREST_EXACT - if is_torchvision_v2_available() - else F.InterpolationMode.NEAREST - ) + interpolation = interpolation if interpolation is not None else F.InterpolationMode.NEAREST_EXACT ratio_height, ratio_width = [target / orig for target, orig in zip(target_size, orig_size)] new_annotation = {} diff --git a/src/transformers/models/dia/convert_dia_to_hf.py b/src/transformers/models/dia/convert_dia_to_hf.py deleted file mode 100644 index 3a33860f6be9..000000000000 --- a/src/transformers/models/dia/convert_dia_to_hf.py +++ /dev/null @@ -1,199 +0,0 @@ -# coding=utf-8 -# Copyright 2025 The Nari Labs and HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Converts a Dia model in Nari Labs format to Hugging Face format.""" - -import argparse -import os -import re - -import torch -from huggingface_hub import snapshot_download -from safetensors.torch import load_file - -from transformers import ( - DacModel, - DiaConfig, - DiaFeatureExtractor, - DiaForConditionalGeneration, - DiaProcessor, - DiaTokenizer, - GenerationConfig, -) -from transformers.utils.import_utils import _is_package_available - - -# Provide just the list of layer keys you want to fix -shape_mappings = [ - "encoder.layers.*.mlp.gate_up_proj.weight", - "encoder.layers.*.mlp.down_proj.weight", - "encoder.layers.*.self_attention.q_proj.weight", - "encoder.layers.*.self_attention.k_proj.weight", - "encoder.layers.*.self_attention.v_proj.weight", - "encoder.layers.*.self_attention.o_proj.weight", - "decoder.layers.*.mlp.gate_up_proj.weight", - "decoder.layers.*.mlp.down_proj.weight", - "decoder.layers.*.self_attention.q_proj.weight", - "decoder.layers.*.self_attention.k_proj.weight", - "decoder.layers.*.self_attention.v_proj.weight", - "decoder.layers.*.self_attention.o_proj.weight", - "decoder.layers.*.cross_attention.q_proj.weight", - "decoder.layers.*.cross_attention.k_proj.weight", - "decoder.layers.*.cross_attention.v_proj.weight", - "decoder.layers.*.cross_attention.o_proj.weight", - "decoder.logits_dense.weight", -] - -# Provide renamings here -rename_mapping = { - "mlp.wo": "mlp.down_proj", - "mlp.wi_fused": "mlp.gate_up_proj", -} - - -def get_generation_config(config): - model_generation_config = GenerationConfig.from_model_config(config) - model_generation_config._from_model_config = False - model_generation_config.do_sample = True - model_generation_config.top_k = 45 - model_generation_config.top_p = 0.95 - model_generation_config.temperature = 1.2 - model_generation_config.guidance_scale = 3.0 - model_generation_config.max_length = 3072 # Decoder max length - - return model_generation_config - - -def convert_dia_model_to_hf(checkpoint_path, verbose=False): - """ - Converts a Dia model in Nari Labs format to Hugging Face format. - Args: - checkpoint_path (`str`): - Path to the downloaded checkpoints. - verbose (`bool`, *optional*) - Whether to print information during conversion. - """ - # Download from HF Hub if checkpoint_path is None - checkpoint_path = snapshot_download(repo_id=checkpoint_path, allow_patterns=["*.pth", "*.safetensors"]) - print(f"Downloaded checkpoint from Hugging Face Hub: {checkpoint_path}") - - # Initialize base model with default config == 1.6B model - with torch.device("meta"): - hf_model = DiaForConditionalGeneration(config=DiaConfig()) - hf_model_dict = hf_model.state_dict() - hf_model_keys = hf_model_dict.keys() - - # Iterate through dir to catch all respective files - prefers safetensors but allows pt - files = os.listdir(checkpoint_path) - for file in files: - if file.endswith(".safetensors"): - load_function = load_file - elif file.endswith(".pth"): - load_function = torch.load - checkpoint_path = os.path.join(checkpoint_path, files[0]) - nari_state_dict = load_function(checkpoint_path, "cpu") - - # Conversion starts here - converted_state_dict = {} - embeddings = {} - for key, tensor in nari_state_dict.items(): - # add prefix - key = "model." + key - - # rename some weights - for original, rename in rename_mapping.items(): - if original in key: - key = re.sub(original, rename, key) - - # decoder multi channel - if "embeddings" in key: - embeddings_key = key.rsplit(".", 2)[0] + ".embed.weight" - if embeddings_key in embeddings: - embeddings[embeddings_key] += [tensor] - else: - embeddings[embeddings_key] = [tensor] - continue - elif re.sub(r"\d+", "*", key).removeprefix("model.") in shape_mappings: - # add exception to the head - if "logits_dense" in key: - key = re.sub("decoder.logits_dense", "logits_dense", key).removeprefix("model.") - - # dense general - if key in hf_model_keys: - tensor_shape = tensor.shape - target_shape = hf_model_dict[key].shape - try: - tensor = tensor.reshape(target_shape[1], target_shape[0]).T - if verbose: - print(f"{key}: transpose reshaped from {tensor_shape} to {target_shape}") - except Exception as e: - print(f"WARNING: Could not reshape {key}: {e}") - - converted_state_dict[key] = tensor - - # Combining the embeddings as last step - embeddings = {k: torch.cat(v, dim=0) for k, v in embeddings.items()} - converted_state_dict.update(embeddings) - - # Load converted weights into HF model - hf_model.load_state_dict(converted_state_dict, assign=True) - - # Overwrite generation config - hf_model.generation_config = get_generation_config(DiaConfig()) - - return hf_model - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # # Required parameters - parser.add_argument( - "--checkpoint_path", type=str, default="nari-labs/Dia-1.6B", help="Path to the downloaded checkpoints" - ) - parser.add_argument( - "--pytorch_dump_folder_path", default="AntonV/Dia-1.6B", type=str, help="Path to the output PyTorch model." - ) - parser.add_argument( - "--convert_preprocessor", - type=bool, - default=True, - help="Whether or not the preprocessor (tokenizer + feature extractor) should be converted along with the model.", - ) - parser.add_argument( - "--verbose", - type=bool, - default=True, - help="Whether or not to log information during conversion.", - ) - args = parser.parse_args() - - model = convert_dia_model_to_hf(args.checkpoint_path, args.verbose) - if args.convert_preprocessor: - try: - if not _is_package_available("tiktoken"): - raise ModuleNotFoundError( - """`tiktoken` is not installed, use `pip install tiktoken` to convert the tokenizer""" - ) - except Exception as e: - print(e) - else: - processor = DiaProcessor( - DiaFeatureExtractor(sampling_rate=44100, hop_length=512), - DiaTokenizer(), - DacModel.from_pretrained("descript/dac_44khz"), - ) - processor.save_pretrained(args.pytorch_dump_folder_path) - - model.save_pretrained(args.pytorch_dump_folder_path) - print(f"Saved converted checkpoint to {args.pytorch_dump_folder_path}") diff --git a/src/transformers/models/dia/generation_dia.py b/src/transformers/models/dia/generation_dia.py index bf18c775eed6..c297de7203d4 100644 --- a/src/transformers/models/dia/generation_dia.py +++ b/src/transformers/models/dia/generation_dia.py @@ -109,7 +109,7 @@ def _get_logits_processor( return merged_processors def _prepare_generation_config( - self, generation_config: Optional[GenerationConfig], use_model_defaults: Optional[bool] = None, **kwargs: dict + self, generation_config: Optional[GenerationConfig], use_model_defaults: Optional[bool] = None, **kwargs: Any ) -> tuple[GenerationConfig, dict]: generation_config, model_kwargs = super()._prepare_generation_config( generation_config, use_model_defaults, **kwargs diff --git a/src/transformers/models/dialogpt/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/dialogpt/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index 03f38084cfbf..000000000000 --- a/src/transformers/models/dialogpt/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os - -import torch - -from transformers.utils import WEIGHTS_NAME - - -DIALOGPT_MODELS = ["small", "medium", "large"] - -OLD_KEY = "lm_head.decoder.weight" -NEW_KEY = "lm_head.weight" - - -def convert_dialogpt_checkpoint(checkpoint_path: str, pytorch_dump_folder_path: str): - d = torch.load(checkpoint_path, weights_only=True) - d[NEW_KEY] = d.pop(OLD_KEY) - os.makedirs(pytorch_dump_folder_path, exist_ok=True) - torch.save(d, os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--dialogpt_path", default=".", type=str) - args = parser.parse_args() - for MODEL in DIALOGPT_MODELS: - checkpoint_path = os.path.join(args.dialogpt_path, f"{MODEL}_ft.pkl") - pytorch_dump_folder_path = f"./DialoGPT-{MODEL}" - convert_dialogpt_checkpoint( - checkpoint_path, - pytorch_dump_folder_path, - ) diff --git a/src/transformers/models/diffllama/modular_diffllama.py b/src/transformers/models/diffllama/modular_diffllama.py index fc0b7a9172d3..253b99edff0d 100644 --- a/src/transformers/models/diffllama/modular_diffllama.py +++ b/src/transformers/models/diffllama/modular_diffllama.py @@ -439,7 +439,7 @@ class DiffLlamaForTokenClassification(LlamaForTokenClassification): __all__ = [ "DiffLlamaPreTrainedModel", - "DiffLlamaModel", # noqa: F822 + "DiffLlamaModel", "DiffLlamaForCausalLM", "DiffLlamaForSequenceClassification", "DiffLlamaForQuestionAnswering", diff --git a/src/transformers/models/dinov2/convert_dinov2_to_hf.py b/src/transformers/models/dinov2/convert_dinov2_to_hf.py deleted file mode 100644 index d716191b2fcb..000000000000 --- a/src/transformers/models/dinov2/convert_dinov2_to_hf.py +++ /dev/null @@ -1,285 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DINOv2 checkpoints from the original repository. - -URL: https://github.com/facebookresearch/dinov2/tree/main -""" - -import argparse -import json -from pathlib import Path - -import requests -import torch -import torch.nn as nn -from huggingface_hub import hf_hub_download -from PIL import Image -from torchvision import transforms - -from transformers import BitImageProcessor, Dinov2Config, Dinov2ForImageClassification, Dinov2Model -from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_dinov2_config(model_name, image_classifier=False): - config = Dinov2Config(image_size=518, patch_size=14) - - # size of the architecture - if "vits" in model_name: - config.hidden_size = 384 - config.num_attention_heads = 6 - elif "vitb" in model_name: - pass - elif "vitl" in model_name: - config.hidden_size = 1024 - config.num_hidden_layers = 24 - config.num_attention_heads = 16 - elif "vitg" in model_name: - config.use_swiglu_ffn = True - config.hidden_size = 1536 - config.num_hidden_layers = 40 - config.num_attention_heads = 24 - else: - raise ValueError("Model not supported") - - if image_classifier: - repo_id = "huggingface/label-files" - filename = "imagenet-1k-id2label.json" - config.num_labels = 1000 - config.id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - config.id2label = {int(k): v for k, v in config.id2label.items()} - - return config - - -def create_rename_keys(config): - rename_keys = [] - # fmt: off - - # patch embedding layer - rename_keys.append(("cls_token", "embeddings.cls_token")) - rename_keys.append(("mask_token", "embeddings.mask_token")) - rename_keys.append(("pos_embed", "embeddings.position_embeddings")) - rename_keys.append(("patch_embed.proj.weight", "embeddings.patch_embeddings.projection.weight")) - rename_keys.append(("patch_embed.proj.bias", "embeddings.patch_embeddings.projection.bias")) - - for i in range(config.num_hidden_layers): - # layernorms - rename_keys.append((f"blocks.{i}.norm1.weight", f"encoder.layer.{i}.norm1.weight")) - rename_keys.append((f"blocks.{i}.norm1.bias", f"encoder.layer.{i}.norm1.bias")) - rename_keys.append((f"blocks.{i}.norm2.weight", f"encoder.layer.{i}.norm2.weight")) - rename_keys.append((f"blocks.{i}.norm2.bias", f"encoder.layer.{i}.norm2.bias")) - # MLP - if config.use_swiglu_ffn: - rename_keys.append((f"blocks.{i}.mlp.w12.weight", f"encoder.layer.{i}.mlp.w12.weight")) - rename_keys.append((f"blocks.{i}.mlp.w12.bias", f"encoder.layer.{i}.mlp.w12.bias")) - rename_keys.append((f"blocks.{i}.mlp.w3.weight", f"encoder.layer.{i}.mlp.w3.weight")) - rename_keys.append((f"blocks.{i}.mlp.w3.bias", f"encoder.layer.{i}.mlp.w3.bias")) - else: - rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"encoder.layer.{i}.mlp.fc1.weight")) - rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"encoder.layer.{i}.mlp.fc1.bias")) - rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"encoder.layer.{i}.mlp.fc2.weight")) - rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"encoder.layer.{i}.mlp.fc2.bias")) - # layerscale - rename_keys.append((f"blocks.{i}.ls1.gamma", f"encoder.layer.{i}.layer_scale1.lambda1")) - rename_keys.append((f"blocks.{i}.ls2.gamma", f"encoder.layer.{i}.layer_scale2.lambda1")) - # attention projection layer - rename_keys.append((f"blocks.{i}.attn.proj.weight", f"encoder.layer.{i}.attention.output.dense.weight")) - rename_keys.append((f"blocks.{i}.attn.proj.bias", f"encoder.layer.{i}.attention.output.dense.bias")) - - # final layernorm - rename_keys.append(("norm.weight", "layernorm.weight")) - rename_keys.append(("norm.bias", "layernorm.bias")) - - # fmt: on - return rename_keys - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config): - for i in range(config.num_hidden_layers): - # read in weights + bias of input projection layer (in timm, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight") - in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :] - state_dict[f"encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size] - state_dict[f"encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ - config.hidden_size : config.hidden_size * 2, : - ] - state_dict[f"encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[ - config.hidden_size : config.hidden_size * 2 - ] - state_dict[f"encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-config.hidden_size :, :] - state_dict[f"encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :] - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - image = Image.open(requests.get(url, stream=True).raw).convert("RGB") - return image - - -@torch.no_grad() -def convert_dinov2_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False): - """ - Copy/paste/tweak model's weights to our DINOv2 structure. - """ - - # define default Dinov2 configuration - image_classifier = "1layer" in model_name - config = get_dinov2_config(model_name, image_classifier=image_classifier) - - # load original model from torch hub - original_model = torch.hub.load("facebookresearch/dinov2", model_name.replace("_1layer", "")) - original_model.eval() - - # load state_dict of original model, remove and rename some keys - state_dict = original_model.state_dict() - rename_keys = create_rename_keys(config) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - read_in_q_k_v(state_dict, config) - - for key, val in state_dict.copy().items(): - val = state_dict.pop(key) - if "w12" in key: - key = key.replace("w12", "weights_in") - if "w3" in key: - key = key.replace("w3", "weights_out") - state_dict[key] = val - - # load HuggingFace model - if image_classifier: - model = Dinov2ForImageClassification(config).eval() - model.dinov2.load_state_dict(state_dict) - model_name_to_classifier_dict_url = { - "dinov2_vits14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_linear_head.pth", - "dinov2_vitb14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_linear_head.pth", - "dinov2_vitl14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_linear_head.pth", - "dinov2_vitg14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_linear_head.pth", - } - url = model_name_to_classifier_dict_url[model_name] - classifier_state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu") - model.classifier.weight = nn.Parameter(classifier_state_dict["weight"]) - model.classifier.bias = nn.Parameter(classifier_state_dict["bias"]) - else: - model = Dinov2Model(config).eval() - model.load_state_dict(state_dict) - - # load image - image = prepare_img() - - # preprocess image - transformations = transforms.Compose( - [ - transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC), - transforms.CenterCrop(224), - transforms.ToTensor(), - transforms.Normalize( - mean=IMAGENET_DEFAULT_MEAN, # these are RGB mean+std values - std=IMAGENET_DEFAULT_STD, # across a large photo dataset. - ), - ] - ) - - original_pixel_values = transformations(image).unsqueeze(0) # insert batch dimension - - processor = BitImageProcessor( - size={"shortest_edge": 256}, - resample=PILImageResampling.BICUBIC, - image_mean=IMAGENET_DEFAULT_MEAN, - image_std=IMAGENET_DEFAULT_STD, - ) - pixel_values = processor(image, return_tensors="pt").pixel_values - - assert torch.allclose(original_pixel_values, pixel_values) - - with torch.no_grad(): - outputs = model(pixel_values, output_hidden_states=True) - original_outputs = original_model(pixel_values) - - # assert values - if image_classifier: - print("Predicted class:") - class_idx = outputs.logits.argmax(-1).item() - print(model.config.id2label[class_idx]) - else: - assert outputs.last_hidden_state[:, 0].shape == original_outputs.shape - assert torch.allclose(outputs.last_hidden_state[:, 0], original_outputs, atol=1e-3) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model {model_name} to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - print(f"Saving image processor to {pytorch_dump_folder_path}") - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - model_name_to_hf_name = { - "dinov2_vits14": "dinov2-small", - "dinov2_vitb14": "dinov2-base", - "dinov2_vitl14": "dinov2-large", - "dinov2_vitg14": "dinov2-giant", - "dinov2_vits14_1layer": "dinov2-small-imagenet1k-1-layer", - "dinov2_vitb14_1layer": "dinov2-base-imagenet1k-1-layer", - "dinov2_vitl14_1layer": "dinov2-large-imagenet1k-1-layer", - "dinov2_vitg14_1layer": "dinov2-giant-imagenet1k-1-layer", - } - - name = model_name_to_hf_name[model_name] - model.push_to_hub(f"facebook/{name}") - processor.push_to_hub(f"facebook/{name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="dinov2_vitb14", - type=str, - choices=[ - "dinov2_vits14", - "dinov2_vitb14", - "dinov2_vitl14", - "dinov2_vitg14", - "dinov2_vits14_1layer", - "dinov2_vitb14_1layer", - "dinov2_vitl14_1layer", - "dinov2_vitg14_1layer", - ], - help="Name of the model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub." - ) - - args = parser.parse_args() - convert_dinov2_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/dinov2_with_registers/convert_dinov2_with_registers_to_hf.py b/src/transformers/models/dinov2_with_registers/convert_dinov2_with_registers_to_hf.py deleted file mode 100644 index 0ff2697f7466..000000000000 --- a/src/transformers/models/dinov2_with_registers/convert_dinov2_with_registers_to_hf.py +++ /dev/null @@ -1,291 +0,0 @@ -# coding=utf-8 -# Copyright 2024 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DINOv2 with Registers checkpoints from the original repository. - -URL: https://github.com/facebookresearch/dinov2/tree/main -""" - -import argparse -import json -from pathlib import Path - -import requests -import torch -import torch.nn as nn -from huggingface_hub import hf_hub_download -from PIL import Image -from torchvision import transforms - -from transformers import ( - BitImageProcessor, - Dinov2WithRegistersConfig, - Dinov2WithRegistersForImageClassification, - Dinov2WithRegistersModel, -) -from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_dinov2_with_registers_config(model_name, image_classifier=False): - config = Dinov2WithRegistersConfig(image_size=518, patch_size=14) - - # size of the architecture - if "vits" in model_name: - config.hidden_size = 384 - config.num_attention_heads = 6 - elif "vitb" in model_name: - pass - elif "vitl" in model_name: - config.hidden_size = 1024 - config.num_hidden_layers = 24 - config.num_attention_heads = 16 - elif "vitg" in model_name: - config.use_swiglu_ffn = True - config.hidden_size = 1536 - config.num_hidden_layers = 40 - config.num_attention_heads = 24 - else: - raise ValueError("Model not supported") - - if image_classifier: - repo_id = "huggingface/label-files" - filename = "imagenet-1k-id2label.json" - config.num_labels = 1000 - config.id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - config.id2label = {int(k): v for k, v in config.id2label.items()} - - return config - - -def create_rename_keys(config): - rename_keys = [] - # fmt: off - - # patch embedding layer - rename_keys.append(("cls_token", "embeddings.cls_token")) - rename_keys.append(("mask_token", "embeddings.mask_token")) - rename_keys.append(("pos_embed", "embeddings.position_embeddings")) - rename_keys.append(("register_tokens", "embeddings.register_tokens")) - rename_keys.append(("patch_embed.proj.weight", "embeddings.patch_embeddings.projection.weight")) - rename_keys.append(("patch_embed.proj.bias", "embeddings.patch_embeddings.projection.bias")) - - for i in range(config.num_hidden_layers): - # layernorms - rename_keys.append((f"blocks.{i}.norm1.weight", f"encoder.layer.{i}.norm1.weight")) - rename_keys.append((f"blocks.{i}.norm1.bias", f"encoder.layer.{i}.norm1.bias")) - rename_keys.append((f"blocks.{i}.norm2.weight", f"encoder.layer.{i}.norm2.weight")) - rename_keys.append((f"blocks.{i}.norm2.bias", f"encoder.layer.{i}.norm2.bias")) - # MLP - if config.use_swiglu_ffn: - rename_keys.append((f"blocks.{i}.mlp.w12.weight", f"encoder.layer.{i}.mlp.w12.weight")) - rename_keys.append((f"blocks.{i}.mlp.w12.bias", f"encoder.layer.{i}.mlp.w12.bias")) - rename_keys.append((f"blocks.{i}.mlp.w3.weight", f"encoder.layer.{i}.mlp.w3.weight")) - rename_keys.append((f"blocks.{i}.mlp.w3.bias", f"encoder.layer.{i}.mlp.w3.bias")) - else: - rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"encoder.layer.{i}.mlp.fc1.weight")) - rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"encoder.layer.{i}.mlp.fc1.bias")) - rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"encoder.layer.{i}.mlp.fc2.weight")) - rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"encoder.layer.{i}.mlp.fc2.bias")) - # layerscale - rename_keys.append((f"blocks.{i}.ls1.gamma", f"encoder.layer.{i}.layer_scale1.lambda1")) - rename_keys.append((f"blocks.{i}.ls2.gamma", f"encoder.layer.{i}.layer_scale2.lambda1")) - # attention projection layer - rename_keys.append((f"blocks.{i}.attn.proj.weight", f"encoder.layer.{i}.attention.output.dense.weight")) - rename_keys.append((f"blocks.{i}.attn.proj.bias", f"encoder.layer.{i}.attention.output.dense.bias")) - - # final layernorm - rename_keys.append(("norm.weight", "layernorm.weight")) - rename_keys.append(("norm.bias", "layernorm.bias")) - - # fmt: on - return rename_keys - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config): - for i in range(config.num_hidden_layers): - # read in weights + bias of input projection layer (in timm, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight") - in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :] - state_dict[f"encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size] - state_dict[f"encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ - config.hidden_size : config.hidden_size * 2, : - ] - state_dict[f"encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[ - config.hidden_size : config.hidden_size * 2 - ] - state_dict[f"encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-config.hidden_size :, :] - state_dict[f"encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :] - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - image = Image.open(requests.get(url, stream=True).raw).convert("RGB") - return image - - -@torch.no_grad() -def convert_dinov2_with_registers_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False): - """ - Copy/paste/tweak model's weights to our Dinov2WithRegisters structure. - """ - - # define default Dinov2WithRegisters configuration - image_classifier = "1layer" in model_name - config = get_dinov2_with_registers_config(model_name, image_classifier=image_classifier) - - # load original model from torch hub - original_model = torch.hub.load("facebookresearch/dinov2", model_name.replace("_1layer", "")) - original_model.eval() - - # load state_dict of original model, remove and rename some keys - state_dict = original_model.state_dict() - rename_keys = create_rename_keys(config) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - read_in_q_k_v(state_dict, config) - - for key, val in state_dict.copy().items(): - val = state_dict.pop(key) - if "w12" in key: - key = key.replace("w12", "weights_in") - if "w3" in key: - key = key.replace("w3", "weights_out") - state_dict[key] = val - - # load HuggingFace model - if image_classifier: - model = Dinov2WithRegistersForImageClassification(config).eval() - model.dinov2_with_registers.load_state_dict(state_dict) - model_name_to_classifier_dict_url = { - "dinov2_vits14_reg_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_reg4_linear_head.pth", - "dinov2_vitb14_reg_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_reg4_linear_head.pth", - "dinov2_vitl14_reg_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_reg4_linear_head.pth", - "dinov2_vitg14_reg_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_reg4_linear_head.pth", - } - url = model_name_to_classifier_dict_url[model_name] - classifier_state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu") - model.classifier.weight = nn.Parameter(classifier_state_dict["weight"]) - model.classifier.bias = nn.Parameter(classifier_state_dict["bias"]) - else: - model = Dinov2WithRegistersModel(config).eval() - model.load_state_dict(state_dict) - - # load image - image = prepare_img() - - # preprocess image - transformations = transforms.Compose( - [ - transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC), - transforms.CenterCrop(224), - transforms.ToTensor(), - transforms.Normalize( - mean=IMAGENET_DEFAULT_MEAN, # these are RGB mean+std values - std=IMAGENET_DEFAULT_STD, # across a large photo dataset. - ), - ] - ) - - original_pixel_values = transformations(image).unsqueeze(0) # insert batch dimension - - processor = BitImageProcessor( - size={"shortest_edge": 256}, - resample=PILImageResampling.BICUBIC, - image_mean=IMAGENET_DEFAULT_MEAN, - image_std=IMAGENET_DEFAULT_STD, - ) - pixel_values = processor(image, return_tensors="pt").pixel_values - - assert torch.allclose(original_pixel_values, pixel_values) - - with torch.no_grad(): - outputs = model(pixel_values, output_hidden_states=True) - original_outputs = original_model(pixel_values) - - # assert values - if image_classifier: - print("Predicted class:") - class_idx = outputs.logits.argmax(-1).item() - print(model.config.id2label[class_idx]) - else: - assert outputs.last_hidden_state[:, 0].shape == original_outputs.shape - assert torch.allclose(outputs.last_hidden_state[:, 0], original_outputs, atol=1e-3) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model {model_name} to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - print(f"Saving image processor to {pytorch_dump_folder_path}") - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - model_name_to_hf_name = { - "dinov2_vits14_reg": "dinov2-with-registers-small", - "dinov2_vitb14_reg": "dinov2-with-registers-base", - "dinov2_vitl14_reg": "dinov2-with-registers-large", - "dinov2_vitg14_reg": "dinov2-with-registers-giant", - "dinov2_vits14_reg_1layer": "dinov2-with-registers-small-imagenet1k-1-layer", - "dinov2_vitb14_reg_1layer": "dinov2-with-registers-base-imagenet1k-1-layer", - "dinov2_vitl14_reg_1layer": "dinov2-with-registers-large-imagenet1k-1-layer", - "dinov2_vitg14_reg_1layer": "dinov2-with-registers-giant-imagenet1k-1-layer", - } - - name = model_name_to_hf_name[model_name] - model.push_to_hub(f"nielsr/{name}") - processor.push_to_hub(f"nielsr/{name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="dinov2_vits14_reg", - type=str, - choices=[ - "dinov2_vits14_reg", - "dinov2_vitb14_reg", - "dinov2_vitl14_reg", - "dinov2_vitg14_reg", - "dinov2_vits14_reg_1layer", - "dinov2_vitb14_reg_1layer", - "dinov2_vitl14_reg_1layer", - "dinov2_vitg14_reg_1layer", - ], - help="Name of the model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub." - ) - - args = parser.parse_args() - convert_dinov2_with_registers_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/dinov3_convnext/convert_dinov3_convnext_to_hf.py b/src/transformers/models/dinov3_convnext/convert_dinov3_convnext_to_hf.py deleted file mode 100644 index 0ba200936ebe..000000000000 --- a/src/transformers/models/dinov3_convnext/convert_dinov3_convnext_to_hf.py +++ /dev/null @@ -1,234 +0,0 @@ -# coding=utf-8 -# Copyright 2025 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DINOv3 checkpoints from the original repository. - -URL: https://github.com/facebookresearch/dinov3/tree/main -""" - -import argparse -import os -import re -from typing import Optional - -import requests -import torch -from huggingface_hub import HfApi, hf_hub_download -from PIL import Image -from torchvision import transforms - -from transformers import DINOv3ConvNextConfig, DINOv3ConvNextModel, DINOv3ViTImageProcessorFast - - -HUB_MODELS = { - "convnext_tiny": "facebook/dinov3-convnext-tiny-pretrain-lvd1689m", - "convnext_small": "facebook/dinov3-convnext-small-pretrain-lvd1689m", - "convnext_base": "facebook/dinov3-convnext-base-pretrain-lvd1689m", - "convnext_large": "facebook/dinov3-convnext-large-pretrain-lvd1689m", -} - -HUB_CHECKPOINTS = { - "convnext_tiny": "dinov3_convnext_tiny_pretrain_lvd1689m-21b726bb.pth", - "convnext_small": "dinov3_convnext_small_pretrain_lvd1689m-296db49d.pth", - "convnext_base": "dinov3_convnext_base_pretrain_lvd1689m-801f2ba9.pth", - "convnext_large": "dinov3_convnext_large_pretrain_lvd1689m-61fa432d.pth", -} - -# fmt: off -ORIGINAL_TO_CONVERTED_KEY_MAPPING = { - r"dwconv": r"depthwise_conv", - r"pwconv": r"pointwise_conv", - r"norm": r"layer_norm", - r"stages.(\d+).(\d+)": r"stages.\1.layers.\2", - r"downsample_layers.(\d+).(\d+)": r"stages.\1.downsample_layers.\2", -} -# fmt: on - - -def get_dinov3_config(model_name: str) -> DINOv3ConvNextConfig: - # size of the architecture - if model_name == "convnext_tiny": - return DINOv3ConvNextConfig( - depths=[3, 3, 9, 3], - hidden_sizes=[96, 192, 384, 768], - ) - elif model_name == "convnext_small": - return DINOv3ConvNextConfig( - depths=[3, 3, 27, 3], - hidden_sizes=[96, 192, 384, 768], - ) - elif model_name == "convnext_base": - return DINOv3ConvNextConfig( - depths=[3, 3, 27, 3], - hidden_sizes=[128, 256, 512, 1024], - ) - elif model_name == "convnext_large": - return DINOv3ConvNextConfig( - depths=[3, 3, 27, 3], - hidden_sizes=[192, 384, 768, 1536], - ) - else: - raise ValueError("Model not supported") - - -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - image = Image.open(requests.get(url, stream=True).raw).convert("RGB") - return image - - -def get_transform(resize_size: int = 224): - to_tensor = transforms.ToTensor() - resize = transforms.Resize((resize_size, resize_size), antialias=True) - normalize = transforms.Normalize( - mean=(0.485, 0.456, 0.406), - std=(0.229, 0.224, 0.225), - ) - return transforms.Compose([to_tensor, resize, normalize]) - - -def get_image_processor(resize_size: int = 224): - return DINOv3ViTImageProcessorFast( - do_resize=True, - size={"height": resize_size, "width": resize_size}, - resample=2, # BILINEAR - ) - - -def convert_old_keys_to_new_keys(state_dict_keys: Optional[dict] = None): - """ - This function should be applied only once, on the concatenated keys to efficiently rename using - the key mappings. - """ - output_dict = {} - if state_dict_keys is not None: - old_text = "\n".join(state_dict_keys) - new_text = old_text - for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items(): - if replacement is None: - new_text = re.sub(pattern, "", new_text) # an empty line - continue - new_text = re.sub(pattern, replacement, new_text) - output_dict = dict(zip(old_text.split("\n"), new_text.split("\n"))) - return output_dict - - -@torch.no_grad() -def convert_and_test_dinov3_checkpoint(args): - expected_outputs = { - "convnext_tiny_cls": [-6.372119, 1.300791, 2.074303, -0.079975, 0.607205], - "convnext_tiny_patch": [0.490530, -3.713466, 1.848513, -1.040319, -1.090818], - "convnext_small_cls": [-0.903914, 1.412183, 0.287465, 0.175296, -2.397940], - "convnext_small_patch": [-1.081114, 0.637362, 3.748765, 0.170179, 1.445153], - "convnext_base_cls": [0.155366, -0.378771, -0.735157, -2.818718, 0.015095], - "convnext_base_patch": [3.039118, 0.778155, -1.961322, -1.607147, -2.411941], - "convnext_large_cls": [-2.219094, -0.594451, -2.300294, -0.957415, -0.520473], - "convnext_large_patch": [-1.477349, -0.217038, -3.128137, 0.418962, 0.334949], - } - model_name = args.model_name - config = get_dinov3_config(model_name) - # print(config) - - model = DINOv3ConvNextModel(config).eval() - state_dict_path = hf_hub_download(repo_id=HUB_MODELS[model_name], filename=HUB_CHECKPOINTS[model_name]) - original_state_dict = torch.load(state_dict_path) - original_keys = list(original_state_dict.keys()) - new_keys = convert_old_keys_to_new_keys(original_keys) - - converted_state_dict = {} - for key in original_keys: - new_key = new_keys[key] - weight_tensor = original_state_dict[key] - if key == "norms.3.weight" or key == "norms.3.bias": - continue - converted_state_dict[new_key] = weight_tensor - model.load_state_dict(converted_state_dict, strict=True) - model = model.eval() - - transform = get_transform() - image_processor = get_image_processor() - image = prepare_img() - - # check preprocessing - original_pixel_values = transform(image).unsqueeze(0) # add batch dimension - inputs = image_processor(image, return_tensors="pt") - - torch.testing.assert_close(original_pixel_values, inputs["pixel_values"], atol=1e-6, rtol=1e-6) - print("Preprocessing looks ok!") - - with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float): - model_output = model(**inputs) - - last_layer_class_token = model_output.pooler_output - last_layer_patch_tokens = model_output.last_hidden_state[:, 1:] - - actual_outputs = {} - actual_outputs[f"{model_name}_cls"] = last_layer_class_token[0, :5].tolist() - actual_outputs[f"{model_name}_patch"] = last_layer_patch_tokens[0, 0, :5].tolist() - - print("Actual: ", [round(x, 6) for x in actual_outputs[f"{model_name}_cls"]]) - print("Expected:", expected_outputs[f"{model_name}_cls"]) - - torch.testing.assert_close( - torch.Tensor(actual_outputs[f"{model_name}_cls"]), - torch.Tensor(expected_outputs[f"{model_name}_cls"]), - atol=1e-3, - rtol=1e-3, - ) - print("Actual: ", [round(x, 6) for x in actual_outputs[f"{model_name}_patch"]]) - print("Expected:", expected_outputs[f"{model_name}_patch"]) - - torch.testing.assert_close( - torch.Tensor(actual_outputs[f"{model_name}_patch"]), - torch.Tensor(expected_outputs[f"{model_name}_patch"]), - atol=1e-3, - rtol=1e-3, - ) - print("Forward pass looks ok!") - - save_dir = os.path.join(args.save_dir, model_name) - os.makedirs(save_dir, exist_ok=True) - model.save_pretrained(save_dir) - image_processor.save_pretrained(save_dir) - print(f"Model saved to {save_dir}") - - if args.push_to_hub: - api = HfApi() - repo = HUB_MODELS[model_name] - api.upload_folder(folder_path=save_dir, repo_id=repo, repo_type="model") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model-name", - default="convnext_tiny", - type=str, - choices=["convnext_tiny", "convnext_small", "convnext_base", "convnext_large"], - help="Name of the model you'd like to convert.", - ) - parser.add_argument( - "--save-dir", - default="converted_models", - type=str, - help="Directory to save the converted model.", - ) - parser.add_argument( - "--push-to-hub", - action="store_true", - help="Push the converted model to the Hugging Face Hub.", - ) - args = parser.parse_args() - convert_and_test_dinov3_checkpoint(args) diff --git a/src/transformers/models/dinov3_vit/convert_dinov3_vit_to_hf.py b/src/transformers/models/dinov3_vit/convert_dinov3_vit_to_hf.py deleted file mode 100644 index b6589e089d95..000000000000 --- a/src/transformers/models/dinov3_vit/convert_dinov3_vit_to_hf.py +++ /dev/null @@ -1,337 +0,0 @@ -# coding=utf-8 -# Copyright 2025 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DINOv3 checkpoints from the original repository. - -URL: https://github.com/facebookresearch/dinov3/tree/main -""" - -import argparse -import os -import re -from typing import Optional - -import requests -import torch -from huggingface_hub import HfApi, hf_hub_download -from PIL import Image -from torchvision import transforms - -from transformers import DINOv3ViTConfig, DINOv3ViTImageProcessorFast, DINOv3ViTModel - - -HUB_MODELS = { - "vits16_lvd1689m": "facebook/dinov3-vits16-pretrain-lvd1689m", - "vits16plus_lvd1689m": "facebook/dinov3-vits16plus-pretrain-lvd1689m", - "vitb16_lvd1689m": "facebook/dinov3-vitb16-pretrain-lvd1689m", - "vitl16_lvd1689m": "facebook/dinov3-vitl16-pretrain-lvd1689m", - "vitl16_sat493m": "facebook/dinov3-vitl16-pretrain-sat493m", - "vith16plus_lvd1689m": "facebook/dinov3-vith16plus-pretrain-lvd1689m", - "vit7b16_lvd1689m": "facebook/dinov3-vit7b16-pretrain-lvd1689m", - "vit7b16_sat493m": "facebook/dinov3-vit7b16-pretrain-sat493m", -} - -HUB_CHECKPOINTS = { - "vits16_lvd1689m": "dinov3_vits16_pretrain_lvd1689m-08c60483.pth", - "vits16plus_lvd1689m": "dinov3_vits16plus_pretrain_lvd1689m-4057cbaa.pth", - "vitb16_lvd1689m": "dinov3_vitb16_pretrain_lvd1689m-73cec8be.pth", - "vitl16_lvd1689m": "dinov3_vitl16_pretrain_lvd1689m-8aa4cbdd.pth", - "vitl16_sat493m": "dinov3_vitl16_pretrain_sat493m-eadcf0ff.pth", - "vith16plus_lvd1689m": "dinov3_vith16plus_pretrain_lvd1689m-7c1da9a5.pth", - "vit7b16_lvd1689m": "dinov3_vit7b16_pretrain_lvd1689m-a955f4ea.pth", - "vit7b16_sat493m": "dinov3_vit7b16_pretrain_sat493m-a6675841.pth", -} - -# fmt: off -ORIGINAL_TO_CONVERTED_KEY_MAPPING = { - r"cls_token": r"embeddings.cls_token", - r"mask_token": r"embeddings.mask_token", - r"storage_tokens": r"embeddings.register_tokens", - r"patch_embed.proj": r"embeddings.patch_embeddings", - r"periods": r"inv_freq", - r"rope_embed": r"rope_embeddings", - r"blocks.(\d+).attn.proj": r"layer.\1.attention.o_proj", - r"blocks.(\d+).attn.": r"layer.\1.attention.", - r"blocks.(\d+).ls(\d+).gamma": r"layer.\1.layer_scale\2.lambda1", - r"blocks.(\d+).mlp.fc1": r"layer.\1.mlp.up_proj", - r"blocks.(\d+).mlp.fc2": r"layer.\1.mlp.down_proj", - r"blocks.(\d+).mlp": r"layer.\1.mlp", - r"blocks.(\d+).norm": r"layer.\1.norm", - r"w1": r"gate_proj", - r"w2": r"up_proj", - r"w3": r"down_proj", -} -# fmt: on - - -def convert_old_keys_to_new_keys(state_dict_keys: Optional[dict] = None): - """ - This function should be applied only once, on the concatenated keys to efficiently rename using - the key mappings. - """ - output_dict = {} - if state_dict_keys is not None: - old_text = "\n".join(state_dict_keys) - new_text = old_text - for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items(): - if replacement is None: - new_text = re.sub(pattern, "", new_text) # an empty line - continue - new_text = re.sub(pattern, replacement, new_text) - output_dict = dict(zip(old_text.split("\n"), new_text.split("\n"))) - return output_dict - - -def split_qkv(state_dict: dict): - keys = [x for x in state_dict.keys() if "qkv" in x] - for key in keys: - qkv = state_dict.pop(key) - q, k, v = torch.chunk(qkv, 3, dim=0) - state_dict[key.replace("qkv", "q_proj")] = q - state_dict[key.replace("qkv", "k_proj")] = k - state_dict[key.replace("qkv", "v_proj")] = v - return state_dict - - -def get_dinov3_config(model_name: str) -> DINOv3ViTConfig: - # size of the architecture - if model_name == "vits16_lvd1689m": - return DINOv3ViTConfig( - patch_size=16, - hidden_size=384, - intermediate_size=1536, - num_hidden_layers=12, - num_attention_heads=6, - proj_bias=True, - num_register_tokens=4, - use_gated_mlp=False, - hidden_act="gelu", - ) - elif model_name == "vits16plus_lvd1689m": - return DINOv3ViTConfig( - patch_size=16, - hidden_size=384, - intermediate_size=1536, - num_hidden_layers=12, - num_attention_heads=6, - num_register_tokens=4, - use_gated_mlp=True, - hidden_act="silu", - ) - elif model_name == "vitb16_lvd1689m": - return DINOv3ViTConfig( - patch_size=16, - hidden_size=768, - intermediate_size=3072, - num_hidden_layers=12, - num_attention_heads=12, - proj_bias=True, - num_register_tokens=4, - use_gated_mlp=False, - hidden_act="gelu", - ) - elif model_name in ("vitl16_lvd1689m", "vitl16_sat493m"): - return DINOv3ViTConfig( - patch_size=16, - hidden_size=1024, - intermediate_size=4096, - num_hidden_layers=24, - num_attention_heads=16, - num_register_tokens=4, - use_gated_mlp=False, - hidden_act="gelu", - ) - elif model_name == "vith16plus_lvd1689m": - return DINOv3ViTConfig( - patch_size=16, - hidden_size=1280, - intermediate_size=5120, - num_hidden_layers=32, - num_attention_heads=20, - num_register_tokens=4, - use_gated_mlp=True, - hidden_act="silu", - ) - elif model_name in ("vit7b16_lvd1689m", "vit7b16_sat493m"): - return DINOv3ViTConfig( - patch_size=16, - hidden_size=4096, - intermediate_size=8192, - num_hidden_layers=40, - num_attention_heads=32, - query_bias=False, - value_bias=False, - num_register_tokens=4, - use_gated_mlp=True, - hidden_act="silu", - ) - else: - raise ValueError("Model not supported") - - -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - image = Image.open(requests.get(url, stream=True).raw).convert("RGB") - return image - - -def get_transform(resize_size: int = 224): - to_tensor = transforms.ToTensor() - resize = transforms.Resize((resize_size, resize_size), antialias=True) - normalize = transforms.Normalize( - mean=(0.485, 0.456, 0.406), - std=(0.229, 0.224, 0.225), - ) - return transforms.Compose([to_tensor, resize, normalize]) - - -def get_image_processor(resize_size: int = 224): - return DINOv3ViTImageProcessorFast( - do_resize=True, - size={"height": resize_size, "width": resize_size}, - resample=2, # BILINEAR - ) - - -@torch.no_grad() -def convert_and_test_dinov3_checkpoint(args): - expected_outputs = { - "vits16_lvd1689m_cls": [0.463561, -0.415609, 0.408236, -0.126613, -0.286636], - "vits16_lvd1689m_patch": [-0.038754, -0.250895, -0.016392, -0.455473, 0.571582], - "vits16plus_lvd1689m_cls": [-0.471349, -1.365778, -0.317983, 0.377219, -0.769085], - "vits16plus_lvd1689m_patch": [0.144551, -0.388117, -0.393433, -0.157695, -0.600380], - "vitb16_lvd1689m_cls": [1.034643, -0.180609, -0.341018, -0.066376, -0.011383], - "vitb16_lvd1689m_patch": [-0.082523, -0.456272, -0.728029, -0.430680, -0.152880], - "vitl16_lvd1689m_cls": [0.484527, -0.582214, 0.480636, 0.592040, 0.945166], - "vitl16_lvd1689m_patch": [-0.211367, -0.490863, -0.257131, 0.101763, 0.154511], - "vith16plus_lvd1689m_cls": [-0.064575, -0.148866, -0.621524, 0.634878, 0.152695], - "vith16plus_lvd1689m_patch": [-0.093817, 0.287407, -0.050036, 0.428043, 0.094561], - "vit7b16_lvd1689m_cls": [0.275439, -0.261353, 0.067772, 0.049936, -0.158747], - "vit7b16_lvd1689m_patch": [0.044442, -0.052542, 0.070777, -0.065111, -0.026546], - "vitl16_sat493m_cls": [-0.33235, 0.34052, -0.22087, 0.21434, 0.09003], - "vitl16_sat493m_patch": [0.18488, 0.30309, -0.20689, 0.12848, 0.06207], - "vit7b16_sat493m_cls": [-0.19779, 0.11819, -0.00581, -0.21055, -0.03971], - "vit7b16_sat493m_patch": [-0.12423, 0.07879, -0.10057, 0.02835, -0.11727], - } - - model_name = args.model_name - config = get_dinov3_config(model_name) - - model = DINOv3ViTModel(config).eval() - state_dict_path = hf_hub_download(repo_id=HUB_MODELS[model_name], filename=HUB_CHECKPOINTS[model_name]) - original_state_dict = torch.load(state_dict_path, mmap=True) - - original_state_dict = split_qkv(original_state_dict) - original_keys = list(original_state_dict.keys()) - new_keys = convert_old_keys_to_new_keys(original_keys) - - converted_state_dict = {} - for key in original_keys: - new_key = new_keys[key] - weight_tensor = original_state_dict[key] - - if "bias_mask" in key or "attn.k_proj.bias" in key or "local_cls_norm" in key: - continue - if "embeddings.mask_token" in new_key: - weight_tensor = weight_tensor.unsqueeze(1) - if "inv_freq" in new_key: - continue - - converted_state_dict[new_key] = weight_tensor - - model.load_state_dict(converted_state_dict, strict=True) - model = model.eval() - - transform = get_transform() - image_processor = get_image_processor() - image = prepare_img() - - # check preprocessing - original_pixel_values = transform(image).unsqueeze(0) # add batch dimension - inputs = image_processor(image, return_tensors="pt") - - torch.testing.assert_close(original_pixel_values, inputs["pixel_values"], atol=1e-6, rtol=1e-6) - print("Preprocessing looks ok!") - - with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float): - model_output = model(**inputs) - - last_layer_class_token = model_output.pooler_output - last_layer_patch_tokens = model_output.last_hidden_state[:, config.num_register_tokens + 1 :] - - actual_outputs = {} - actual_outputs[f"{model_name}_cls"] = last_layer_class_token[0, :5].tolist() - actual_outputs[f"{model_name}_patch"] = last_layer_patch_tokens[0, 0, :5].tolist() - - print("Actual: ", [round(x, 6) for x in actual_outputs[f"{model_name}_cls"]]) - print("Expected:", expected_outputs[f"{model_name}_cls"]) - - torch.testing.assert_close( - torch.Tensor(actual_outputs[f"{model_name}_cls"]), - torch.Tensor(expected_outputs[f"{model_name}_cls"]), - atol=1e-3, - rtol=1e-3, - ) - torch.testing.assert_close( - torch.Tensor(actual_outputs[f"{model_name}_patch"]), - torch.Tensor(expected_outputs[f"{model_name}_patch"]), - atol=1e-3, - rtol=1e-3, - ) - print("Forward pass looks ok!") - - save_dir = os.path.join(args.save_dir, model_name) - os.makedirs(save_dir, exist_ok=True) - model.save_pretrained(save_dir) - image_processor.save_pretrained(save_dir) - print(f"Model saved to {save_dir}") - - if args.push_to_hub: - api = HfApi() - repo = HUB_MODELS[model_name] - api.upload_folder(folder_path=save_dir, repo_id=repo, repo_type="model") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model-name", - default="vith16plus_lvd1689m", - type=str, - choices=[ - "vits16_lvd1689m", - "vits16plus_lvd1689m", - "vitb16_lvd1689m", - "vitl16_lvd1689m", - "vitl16_sat493m", - "vith16plus_lvd1689m", - "vit7b16_lvd1689m", - "vit7b16_sat493m", - ], - help="Name of the model you'd like to convert.", - ) - parser.add_argument( - "--save-dir", - default="converted_models", - type=str, - help="Directory to save the converted model.", - ) - parser.add_argument( - "--push-to-hub", - action="store_true", - help="Push the converted model to the Hugging Face Hub.", - ) - args = parser.parse_args() - convert_and_test_dinov3_checkpoint(args) diff --git a/src/transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py b/src/transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py index cdb68044bfc4..7c080485ed00 100644 --- a/src/transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +++ b/src/transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py @@ -17,6 +17,7 @@ from typing import Optional, Union import torch +from torchvision.transforms.v2 import functional as F from transformers.image_processing_base import BatchFeature from transformers.image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images @@ -24,17 +25,11 @@ from transformers.utils import ( TensorType, auto_docstring, - is_torchvision_v2_available, logging, ) from transformers.utils.import_utils import requires -if is_torchvision_v2_available(): - from torchvision.transforms.v2 import functional as F -else: - from torchvision.transforms import functional as F - logger = logging.get_logger(__name__) diff --git a/src/transformers/models/dit/convert_dit_unilm_to_pytorch.py b/src/transformers/models/dit/convert_dit_unilm_to_pytorch.py deleted file mode 100644 index a945a6b50a04..000000000000 --- a/src/transformers/models/dit/convert_dit_unilm_to_pytorch.py +++ /dev/null @@ -1,230 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DiT checkpoints from the unilm repository.""" - -import argparse -import json -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import BeitConfig, BeitForImageClassification, BeitForMaskedImageModeling, BeitImageProcessor -from transformers.image_utils import PILImageResampling -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config, has_lm_head=False, is_semantic=False): - prefix = "backbone." if is_semantic else "" - - rename_keys = [] - for i in range(config.num_hidden_layers): - # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms - rename_keys.append((f"{prefix}blocks.{i}.norm1.weight", f"beit.encoder.layer.{i}.layernorm_before.weight")) - rename_keys.append((f"{prefix}blocks.{i}.norm1.bias", f"beit.encoder.layer.{i}.layernorm_before.bias")) - rename_keys.append( - (f"{prefix}blocks.{i}.attn.proj.weight", f"beit.encoder.layer.{i}.attention.output.dense.weight") - ) - rename_keys.append( - (f"{prefix}blocks.{i}.attn.proj.bias", f"beit.encoder.layer.{i}.attention.output.dense.bias") - ) - rename_keys.append((f"{prefix}blocks.{i}.norm2.weight", f"beit.encoder.layer.{i}.layernorm_after.weight")) - rename_keys.append((f"{prefix}blocks.{i}.norm2.bias", f"beit.encoder.layer.{i}.layernorm_after.bias")) - rename_keys.append((f"{prefix}blocks.{i}.mlp.fc1.weight", f"beit.encoder.layer.{i}.intermediate.dense.weight")) - rename_keys.append((f"{prefix}blocks.{i}.mlp.fc1.bias", f"beit.encoder.layer.{i}.intermediate.dense.bias")) - rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.weight", f"beit.encoder.layer.{i}.output.dense.weight")) - rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.bias", f"beit.encoder.layer.{i}.output.dense.bias")) - - # projection layer + position embeddings - rename_keys.extend( - [ - (f"{prefix}cls_token", "beit.embeddings.cls_token"), - (f"{prefix}patch_embed.proj.weight", "beit.embeddings.patch_embeddings.projection.weight"), - (f"{prefix}patch_embed.proj.bias", "beit.embeddings.patch_embeddings.projection.bias"), - (f"{prefix}pos_embed", "beit.embeddings.position_embeddings"), - ] - ) - - if has_lm_head: - # mask token + layernorm - rename_keys.extend( - [ - ("mask_token", "beit.embeddings.mask_token"), - ("norm.weight", "layernorm.weight"), - ("norm.bias", "layernorm.bias"), - ] - ) - else: - # layernorm + classification head - rename_keys.extend( - [ - ("fc_norm.weight", "beit.pooler.layernorm.weight"), - ("fc_norm.bias", "beit.pooler.layernorm.bias"), - ("head.weight", "classifier.weight"), - ("head.bias", "classifier.bias"), - ] - ) - - return rename_keys - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config, has_lm_head=False, is_semantic=False): - for i in range(config.num_hidden_layers): - prefix = "backbone." if is_semantic else "" - # queries, keys and values - in_proj_weight = state_dict.pop(f"{prefix}blocks.{i}.attn.qkv.weight") - q_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.q_bias") - v_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.v_bias") - - state_dict[f"beit.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[ - : config.hidden_size, : - ] - state_dict[f"beit.encoder.layer.{i}.attention.attention.query.bias"] = q_bias - state_dict[f"beit.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ - config.hidden_size : config.hidden_size * 2, : - ] - state_dict[f"beit.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[ - -config.hidden_size :, : - ] - state_dict[f"beit.encoder.layer.{i}.attention.attention.value.bias"] = v_bias - - # gamma_1 and gamma_2 - # we call them lambda because otherwise they are renamed when using .from_pretrained - gamma_1 = state_dict.pop(f"{prefix}blocks.{i}.gamma_1") - gamma_2 = state_dict.pop(f"{prefix}blocks.{i}.gamma_2") - - state_dict[f"beit.encoder.layer.{i}.lambda_1"] = gamma_1 - state_dict[f"beit.encoder.layer.{i}.lambda_2"] = gamma_2 - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -@torch.no_grad() -def convert_dit_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub=False): - """ - Copy/paste/tweak model's weights to our BEiT structure. - """ - - # define default BEiT configuration - has_lm_head = "rvlcdip" not in checkpoint_url - config = BeitConfig(use_absolute_position_embeddings=True, use_mask_token=has_lm_head) - - # size of the architecture - if "large" in checkpoint_url or "dit-l" in checkpoint_url: - config.hidden_size = 1024 - config.intermediate_size = 4096 - config.num_hidden_layers = 24 - config.num_attention_heads = 16 - - # labels - if "rvlcdip" in checkpoint_url: - config.num_labels = 16 - repo_id = "huggingface/label-files" - filename = "rvlcdip-id2label.json" - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - - # load state_dict of original model, remove and rename some keys - state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["model"] - - rename_keys = create_rename_keys(config, has_lm_head=has_lm_head) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - read_in_q_k_v(state_dict, config, has_lm_head=has_lm_head) - - # load HuggingFace model - model = BeitForMaskedImageModeling(config) if has_lm_head else BeitForImageClassification(config) - model.eval() - model.load_state_dict(state_dict) - - # Check outputs on an image - image_processor = BeitImageProcessor( - size=config.image_size, resample=PILImageResampling.BILINEAR, do_center_crop=False - ) - image = prepare_img() - - encoding = image_processor(images=image, return_tensors="pt") - pixel_values = encoding["pixel_values"] - - outputs = model(pixel_values) - logits = outputs.logits - - # verify logits - expected_shape = [1, 16] if "rvlcdip" in checkpoint_url else [1, 196, 8192] - assert logits.shape == torch.Size(expected_shape), "Shape of logits not as expected" - - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - print(f"Saving image processor to {pytorch_dump_folder_path}") - image_processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - if has_lm_head: - model_name = "dit-base" if "base" in checkpoint_url else "dit-large" - else: - model_name = "dit-base-finetuned-rvlcdip" if "dit-b" in checkpoint_url else "dit-large-finetuned-rvlcdip" - image_processor.push_to_hub( - repo_path_or_name=Path(pytorch_dump_folder_path, model_name), - organization="nielsr", - commit_message="Add image processor", - use_temp_dir=True, - ) - model.push_to_hub( - repo_path_or_name=Path(pytorch_dump_folder_path, model_name), - organization="nielsr", - commit_message="Add model", - use_temp_dir=True, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument( - "--checkpoint_url", - default="https://layoutlm.blob.core.windows.net/dit/dit-pts/dit-base-224-p16-500k-62d53a.pth", - type=str, - help="URL to the original PyTorch checkpoint (.pth file).", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model." - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - ) - args = parser.parse_args() - convert_dit_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/doge/convert_doge_weights_to_hf.py b/src/transformers/models/doge/convert_doge_weights_to_hf.py deleted file mode 100644 index cde4350a15c4..000000000000 --- a/src/transformers/models/doge/convert_doge_weights_to_hf.py +++ /dev/null @@ -1,126 +0,0 @@ -import argparse -import json -import os -import re - -import torch -from safetensors.torch import load_file - -from transformers import DogeConfig, DogeForCausalLM - - -# fmt: off -# `None` means we drop the key -STATE_DICT_MAPPING = { - # CausalLM keys - r"^lm_head.weight": r"lm_head.weight", - - # Model keys - r"^model.word_embed.weight": r"model.embed_tokens.weight", - r"^model.rotary_emb.rotary_emb": r"model.rotary_emb.rotary_emb", - r"^model.final_layernorm.weight": r"model.norm.weight", - - # Layers keys - r"^model.layers.(\d+).pre_layernorm.weight": r"model.layers.\1.input_layernorm.weight", - r"^model.layers.(\d+).pre_residual.weight": r"model.layers.\1.input_residual", - r"^model.layers.(\d+).post_layernorm.weight": r"model.layers.\1.post_attention_layernorm.weight", - r"^model.layers.(\d+).post_residual.weight": r"model.layers.\1.post_attention_residual", - - # Attention keys - r"^model.layers.(\d+).self_attn.q_proj.weight": r"model.layers.\1.self_attn.q_proj.weight", - r"^model.layers.(\d+).self_attn.k_proj.weight": r"model.layers.\1.self_attn.k_proj.weight", - r"^model.layers.(\d+).self_attn.v_proj.weight": r"model.layers.\1.self_attn.v_proj.weight", - r"^model.layers.(\d+).self_attn.A": r"model.layers.\1.self_attn.A", - r"^model.layers.(\d+).self_attn.dt_proj.weight": r"model.layers.\1.self_attn.dt_proj.weight", - r"^model.layers.(\d+).self_attn.o_proj.weight": r"model.layers.\1.self_attn.o_proj.weight", - - # Feedforward keys - r"^model.layers.(\d+).feed_forward.gate_proj.weight": r"model.layers.\1.mlp.gate_proj.weight", - r"^model.layers.(\d+).feed_forward.up_proj.weight": r"model.layers.\1.mlp.up_proj.weight", - r"^model.layers.(\d+).feed_forward.down_proj.weight": r"model.layers.\1.mlp.down_proj.weight", - r"^model.layers.(\d+).feed_forward.router_gate.weight": r"model.layers.\1.mlp.router_gate.weight", - r"^model.layers.(\d+).feed_forward.router_gate.bias": None, - r"^model.layers.(\d+).feed_forward.down_embed.weight": r"model.layers.\1.mlp.down_embed.weight", - r"^model.layers.(\d+).feed_forward.up_embed.weight": r"model.layers.\1.mlp.up_embed.weight", -} -# fmt: on - - -def load_weights(input_dir: str): - safetensor_files = [os.path.join(input_dir, x) for x in os.listdir(input_dir) if x.endswith(".safetensors")] - - all_weights = {} - - if safetensor_files: - if len(safetensor_files) == 1: - tensors = load_file(safetensor_files[0]) - all_weights.update(tensors) - return all_weights - safetensor_files = sorted(safetensor_files, key=lambda x: int(x.rsplit("-", 3)[1])) - for file in safetensor_files: - tensors = load_file(file) - all_weights.update(tensors) - return all_weights - - else: - raise ValueError("No .safetensors or .bin files found in the specified directory.") - - -def map_old_key_to_new(old_key): - for pattern, replacement in STATE_DICT_MAPPING.items(): - if replacement is None: - if re.fullmatch(pattern, old_key): - return None - else: - new_key, n_replace = re.subn(pattern, replacement, old_key) - # Early exit of the loop - if n_replace > 0: - return new_key - - raise ValueError(f"Key: {old_key} could not be mapped (check the mapping).") - - -def convert_state_dict(original_state_dict: dict, config: DogeConfig): - new_dict = {} - - for old_key, value in original_state_dict.items(): - new_key = map_old_key_to_new(old_key) - if new_key is None: - continue - new_dict[new_key] = value - return new_dict - - -def convert_doge_model(input_dir, output_dir): - # Load and convert config - with open(os.path.join(input_dir, "config.json")) as f: - config = json.load(f) - config = DogeConfig(**config) - config.save_pretrained(output_dir) - - # Load and convert weights - original_state_dict = load_weights(input_dir) - new_dict = convert_state_dict(original_state_dict, config) - with torch.device("meta"): - model = DogeForCausalLM(config) - if config.tie_word_embeddings: - new_dict["lm_head.weight"] = new_dict["model.embed_tokens.weight"] - model.load_state_dict(new_dict, strict=True, assign=True) - model.save_pretrained(output_dir) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "input_dir", - type=str, - help="Location of the local folder copied from the Hub.", - ) - parser.add_argument( - "output_dir", - type=str, - help="Location to write HF model.", - ) - - args = parser.parse_args() - convert_doge_model(args.input_dir, args.output_dir) diff --git a/src/transformers/models/donut/convert_donut_to_pytorch.py b/src/transformers/models/donut/convert_donut_to_pytorch.py deleted file mode 100644 index d58cdd622479..000000000000 --- a/src/transformers/models/donut/convert_donut_to_pytorch.py +++ /dev/null @@ -1,234 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Donut checkpoints using the original `donut-python` library. URL: https://github.com/clovaai/donut""" - -import argparse - -import torch -from datasets import load_dataset -from donut import DonutModel - -from transformers import ( - DonutImageProcessor, - DonutProcessor, - DonutSwinConfig, - DonutSwinModel, - MBartConfig, - MBartForCausalLM, - VisionEncoderDecoderModel, - XLMRobertaTokenizerFast, -) - - -def get_configs(model): - original_config = model.config - - encoder_config = DonutSwinConfig( - image_size=original_config.input_size, - patch_size=4, - depths=original_config.encoder_layer, - num_heads=[4, 8, 16, 32], - window_size=original_config.window_size, - embed_dim=128, - ) - decoder_config = MBartConfig( - is_decoder=True, - is_encoder_decoder=False, - add_cross_attention=True, - decoder_layers=original_config.decoder_layer, - max_position_embeddings=original_config.max_position_embeddings, - vocab_size=len( - model.decoder.tokenizer - ), # several special tokens are added to the vocab of XLMRobertaTokenizer, see repo on the hub (added_tokens.json) - scale_embedding=True, - add_final_layer_norm=True, - ) - - return encoder_config, decoder_config - - -def rename_key(name): - if "encoder.model" in name: - name = name.replace("encoder.model", "encoder") - if "decoder.model" in name: - name = name.replace("decoder.model", "decoder") - if "patch_embed.proj" in name: - name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection") - if "patch_embed.norm" in name: - name = name.replace("patch_embed.norm", "embeddings.norm") - if name.startswith("encoder"): - if "layers" in name: - name = "encoder." + name - if "attn.proj" in name: - name = name.replace("attn.proj", "attention.output.dense") - if "attn" in name and "mask" not in name: - name = name.replace("attn", "attention.self") - if "norm1" in name: - name = name.replace("norm1", "layernorm_before") - if "norm2" in name: - name = name.replace("norm2", "layernorm_after") - if "mlp.fc1" in name: - name = name.replace("mlp.fc1", "intermediate.dense") - if "mlp.fc2" in name: - name = name.replace("mlp.fc2", "output.dense") - - if name == "encoder.norm.weight": - name = "encoder.layernorm.weight" - if name == "encoder.norm.bias": - name = "encoder.layernorm.bias" - - return name - - -def convert_state_dict(orig_state_dict, model): - for key in orig_state_dict.copy(): - val = orig_state_dict.pop(key) - - if "qkv" in key: - key_split = key.split(".") - layer_num = int(key_split[3]) - block_num = int(key_split[5]) - dim = model.encoder.encoder.layers[layer_num].blocks[block_num].attention.self.all_head_size - - if "weight" in key: - orig_state_dict[ - f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.weight" - ] = val[:dim, :] - orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight"] = ( - val[dim : dim * 2, :] - ) - orig_state_dict[ - f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.weight" - ] = val[-dim:, :] - else: - orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias"] = ( - val[:dim] - ) - orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.bias"] = ( - val[dim : dim * 2] - ) - orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.bias"] = ( - val[-dim:] - ) - elif "attn_mask" in key or key in ["encoder.model.norm.weight", "encoder.model.norm.bias"]: - # HuggingFace implementation doesn't use attn_mask buffer - # and model doesn't use final LayerNorms for the encoder - pass - else: - orig_state_dict[rename_key(key)] = val - - return orig_state_dict - - -def convert_donut_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False): - # load original model - original_model = DonutModel.from_pretrained(model_name).eval() - - # load HuggingFace model - encoder_config, decoder_config = get_configs(original_model) - encoder = DonutSwinModel(encoder_config) - decoder = MBartForCausalLM(decoder_config) - model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder) - model.eval() - - state_dict = original_model.state_dict() - new_state_dict = convert_state_dict(state_dict, model) - model.load_state_dict(new_state_dict) - - # verify results on scanned document - dataset = load_dataset("hf-internal-testing/example-documents") # no-script - image = dataset["test"][0]["image"].convert("RGB") - - tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name, from_slow=True) - image_processor = DonutImageProcessor( - do_align_long_axis=original_model.config.align_long_axis, size=original_model.config.input_size[::-1] - ) - processor = DonutProcessor(image_processor, tokenizer) - pixel_values = processor(image, return_tensors="pt").pixel_values - - if model_name == "naver-clova-ix/donut-base-finetuned-docvqa": - task_prompt = "{user_input}" - question = "When is the coffee break?" - task_prompt = task_prompt.replace("{user_input}", question) - elif model_name == "naver-clova-ix/donut-base-finetuned-rvlcdip": - task_prompt = "" - elif model_name in [ - "naver-clova-ix/donut-base-finetuned-cord-v1", - "naver-clova-ix/donut-base-finetuned-cord-v1-2560", - ]: - task_prompt = "" - elif model_name == "naver-clova-ix/donut-base-finetuned-cord-v2": - task_prompt = "s_cord-v2>" - elif model_name == "naver-clova-ix/donut-base-finetuned-zhtrainticket": - task_prompt = "" - elif model_name in ["naver-clova-ix/donut-proto", "naver-clova-ix/donut-base"]: - # use a random prompt - task_prompt = "hello world" - else: - raise ValueError("Model name not supported") - prompt_tensors = original_model.decoder.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt")[ - "input_ids" - ] - - original_patch_embed = original_model.encoder.model.patch_embed(pixel_values) - patch_embeddings, _ = model.encoder.embeddings(pixel_values) - assert torch.allclose(original_patch_embed, patch_embeddings, atol=1e-3) - - # verify encoder hidden states - original_last_hidden_state = original_model.encoder(pixel_values) - last_hidden_state = model.encoder(pixel_values).last_hidden_state - assert torch.allclose(original_last_hidden_state, last_hidden_state, atol=1e-2) - - # verify decoder hidden states - original_logits = original_model(pixel_values, prompt_tensors, None).logits - logits = model(pixel_values, decoder_input_ids=prompt_tensors).logits - assert torch.allclose(original_logits, logits, atol=1e-3) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - print(f"Saving model and processor to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - model.push_to_hub("nielsr/" + model_name.split("/")[-1], commit_message="Update model") - processor.push_to_hub("nielsr/" + model_name.split("/")[-1], commit_message="Update model") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="naver-clova-ix/donut-base-finetuned-docvqa", - required=False, - type=str, - help="Name of the original model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - required=False, - type=str, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether or not to push the converted model and processor to the 🤗 hub.", - ) - - args = parser.parse_args() - convert_donut_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/donut/image_processing_donut_fast.py b/src/transformers/models/donut/image_processing_donut_fast.py index 7c808ab60cd4..29e06831b1b4 100644 --- a/src/transformers/models/donut/image_processing_donut_fast.py +++ b/src/transformers/models/donut/image_processing_donut_fast.py @@ -17,6 +17,7 @@ from typing import Optional, Union import torch +from torchvision.transforms.v2 import functional as F from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature, DefaultFastImageProcessorKwargs from ...image_transforms import group_images_by_shape, reorder_images @@ -25,16 +26,10 @@ from ...utils import ( TensorType, auto_docstring, - is_torchvision_v2_available, logging, ) -if is_torchvision_v2_available(): - from torchvision.transforms.v2 import functional as F -else: - from torchvision.transforms import functional as F - logger = logging.get_logger(__name__) diff --git a/src/transformers/models/dpr/convert_dpr_original_checkpoint_to_pytorch.py b/src/transformers/models/dpr/convert_dpr_original_checkpoint_to_pytorch.py deleted file mode 100644 index 5151c0972a7e..000000000000 --- a/src/transformers/models/dpr/convert_dpr_original_checkpoint_to_pytorch.py +++ /dev/null @@ -1,145 +0,0 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import collections -from pathlib import Path - -import torch -from torch.serialization import default_restore_location - -from transformers import BertConfig, DPRConfig, DPRContextEncoder, DPRQuestionEncoder, DPRReader - - -CheckpointState = collections.namedtuple( - "CheckpointState", ["model_dict", "optimizer_dict", "scheduler_dict", "offset", "epoch", "encoder_params"] -) - - -def load_states_from_checkpoint(model_file: str) -> CheckpointState: - print(f"Reading saved model from {model_file}") - state_dict = torch.load( - model_file, map_location=lambda s, l: default_restore_location(s, "cpu"), weights_only=True - ) - return CheckpointState(**state_dict) - - -class DPRState: - def __init__(self, src_file: Path): - self.src_file = src_file - - def load_dpr_model(self): - raise NotImplementedError - - @staticmethod - def from_type(comp_type: str, *args, **kwargs) -> "DPRState": - if comp_type.startswith("c"): - return DPRContextEncoderState(*args, **kwargs) - if comp_type.startswith("q"): - return DPRQuestionEncoderState(*args, **kwargs) - if comp_type.startswith("r"): - return DPRReaderState(*args, **kwargs) - else: - raise ValueError("Component type must be either 'ctx_encoder', 'question_encoder' or 'reader'.") - - -class DPRContextEncoderState(DPRState): - def load_dpr_model(self): - model = DPRContextEncoder(DPRConfig(**BertConfig.get_config_dict("google-bert/bert-base-uncased")[0])) - print(f"Loading DPR biencoder from {self.src_file}") - saved_state = load_states_from_checkpoint(self.src_file) - encoder, prefix = model.ctx_encoder, "ctx_model." - # Fix changes from https://github.com/huggingface/transformers/commit/614fef1691edb806de976756d4948ecbcd0c0ca3 - state_dict = {"bert_model.embeddings.position_ids": model.ctx_encoder.bert_model.embeddings.position_ids} - for key, value in saved_state.model_dict.items(): - if key.startswith(prefix): - key = key[len(prefix) :] - if not key.startswith("encode_proj."): - key = "bert_model." + key - state_dict[key] = value - encoder.load_state_dict(state_dict) - return model - - -class DPRQuestionEncoderState(DPRState): - def load_dpr_model(self): - model = DPRQuestionEncoder(DPRConfig(**BertConfig.get_config_dict("google-bert/bert-base-uncased")[0])) - print(f"Loading DPR biencoder from {self.src_file}") - saved_state = load_states_from_checkpoint(self.src_file) - encoder, prefix = model.question_encoder, "question_model." - # Fix changes from https://github.com/huggingface/transformers/commit/614fef1691edb806de976756d4948ecbcd0c0ca3 - state_dict = {"bert_model.embeddings.position_ids": model.question_encoder.bert_model.embeddings.position_ids} - for key, value in saved_state.model_dict.items(): - if key.startswith(prefix): - key = key[len(prefix) :] - if not key.startswith("encode_proj."): - key = "bert_model." + key - state_dict[key] = value - encoder.load_state_dict(state_dict) - return model - - -class DPRReaderState(DPRState): - def load_dpr_model(self): - model = DPRReader(DPRConfig(**BertConfig.get_config_dict("google-bert/bert-base-uncased")[0])) - print(f"Loading DPR reader from {self.src_file}") - saved_state = load_states_from_checkpoint(self.src_file) - # Fix changes from https://github.com/huggingface/transformers/commit/614fef1691edb806de976756d4948ecbcd0c0ca3 - state_dict = { - "encoder.bert_model.embeddings.position_ids": model.span_predictor.encoder.bert_model.embeddings.position_ids - } - for key, value in saved_state.model_dict.items(): - if key.startswith("encoder.") and not key.startswith("encoder.encode_proj"): - key = "encoder.bert_model." + key[len("encoder.") :] - state_dict[key] = value - model.span_predictor.load_state_dict(state_dict) - return model - - -def convert(comp_type: str, src_file: Path, dest_dir: Path): - dest_dir = Path(dest_dir) - dest_dir.mkdir(exist_ok=True) - - dpr_state = DPRState.from_type(comp_type, src_file=src_file) - model = dpr_state.load_dpr_model() - model.save_pretrained(dest_dir) - model.from_pretrained(dest_dir) # sanity check - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--type", type=str, help="Type of the component to convert: 'ctx_encoder', 'question_encoder' or 'reader'." - ) - parser.add_argument( - "--src", - type=str, - help=( - "Path to the dpr checkpoint file. They can be downloaded from the official DPR repo" - " https://github.com/facebookresearch/DPR. Note that in the official repo, both encoders are stored in the" - " 'retriever' checkpoints." - ), - ) - parser.add_argument("--dest", type=str, default=None, help="Path to the output PyTorch model directory.") - args = parser.parse_args() - - src_file = Path(args.src) - dest_dir = f"converted-{src_file.name}" if args.dest is None else args.dest - dest_dir = Path(dest_dir) - assert src_file.exists() - assert args.type is not None, ( - "Please specify the component type of the DPR model to convert: 'ctx_encoder', 'question_encoder' or 'reader'." - ) - convert(args.type, src_file, dest_dir) diff --git a/src/transformers/models/dpt/configuration_dpt.py b/src/transformers/models/dpt/configuration_dpt.py index 70e46f232022..311425fcda1c 100644 --- a/src/transformers/models/dpt/configuration_dpt.py +++ b/src/transformers/models/dpt/configuration_dpt.py @@ -202,9 +202,7 @@ def __init__( if isinstance(backbone_config, dict): logger.info("Initializing the config with a `BiT` backbone.") backbone_config = BitConfig(**backbone_config) - elif isinstance(backbone_config, PretrainedConfig): - backbone_config = backbone_config - else: + elif not isinstance(backbone_config, PretrainedConfig): raise ValueError( f"backbone_config must be a dictionary or a `PretrainedConfig`, got {backbone_config.__class__}." ) diff --git a/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py b/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py deleted file mode 100644 index 21aa2b4897eb..000000000000 --- a/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py +++ /dev/null @@ -1,383 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DINOv2 + DPT checkpoints from the original repository. URL: -https://github.com/facebookresearch/dinov2/tree/main""" - -import argparse -import itertools -import math -from pathlib import Path - -import requests -import torch -from PIL import Image -from torchvision import transforms - -from transformers import Dinov2Config, DPTConfig, DPTForDepthEstimation, DPTImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_dpt_config(model_name): - if "small" in model_name: - # equivalent to stage 3, stage 6, stage 9, stage 12 - backbone_config = Dinov2Config.from_pretrained( - "facebook/dinov2-small", out_indices=[3, 6, 9, 12], apply_layernorm=False, reshape_hidden_states=False - ) - neck_hidden_sizes = [48, 96, 192, 384] - elif "base" in model_name: - backbone_config = Dinov2Config.from_pretrained( - "facebook/dinov2-base", out_indices=[3, 6, 9, 12], apply_layernorm=False, reshape_hidden_states=False - ) - neck_hidden_sizes = [96, 192, 384, 768] - elif "large" in model_name: - backbone_config = Dinov2Config.from_pretrained( - "facebook/dinov2-large", out_indices=[5, 12, 18, 24], apply_layernorm=False, reshape_hidden_states=False - ) - neck_hidden_sizes = [128, 256, 512, 1024] - elif "giant" in model_name: - backbone_config = Dinov2Config.from_pretrained( - "facebook/dinov2-giant", out_indices=[10, 20, 30, 40], apply_layernorm=False, reshape_hidden_states=False - ) - neck_hidden_sizes = [192, 384, 768, 1536] - else: - raise NotImplementedError("To do") - - config = DPTConfig( - backbone_config=backbone_config, - neck_hidden_sizes=neck_hidden_sizes, - use_bias_in_fusion_residual=False, - add_projection=True, - ) - - return config - - -# here we list all DPT keys to be renamed (original name on the left, our name on the right) -def create_rename_keys_dpt(config): - rename_keys = [] - - # fmt: off - # activation postprocessing (projections, readout projections + resize blocks) - for i in range(4): - rename_keys.append((f"decode_head.reassemble_blocks.projects.{i}.conv.weight", f"neck.reassemble_stage.layers.{i}.projection.weight")) - rename_keys.append((f"decode_head.reassemble_blocks.projects.{i}.conv.bias", f"neck.reassemble_stage.layers.{i}.projection.bias")) - - rename_keys.append((f"decode_head.reassemble_blocks.readout_projects.{i}.0.weight", f"neck.reassemble_stage.readout_projects.{i}.0.weight")) - rename_keys.append((f"decode_head.reassemble_blocks.readout_projects.{i}.0.bias", f"neck.reassemble_stage.readout_projects.{i}.0.bias")) - - if i != 2: - rename_keys.append((f"decode_head.reassemble_blocks.resize_layers.{i}.weight", f"neck.reassemble_stage.layers.{i}.resize.weight")) - rename_keys.append((f"decode_head.reassemble_blocks.resize_layers.{i}.bias", f"neck.reassemble_stage.layers.{i}.resize.bias")) - - # fusion layers - for i in range(4): - rename_keys.append((f"decode_head.fusion_blocks.{i}.project.conv.weight", f"neck.fusion_stage.layers.{i}.projection.weight")) - rename_keys.append((f"decode_head.fusion_blocks.{i}.project.conv.bias", f"neck.fusion_stage.layers.{i}.projection.bias")) - if i != 0: - rename_keys.append((f"decode_head.fusion_blocks.{i}.res_conv_unit1.conv1.conv.weight", f"neck.fusion_stage.layers.{i}.residual_layer1.convolution1.weight")) - rename_keys.append((f"decode_head.fusion_blocks.{i}.res_conv_unit1.conv2.conv.weight", f"neck.fusion_stage.layers.{i}.residual_layer1.convolution2.weight")) - rename_keys.append((f"decode_head.fusion_blocks.{i}.res_conv_unit2.conv1.conv.weight", f"neck.fusion_stage.layers.{i}.residual_layer2.convolution1.weight")) - rename_keys.append((f"decode_head.fusion_blocks.{i}.res_conv_unit2.conv2.conv.weight", f"neck.fusion_stage.layers.{i}.residual_layer2.convolution2.weight")) - - # neck convolutions - for i in range(4): - rename_keys.append((f"decode_head.convs.{i}.conv.weight", f"neck.convs.{i}.weight")) - - # head - rename_keys.append(("decode_head.project.conv.weight", "head.projection.weight")) - rename_keys.append(("decode_head.project.conv.bias", "head.projection.bias")) - - for i in range(0, 5, 2): - rename_keys.append((f"decode_head.conv_depth.head.{i}.weight", f"head.head.{i}.weight")) - rename_keys.append((f"decode_head.conv_depth.head.{i}.bias", f"head.head.{i}.bias")) - # fmt: on - - return rename_keys - - -# here we list all backbone keys to be renamed (original name on the left, our name on the right) -def create_rename_keys_backbone(config): - rename_keys = [] - - # fmt: off - # patch embedding layer - rename_keys.append(("cls_token", "backbone.embeddings.cls_token")) - rename_keys.append(("mask_token", "backbone.embeddings.mask_token")) - rename_keys.append(("pos_embed", "backbone.embeddings.position_embeddings")) - rename_keys.append(("patch_embed.proj.weight", "backbone.embeddings.patch_embeddings.projection.weight")) - rename_keys.append(("patch_embed.proj.bias", "backbone.embeddings.patch_embeddings.projection.bias")) - - # Transformer encoder - for i in range(config.backbone_config.num_hidden_layers): - # layernorms - rename_keys.append((f"blocks.{i}.norm1.weight", f"backbone.encoder.layer.{i}.norm1.weight")) - rename_keys.append((f"blocks.{i}.norm1.bias", f"backbone.encoder.layer.{i}.norm1.bias")) - rename_keys.append((f"blocks.{i}.norm2.weight", f"backbone.encoder.layer.{i}.norm2.weight")) - rename_keys.append((f"blocks.{i}.norm2.bias", f"backbone.encoder.layer.{i}.norm2.bias")) - # MLP - if config.backbone_config.use_swiglu_ffn: - rename_keys.append((f"blocks.{i}.mlp.w12.weight", f"backbone.encoder.layer.{i}.mlp.w12.weight")) - rename_keys.append((f"blocks.{i}.mlp.w12.bias", f"backbone.encoder.layer.{i}.mlp.w12.bias")) - rename_keys.append((f"blocks.{i}.mlp.w3.weight", f"backbone.encoder.layer.{i}.mlp.w3.weight")) - rename_keys.append((f"blocks.{i}.mlp.w3.bias", f"backbone.encoder.layer.{i}.mlp.w3.bias")) - else: - rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"backbone.encoder.layer.{i}.mlp.fc1.weight")) - rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"backbone.encoder.layer.{i}.mlp.fc1.bias")) - rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"backbone.encoder.layer.{i}.mlp.fc2.weight")) - rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"backbone.encoder.layer.{i}.mlp.fc2.bias")) - # layerscale - rename_keys.append((f"blocks.{i}.ls1.gamma", f"backbone.encoder.layer.{i}.layer_scale1.lambda1")) - rename_keys.append((f"blocks.{i}.ls2.gamma", f"backbone.encoder.layer.{i}.layer_scale2.lambda1")) - # attention projection layer - rename_keys.append((f"blocks.{i}.attn.proj.weight", f"backbone.encoder.layer.{i}.attention.output.dense.weight")) - rename_keys.append((f"blocks.{i}.attn.proj.bias", f"backbone.encoder.layer.{i}.attention.output.dense.bias")) - # fmt: on - - rename_keys.append(("norm.weight", "backbone.layernorm.weight")) - rename_keys.append(("norm.bias", "backbone.layernorm.bias")) - - return rename_keys - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config): - for i in range(config.backbone_config.num_hidden_layers): - # read in weights + bias of input projection layer (in timm, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight") - in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias") - hidden_size = config.backbone_config.hidden_size - # next, add query, keys and values (in that order) to the state dict - state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[:hidden_size, :] - state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[:hidden_size] - state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ - hidden_size : hidden_size * 2, : - ] - state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[ - hidden_size : hidden_size * 2 - ] - state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-hidden_size:, :] - state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-hidden_size:] - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "https://dl.fbaipublicfiles.com/dinov2/images/example.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -name_to_url = { - "dpt-dinov2-small-nyu": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_nyu_dpt_head.pth", - "dpt-dinov2-small-kitti": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_kitti_dpt_head.pth", - "dpt-dinov2-base-nyu": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_nyu_dpt_head.pth", - "dpt-dinov2-base-kitti": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_kitti_dpt_head.pth", - "dpt-dinov2-large-nyu": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_nyu_dpt_head.pth", - "dpt-dinov2-large-kitti": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_kitti_dpt_head.pth", - "dpt-dinov2-giant-nyu": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_nyu_dpt_head.pth", - "dpt-dinov2-giant-kitti": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_kitti_dpt_head.pth", -} - - -def get_original_pixel_values(image): - class CenterPadding: - def __init__(self, multiple): - super().__init__() - self.multiple = multiple - - def _get_pad(self, size): - new_size = math.ceil(size / self.multiple) * self.multiple - pad_size = new_size - size - pad_size_left = pad_size // 2 - pad_size_right = pad_size - pad_size_left - return pad_size_left, pad_size_right - - def __call__(self, img): - pads = list(itertools.chain.from_iterable(self._get_pad(m) for m in img.shape[-2:][::-1])) - output = torch.nn.functional.pad(img, pads) - return output - - def __repr__(self): - return self.__class__.__name__ + "()" - - def make_depth_transform() -> transforms.Compose: - return transforms.Compose( - [ - transforms.ToTensor(), - lambda x: 255.0 * x[:3], # Discard alpha component and scale by 255 - transforms.Normalize( - mean=(123.675, 116.28, 103.53), - std=(58.395, 57.12, 57.375), - ), - CenterPadding(multiple=14), - ] - ) - - transform = make_depth_transform() - original_pixel_values = transform(image).unsqueeze(0) - - return original_pixel_values - - -@torch.no_grad() -def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, verify_logits): - """ - Copy/paste/tweak model's weights to our DPT structure. - """ - - # define DPT configuration based on URL - checkpoint_url = name_to_url[model_name] - config = get_dpt_config(model_name) - - # load original DPT state_dict from URL - print("URL:", checkpoint_url) - dpt_state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["state_dict"] - # rename keys - rename_keys = create_rename_keys_dpt(config) - for src, dest in rename_keys: - rename_key(dpt_state_dict, src, dest) - - # load original backbone state_dict from URL - if "small" in model_name: - original_model = torch.hub.load("facebookresearch/dinov2", "dinov2_vits14") - elif "base" in model_name: - original_model = torch.hub.load("facebookresearch/dinov2", "dinov2_vitb14") - elif "large" in model_name: - original_model = torch.hub.load("facebookresearch/dinov2", "dinov2_vitl14") - elif "giant" in model_name: - original_model = torch.hub.load("facebookresearch/dinov2", "dinov2_vitg14") - else: - raise NotImplementedError("To do") - original_model.eval() - backbone_state_dict = original_model.state_dict() - - # rename keys - rename_keys = create_rename_keys_backbone(config) - for src, dest in rename_keys: - rename_key(backbone_state_dict, src, dest) - - # read in qkv matrices - read_in_q_k_v(backbone_state_dict, config) - - for key, val in backbone_state_dict.copy().items(): - val = backbone_state_dict.pop(key) - if "w12" in key: - key = key.replace("w12", "weights_in") - if "w3" in key: - key = key.replace("w3", "weights_out") - backbone_state_dict[key] = val - - # merge state_dicts - state_dict = {**backbone_state_dict, **dpt_state_dict} - - # load HuggingFace model - model = DPTForDepthEstimation(config) - missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False) - print("Missing keys:", missing_keys) - print("Unexpected keys:", unexpected_keys) - assert missing_keys == [ - "neck.fusion_stage.layers.0.residual_layer1.convolution1.weight", - "neck.fusion_stage.layers.0.residual_layer1.convolution2.weight", - ] - model.eval() - - # Verify image processor - processor = DPTImageProcessor( - do_resize=False, - do_rescale=False, - do_pad=True, - size_divisor=14, - do_normalize=True, - image_mean=(123.675, 116.28, 103.53), - image_std=(58.395, 57.12, 57.375), - ) - - image = prepare_img() - pixel_values = processor(image, return_tensors="pt").pixel_values.float() - original_pixel_values = get_original_pixel_values(image) - - assert torch.allclose(pixel_values, original_pixel_values) - - # Verify forward pass - with torch.no_grad(): - outputs = model(pixel_values) - - predicted_depth = outputs.predicted_depth - - print("Shape of predicted depth:", predicted_depth.shape) - print("First values of predicted depth:", predicted_depth[0, :3, :3]) - - # assert logits - if verify_logits: - if model_name == "dpt-dinov2-small-nyu": - expected_shape = torch.Size([1, 576, 736]) - expected_slice = torch.tensor( - [[3.3576, 3.4741, 3.4345], [3.4324, 3.5012, 3.2775], [3.2560, 3.3563, 3.2354]] - ) - - assert predicted_depth.shape == torch.Size(expected_shape) - assert torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-5) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model and processor to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - print("Pushing model and processor to hub...") - model.push_to_hub(repo_id=f"facebook/{model_name}") - processor.push_to_hub(repo_id=f"facebook/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="dpt-dinov2-small-nyu", - type=str, - choices=name_to_url.keys(), - help="Name of the model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether to push the model to the hub after conversion.", - ) - parser.add_argument( - "--verify_logits", - action="store_true", - required=False, - help="Path to the output PyTorch model directory.", - ) - - args = parser.parse_args() - convert_dpt_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.verify_logits) diff --git a/src/transformers/models/dpt/convert_dpt_beit_to_hf.py b/src/transformers/models/dpt/convert_dpt_beit_to_hf.py deleted file mode 100644 index c4ff8a3eb7bf..000000000000 --- a/src/transformers/models/dpt/convert_dpt_beit_to_hf.py +++ /dev/null @@ -1,305 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DPT 3.1 checkpoints from the MiDaS repository. URL: https://github.com/isl-org/MiDaS""" - -import argparse -from pathlib import Path - -import requests -import torch -from PIL import Image - -from transformers import BeitConfig, DPTConfig, DPTForDepthEstimation, DPTImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_dpt_config(model_name): - hidden_size = 768 - num_hidden_layers = 12 - num_attention_heads = 12 - intermediate_size = 3072 - out_features = ["stage3", "stage6", "stage9", "stage12"] # beit-base-384 uses [2, 5, 8, 11] - - if "large" in model_name: - hidden_size = 1024 - num_hidden_layers = 24 - num_attention_heads = 16 - intermediate_size = 4096 - out_features = ["stage6", "stage12", "stage18", "stage24"] # beit-large-512 uses [5, 11, 17, 23] - - if "512" in model_name: - image_size = 512 - elif "384" in model_name: - image_size = 384 - else: - raise ValueError("Model not supported") - - backbone_config = BeitConfig( - image_size=image_size, - num_hidden_layers=num_hidden_layers, - hidden_size=hidden_size, - intermediate_size=intermediate_size, - num_attention_heads=num_attention_heads, - use_relative_position_bias=True, - reshape_hidden_states=False, - out_features=out_features, - ) - - neck_hidden_sizes = [256, 512, 1024, 1024] if "large" in model_name else [96, 192, 384, 768] - config = DPTConfig(backbone_config=backbone_config, neck_hidden_sizes=neck_hidden_sizes) - - return config, image_size - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config): - rename_keys = [] - - # fmt: off - # stem - rename_keys.append(("pretrained.model.cls_token", "backbone.embeddings.cls_token")) - rename_keys.append(("pretrained.model.patch_embed.proj.weight", "backbone.embeddings.patch_embeddings.projection.weight")) - rename_keys.append(("pretrained.model.patch_embed.proj.bias", "backbone.embeddings.patch_embeddings.projection.bias")) - - # Transformer encoder - for i in range(config.backbone_config.num_hidden_layers): - rename_keys.append((f"pretrained.model.blocks.{i}.gamma_1", f"backbone.encoder.layer.{i}.lambda_1")) - rename_keys.append((f"pretrained.model.blocks.{i}.gamma_2", f"backbone.encoder.layer.{i}.lambda_2")) - rename_keys.append((f"pretrained.model.blocks.{i}.norm1.weight", f"backbone.encoder.layer.{i}.layernorm_before.weight")) - rename_keys.append((f"pretrained.model.blocks.{i}.norm1.bias", f"backbone.encoder.layer.{i}.layernorm_before.bias")) - rename_keys.append((f"pretrained.model.blocks.{i}.norm2.weight", f"backbone.encoder.layer.{i}.layernorm_after.weight")) - rename_keys.append((f"pretrained.model.blocks.{i}.norm2.bias", f"backbone.encoder.layer.{i}.layernorm_after.bias")) - rename_keys.append((f"pretrained.model.blocks.{i}.mlp.fc1.weight", f"backbone.encoder.layer.{i}.intermediate.dense.weight")) - rename_keys.append((f"pretrained.model.blocks.{i}.mlp.fc1.bias", f"backbone.encoder.layer.{i}.intermediate.dense.bias")) - rename_keys.append((f"pretrained.model.blocks.{i}.mlp.fc2.weight", f"backbone.encoder.layer.{i}.output.dense.weight")) - rename_keys.append((f"pretrained.model.blocks.{i}.mlp.fc2.bias", f"backbone.encoder.layer.{i}.output.dense.bias")) - rename_keys.append((f"pretrained.model.blocks.{i}.attn.proj.weight", f"backbone.encoder.layer.{i}.attention.output.dense.weight")) - rename_keys.append((f"pretrained.model.blocks.{i}.attn.proj.bias", f"backbone.encoder.layer.{i}.attention.output.dense.bias")) - rename_keys.append((f"pretrained.model.blocks.{i}.attn.relative_position_bias_table", f"backbone.encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_bias_table")) - rename_keys.append((f"pretrained.model.blocks.{i}.attn.relative_position_index", f"backbone.encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_index")) - - # activation postprocessing (readout projections + resize blocks) - for i in range(4): - rename_keys.append((f"pretrained.act_postprocess{i+1}.0.project.0.weight", f"neck.reassemble_stage.readout_projects.{i}.0.weight")) - rename_keys.append((f"pretrained.act_postprocess{i+1}.0.project.0.bias", f"neck.reassemble_stage.readout_projects.{i}.0.bias")) - - rename_keys.append((f"pretrained.act_postprocess{i+1}.3.weight", f"neck.reassemble_stage.layers.{i}.projection.weight")) - rename_keys.append((f"pretrained.act_postprocess{i+1}.3.bias", f"neck.reassemble_stage.layers.{i}.projection.bias")) - - if i != 2: - rename_keys.append((f"pretrained.act_postprocess{i+1}.4.weight", f"neck.reassemble_stage.layers.{i}.resize.weight")) - rename_keys.append((f"pretrained.act_postprocess{i+1}.4.bias", f"neck.reassemble_stage.layers.{i}.resize.bias")) - - # refinenet (tricky here) - mapping = {1:3, 2:2, 3:1, 4:0} - - for i in range(1, 5): - j = mapping[i] - rename_keys.append((f"scratch.refinenet{i}.out_conv.weight", f"neck.fusion_stage.layers.{j}.projection.weight")) - rename_keys.append((f"scratch.refinenet{i}.out_conv.bias", f"neck.fusion_stage.layers.{j}.projection.bias")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.weight")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.bias")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.weight")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.bias")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.weight")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.bias")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.weight")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.bias")) - - # scratch convolutions - for i in range(4): - rename_keys.append((f"scratch.layer{i+1}_rn.weight", f"neck.convs.{i}.weight")) - - # head - for i in range(0, 5, 2): - rename_keys.append((f"scratch.output_conv.{i}.weight", f"head.head.{i}.weight")) - rename_keys.append((f"scratch.output_conv.{i}.bias", f"head.head.{i}.bias")) - - return rename_keys - - -def remove_ignore_keys_(state_dict): - ignore_keys = ["pretrained.model.head.weight", "pretrained.model.head.bias"] - for k in ignore_keys: - state_dict.pop(k, None) - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config): - hidden_size = config.backbone_config.hidden_size - for i in range(config.backbone_config.num_hidden_layers): - # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"pretrained.model.blocks.{i}.attn.qkv.weight") - q_bias = state_dict.pop(f"pretrained.model.blocks.{i}.attn.q_bias") - v_bias = state_dict.pop(f"pretrained.model.blocks.{i}.attn.v_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[:hidden_size, :] - state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.bias"] = q_bias - state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ - hidden_size : hidden_size * 2, : - ] - state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-hidden_size:, :] - state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.bias"] = v_bias - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -@torch.no_grad() -def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub): - """ - Copy/paste/tweak model's weights to our DPT structure. - """ - - name_to_url = { - "dpt-beit-large-512": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt", - "dpt-beit-large-384": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_384.pt", - "dpt-beit-base-384": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_base_384.pt", - } - - # define DPT configuration based on URL - checkpoint_url = name_to_url[model_name] - config, image_size = get_dpt_config(model_name) - # load original state_dict from URL - state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu") - # remove certain keys - remove_ignore_keys_(state_dict) - # rename keys - rename_keys = create_rename_keys(config) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - # read in qkv matrices - read_in_q_k_v(state_dict, config) - - # load HuggingFace model - model = DPTForDepthEstimation(config) - missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False) - print("Missing keys:", missing_keys) - print("Unexpected keys:", unexpected_keys) - assert missing_keys == [] - # assert unexpected_keys == ["pretrained.model.fc_norm.weight", "pretrained.model.fc_norm.bias"] - model.eval() - - # Check outputs on an image - # We set `keep_aspect_ratio=False` as our current BEiT does not support arbitrary window sizes - processor = DPTImageProcessor( - size={"height": image_size, "width": image_size}, keep_aspect_ratio=False, ensure_multiple_of=32 - ) - - image = prepare_img() - pixel_values = processor(image, return_tensors="pt").pixel_values - - print("First values of pixel values:", pixel_values[0, 0, :3, :3]) - print("Mean of pixel values:", pixel_values.mean().item()) - print("Shape of pixel values:", pixel_values.shape) - - import requests - from PIL import Image - from torchvision import transforms - - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - image = Image.open(requests.get(url, stream=True).raw) - - transforms = transforms.Compose( - [ - transforms.Resize((image_size, image_size)), - transforms.ToTensor(), - ] - ) - pixel_values = transforms(image).unsqueeze(0) - - # forward pass - with torch.no_grad(): - outputs = model(pixel_values) - - predicted_depth = outputs.predicted_depth - - print("Shape of predicted depth:", predicted_depth.shape) - print("First values of predicted depth:", predicted_depth[0, :3, :3]) - - # assert logits - # TODO there's still a small difference with the original logits - if model_name == "dpt-beit-large-512": - # OK, checked - expected_shape = torch.Size([1, 512, 512]) - expected_slice = torch.tensor( - [[2804.6260, 2792.5708, 2812.9263], [2772.0288, 2780.1118, 2796.2529], [2748.1094, 2766.6558, 2766.9834]] - ) - elif model_name == "dpt-beit-large-384": - # OK, checked - expected_shape = torch.Size([1, 384, 384]) - expected_slice = torch.tensor( - [[1783.2273, 1780.5729, 1792.6453], [1759.9817, 1765.5359, 1778.5002], [1739.1633, 1754.7903, 1757.1990]], - ) - elif model_name == "dpt-beit-base-384": - # OK, checked - expected_shape = torch.Size([1, 384, 384]) - expected_slice = torch.tensor( - [[2898.4482, 2891.3750, 2904.8079], [2858.6685, 2877.2615, 2894.4507], [2842.1235, 2854.1023, 2861.6328]], - ) - - assert predicted_depth.shape == torch.Size(expected_shape) - assert torch.allclose(predicted_depth[0, :3, :3], expected_slice) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model and processor to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - print("Pushing model and processor to hub...") - model.push_to_hub(repo_id=f"nielsr/{model_name}") - processor.push_to_hub(repo_id=f"nielsr/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="dpt-beit-large-512", - type=str, - choices=["dpt-beit-large-512", "dpt-beit-large-384", "dpt-beit-base-384"], - help="Name of the model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether to push the model to the hub after conversion.", - ) - - args = parser.parse_args() - convert_dpt_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py b/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py deleted file mode 100644 index ce53018a7627..000000000000 --- a/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py +++ /dev/null @@ -1,315 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DPT checkpoints from the original repository. URL: https://github.com/isl-org/DPT""" - -import argparse -import json -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import DPTConfig, DPTForDepthEstimation, DPTForSemanticSegmentation, DPTImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_dpt_config(checkpoint_url): - config = DPTConfig(embedding_type="hybrid") - - if "large" in checkpoint_url: - config.hidden_size = 1024 - config.intermediate_size = 4096 - config.num_hidden_layers = 24 - config.num_attention_heads = 16 - config.backbone_out_indices = [5, 11, 17, 23] - config.neck_hidden_sizes = [256, 512, 1024, 1024] - expected_shape = (1, 384, 384) - - if "nyu" in checkpoint_url or "midas" in checkpoint_url: - config.hidden_size = 768 - config.reassemble_factors = [1, 1, 1, 0.5] - config.neck_hidden_sizes = [256, 512, 768, 768] - config.num_labels = 150 - config.patch_size = 16 - expected_shape = (1, 384, 384) - config.use_batch_norm_in_fusion_residual = False - config.readout_type = "project" - - if "ade" in checkpoint_url: - config.use_batch_norm_in_fusion_residual = True - config.hidden_size = 768 - config.reassemble_stage = [1, 1, 1, 0.5] - config.num_labels = 150 - config.patch_size = 16 - repo_id = "huggingface/label-files" - filename = "ade20k-id2label.json" - id2label = json.loads(Path(hf_hub_download(repo_id, filename, repo_type="dataset")).read_text()) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - expected_shape = [1, 150, 480, 480] - - return config, expected_shape - - -def remove_ignore_keys_(state_dict): - ignore_keys = ["pretrained.model.head.weight", "pretrained.model.head.bias"] - for k in ignore_keys: - state_dict.pop(k, None) - - -def rename_key(name): - if ( - "pretrained.model" in name - and "cls_token" not in name - and "pos_embed" not in name - and "patch_embed" not in name - ): - name = name.replace("pretrained.model", "dpt.encoder") - if "pretrained.model" in name: - name = name.replace("pretrained.model", "dpt.embeddings") - if "patch_embed" in name: - name = name.replace("patch_embed", "") - if "pos_embed" in name: - name = name.replace("pos_embed", "position_embeddings") - if "attn.proj" in name: - name = name.replace("attn.proj", "attention.output.dense") - if "proj" in name and "project" not in name: - name = name.replace("proj", "projection") - if "blocks" in name: - name = name.replace("blocks", "layer") - if "mlp.fc1" in name: - name = name.replace("mlp.fc1", "intermediate.dense") - if "mlp.fc2" in name: - name = name.replace("mlp.fc2", "output.dense") - if "norm1" in name and "backbone" not in name: - name = name.replace("norm1", "layernorm_before") - if "norm2" in name and "backbone" not in name: - name = name.replace("norm2", "layernorm_after") - if "scratch.output_conv" in name: - name = name.replace("scratch.output_conv", "head") - if "scratch" in name: - name = name.replace("scratch", "neck") - if "layer1_rn" in name: - name = name.replace("layer1_rn", "convs.0") - if "layer2_rn" in name: - name = name.replace("layer2_rn", "convs.1") - if "layer3_rn" in name: - name = name.replace("layer3_rn", "convs.2") - if "layer4_rn" in name: - name = name.replace("layer4_rn", "convs.3") - if "refinenet" in name: - layer_idx = int(name[len("neck.refinenet") : len("neck.refinenet") + 1]) - # tricky here: we need to map 4 to 0, 3 to 1, 2 to 2 and 1 to 3 - name = name.replace(f"refinenet{layer_idx}", f"fusion_stage.layers.{abs(layer_idx - 4)}") - if "out_conv" in name: - name = name.replace("out_conv", "projection") - if "resConfUnit1" in name: - name = name.replace("resConfUnit1", "residual_layer1") - if "resConfUnit2" in name: - name = name.replace("resConfUnit2", "residual_layer2") - if "conv1" in name: - name = name.replace("conv1", "convolution1") - if "conv2" in name: - name = name.replace("conv2", "convolution2") - # readout blocks - if "pretrained.act_postprocess1.0.project.0" in name: - name = name.replace("pretrained.act_postprocess1.0.project.0", "neck.reassemble_stage.readout_projects.0.0") - if "pretrained.act_postprocess2.0.project.0" in name: - name = name.replace("pretrained.act_postprocess2.0.project.0", "neck.reassemble_stage.readout_projects.1.0") - if "pretrained.act_postprocess3.0.project.0" in name: - name = name.replace("pretrained.act_postprocess3.0.project.0", "neck.reassemble_stage.readout_projects.2.0") - if "pretrained.act_postprocess4.0.project.0" in name: - name = name.replace("pretrained.act_postprocess4.0.project.0", "neck.reassemble_stage.readout_projects.3.0") - - # resize blocks - if "pretrained.act_postprocess1.3" in name: - name = name.replace("pretrained.act_postprocess1.3", "neck.reassemble_stage.layers.0.projection") - if "pretrained.act_postprocess1.4" in name: - name = name.replace("pretrained.act_postprocess1.4", "neck.reassemble_stage.layers.0.resize") - if "pretrained.act_postprocess2.3" in name: - name = name.replace("pretrained.act_postprocess2.3", "neck.reassemble_stage.layers.1.projection") - if "pretrained.act_postprocess2.4" in name: - name = name.replace("pretrained.act_postprocess2.4", "neck.reassemble_stage.layers.1.resize") - if "pretrained.act_postprocess3.3" in name: - name = name.replace("pretrained.act_postprocess3.3", "neck.reassemble_stage.layers.2.projection") - if "pretrained.act_postprocess4.3" in name: - name = name.replace("pretrained.act_postprocess4.3", "neck.reassemble_stage.layers.3.projection") - if "pretrained.act_postprocess4.4" in name: - name = name.replace("pretrained.act_postprocess4.4", "neck.reassemble_stage.layers.3.resize") - if "pretrained" in name: - name = name.replace("pretrained", "dpt") - if "bn" in name: - name = name.replace("bn", "batch_norm") - if "head" in name: - name = name.replace("head", "head.head") - if "encoder.norm" in name: - name = name.replace("encoder.norm", "layernorm") - if "auxlayer" in name: - name = name.replace("auxlayer", "auxiliary_head.head") - if "backbone" in name: - name = name.replace("backbone", "backbone.bit.encoder") - - if ".." in name: - name = name.replace("..", ".") - - if "stem.conv" in name: - name = name.replace("stem.conv", "bit.embedder.convolution") - if "blocks" in name: - name = name.replace("blocks", "layers") - if "convolution" in name and "backbone" in name: - name = name.replace("convolution", "conv") - if "layer" in name and "backbone" in name: - name = name.replace("layer", "layers") - if "backbone.bit.encoder.bit" in name: - name = name.replace("backbone.bit.encoder.bit", "backbone.bit") - if "embedder.conv" in name: - name = name.replace("embedder.conv", "embedder.convolution") - if "backbone.bit.encoder.stem.norm" in name: - name = name.replace("backbone.bit.encoder.stem.norm", "backbone.bit.embedder.norm") - return name - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config): - for i in range(config.num_hidden_layers): - # read in weights + bias of input projection layer (in timm, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"dpt.encoder.layer.{i}.attn.qkv.weight") - in_proj_bias = state_dict.pop(f"dpt.encoder.layer.{i}.attn.qkv.bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"dpt.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :] - state_dict[f"dpt.encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size] - state_dict[f"dpt.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ - config.hidden_size : config.hidden_size * 2, : - ] - state_dict[f"dpt.encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[ - config.hidden_size : config.hidden_size * 2 - ] - state_dict[f"dpt.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[ - -config.hidden_size :, : - ] - state_dict[f"dpt.encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :] - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -@torch.no_grad() -def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub, model_name, show_prediction): - """ - Copy/paste/tweak model's weights to our DPT structure. - """ - - # define DPT configuration based on URL - config, expected_shape = get_dpt_config(checkpoint_url) - # load original state_dict from URL - # state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu") - state_dict = torch.load(checkpoint_url, map_location="cpu", weights_only=True) - # remove certain keys - remove_ignore_keys_(state_dict) - # rename keys - for key in state_dict.copy(): - val = state_dict.pop(key) - state_dict[rename_key(key)] = val - # read in qkv matrices - read_in_q_k_v(state_dict, config) - - # load HuggingFace model - model = DPTForSemanticSegmentation(config) if "ade" in checkpoint_url else DPTForDepthEstimation(config) - model.load_state_dict(state_dict) - model.eval() - - # Check outputs on an image - size = 480 if "ade" in checkpoint_url else 384 - image_processor = DPTImageProcessor(size=size) - - image = prepare_img() - encoding = image_processor(image, return_tensors="pt") - - # forward pass - outputs = model(**encoding).logits if "ade" in checkpoint_url else model(**encoding).predicted_depth - - if show_prediction: - prediction = ( - torch.nn.functional.interpolate( - outputs.unsqueeze(1), - size=(image.size[1], image.size[0]), - mode="bicubic", - align_corners=False, - ) - .squeeze() - .cpu() - .numpy() - ) - - Image.fromarray((prediction / prediction.max()) * 255).show() - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - print(f"Saving image processor to {pytorch_dump_folder_path}") - image_processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - model.push_to_hub("ybelkada/dpt-hybrid-midas") - image_processor.push_to_hub("ybelkada/dpt-hybrid-midas") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--checkpoint_url", - default="https://github.com/intel-isl/DPT/releases/download/1_0/dpt_large-midas-2f21e586.pt", - type=str, - help="URL of the original DPT checkpoint you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - required=False, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - ) - parser.add_argument( - "--model_name", - default="dpt-large", - type=str, - help="Name of the model, in case you're pushing to the hub.", - ) - parser.add_argument( - "--show_prediction", - action="store_true", - ) - - args = parser.parse_args() - convert_dpt_checkpoint( - args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub, args.model_name, args.show_prediction - ) diff --git a/src/transformers/models/dpt/convert_dpt_swinv2_to_hf.py b/src/transformers/models/dpt/convert_dpt_swinv2_to_hf.py deleted file mode 100644 index 0feebe72d474..000000000000 --- a/src/transformers/models/dpt/convert_dpt_swinv2_to_hf.py +++ /dev/null @@ -1,321 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DPT 3.1 checkpoints from the MiDaS repository. URL: https://github.com/isl-org/MiDaS""" - -import argparse -from pathlib import Path - -import requests -import torch -from PIL import Image - -from transformers import DPTConfig, DPTForDepthEstimation, DPTImageProcessor, Swinv2Config -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_dpt_config(model_name): - if "tiny" in model_name: - embed_dim = 96 - depths = (2, 2, 6, 2) - num_heads = (3, 6, 12, 24) - window_size = 16 - # note: for Swinv2-tiny authors used the window_size = 16 variant - # as seen here: https://github.com/isl-org/MiDaS/blob/bdc4ed64c095e026dc0a2f17cabb14d58263decb/midas/backbones/swin2.py#L26 - pretrained_window_sizes = (0, 0, 0, 0) - elif "base" in model_name: - embed_dim = 128 - depths = (2, 2, 18, 2) - num_heads = (4, 8, 16, 32) - window_size = 24 - pretrained_window_sizes = (12, 12, 12, 6) - elif "large" in model_name: - embed_dim = 192 - depths = (2, 2, 18, 2) - num_heads = (6, 12, 24, 48) - window_size = 24 - pretrained_window_sizes = (12, 12, 12, 6) - - if "384" in model_name: - image_size = 384 - elif "256" in model_name: - image_size = 256 - else: - raise ValueError("Model not supported, to do") - - backbone_config = Swinv2Config( - image_size=image_size, - embed_dim=embed_dim, - depths=depths, - window_size=window_size, - pretrained_window_sizes=pretrained_window_sizes, - num_heads=num_heads, - out_features=["stage1", "stage2", "stage3", "stage4"], - ) - - if model_name == "dpt-swinv2-tiny-256": - neck_hidden_sizes = [96, 192, 384, 768] - elif model_name == "dpt-swinv2-base-384": - neck_hidden_sizes = [128, 256, 512, 1024] - elif model_name == "dpt-swinv2-large-384": - neck_hidden_sizes = [192, 384, 768, 1536] - - config = DPTConfig(backbone_config=backbone_config, neck_hidden_sizes=neck_hidden_sizes) - - return config, image_size - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config): - rename_keys = [] - - # fmt: off - # stem - rename_keys.append(("pretrained.model.patch_embed.proj.weight", "backbone.embeddings.patch_embeddings.projection.weight")) - rename_keys.append(("pretrained.model.patch_embed.proj.bias", "backbone.embeddings.patch_embeddings.projection.bias")) - rename_keys.append(("pretrained.model.patch_embed.norm.weight", "backbone.embeddings.norm.weight")) - rename_keys.append(("pretrained.model.patch_embed.norm.bias", "backbone.embeddings.norm.bias")) - - # transformer encoder - for i in range(len(config.backbone_config.depths)): - for j in range(config.backbone_config.depths[i]): - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.logit_scale", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.logit_scale")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.cpb_mlp.0.weight", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.continuous_position_bias_mlp.0.weight")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.cpb_mlp.0.bias", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.continuous_position_bias_mlp.0.bias")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.cpb_mlp.2.weight", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.continuous_position_bias_mlp.2.weight")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.q_bias", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.query.bias")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.v_bias", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.value.bias")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.proj.weight", f"backbone.encoder.layers.{i}.blocks.{j}.attention.output.dense.weight")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.proj.bias", f"backbone.encoder.layers.{i}.blocks.{j}.attention.output.dense.bias")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.norm1.weight", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_before.weight")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.norm1.bias", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_before.bias")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.mlp.fc1.weight", f"backbone.encoder.layers.{i}.blocks.{j}.intermediate.dense.weight")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.mlp.fc1.bias", f"backbone.encoder.layers.{i}.blocks.{j}.intermediate.dense.bias")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.mlp.fc2.weight", f"backbone.encoder.layers.{i}.blocks.{j}.output.dense.weight")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.mlp.fc2.bias", f"backbone.encoder.layers.{i}.blocks.{j}.output.dense.bias")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.norm2.weight", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_after.weight")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.norm2.bias", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_after.bias")) - - # downsample parameters - if i in [0,1,2]: - rename_keys.append((f"pretrained.model.layers.{i}.downsample.reduction.weight", f"backbone.encoder.layers.{i}.downsample.reduction.weight")) - rename_keys.append((f"pretrained.model.layers.{i}.downsample.norm.weight", f"backbone.encoder.layers.{i}.downsample.norm.weight")) - rename_keys.append((f"pretrained.model.layers.{i}.downsample.norm.bias", f"backbone.encoder.layers.{i}.downsample.norm.bias")) - - # note: non-Transformer backbones like Swinv2, LeViT et al don't require activation postprocessing (readout projections + resize blocks) - - # refinenet (tricky here) - mapping = {1:3, 2:2, 3:1, 4:0} - - for i in range(1, 5): - j = mapping[i] - rename_keys.append((f"scratch.refinenet{i}.out_conv.weight", f"neck.fusion_stage.layers.{j}.projection.weight")) - rename_keys.append((f"scratch.refinenet{i}.out_conv.bias", f"neck.fusion_stage.layers.{j}.projection.bias")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.weight")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.bias")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.weight")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.bias")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.weight")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.bias")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.weight")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.bias")) - - # scratch convolutions - for i in range(4): - rename_keys.append((f"scratch.layer{i+1}_rn.weight", f"neck.convs.{i}.weight")) - - # head - for i in range(0, 5, 2): - rename_keys.append((f"scratch.output_conv.{i}.weight", f"head.head.{i}.weight")) - rename_keys.append((f"scratch.output_conv.{i}.bias", f"head.head.{i}.bias")) - - return rename_keys - - -def remove_ignore_keys_(state_dict): - ignore_keys = ["pretrained.model.head.weight", "pretrained.model.head.bias"] - for k in ignore_keys: - state_dict.pop(k, None) - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config, model): - for i in range(len(config.backbone_config.depths)): - for j in range(config.backbone_config.depths[i]): - dim = model.backbone.encoder.layers[i].blocks[j].attention.self.all_head_size - # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"pretrained.model.layers.{i}.blocks.{j}.attn.qkv.weight") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.query.weight"] = in_proj_weight[:dim, :] - state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.key.weight"] = in_proj_weight[ - dim : dim * 2, : - ] - state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.value.weight"] = in_proj_weight[ - -dim:, : - ] - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -@torch.no_grad() -def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, verify_logits, push_to_hub): - """ - Copy/paste/tweak model's weights to our DPT structure. - """ - - name_to_url = { - "dpt-swinv2-tiny-256": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_tiny_256.pt", - "dpt-swinv2-base-384": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_base_384.pt", - "dpt-swinv2-large-384": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_large_384.pt", - } - - # define DPT configuration based on URL - checkpoint_url = name_to_url[model_name] - config, image_size = get_dpt_config(model_name) - # load original state_dict from URL - state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu") - - # load HuggingFace model - model = DPTForDepthEstimation(config) - - # remove certain keys - remove_ignore_keys_(state_dict) - # rename keys - rename_keys = create_rename_keys(config) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - # read in qkv matrices - read_in_q_k_v(state_dict, config, model) - - missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False) - print("Missing keys:", missing_keys) - print("Unexpected keys:", unexpected_keys) - model.eval() - - # Check outputs on an image - processor = DPTImageProcessor(size={"height": image_size, "width": image_size}) - - image = prepare_img() - processor(image, return_tensors="pt") - - if verify_logits: - from torchvision import transforms - - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - image = Image.open(requests.get(url, stream=True).raw) - - transforms = transforms.Compose( - [ - transforms.Resize((image_size, image_size)), - transforms.ToTensor(), - ] - ) - pixel_values = transforms(image).unsqueeze(0) - - # forward pass - with torch.no_grad(): - outputs = model(pixel_values) - - predicted_depth = outputs.predicted_depth - - print("Shape of predicted depth:", predicted_depth.shape) - print("First values of predicted depth:", predicted_depth[0, :3, :3]) - - # assert logits - if model_name == "dpt-swinv2-base-384": - # OK, checked - expected_shape = torch.Size([1, 384, 384]) - expected_slice = torch.tensor( - [ - [1998.5575, 1997.3887, 2009.2981], - [1952.8607, 1979.6488, 2001.0854], - [1953.7697, 1961.7711, 1968.8904], - ], - ) - elif model_name == "dpt-swinv2-tiny-256": - # OK, checked - expected_shape = torch.Size([1, 256, 256]) - expected_slice = torch.tensor( - [[978.9163, 976.5215, 978.5349], [974.1859, 971.7249, 975.8046], [971.3419, 970.3118, 971.6830]], - ) - elif model_name == "dpt-swinv2-large-384": - # OK, checked - expected_shape = torch.Size([1, 384, 384]) - expected_slice = torch.tensor( - [ - [1203.7206, 1200.1495, 1197.8234], - [1196.2484, 1183.5033, 1186.4640], - [1178.8131, 1182.3260, 1174.3975], - ], - ) - - assert predicted_depth.shape == torch.Size(expected_shape) - assert torch.allclose(predicted_depth[0, :3, :3], expected_slice) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model and processor to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - print("Pushing model and processor to hub...") - model.push_to_hub(repo_id=f"Intel/{model_name}") - processor.push_to_hub(repo_id=f"Intel/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="dpt-swinv2-base-384", - type=str, - choices=["dpt-swinv2-tiny-256", "dpt-swinv2-base-384", "dpt-swinv2-large-384"], - help="Name of the model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument( - "--verify_logits", - action="store_true", - help="Whether to verify logits after conversion.", - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether to push the model to the hub after conversion.", - ) - - args = parser.parse_args() - convert_dpt_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.verify_logits, args.push_to_hub) diff --git a/src/transformers/models/dpt/convert_dpt_to_pytorch.py b/src/transformers/models/dpt/convert_dpt_to_pytorch.py deleted file mode 100644 index 1341f8908bcd..000000000000 --- a/src/transformers/models/dpt/convert_dpt_to_pytorch.py +++ /dev/null @@ -1,285 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DPT checkpoints from the original repository. URL: https://github.com/isl-org/DPT""" - -import argparse -import json -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import DPTConfig, DPTForDepthEstimation, DPTForSemanticSegmentation, DPTImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_dpt_config(checkpoint_url): - config = DPTConfig() - - if "large" in checkpoint_url: - config.hidden_size = 1024 - config.intermediate_size = 4096 - config.num_hidden_layers = 24 - config.num_attention_heads = 16 - config.backbone_out_indices = [5, 11, 17, 23] - config.neck_hidden_sizes = [256, 512, 1024, 1024] - expected_shape = (1, 384, 384) - - if "ade" in checkpoint_url: - config.use_batch_norm_in_fusion_residual = True - - config.num_labels = 150 - repo_id = "huggingface/label-files" - filename = "ade20k-id2label.json" - id2label = json.loads(Path(hf_hub_download(repo_id, filename, repo_type="dataset")).read_text()) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - expected_shape = [1, 150, 480, 480] - - return config, expected_shape - - -def remove_ignore_keys_(state_dict): - ignore_keys = ["pretrained.model.head.weight", "pretrained.model.head.bias"] - for k in ignore_keys: - state_dict.pop(k, None) - - -def rename_key(name): - if ( - "pretrained.model" in name - and "cls_token" not in name - and "pos_embed" not in name - and "patch_embed" not in name - ): - name = name.replace("pretrained.model", "dpt.encoder") - if "pretrained.model" in name: - name = name.replace("pretrained.model", "dpt.embeddings") - if "patch_embed" in name: - name = name.replace("patch_embed", "patch_embeddings") - if "pos_embed" in name: - name = name.replace("pos_embed", "position_embeddings") - if "attn.proj" in name: - name = name.replace("attn.proj", "attention.output.dense") - if "proj" in name and "project" not in name: - name = name.replace("proj", "projection") - if "blocks" in name: - name = name.replace("blocks", "layer") - if "mlp.fc1" in name: - name = name.replace("mlp.fc1", "intermediate.dense") - if "mlp.fc2" in name: - name = name.replace("mlp.fc2", "output.dense") - if "norm1" in name: - name = name.replace("norm1", "layernorm_before") - if "norm2" in name: - name = name.replace("norm2", "layernorm_after") - if "scratch.output_conv" in name: - name = name.replace("scratch.output_conv", "head") - if "scratch" in name: - name = name.replace("scratch", "neck") - if "layer1_rn" in name: - name = name.replace("layer1_rn", "convs.0") - if "layer2_rn" in name: - name = name.replace("layer2_rn", "convs.1") - if "layer3_rn" in name: - name = name.replace("layer3_rn", "convs.2") - if "layer4_rn" in name: - name = name.replace("layer4_rn", "convs.3") - if "refinenet" in name: - layer_idx = int(name[len("neck.refinenet") : len("neck.refinenet") + 1]) - # tricky here: we need to map 4 to 0, 3 to 1, 2 to 2 and 1 to 3 - name = name.replace(f"refinenet{layer_idx}", f"fusion_stage.layers.{abs(layer_idx - 4)}") - if "out_conv" in name: - name = name.replace("out_conv", "projection") - if "resConfUnit1" in name: - name = name.replace("resConfUnit1", "residual_layer1") - if "resConfUnit2" in name: - name = name.replace("resConfUnit2", "residual_layer2") - if "conv1" in name: - name = name.replace("conv1", "convolution1") - if "conv2" in name: - name = name.replace("conv2", "convolution2") - # readout blocks - if "pretrained.act_postprocess1.0.project.0" in name: - name = name.replace("pretrained.act_postprocess1.0.project.0", "neck.reassemble_stage.readout_projects.0.0") - if "pretrained.act_postprocess2.0.project.0" in name: - name = name.replace("pretrained.act_postprocess2.0.project.0", "neck.reassemble_stage.readout_projects.1.0") - if "pretrained.act_postprocess3.0.project.0" in name: - name = name.replace("pretrained.act_postprocess3.0.project.0", "neck.reassemble_stage.readout_projects.2.0") - if "pretrained.act_postprocess4.0.project.0" in name: - name = name.replace("pretrained.act_postprocess4.0.project.0", "neck.reassemble_stage.readout_projects.3.0") - # resize blocks - if "pretrained.act_postprocess1.3" in name: - name = name.replace("pretrained.act_postprocess1.3", "neck.reassemble_stage.layers.0.projection") - if "pretrained.act_postprocess1.4" in name: - name = name.replace("pretrained.act_postprocess1.4", "neck.reassemble_stage.layers.0.resize") - if "pretrained.act_postprocess2.3" in name: - name = name.replace("pretrained.act_postprocess2.3", "neck.reassemble_stage.layers.1.projection") - if "pretrained.act_postprocess2.4" in name: - name = name.replace("pretrained.act_postprocess2.4", "neck.reassemble_stage.layers.1.resize") - if "pretrained.act_postprocess3.3" in name: - name = name.replace("pretrained.act_postprocess3.3", "neck.reassemble_stage.layers.2.projection") - if "pretrained.act_postprocess4.3" in name: - name = name.replace("pretrained.act_postprocess4.3", "neck.reassemble_stage.layers.3.projection") - if "pretrained.act_postprocess4.4" in name: - name = name.replace("pretrained.act_postprocess4.4", "neck.reassemble_stage.layers.3.resize") - if "pretrained" in name: - name = name.replace("pretrained", "dpt") - if "bn" in name: - name = name.replace("bn", "batch_norm") - if "head" in name: - name = name.replace("head", "head.head") - if "encoder.norm" in name: - name = name.replace("encoder.norm", "layernorm") - if "auxlayer" in name: - name = name.replace("auxlayer", "auxiliary_head.head") - - return name - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config): - for i in range(config.num_hidden_layers): - # read in weights + bias of input projection layer (in timm, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"dpt.encoder.layer.{i}.attn.qkv.weight") - in_proj_bias = state_dict.pop(f"dpt.encoder.layer.{i}.attn.qkv.bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"dpt.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :] - state_dict[f"dpt.encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size] - state_dict[f"dpt.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ - config.hidden_size : config.hidden_size * 2, : - ] - state_dict[f"dpt.encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[ - config.hidden_size : config.hidden_size * 2 - ] - state_dict[f"dpt.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[ - -config.hidden_size :, : - ] - state_dict[f"dpt.encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :] - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -@torch.no_grad() -def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub, model_name): - """ - Copy/paste/tweak model's weights to our DPT structure. - """ - - # define DPT configuration based on URL - config, expected_shape = get_dpt_config(checkpoint_url) - # load original state_dict from URL - state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu") - # remove certain keys - remove_ignore_keys_(state_dict) - # rename keys - for key in state_dict.copy(): - val = state_dict.pop(key) - state_dict[rename_key(key)] = val - # read in qkv matrices - read_in_q_k_v(state_dict, config) - - # load HuggingFace model - model = DPTForSemanticSegmentation(config) if "ade" in checkpoint_url else DPTForDepthEstimation(config) - model.load_state_dict(state_dict) - model.eval() - - # Check outputs on an image - size = 480 if "ade" in checkpoint_url else 384 - image_processor = DPTImageProcessor(size=size) - - image = prepare_img() - encoding = image_processor(image, return_tensors="pt") - - # forward pass - outputs = model(**encoding).logits if "ade" in checkpoint_url else model(**encoding).predicted_depth - - # Assert logits - expected_slice = torch.tensor([[6.3199, 6.3629, 6.4148], [6.3850, 6.3615, 6.4166], [6.3519, 6.3176, 6.3575]]) - if "ade" in checkpoint_url: - expected_slice = torch.tensor([[4.0480, 4.2420, 4.4360], [4.3124, 4.5693, 4.8261], [4.5768, 4.8965, 5.2163]]) - assert outputs.shape == torch.Size(expected_shape) - assert ( - torch.allclose(outputs[0, 0, :3, :3], expected_slice, atol=1e-4) - if "ade" in checkpoint_url - else torch.allclose(outputs[0, :3, :3], expected_slice) - ) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - print(f"Saving image processor to {pytorch_dump_folder_path}") - image_processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - print("Pushing model to hub...") - model.push_to_hub( - repo_path_or_name=Path(pytorch_dump_folder_path, model_name), - organization="nielsr", - commit_message="Add model", - use_temp_dir=True, - ) - image_processor.push_to_hub( - repo_path_or_name=Path(pytorch_dump_folder_path, model_name), - organization="nielsr", - commit_message="Add image processor", - use_temp_dir=True, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--checkpoint_url", - default="https://github.com/intel-isl/DPT/releases/download/1_0/dpt_large-midas-2f21e586.pt", - type=str, - help="URL of the original DPT checkpoint you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - required=False, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - ) - parser.add_argument( - "--model_name", - default="dpt-large", - type=str, - required=False, - help="Name of the model, in case you're pushing to the hub.", - ) - - args = parser.parse_args() - convert_dpt_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub, args.model_name) diff --git a/src/transformers/models/dpt/image_processing_dpt_fast.py b/src/transformers/models/dpt/image_processing_dpt_fast.py index d4848c50653c..faaddb8023c0 100644 --- a/src/transformers/models/dpt/image_processing_dpt_fast.py +++ b/src/transformers/models/dpt/image_processing_dpt_fast.py @@ -25,6 +25,7 @@ from typing import TYPE_CHECKING, Optional, Union import torch +from torchvision.transforms.v2 import functional as F from ...image_processing_base import BatchFeature from ...image_processing_utils_fast import BaseImageProcessorFast, DefaultFastImageProcessorKwargs @@ -39,17 +40,12 @@ is_torch_tensor, ) from ...processing_utils import Unpack -from ...utils import TensorType, auto_docstring, is_torchvision_v2_available, requires_backends +from ...utils import TensorType, auto_docstring, requires_backends if TYPE_CHECKING: from ...modeling_outputs import DepthEstimatorOutput -if is_torchvision_v2_available(): - from torchvision.transforms.v2 import functional as F -else: - from torchvision.transforms import functional as F - class DPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): """ diff --git a/src/transformers/models/dpt/modeling_dpt.py b/src/transformers/models/dpt/modeling_dpt.py index 363fce92f897..cef10dd76eda 100755 --- a/src/transformers/models/dpt/modeling_dpt.py +++ b/src/transformers/models/dpt/modeling_dpt.py @@ -879,7 +879,7 @@ def __init__(self, config: DPTConfig): self.config = config # postprocessing: only required in case of a non-hierarchical backbone (e.g. ViT, BEiT) - if config.backbone_config is not None and config.backbone_config.model_type in ["swinv2"]: + if config.backbone_config is not None and config.backbone_config.model_type == "swinv2": self.reassemble_stage = None else: self.reassemble_stage = DPTReassembleStage(config) diff --git a/src/transformers/models/dpt/modular_dpt.py b/src/transformers/models/dpt/modular_dpt.py index 32ca94a2d43f..34eb08f39b68 100644 --- a/src/transformers/models/dpt/modular_dpt.py +++ b/src/transformers/models/dpt/modular_dpt.py @@ -32,7 +32,6 @@ from ...utils import ( TensorType, auto_docstring, - is_torchvision_v2_available, requires_backends, ) from ..beit.image_processing_beit_fast import BeitImageProcessorFast @@ -41,10 +40,7 @@ if TYPE_CHECKING: from ...modeling_outputs import DepthEstimatorOutput -if is_torchvision_v2_available(): - from torchvision.transforms.v2 import functional as F -else: - from torchvision.transforms import functional as F +from torchvision.transforms.v2 import functional as F def get_resize_output_image_size( diff --git a/src/transformers/models/edgetam/__init__.py b/src/transformers/models/edgetam/__init__.py new file mode 100644 index 000000000000..d9c1a55fc5bc --- /dev/null +++ b/src/transformers/models/edgetam/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_edgetam import * + from .modeling_edgetam import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/edgetam/configuration_edgetam.py b/src/transformers/models/edgetam/configuration_edgetam.py new file mode 100644 index 000000000000..07ccee36e932 --- /dev/null +++ b/src/transformers/models/edgetam/configuration_edgetam.py @@ -0,0 +1,332 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/edgetam/modular_edgetam.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_edgetam.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 The Meta AI Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...configuration_utils import PretrainedConfig +from ..auto import CONFIG_MAPPING, AutoConfig + + +class EdgeTamVisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`EdgeTamVisionModel`]. It is used to instantiate a SAM + vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration + defaults will yield a similar configuration to that of SAM 2.1 Hiera-tiny + [facebook/EdgeTAM](https://huggingface.co/facebook/EdgeTAM) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + backbone_config (`Union[dict, "PretrainedConfig"]`, *optional*): + Configuration for the vision backbone. This is used to instantiate the backbone using + `AutoModel.from_config`. + backbone_channel_list (`List[int]`, *optional*, defaults to `[384, 192, 96, 48]`): + The list of channel dimensions for the backbone. + backbone_feature_sizes (`List[List[int]]`, *optional*, defaults to `[[256, 256], [128, 128], [64, 64]]`): + The spatial sizes of the feature maps from the backbone. + fpn_hidden_size (`int`, *optional*, defaults to 256): + The hidden dimension of the FPN. + fpn_kernel_size (`int`, *optional*, defaults to 1): + The kernel size for the convolutions in the neck. + fpn_stride (`int`, *optional*, defaults to 1): + The stride for the convolutions in the neck. + fpn_padding (`int`, *optional*, defaults to 0): + The padding for the convolutions in the neck. + fpn_top_down_levels (`List[int]`, *optional*, defaults to `[2, 3]`): + The levels for the top-down FPN connections. + num_feature_levels (`int`, *optional*, defaults to 3): + The number of feature levels from the FPN to use. + hidden_act (`str`, *optional*, defaults to `"gelu"`): + The non-linear activation function in the neck. + layer_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon for the layer normalization. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + + """ + + base_config_key = "vision_config" + model_type = "edgetam_vision_model" + sub_configs = { + "backbone_config": AutoConfig, + } + + def __init__( + self, + backbone_config=None, + backbone_channel_list=None, + backbone_feature_sizes=None, + fpn_hidden_size=256, + fpn_kernel_size=1, + fpn_stride=1, + fpn_padding=0, + fpn_top_down_levels=None, + num_feature_levels=3, + hidden_act="gelu", + layer_norm_eps=1e-6, + initializer_range=0.02, + **kwargs, + ): + super().__init__(**kwargs) + + backbone_channel_list = [384, 192, 96, 48] if backbone_channel_list is None else backbone_channel_list + backbone_feature_sizes = ( + [[256, 256], [128, 128], [64, 64]] if backbone_feature_sizes is None else backbone_feature_sizes + ) + fpn_top_down_levels = [2, 3] if fpn_top_down_levels is None else fpn_top_down_levels + + if isinstance(backbone_config, dict): + backbone_config["model_type"] = backbone_config.get("model_type", "timm_wrapper") + backbone_config = CONFIG_MAPPING[backbone_config["model_type"]](**backbone_config) + elif isinstance(backbone_config, AutoConfig): + backbone_config = backbone_config + elif backbone_config is None: + backbone_config = AutoConfig.from_pretrained( + "timm/repvit_m1.dist_in1k", + model_args={"in_chans": 3, "features_only": True, "out_indices": [0, 1, 2, 3]}, + ) + + self.backbone_config = backbone_config + + # Neck + self.backbone_channel_list = backbone_channel_list + self.backbone_feature_sizes = backbone_feature_sizes + self.fpn_hidden_size = fpn_hidden_size + self.fpn_kernel_size = fpn_kernel_size + self.fpn_stride = fpn_stride + self.fpn_padding = fpn_padding + self.fpn_top_down_levels = fpn_top_down_levels + self.num_feature_levels = num_feature_levels + + self.hidden_act = hidden_act + self.layer_norm_eps = layer_norm_eps + self.initializer_range = initializer_range + + +class EdgeTamPromptEncoderConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`EdgeTamPromptEncoder`]. The [`EdgeTamPromptEncoder`] + module is used to encode the input 2D points and bounding boxes. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 256): + Dimensionality of the hidden states. + image_size (`int`, *optional*, defaults to 1024): + The expected output resolution of the image. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + mask_input_channels (`int`, *optional*, defaults to 16): + The number of channels to be fed to the `MaskDecoder` module. + num_point_embeddings (`int`, *optional*, defaults to 4): + The number of point embeddings to be used. + hidden_act (`str`, *optional*, defaults to `"gelu"`): + The non-linear activation function in the encoder and pooler. + layer_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the layer normalization layers. + scale (`float`, *optional*, defaults to 1): + The scale factor for the prompt encoder. + """ + + base_config_key = "prompt_encoder_config" + + def __init__( + self, + hidden_size=256, + image_size=1024, + patch_size=16, + mask_input_channels=16, + num_point_embeddings=4, + hidden_act="gelu", + layer_norm_eps=1e-6, + scale=1, + **kwargs, + ): + super().__init__(**kwargs) + self.hidden_size = hidden_size + self.image_size = image_size + self.patch_size = patch_size + self.mask_input_channels = mask_input_channels + self.num_point_embeddings = num_point_embeddings + self.hidden_act = hidden_act + self.layer_norm_eps = layer_norm_eps + self.scale = scale + + +class EdgeTamMaskDecoderConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`EdgeTamMaskDecoder`]. It is used to instantiate a EDGETAM + memory encoder according to the specified arguments, defining the model architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 256): + Dimensionality of the hidden states. + hidden_act (`str`, *optional*, defaults to `"gelu"`): + The non-linear activation function in the EDGETAM mask decoder. + mlp_dim (`int`, *optional*, defaults to 2048): + The dimension of the MLP in the two-way transformer. + num_hidden_layers (`int`, *optional*, defaults to 2): + The number of hidden layers in the two-way transformer. + num_attention_heads (`int`, *optional*, defaults to 8): + The number of attention heads in the two-way transformer. + attention_downsample_rate (`int`, *optional*, defaults to 2): + The downsample rate for the attention layers. + num_multimask_outputs (`int`, *optional*, defaults to 3): + The number of multimask outputs. + iou_head_depth (`int`, *optional*, defaults to 3): + The depth of the IoU head. + iou_head_hidden_dim (`int`, *optional*, defaults to 256): + The hidden dimension of the IoU head. + dynamic_multimask_via_stability (`bool`, *optional*, defaults to `True`): + Whether to use dynamic multimask via stability. + dynamic_multimask_stability_delta (`float`, *optional*, defaults to 0.05): + The stability delta for the dynamic multimask. + dynamic_multimask_stability_thresh (`float`, *optional*, defaults to 0.98): + The stability threshold for the dynamic multimask. + + """ + + base_config_key = "mask_decoder_config" + + def __init__( + self, + hidden_size=256, + hidden_act="gelu", + mlp_dim=2048, + num_hidden_layers=2, + num_attention_heads=8, + attention_downsample_rate=2, + num_multimask_outputs=3, + iou_head_depth=3, + iou_head_hidden_dim=256, + dynamic_multimask_via_stability=True, + dynamic_multimask_stability_delta=0.05, + dynamic_multimask_stability_thresh=0.98, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.num_multimask_outputs = num_multimask_outputs + self.hidden_act = hidden_act + self.iou_head_depth = iou_head_depth + self.iou_head_hidden_dim = iou_head_hidden_dim + self.dynamic_multimask_via_stability = dynamic_multimask_via_stability + self.dynamic_multimask_stability_delta = dynamic_multimask_stability_delta + self.dynamic_multimask_stability_thresh = dynamic_multimask_stability_thresh + + # TwoWayTransformer configuration + self.num_hidden_layers = num_hidden_layers + self.hidden_size = hidden_size + self.num_attention_heads = num_attention_heads + self.mlp_dim = mlp_dim + self.attention_downsample_rate = attention_downsample_rate + + +class EdgeTamConfig(PretrainedConfig): + r""" + [`EdgeTamConfig`] is the configuration class to store the configuration of a [`EdgeTamModel`]. It is used to instantiate a + EDGETAM model according to the specified arguments, defining the memory attention, memory encoder, and image encoder + configs. Instantiating a configuration defaults will yield a similar configuration to that of the SAM 2.1 Hiera-tiny + [facebook/edgetam.1-hiera-tiny](https://huggingface.co/facebook/edgetam.1-hiera-tiny) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vision_config (Union[`dict`, `EdgeTamVisionConfig`], *optional*): + Dictionary of configuration options used to initialize [`EdgeTamVisionConfig`]. + prompt_encoder_config (Union[`dict`, `EdgeTamPromptEncoderConfig`], *optional*): + Dictionary of configuration options used to initialize [`EdgeTamPromptEncoderConfig`]. + mask_decoder_config (Union[`dict`, `EdgeTamMaskDecoderConfig`], *optional*): + Dictionary of configuration options used to initialize [`EdgeTamMaskDecoderConfig`]. + initializer_range (`float`, *optional*, defaults to 0.02): + Standard deviation for parameter initialization. + + Example: + + ```python + >>> from transformers import ( + ... EdgeTamVisionConfig, + ... EdgeTamPromptEncoderConfig, + ... EdgeTamMaskDecoderConfig, + ... EdgeTamModel, + ... ) + + >>> # Initializing a EdgeTamConfig with `"facebook/edgetam.1_hiera_tiny"` style configuration + >>> configuration = EdgeTamconfig() + + >>> # Initializing a EdgeTamModel (with random weights) from the `"facebook/edgetam.1_hiera_tiny"` style configuration + >>> model = EdgeTamModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + + >>> # We can also initialize a EdgeTamConfig from a EdgeTamVisionConfig, EdgeTamPromptEncoderConfig, and EdgeTamMaskDecoderConfig + + >>> # Initializing EDGETAM vision encoder, memory attention, and memory encoder configurations + >>> vision_config = EdgeTamVisionConfig() + >>> prompt_encoder_config = EdgeTamPromptEncoderConfig() + >>> mask_decoder_config = EdgeTamMaskDecoderConfig() + + >>> config = EdgeTamConfig(vision_config, prompt_encoder_config, mask_decoder_config) + ```""" + + model_type = "edgetam" + sub_configs = { + "vision_config": AutoConfig, + "prompt_encoder_config": EdgeTamPromptEncoderConfig, + "mask_decoder_config": EdgeTamMaskDecoderConfig, + } + + def __init__( + self, + vision_config=None, + prompt_encoder_config=None, + mask_decoder_config=None, + initializer_range=0.02, + **kwargs, + ): + super().__init__(**kwargs) + vision_config = vision_config if vision_config is not None else {} + prompt_encoder_config = prompt_encoder_config if prompt_encoder_config is not None else {} + mask_decoder_config = mask_decoder_config if mask_decoder_config is not None else {} + + if isinstance(vision_config, dict): + vision_config["model_type"] = vision_config.get("model_type", "edgetam_vision_model") + vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) + if isinstance(prompt_encoder_config, EdgeTamPromptEncoderConfig): + prompt_encoder_config = prompt_encoder_config.to_dict() + if isinstance(mask_decoder_config, EdgeTamMaskDecoderConfig): + mask_decoder_config = mask_decoder_config.to_dict() + + self.vision_config = vision_config + self.prompt_encoder_config = EdgeTamPromptEncoderConfig(**prompt_encoder_config) + self.mask_decoder_config = EdgeTamMaskDecoderConfig(**mask_decoder_config) + + self.initializer_range = initializer_range + + +__all__ = ["EdgeTamConfig", "EdgeTamVisionConfig", "EdgeTamPromptEncoderConfig", "EdgeTamMaskDecoderConfig"] diff --git a/src/transformers/models/edgetam/modeling_edgetam.py b/src/transformers/models/edgetam/modeling_edgetam.py new file mode 100644 index 000000000000..d7e3ee6009cf --- /dev/null +++ b/src/transformers/models/edgetam/modeling_edgetam.py @@ -0,0 +1,1252 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/edgetam/modular_edgetam.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_edgetam.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 The Meta AI Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from dataclasses import dataclass +from typing import Callable, Optional, Union + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor + +from transformers.utils.generic import OutputRecorder, TransformersKwargs, check_model_inputs + +from ...activations import ACT2FN +from ...modeling_outputs import BaseModelOutput +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...pytorch_utils import compile_compatible_method_lru_cache +from ...utils import ModelOutput, auto_docstring +from ..auto import AutoModel +from .configuration_edgetam import ( + EdgeTamConfig, + EdgeTamMaskDecoderConfig, + EdgeTamPromptEncoderConfig, + EdgeTamVisionConfig, +) + + +# fix this in modular +if True: + from transformers.models.timm_wrapper.modeling_timm_wrapper import TimmWrapperModel + + +class EdgeTamLayerNorm(nn.LayerNorm): + r"""LayerNorm that supports two data formats: channels_last (default) or channels_first. + The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height, + width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width). + """ + + def __init__(self, normalized_shape, *, eps=1e-6, data_format="channels_last", **kwargs): + super().__init__(normalized_shape, eps=eps, **kwargs) + if data_format not in ["channels_last", "channels_first"]: + raise NotImplementedError(f"Unsupported data format: {data_format}") + self.data_format = data_format + + def forward(self, features: torch.Tensor) -> torch.Tensor: + """ + Args: + features: Tensor of shape (batch_size, channels, height, width) OR (batch_size, height, width, channels) + """ + if self.data_format == "channels_first": + features = features.permute(0, 2, 3, 1) + features = super().forward(features) + features = features.permute(0, 3, 1, 2) + else: + features = super().forward(features) + return features + + +@dataclass +@auto_docstring(custom_intro="Base class for the vision encoder's outputs.") +class EdgeTamVisionEncoderOutput(ModelOutput): + r""" + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, height, width, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + fpn_hidden_states (`tuple(torch.FloatTensor)`): + Tuple of `torch.FloatTensor` (one for each feature level, from high to low resolution) of shape + `(batch_size, hidden_size, height, width)`. Feature maps from the Feature Pyramid Network neck. + fpn_position_encoding (`tuple(torch.FloatTensor)`): + Tuple of `torch.FloatTensor` (one for each feature level, from high to low resolution) of shape + `(batch_size, hidden_size, height, width)`. Positional encodings corresponding to the `fpn_hidden_states`. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each stage) of shape `(batch_size, height, width, hidden_size)`. Hidden-states of the + model at the output of each stage. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in + the self-attention heads. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + fpn_hidden_states: Optional[torch.FloatTensor] = None + fpn_position_encoding: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + attentions: Optional[tuple[torch.FloatTensor, ...]] = None + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs, +): + attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling + if attention_mask is not None: + attn_weights = attn_weights + attention_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +class EdgeTamAttention(nn.Module): + """ + EDGETAM's attention layer that allows for downscaling the size of the embedding after projection to queries, keys, and + values. + """ + + def __init__(self, config, downsample_rate=None): + super().__init__() + downsample_rate = config.attention_downsample_rate if downsample_rate is None else downsample_rate + self.config = config + self.hidden_size = config.hidden_size + self.internal_dim = config.hidden_size // downsample_rate + self.num_attention_heads = config.num_attention_heads + self.head_dim = self.internal_dim // config.num_attention_heads + self.scaling = self.head_dim**-0.5 + self.is_causal = False + + self.q_proj = nn.Linear(self.hidden_size, self.internal_dim) + self.k_proj = nn.Linear(self.hidden_size, self.internal_dim) + self.v_proj = nn.Linear(self.hidden_size, self.internal_dim) + self.o_proj = nn.Linear(self.internal_dim, self.hidden_size) + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_similarity: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[torch.Tensor, torch.Tensor]: + # Input projections + batch_size, point_batch_size = query.shape[:2] + new_shape = (batch_size * point_batch_size, -1, self.num_attention_heads, self.head_dim) + + query = self.q_proj(query).view(*new_shape).transpose(1, 2) + key = self.k_proj(key).view(*new_shape).transpose(1, 2) + value = self.v_proj(value).view(*new_shape).transpose(1, 2) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query, + key, + value, + attention_mask=attention_similarity, + dropout=0.0, + scaling=self.scaling, + is_causal=self.is_causal, + **kwargs, + ) + + attn_output = attn_output.reshape( + batch_size, point_batch_size, -1, self.num_attention_heads * self.head_dim + ).contiguous() + attn_output = self.o_proj(attn_output) + + return attn_output, attn_weights + + +class EdgeTamTwoWayAttentionBlock(nn.Module): + def __init__(self, config: EdgeTamMaskDecoderConfig, skip_first_layer_pe: bool = False): + """ + A transformer block with four layers: + (1) self-attention of sparse inputs (2) cross attention of sparse inputs -> dense inputs (3) mlp block on + sparse inputs (4) cross attention of dense inputs -> sparse inputs + + Arguments: + config (`EdgeTamMaskDecoderConfig`): + The configuration file used to instantiate the block + attention_downsample_rate (*optionalk*, int, defaults to 2): + The downsample ratio of the block used to reduce the inner dim of the attention. + skip_first_layer_pe (*optional*, bool, defaults to `False`): + Whether or not to skip the addition of the query_point_embedding on the first layer. + """ + super().__init__() + self.self_attn = EdgeTamAttention(config, downsample_rate=1) + self.layer_norm1 = nn.LayerNorm(config.hidden_size) + + self.cross_attn_token_to_image = EdgeTamAttention(config) + self.layer_norm2 = nn.LayerNorm(config.hidden_size) + + self.mlp = EdgeTamFeedForward( + config.hidden_size, config.mlp_dim, config.hidden_size, num_layers=config.num_hidden_layers + ) + self.layer_norm3 = nn.LayerNorm(config.hidden_size) + + self.layer_norm4 = nn.LayerNorm(config.hidden_size) + self.cross_attn_image_to_token = EdgeTamAttention(config) + + self.skip_first_layer_pe = skip_first_layer_pe + + def forward( + self, + queries: Tensor, + keys: Tensor, + query_point_embedding: Tensor, + key_point_embedding: Tensor, + attention_similarity: Tensor, + **kwargs: Unpack[TransformersKwargs], + ): + # Self attention block + if self.skip_first_layer_pe: + queries, _ = self.self_attn(query=queries, key=queries, value=queries) + else: + query = queries + query_point_embedding + attn_out, _ = self.self_attn(query=query, key=query, value=queries) + queries = queries + attn_out + queries = self.layer_norm1(queries) + + # Cross attention block, tokens attending to image embedding + query = queries + query_point_embedding + key = keys + key_point_embedding + + attn_out, _ = self.cross_attn_token_to_image( + query=query, key=key, value=keys, attention_similarity=attention_similarity + ) + queries = queries + attn_out + + queries = self.layer_norm2(queries) + + # MLP block + mlp_out = self.mlp(queries) + queries = queries + mlp_out + queries = self.layer_norm3(queries) + + # Cross attention block, image embedding attending to tokens + query = queries + query_point_embedding + key = keys + key_point_embedding + + attn_out, _ = self.cross_attn_image_to_token(query=key, key=query, value=queries) + keys = keys + attn_out + + keys = self.layer_norm4(keys) + return queries, keys, attn_out + + +class EdgeTamFeedForward(nn.Module): + def __init__( + self, + input_dim: int, + hidden_dim: int, + output_dim: int, + num_layers: int, + activation: str = "relu", + sigmoid_output: bool = False, + ): + super().__init__() + self.num_layers = num_layers + self.activation = ACT2FN[activation] + self.proj_in = nn.Linear(input_dim, hidden_dim) + self.proj_out = nn.Linear(hidden_dim, output_dim) + self.layers = nn.ModuleList([nn.Linear(hidden_dim, hidden_dim) for _ in range(num_layers - 2)]) + self.sigmoid_output = sigmoid_output + + def forward(self, hidden_states): + hidden_states = self.proj_in(hidden_states) + hidden_states = self.activation(hidden_states) + for layer in self.layers: + hidden_states = self.activation(layer(hidden_states)) + + hidden_states = self.proj_out(hidden_states) + if self.sigmoid_output: + hidden_states = F.sigmoid(hidden_states) + return hidden_states + + +@auto_docstring +class EdgeTamPreTrainedModel(PreTrainedModel): + config_class = EdgeTamConfig + base_model_prefix = "edgetam" + main_input_name = "pixel_values" + _supports_sdpa = True + _supports_flash_attn_2 = True + _supports_attention_backend = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, (nn.LayerNorm, EdgeTamLayerNorm)): + module.weight.data.fill_(1.0) + module.bias.data.zero_() + if isinstance(module, EdgeTamModel): + if module.no_memory_embedding is not None: + module.no_memory_embedding.data.zero_() + + +# copied and adapted from original implementation, also practically equal to DetrSinePositionEmbedding +class EdgeTamSinePositionEmbedding(nn.Module): + """ + This is a more standard version of the position embedding, very similar to the one used by the Attention is all you + need paper, generalized to work on images. + """ + + def __init__( + self, num_pos_feats: int = 64, temperature: int = 10000, normalize: bool = False, scale: Optional[float] = None + ): + super().__init__() + if scale is not None and normalize is False: + raise ValueError("normalize should be True if scale is passed") + self.num_pos_feats = num_pos_feats + self.temperature = temperature + self.normalize = normalize + self.scale = 2 * math.pi if scale is None else scale + + @compile_compatible_method_lru_cache(maxsize=1) + def forward( + self, + shape: torch.Size, + device: Union[torch.device, str], + dtype: torch.dtype, + mask: Optional[Tensor] = None, + ) -> Tensor: + if mask is None: + mask = torch.zeros((shape[0], shape[2], shape[3]), device=device, dtype=torch.bool) + not_mask = (~mask).to(dtype) + y_embed = not_mask.cumsum(1) + x_embed = not_mask.cumsum(2) + if self.normalize: + eps = 1e-6 + y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale + x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale + + dim_t = torch.arange(self.num_pos_feats, dtype=torch.int64, device=device).to(dtype) + dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.num_pos_feats) + + pos_x = x_embed[:, :, :, None] / dim_t + pos_y = y_embed[:, :, :, None] / dim_t + pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) + return pos + + +class EdgeTamVisionNeck(nn.Module): + def __init__(self, config: EdgeTamVisionConfig): + super().__init__() + self.config = config + + self.position_encoding = EdgeTamSinePositionEmbedding( + num_pos_feats=config.fpn_hidden_size // 2, normalize=True + ) + self.convs = nn.ModuleList() + for in_channels in config.backbone_channel_list: + self.convs.append( + nn.Conv2d( + in_channels=in_channels, + out_channels=config.fpn_hidden_size, + kernel_size=config.fpn_kernel_size, + stride=config.fpn_stride, + padding=config.fpn_padding, + ), + ) + self.fpn_top_down_levels = config.fpn_top_down_levels + + def forward(self, hidden_states: torch.Tensor) -> tuple[tuple[torch.Tensor, ...], tuple[torch.Tensor, ...]]: + fpn_hidden_states = () + fpn_position_encoding = () + + # forward in top-down order (from low to high resolution) + n = len(self.convs) - 1 + for i in range(n, -1, -1): + lateral_features = hidden_states[i].permute(0, 3, 1, 2) + lateral_features = self.convs[n - i](lateral_features) + if i not in self.fpn_top_down_levels or i == n: + prev_features = lateral_features + else: + top_down_features = F.interpolate( + prev_features.to(dtype=torch.float32), + scale_factor=2.0, + mode="nearest", + align_corners=None, + antialias=False, + ).to(lateral_features.dtype) + prev_features = lateral_features + top_down_features + + prev_position_encoding = self.position_encoding( + prev_features.shape, prev_features.device, prev_features.dtype + ).to(prev_features.dtype) + + fpn_hidden_states += (prev_features,) + fpn_position_encoding += (prev_position_encoding,) + + return fpn_hidden_states, fpn_position_encoding + + +@auto_docstring( + custom_intro=""" + The vision model from EdgeTAM without any head or projection on top. + """ +) +class EdgeTamVisionModel(EdgeTamPreTrainedModel): + config_class = EdgeTamVisionConfig + main_input_name = "pixel_values" + _can_record_outputs = {"hidden_states": TimmWrapperModel, "attentions": TimmWrapperModel} + + def __init__(self, config: EdgeTamVisionConfig): + super().__init__(config) + self.config = config + + self.backbone = AutoModel.from_config(config.backbone_config) + + self.neck = EdgeTamVisionNeck(config) + self.num_feature_levels = config.num_feature_levels + + self.post_init() + + @check_model_inputs + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, EdgeTamVisionEncoderOutput]: + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + # Forward through backbone + backbone_output = self.backbone(pixel_values) + intermediate_hidden_states = backbone_output.last_hidden_state + intermediate_hidden_states = [hidden_state.permute(0, 2, 3, 1) for hidden_state in intermediate_hidden_states] + + fpn_hidden_states, fpn_position_encoding = self.neck(intermediate_hidden_states) + # Select last `num_feature_levels` feature levels from FPN and reverse order to get features from high to low resolution + fpn_hidden_states = fpn_hidden_states[-self.num_feature_levels :][::-1] + fpn_position_encoding = fpn_position_encoding[-self.num_feature_levels :][::-1] + + return EdgeTamVisionEncoderOutput( + last_hidden_state=intermediate_hidden_states[-1], + fpn_hidden_states=fpn_hidden_states, + fpn_position_encoding=fpn_position_encoding, + ) + + +@dataclass +@auto_docstring(custom_intro="Base class for the EdgeTam model's output.") +class EdgeTamImageSegmentationOutput(ModelOutput): + r""" + iou_scores (`torch.FloatTensor` of shape `(batch_size, point_batch_size, num_masks)`): + The Intersection over Union (IoU) scores of the predicted masks. + pred_masks (`torch.FloatTensor` of shape `(batch_size, point_batch_size, num_masks, height, width)`): + The predicted low-resolution masks. This is an alias for `low_res_masks`. These masks need to be post-processed + by the processor to be brought to the original image size. + object_score_logits (`torch.FloatTensor` of shape `(batch_size, point_batch_size, 1)`): + Logits for the object score, indicating if an object is present. + image_embeddings (`tuple(torch.FloatTensor)`): + The features from the FPN, which are used by the mask decoder. This is a tuple of `torch.FloatTensor` where each + tensor has shape `(batch_size, channels, height, width)`. + vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of each stage) of shape `(batch_size, height, width, hidden_size)`. + Hidden-states of the vision model at the output of each stage. + vision_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. + Attentions weights of the vision model. + mask_decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. + Attentions weights of the mask decoder. + """ + + iou_scores: Optional[torch.FloatTensor] = None + pred_masks: Optional[torch.FloatTensor] = None + object_score_logits: Optional[torch.FloatTensor] = None + image_embeddings: tuple[torch.FloatTensor, ...] = None + vision_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + vision_attentions: Optional[tuple[torch.FloatTensor, ...]] = None + mask_decoder_attentions: Optional[tuple[torch.FloatTensor, ...]] = None + + +class EdgeTamPositionalEmbedding(nn.Module): + def __init__(self, config: EdgeTamPromptEncoderConfig): + super().__init__() + self.scale = config.scale + positional_embedding = self.scale * torch.randn((2, config.hidden_size // 2)) + self.register_buffer("positional_embedding", positional_embedding) + + def forward(self, input_coords, input_shape=None): + """Positionally encode points that are normalized to [0,1].""" + coordinates = input_coords.clone() + + if input_shape is not None: + coordinates[:, :, :, 0] = coordinates[:, :, :, 0] / input_shape[1] + coordinates[:, :, :, 1] = coordinates[:, :, :, 1] / input_shape[0] + coordinates.to(torch.float32) + + # assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape + coordinates = 2 * coordinates - 1 + coordinates = coordinates.to(self.positional_embedding.dtype) + coordinates = coordinates @ self.positional_embedding + coordinates = 2 * np.pi * coordinates + # outputs d_1 x ... x d_n x channel shape + return torch.cat([torch.sin(coordinates), torch.cos(coordinates)], dim=-1) + + +class EdgeTamMaskEmbedding(nn.Module): + def __init__(self, config: EdgeTamPromptEncoderConfig): + super().__init__() + self.mask_input_channels = config.mask_input_channels // 4 + self.activation = ACT2FN[config.hidden_act] + self.conv1 = nn.Conv2d(1, self.mask_input_channels, kernel_size=2, stride=2) + self.conv2 = nn.Conv2d(self.mask_input_channels, config.mask_input_channels, kernel_size=2, stride=2) + self.conv3 = nn.Conv2d(config.mask_input_channels, config.hidden_size, kernel_size=1) + self.layer_norm1 = EdgeTamLayerNorm( + self.mask_input_channels, eps=config.layer_norm_eps, data_format="channels_first" + ) + self.layer_norm2 = EdgeTamLayerNorm( + self.mask_input_channels * 4, eps=config.layer_norm_eps, data_format="channels_first" + ) + + def forward(self, masks): + hidden_states = self.conv1(masks) + hidden_states = self.layer_norm1(hidden_states) + hidden_states = self.activation(hidden_states) + + hidden_states = self.conv2(hidden_states) + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.activation(hidden_states) + dense_embeddings = self.conv3(hidden_states) + return dense_embeddings + + +class EdgeTamPromptEncoder(nn.Module): + def __init__(self, config: EdgeTamPromptEncoderConfig): + super().__init__() + self.shared_embedding = EdgeTamPositionalEmbedding(config) + self.mask_embed = EdgeTamMaskEmbedding(config) + self.no_mask_embed = nn.Embedding(1, config.hidden_size) + + self.image_embedding_size = (config.image_size // config.patch_size, config.image_size // config.patch_size) + self.mask_input_size = (4 * config.image_size // config.patch_size, 4 * config.image_size // config.patch_size) + self.input_image_size = config.image_size + + self.point_embed = nn.Embedding(config.num_point_embeddings, config.hidden_size) + self.hidden_size = config.hidden_size + self.not_a_point_embed = nn.Embedding(1, config.hidden_size) + + def _embed_points(self, points: torch.Tensor, labels: torch.Tensor, pad: bool) -> torch.Tensor: + """Embeds point prompts.""" + points = points + 0.5 # Shift to center of pixel + if pad: + points = torch.nn.functional.pad(points, (0, 0, 0, 1), mode="constant", value=0) + labels = torch.nn.functional.pad(labels, (0, 1), mode="constant", value=-1) + input_shape = (self.input_image_size, self.input_image_size) + point_embedding = self.shared_embedding(points, input_shape) + + # torch.where and expanding the labels tensor is required by the ONNX export + point_embedding = torch.where(labels[..., None] == -1, self.not_a_point_embed.weight, point_embedding) + + # This is required for the ONNX export. The dtype, device need to be explicitly + # specified as otherwise torch.onnx.export interprets as double + point_embedding = torch.where( + labels[..., None] != -10, + point_embedding, + torch.zeros_like(point_embedding), + ) + + # Add point embeddings for labels >= 0 + point_embedding = point_embedding + self.point_embed(labels.clamp(min=0)) * (labels >= 0).unsqueeze(-1) + + return point_embedding + + def _embed_boxes(self, boxes: torch.Tensor) -> torch.Tensor: + """Embeds box prompts.""" + boxes += 0.5 # Shift to center of pixel + coords = boxes.view(*boxes.shape[:2], 2, 2) + # add padding point for consistency with the original implementation + coords = torch.nn.functional.pad(coords, (0, 0, 0, 1), mode="constant", value=0) + corner_embedding = self.shared_embedding(coords, (self.input_image_size, self.input_image_size)) + corner_embedding[:, :, 0, :] += self.point_embed.weight[2] + corner_embedding[:, :, 1, :] += self.point_embed.weight[3] + corner_embedding[:, :, 2, :] = self.not_a_point_embed.weight.expand_as(corner_embedding[:, :, 2, :]) + return corner_embedding + + def forward( + self, + input_points: Optional[tuple[torch.Tensor, torch.Tensor]], + input_labels: Optional[torch.Tensor], + input_boxes: Optional[torch.Tensor], + input_masks: Optional[torch.Tensor], + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Embeds different types of prompts, returning both sparse and dense embeddings. + + Args: + points (`torch.Tensor`, *optional*): + point coordinates and labels to embed. + boxes (`torch.Tensor`, *optional*): + boxes to embed + masks (`torch.Tensor`, *optional*): + masks to embed + """ + sparse_embeddings = None + batch_size = 1 + if input_points is not None: + batch_size = input_points.shape[0] + if input_labels is None: + raise ValueError("If points are provided, labels must also be provided.") + point_embeddings = self._embed_points(input_points, input_labels, pad=(input_boxes is None)) + sparse_embeddings = point_embeddings + if input_boxes is not None: + batch_size = input_boxes.shape[0] + box_embeddings = self._embed_boxes(input_boxes) + if sparse_embeddings is None: + sparse_embeddings = box_embeddings + else: + sparse_embeddings = torch.cat([sparse_embeddings, box_embeddings], dim=2) + if input_masks is not None: + dense_embeddings = self.mask_embed(input_masks) + else: + dense_embeddings = self.no_mask_embed.weight.reshape(1, -1, 1, 1).expand( + batch_size, -1, self.image_embedding_size[0], self.image_embedding_size[1] + ) + + return sparse_embeddings, dense_embeddings + + +class EdgeTamTwoWayTransformer(nn.Module): + def __init__(self, config: EdgeTamMaskDecoderConfig): + super().__init__() + self.config = config + + self.num_hidden_layers = config.num_hidden_layers + self.layers = nn.ModuleList() + + for i in range(self.num_hidden_layers): + self.layers.append(EdgeTamTwoWayAttentionBlock(config, skip_first_layer_pe=(i == 0))) + + self.final_attn_token_to_image = EdgeTamAttention(config) + self.layer_norm_final_attn = nn.LayerNorm(config.hidden_size) + + def forward( + self, + point_embeddings: Tensor, + image_embeddings: Tensor, + image_positional_embeddings: Tensor, + attention_similarity: Tensor, + target_embedding=None, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, BaseModelOutput]: + if image_embeddings is None: + raise ValueError("You have to specify an image_embedding") + + image_embeddings = image_embeddings.flatten(2).permute(0, 2, 1).unsqueeze(1) + image_positional_embeddings = image_positional_embeddings.flatten(2).permute(0, 2, 1).unsqueeze(1) + + # Prepare queries + queries = point_embeddings + keys = image_embeddings + + # Apply transformer blocks and final layernorm + for layer in self.layers: + if target_embedding is not None: + queries += target_embedding + + queries, keys, _ = layer( + queries=queries, + keys=keys, + query_point_embedding=point_embeddings, + key_point_embedding=image_positional_embeddings, + attention_similarity=attention_similarity, + **kwargs, + ) + # Apply the final attention layer from the points to the image + query = queries + point_embeddings + key = keys + image_positional_embeddings + + attn_out, _ = self.final_attn_token_to_image(query=query, key=key, value=keys) + + queries = queries + attn_out + queries = self.layer_norm_final_attn(queries) + return queries, keys + + +class EdgeTamMaskDecoder(nn.Module): + def __init__(self, config: EdgeTamMaskDecoderConfig): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + + self.num_multimask_outputs = config.num_multimask_outputs + self.num_mask_tokens = config.num_multimask_outputs + 1 + + self.iou_token = nn.Embedding(1, self.hidden_size) + self.mask_tokens = nn.Embedding(self.num_mask_tokens, self.hidden_size) + + self.transformer = EdgeTamTwoWayTransformer(config) + + # should we create a new class for this? + self.upscale_conv1 = nn.ConvTranspose2d(self.hidden_size, self.hidden_size // 4, kernel_size=2, stride=2) + self.upscale_conv2 = nn.ConvTranspose2d(self.hidden_size // 4, self.hidden_size // 8, kernel_size=2, stride=2) + self.upscale_layer_norm = EdgeTamLayerNorm(self.hidden_size // 4, data_format="channels_first") + self.activation = nn.GELU() + + mlps_list = [] + for _ in range(self.num_mask_tokens): + mlps_list += [EdgeTamFeedForward(self.hidden_size, self.hidden_size, self.hidden_size // 8, 3)] + self.output_hypernetworks_mlps = nn.ModuleList(mlps_list) + self.iou_prediction_head = EdgeTamFeedForward( + self.hidden_size, + config.iou_head_hidden_dim, + self.num_mask_tokens, + config.iou_head_depth, + sigmoid_output=True, + ) + + self.conv_s0 = nn.Conv2d(config.hidden_size, config.hidden_size // 8, kernel_size=1, stride=1) + self.conv_s1 = nn.Conv2d(config.hidden_size, config.hidden_size // 4, kernel_size=1, stride=1) + + self.obj_score_token = nn.Embedding(1, self.hidden_size) + self.pred_obj_score_head = EdgeTamFeedForward(self.hidden_size, self.hidden_size, 1, 3) + + self.dynamic_multimask_via_stability = config.dynamic_multimask_via_stability + self.dynamic_multimask_stability_delta = config.dynamic_multimask_stability_delta + self.dynamic_multimask_stability_thresh = config.dynamic_multimask_stability_thresh + + def forward( + self, + image_embeddings: torch.Tensor, + image_positional_embeddings: torch.Tensor, + sparse_prompt_embeddings: torch.Tensor, + dense_prompt_embeddings: torch.Tensor, + multimask_output: bool, + high_resolution_features: list[torch.Tensor], + attention_similarity: Optional[torch.Tensor] = None, + target_embedding: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Predict masks given image and prompt embeddings. + + Args: + image_embeddings (`torch.Tensor`): + The embeddings from the image encoder. + image_positional_embeddings (`torch.Tensor`): + Positional encoding with the shape of image_embeddings. + sparse_prompt_embeddings (`torch.Tensor`): + The embeddings of the points and boxes. + dense_prompt_embeddings (`torch.Tensor`): + The embeddings of the mask inputs. + multimask_output (`bool`): + Whether to return multiple masks or a single mask. + high_resolution_features (`list[torch.Tensor]`, *optional*): + The high-resolution features from the vision encoder. + attention_similarity (`torch.Tensor`, *optional*): + The attention similarity tensor. + target_embedding (`torch.Tensor`, *optional*): + The target embedding. + """ + batch_size, num_channels, height, width = image_embeddings.shape + point_batch_size = sparse_prompt_embeddings.shape[1] + # Concatenate output tokens + output_tokens = torch.cat( + [ + self.obj_score_token.weight, + self.iou_token.weight, + self.mask_tokens.weight, + ], + dim=0, + ) + output_tokens = output_tokens.repeat(batch_size, point_batch_size, 1, 1) + + if sparse_prompt_embeddings.shape[0] != 0: + tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=2) + else: + tokens = output_tokens + point_embeddings = tokens.to(self.iou_token.weight.dtype) + + # Expand per-image data in batch direction to be per-mask + image_embeddings = image_embeddings + dense_prompt_embeddings + image_embeddings = image_embeddings.repeat_interleave(point_batch_size, dim=0) + image_positional_embeddings = image_positional_embeddings.repeat_interleave(point_batch_size, 0) + # Run the transformer + point_embeddings, image_embeddings = self.transformer( + point_embeddings=point_embeddings, + image_embeddings=image_embeddings, + image_positional_embeddings=image_positional_embeddings, + attention_similarity=attention_similarity, + target_embedding=target_embedding, + **kwargs, + ) + iou_token_out = point_embeddings[:, :, 1, :] + mask_tokens_out = point_embeddings[:, :, 2 : (2 + self.num_mask_tokens), :] + + # Upscale mask embeddings and predict masks using the mask tokens + image_embeddings = image_embeddings.transpose(2, 3).view( + batch_size * point_batch_size, num_channels, height, width + ) + + feat_s0, feat_s1 = high_resolution_features + feat_s0 = feat_s0.repeat_interleave(point_batch_size, dim=0) + feat_s1 = feat_s1.repeat_interleave(point_batch_size, dim=0) + upscaled_embedding = self.upscale_conv1(image_embeddings) + feat_s1 + upscaled_embedding = self.activation(self.upscale_layer_norm(upscaled_embedding)) + upscaled_embedding = self.activation(self.upscale_conv2(upscaled_embedding) + feat_s0) + + hyper_in_list: list[torch.Tensor] = [] + for i in range(self.num_mask_tokens): + current_mlp = self.output_hypernetworks_mlps[i] + hyper_in_list += [current_mlp(mask_tokens_out[:, :, i, :])] + hyper_in = torch.stack(hyper_in_list, dim=2) + + _, num_channels, height, width = upscaled_embedding.shape + upscaled_embedding = upscaled_embedding.view(batch_size, point_batch_size, num_channels, height * width) + masks = (hyper_in @ upscaled_embedding).view(batch_size, point_batch_size, -1, height, width) + + # Generate mask quality predictions + iou_pred = self.iou_prediction_head(iou_token_out) + object_score_logits = self.pred_obj_score_head(point_embeddings[:, :, 0, :]) + + # Select the correct mask or masks for output + if multimask_output: + mask_slice = slice(1, None) + masks = masks[:, :, mask_slice, :, :] + iou_pred = iou_pred[:, :, mask_slice] + elif self.dynamic_multimask_via_stability and not self.training: + mask_slice = slice(0, 1) + masks, iou_pred = self._dynamic_multimask_via_stability(masks, iou_pred) + else: + mask_slice = slice(0, 1) + masks = masks[:, :, mask_slice, :, :] + iou_pred = iou_pred[:, :, mask_slice] + + sam_tokens_out = mask_tokens_out[:, :, mask_slice] # [b, 3, c] shape + + return masks, iou_pred, sam_tokens_out, object_score_logits + + def _get_stability_scores(self, mask_logits): + """ + Compute stability scores of the mask logits based on the IoU between upper and + lower thresholds. + """ + mask_logits = mask_logits.flatten(-2) + stability_delta = self.dynamic_multimask_stability_delta + area_i = torch.sum(mask_logits > stability_delta, dim=-1).float() + area_u = torch.sum(mask_logits > -stability_delta, dim=-1).float() + stability_scores = torch.where(area_u > 0, area_i / area_u, 1.0) + return stability_scores + + def _dynamic_multimask_via_stability(self, all_mask_logits, all_iou_scores): + """ + When outputting a single mask, if the stability score from the current single-mask + output (based on output token 0) falls below a threshold, we instead select from + multi-mask outputs (based on output token 1~3) the mask with the highest predicted + IoU score. This is intended to ensure a valid mask for both clicking and tracking. + """ + # The best mask from multimask output tokens (1~3) + multimask_logits = all_mask_logits[:, :, 1:, :, :] + multimask_iou_scores = all_iou_scores[:, :, 1:] + best_scores_inds = torch.argmax(multimask_iou_scores, dim=-1) # [B, P] + best_scores_inds_expanded = best_scores_inds.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1) + best_scores_inds_expanded = best_scores_inds_expanded.expand( + -1, -1, 1, multimask_logits.size(-2), multimask_logits.size(-1) + ) + best_multimask_logits = torch.gather(multimask_logits, 2, best_scores_inds_expanded) # [B, P, 1, H, W] + best_multimask_iou_scores = torch.gather(multimask_iou_scores, 2, best_scores_inds.unsqueeze(-1)) # [B, P, 1] + + # The mask from singlemask output token 0 and its stability score + singlemask_logits = all_mask_logits[:, :, 0:1, :, :] + singlemask_iou_scores = all_iou_scores[:, :, 0:1] + stability_scores = self._get_stability_scores(singlemask_logits) + is_stable = stability_scores >= self.dynamic_multimask_stability_thresh + + # Dynamically fall back to best multimask output upon low stability scores. + mask_logits_out = torch.where( + is_stable[..., None, None].expand_as(singlemask_logits), + singlemask_logits, + best_multimask_logits, + ) + iou_scores_out = torch.where( + is_stable.expand_as(singlemask_iou_scores), + singlemask_iou_scores, + best_multimask_iou_scores, + ) + return mask_logits_out, iou_scores_out + + +@auto_docstring( + custom_intro=""" + Segment Anything Model 2 (SAM 2) for generating segmentation masks, given an input image and + input points and labels, boxes, or masks. + """ +) +class EdgeTamModel(EdgeTamPreTrainedModel): + _tied_weights_keys = ["prompt_encoder.shared_embedding.positional_embedding"] + # need to be ignored, as it's a buffer and will not be correctly detected as tied weight + _keys_to_ignore_on_load_missing = ["prompt_encoder.shared_embedding.positional_embedding"] + _can_record_outputs = {"mask_decoder_attentions": OutputRecorder(EdgeTamTwoWayAttentionBlock, index=2)} + _keys_to_ignore_on_load_unexpected = [ + r"^memory_.*", + r"^mask_downsample.*", + r"spatial_perceiver.*", + r"^object_pointer_proj.*", + r"^temporal_positional_encoding_projection_layer.*", + "no_memory_positional_encoding", + "no_object_pointer", + "occlusion_spatial_embedding_parameter", + ] + + def __init__(self, config: EdgeTamConfig): + super().__init__(config) + self.shared_image_embedding = EdgeTamPositionalEmbedding(config.prompt_encoder_config) + self.vision_encoder = AutoModel.from_config(config.vision_config) + self.prompt_encoder = EdgeTamPromptEncoder(config.prompt_encoder_config) + # The module using it is not a PreTrainedModel subclass so we need this + config.mask_decoder_config._attn_implementation = config._attn_implementation + self.mask_decoder = EdgeTamMaskDecoder(config.mask_decoder_config) + + self.num_feature_levels = config.vision_config.num_feature_levels + self.backbone_feature_sizes = config.vision_config.backbone_feature_sizes + # a single token to indicate no memory embedding from previous frames + self.hidden_dim = config.vision_config.fpn_hidden_size + self.no_memory_embedding = torch.nn.Parameter(torch.zeros(1, 1, self.hidden_dim)) + + self.post_init() + + def _tie_weights(self): + self.prompt_encoder.shared_embedding.positional_embedding.data = ( + self.shared_image_embedding.positional_embedding.data + ) + + def get_image_wide_positional_embeddings(self) -> torch.Tensor: + size = self.prompt_encoder.image_embedding_size + target_device = self.shared_image_embedding.positional_embedding.device + target_dtype = self.shared_image_embedding.positional_embedding.dtype + grid = torch.ones(size, device=target_device, dtype=target_dtype) + y_embed = grid.cumsum(dim=0) - 0.5 + x_embed = grid.cumsum(dim=1) - 0.5 + y_embed = y_embed / size[0] + x_embed = x_embed / size[1] + + positional_embedding = self.shared_image_embedding(torch.stack([x_embed, y_embed], dim=-1)) + return positional_embedding.permute(2, 0, 1).unsqueeze(0) # channel x height x width + + @torch.no_grad() + def get_image_embeddings( + self, + pixel_values: torch.FloatTensor, + **kwargs: Unpack[TransformersKwargs], + ) -> list[torch.Tensor]: + r""" + Returns the image embeddings by passing the pixel values through the vision encoder. + + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Input pixel values + """ + batch_size = pixel_values.shape[0] + feature_maps, _, _, _ = self.get_image_features(pixel_values, **kwargs) + + # add no memory embedding to the last feature map + feature_maps[-1] = feature_maps[-1] + self.no_memory_embedding + + # reshape feature maps to the same shape as the backbone feature sizes + image_embeddings = [ + feat.permute(1, 2, 0).view(batch_size, -1, *feat_size) + for feat, feat_size in zip(feature_maps, self.backbone_feature_sizes) + ] + + return image_embeddings + + @torch.no_grad() + def get_prompt_embeddings( + self, + input_points: Optional[torch.FloatTensor] = None, + input_labels: Optional[torch.LongTensor] = None, + input_boxes: Optional[torch.FloatTensor] = None, + input_masks: Optional[torch.LongTensor] = None, + ): + r""" + Returns the prompt embeddings by passing the input points, labels, boxes and masks through the prompt encoder. + + Args: + input_points (`torch.FloatTensor` of shape `(batch_size, point_batch_size, num_points_per_image, 2)`): + Optional input points for the prompt encoder. The padding of the point is automatically done by the + processor. `point_batch_size` refers to the number of masks that we want the model to predict per + point. The model will output `point_batch_size` times 3 masks in total. + input_labels (`torch.LongTensor` of shape `(batch_size, point_batch_size, num_points_per_image)`): + Optional input labels for the prompt encoder. The padding of the labels is automatically done by the + processor, or can be fed by the user. + input_boxes (`torch.FloatTensor` of shape `(batch_size, num_boxes_per_image, 4)`): + Optional input boxes for the prompt encoder. The padding of the boxes is automatically done by the + processor. users can also pass manually the input boxes. + input_masks (`torch.LongTensor` of shape `(batch_size, image_size, image_size)`): + Optional input masks for the prompt encoder. + """ + prompt_output = self.prompt_encoder( + input_points=input_points, + input_labels=input_labels, + input_boxes=input_boxes, + input_masks=input_masks, + ) + return prompt_output + + @check_model_inputs + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + input_points: Optional[torch.FloatTensor] = None, + input_labels: Optional[torch.LongTensor] = None, + input_boxes: Optional[torch.FloatTensor] = None, + input_masks: Optional[torch.LongTensor] = None, + image_embeddings: Optional[torch.FloatTensor] = None, + multimask_output: bool = True, + attention_similarity: Optional[torch.FloatTensor] = None, + target_embedding: Optional[torch.FloatTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> EdgeTamImageSegmentationOutput: + r""" + input_points (`torch.FloatTensor` of shape `(batch_size, num_points, 2)`): + Input 2D spatial points, this is used by the prompt encoder to encode the prompt. Generally yields to much + better results. The points can be obtained by passing a list of list of list to the processor that will + create corresponding `torch` tensors of dimension 4. The first dimension is the image batch size, the + second dimension is the point batch size (i.e. how many segmentation masks do we want the model to predict + per input point), the third dimension is the number of points per segmentation mask (it is possible to pass + multiple points for a single mask), and the last dimension is the x (vertical) and y (horizontal) + coordinates of the point. If a different number of points is passed either for each image, or for each + mask, the processor will create "PAD" points that will correspond to the (0, 0) coordinate, and the + computation of the embedding will be skipped for these points using the labels. + input_labels (`torch.LongTensor` of shape `(batch_size, point_batch_size, num_points)`): + Input labels for the points, this is used by the prompt encoder to encode the prompt. According to the + official implementation, there are 3 types of labels + + - `1`: the point is a point that contains the object of interest + - `0`: the point is a point that does not contain the object of interest + - `-1`: the point corresponds to the background + + We added the label: + + - `-10`: the point is a padding point, thus should be ignored by the prompt encoder + + The padding labels should be automatically done by the processor. + input_boxes (`torch.FloatTensor` of shape `(batch_size, num_boxes, 4)`): + Input boxes for the points, this is used by the prompt encoder to encode the prompt. Generally yields to + much better generated masks. The boxes can be obtained by passing a list of list of list to the processor, + that will generate a `torch` tensor, with each dimension corresponding respectively to the image batch + size, the number of boxes per image and the coordinates of the top left and bottom right point of the box. + In the order (`x1`, `y1`, `x2`, `y2`): + + - `x1`: the x coordinate of the top left point of the input box + - `y1`: the y coordinate of the top left point of the input box + - `x2`: the x coordinate of the bottom right point of the input box + - `y2`: the y coordinate of the bottom right point of the input box + input_masks (`torch.FloatTensor` of shape `(batch_size, image_size, image_size)`): + SAM model also accepts segmentation masks as input. The mask will be embedded by the prompt encoder to + generate a corresponding embedding, that will be fed later on to the mask decoder. These masks needs to be + manually fed by the user, and they need to be of shape (`batch_size`, `image_size`, `image_size`). + image_embeddings (`torch.FloatTensor` of shape `(batch_size, output_channels, window_size, window_size)`): + Image embeddings, this is used by the mask decoder to generate masks and iou scores. For more memory + efficient computation, users can first retrieve the image embeddings using the `get_image_embeddings` + method, and then feed them to the `forward` method instead of feeding the `pixel_values`. + multimask_output (`bool`, *optional*): + In the original implementation and paper, the model always outputs 3 masks per image (or per point / per + bounding box if relevant). However, it is possible to just output a single mask, that corresponds to the + "best" mask, by specifying `multimask_output=False`. + attention_similarity (`torch.FloatTensor`, *optional*): + Attention similarity tensor, to be provided to the mask decoder for target-guided attention in case the + model is used for personalization as introduced in [PerSAM](https://huggingface.co/papers/2305.03048). + target_embedding (`torch.FloatTensor`, *optional*): + Embedding of the target concept, to be provided to the mask decoder for target-semantic prompting in case + the model is used for personalization as introduced in [PerSAM](https://huggingface.co/papers/2305.03048). + + Example: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoModel, AutoProcessor + + >>> model = AutoModel.from_pretrained("danelcsb/edgetam.1_hiera_tiny") + >>> processor = AutoProcessor.from_pretrained("danelcsb/edgetam.1_hiera_tiny") + + >>> img_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/sam-car.png" + >>> raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB") + >>> input_points = [[[400, 650]]] # 2D location of a window on the car + >>> inputs = processor(images=raw_image, input_points=input_points, return_tensors="pt") + + >>> # Get segmentation mask + >>> outputs = model(**inputs) + + >>> # Postprocess masks + >>> masks = processor.post_process_masks( + ... outputs.pred_masks, inputs["original_sizes"], inputs["reshaped_input_sizes"] + ... ) + ``` + """ + if not ((pixel_values is None) ^ (image_embeddings is None)): + raise ValueError("Exactly one of pixel_values or image_embeddings must be provided.") + if input_points is not None and input_boxes is not None: + if input_points.shape[1] != input_boxes.shape[1]: + raise ValueError( + f"You should provide as many bounding boxes as input points per box. Got {input_points.shape[1]} and {input_boxes.shape[1]}." + ) + + image_positional_embeddings = self.get_image_wide_positional_embeddings() + # repeat with batch size + batch_size = pixel_values.shape[0] if pixel_values is not None else image_embeddings[-1].shape[0] + image_positional_embeddings = image_positional_embeddings.repeat(batch_size, 1, 1, 1) + + vision_attentions = None + vision_hidden_states = None + + if pixel_values is not None: + feature_maps, _, vision_hidden_states, vision_attentions = self.get_image_features( + pixel_values, + **kwargs, + ) + + # add no memory embedding to the last feature map + feature_maps[-1] = feature_maps[-1] + self.no_memory_embedding + + # reshape feature maps to the same shape as the backbone feature sizes + image_embeddings = [ + feat.permute(1, 2, 0).view(batch_size, -1, *feat_size) + for feat, feat_size in zip(feature_maps, self.backbone_feature_sizes) + ] + + if input_points is not None and input_labels is None: + input_labels = torch.ones_like(input_points[:, :, :, 0], dtype=torch.int, device=input_points.device) + + if input_points is None and input_boxes is None: + # If no points are provide, pad with an empty point (with label -1) + input_points = torch.zeros( + batch_size, 1, 1, 2, dtype=image_embeddings[-1].dtype, device=image_embeddings[-1].device + ) + input_labels = -torch.ones(batch_size, 1, 1, dtype=torch.int32, device=image_embeddings[-1].device) + + if input_masks is not None: + # If mask_inputs is provided, downsize it into low-res mask input if needed + # and feed it as a dense mask prompt into the SAM mask encoder + if input_masks.shape[-2:] != self.prompt_encoder.mask_input_size: + input_masks = F.interpolate( + input_masks.float(), + size=self.prompt_encoder.mask_input_size, + align_corners=False, + mode="bilinear", + antialias=True, # use antialias for downsampling + ).to(input_masks.dtype) + + sparse_embeddings, dense_embeddings = self.prompt_encoder( + input_points=input_points, + input_labels=input_labels, + input_boxes=input_boxes, + input_masks=input_masks, + ) + low_res_multimasks, iou_scores, _, object_score_logits = self.mask_decoder( + image_embeddings=image_embeddings[-1], + image_positional_embeddings=image_positional_embeddings, + sparse_prompt_embeddings=sparse_embeddings, + dense_prompt_embeddings=dense_embeddings, + multimask_output=multimask_output, + high_resolution_features=image_embeddings[:-1], + attention_similarity=attention_similarity, + target_embedding=target_embedding, + **kwargs, + ) + + return EdgeTamImageSegmentationOutput( + iou_scores=iou_scores, + pred_masks=low_res_multimasks, + object_score_logits=object_score_logits, + image_embeddings=image_embeddings, + vision_hidden_states=vision_hidden_states, + vision_attentions=vision_attentions, + ) + + def get_image_features( + self, + pixel_values: torch.FloatTensor, + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[ + list[torch.Tensor], + list[torch.Tensor], + Optional[tuple[torch.FloatTensor, ...]], + Optional[tuple[torch.FloatTensor, ...]], + ]: + r""" + Extract and preprocess image features using the vision encoder. + + Args: + pixel_values (`torch.FloatTensor`): + Input pixel values of shape `(batch_size, num_channels, height, width)`. + + Returns: + `tuple`: A tuple containing: + - feature_maps (`list[torch.Tensor]`): List of feature maps from different levels. + - feature_maps_position_embeddings (`list[torch.Tensor]`): List of positional embeddings for each feature level. + - vision_hidden_states (`tuple[torch.FloatTensor]`, *optional*): Hidden states from the vision encoder. + - vision_attentions (`tuple[torch.FloatTensor]`, *optional*): Attention weights from the vision encoder. + """ + vision_outputs: EdgeTamVisionEncoderOutput = self.vision_encoder( + pixel_values, + **kwargs, + ) + + feature_maps = vision_outputs.fpn_hidden_states + feature_maps_position_embeddings = vision_outputs.fpn_position_encoding + + # precompute projected level 0 and level 1 features in SAM decoder + # to avoid running it again on every SAM click + feature_maps = list(feature_maps) + feature_maps[0] = self.mask_decoder.conv_s0(feature_maps[0]) + feature_maps[1] = self.mask_decoder.conv_s1(feature_maps[1]) + + # flatten NxCxHxW to HWxNxC + feature_maps = [feature_map.flatten(2).permute(2, 0, 1) for feature_map in feature_maps] + feature_maps_position_embeddings = [ + feature_map_position_embedding.flatten(2).permute(2, 0, 1) + for feature_map_position_embedding in feature_maps_position_embeddings + ] + + return feature_maps, feature_maps_position_embeddings, vision_outputs.hidden_states, vision_outputs.attentions + + +__all__ = ["EdgeTamModel", "EdgeTamVisionModel", "EdgeTamPreTrainedModel"] diff --git a/src/transformers/models/edgetam/modular_edgetam.py b/src/transformers/models/edgetam/modular_edgetam.py new file mode 100644 index 000000000000..e26d58d96b81 --- /dev/null +++ b/src/transformers/models/edgetam/modular_edgetam.py @@ -0,0 +1,261 @@ +# coding=utf-8 +# Copyright 2025 The Meta AI Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch SAM 2 model.""" + +from typing import Optional, Union + +import torch +import torch.nn as nn +import torch.utils.checkpoint + +from transformers.models.sam2.configuration_sam2 import Sam2Config, Sam2MaskDecoderConfig, Sam2PromptEncoderConfig +from transformers.models.sam2.modeling_sam2 import ( + Sam2Attention, + Sam2FeedForward, + Sam2LayerNorm, + Sam2Model, + Sam2PreTrainedModel, + Sam2TwoWayAttentionBlock, + Sam2VisionEncoderOutput, + Sam2VisionModel, +) +from transformers.utils.generic import TransformersKwargs, check_model_inputs + +from ...configuration_utils import PretrainedConfig +from ...processing_utils import Unpack +from ...utils import ( + auto_docstring, +) +from ..auto import CONFIG_MAPPING, AutoConfig + + +# fix this in modular +if True: + from transformers.models.timm_wrapper.modeling_timm_wrapper import TimmWrapperModel + + +class EdgeTamVisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`EdgeTamVisionModel`]. It is used to instantiate a SAM + vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration + defaults will yield a similar configuration to that of SAM 2.1 Hiera-tiny + [facebook/EdgeTAM](https://huggingface.co/facebook/EdgeTAM) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + backbone_config (`Union[dict, "PretrainedConfig"]`, *optional*): + Configuration for the vision backbone. This is used to instantiate the backbone using + `AutoModel.from_config`. + backbone_channel_list (`List[int]`, *optional*, defaults to `[384, 192, 96, 48]`): + The list of channel dimensions for the backbone. + backbone_feature_sizes (`List[List[int]]`, *optional*, defaults to `[[256, 256], [128, 128], [64, 64]]`): + The spatial sizes of the feature maps from the backbone. + fpn_hidden_size (`int`, *optional*, defaults to 256): + The hidden dimension of the FPN. + fpn_kernel_size (`int`, *optional*, defaults to 1): + The kernel size for the convolutions in the neck. + fpn_stride (`int`, *optional*, defaults to 1): + The stride for the convolutions in the neck. + fpn_padding (`int`, *optional*, defaults to 0): + The padding for the convolutions in the neck. + fpn_top_down_levels (`List[int]`, *optional*, defaults to `[2, 3]`): + The levels for the top-down FPN connections. + num_feature_levels (`int`, *optional*, defaults to 3): + The number of feature levels from the FPN to use. + hidden_act (`str`, *optional*, defaults to `"gelu"`): + The non-linear activation function in the neck. + layer_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon for the layer normalization. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + + """ + + base_config_key = "vision_config" + model_type = "edgetam_vision_model" + sub_configs = { + "backbone_config": AutoConfig, + } + + def __init__( + self, + backbone_config=None, + backbone_channel_list=None, + backbone_feature_sizes=None, + fpn_hidden_size=256, + fpn_kernel_size=1, + fpn_stride=1, + fpn_padding=0, + fpn_top_down_levels=None, + num_feature_levels=3, + hidden_act="gelu", + layer_norm_eps=1e-6, + initializer_range=0.02, + **kwargs, + ): + super().__init__(**kwargs) + + backbone_channel_list = [384, 192, 96, 48] if backbone_channel_list is None else backbone_channel_list + backbone_feature_sizes = ( + [[256, 256], [128, 128], [64, 64]] if backbone_feature_sizes is None else backbone_feature_sizes + ) + fpn_top_down_levels = [2, 3] if fpn_top_down_levels is None else fpn_top_down_levels + + if isinstance(backbone_config, dict): + backbone_config["model_type"] = backbone_config.get("model_type", "timm_wrapper") + backbone_config = CONFIG_MAPPING[backbone_config["model_type"]](**backbone_config) + elif isinstance(backbone_config, AutoConfig): + backbone_config = backbone_config + elif backbone_config is None: + backbone_config = AutoConfig.from_pretrained( + "timm/repvit_m1.dist_in1k", + model_args={"in_chans": 3, "features_only": True, "out_indices": [0, 1, 2, 3]}, + ) + + self.backbone_config = backbone_config + + # Neck + self.backbone_channel_list = backbone_channel_list + self.backbone_feature_sizes = backbone_feature_sizes + self.fpn_hidden_size = fpn_hidden_size + self.fpn_kernel_size = fpn_kernel_size + self.fpn_stride = fpn_stride + self.fpn_padding = fpn_padding + self.fpn_top_down_levels = fpn_top_down_levels + self.num_feature_levels = num_feature_levels + + self.hidden_act = hidden_act + self.layer_norm_eps = layer_norm_eps + self.initializer_range = initializer_range + + +class EdgeTamPromptEncoderConfig(Sam2PromptEncoderConfig): + pass + + +class EdgeTamMaskDecoderConfig(Sam2MaskDecoderConfig): + pass + + +class EdgeTamConfig(Sam2Config): + pass + + +class EdgeTamLayerNorm(Sam2LayerNorm): + pass + + +class EdgeTamVisionEncoderOutput(Sam2VisionEncoderOutput): + pass + + +class EdgeTamAttention(Sam2Attention): + pass + + +class EdgeTamTwoWayAttentionBlock(Sam2TwoWayAttentionBlock): + pass + + +class EdgeTamFeedForward(Sam2FeedForward): + pass + + +@auto_docstring +class EdgeTamPreTrainedModel(Sam2PreTrainedModel): + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, (nn.LayerNorm, EdgeTamLayerNorm)): + module.weight.data.fill_(1.0) + module.bias.data.zero_() + if isinstance(module, EdgeTamModel): + if module.no_memory_embedding is not None: + module.no_memory_embedding.data.zero_() + + +@auto_docstring( + custom_intro=""" + The vision model from EdgeTAM without any head or projection on top. + """ +) +class EdgeTamVisionModel(Sam2VisionModel): + config_class = EdgeTamVisionConfig + main_input_name = "pixel_values" + _can_record_outputs = {"hidden_states": TimmWrapperModel, "attentions": TimmWrapperModel} + + def get_input_embeddings(self): + raise NotImplementedError("Can't get input embeddings from timm wrapper model") + + @check_model_inputs + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, EdgeTamVisionEncoderOutput]: + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + # Forward through backbone + backbone_output = self.backbone(pixel_values) + intermediate_hidden_states = backbone_output.last_hidden_state + intermediate_hidden_states = [hidden_state.permute(0, 2, 3, 1) for hidden_state in intermediate_hidden_states] + + fpn_hidden_states, fpn_position_encoding = self.neck(intermediate_hidden_states) + # Select last `num_feature_levels` feature levels from FPN and reverse order to get features from high to low resolution + fpn_hidden_states = fpn_hidden_states[-self.num_feature_levels :][::-1] + fpn_position_encoding = fpn_position_encoding[-self.num_feature_levels :][::-1] + + return EdgeTamVisionEncoderOutput( + last_hidden_state=intermediate_hidden_states[-1], + fpn_hidden_states=fpn_hidden_states, + fpn_position_encoding=fpn_position_encoding, + ) + + +class EdgeTamModel(Sam2Model): + _keys_to_ignore_on_load_unexpected = [ + r"^memory_.*", + r"^mask_downsample.*", + r"spatial_perceiver.*", + r"^object_pointer_proj.*", + r"^temporal_positional_encoding_projection_layer.*", + "no_memory_positional_encoding", + "no_object_pointer", + "occlusion_spatial_embedding_parameter", + ] + + def get_input_embeddings(self): + raise NotImplementedError("Can't get input embeddings from timm wrapper model") + + +__all__ = [ + "EdgeTamModel", + "EdgeTamVisionModel", + "EdgeTamPreTrainedModel", + "EdgeTamConfig", + "EdgeTamVisionConfig", + "EdgeTamPromptEncoderConfig", + "EdgeTamMaskDecoderConfig", +] diff --git a/src/transformers/models/edgetam_video/__init__.py b/src/transformers/models/edgetam_video/__init__.py new file mode 100644 index 000000000000..669dd64ec304 --- /dev/null +++ b/src/transformers/models/edgetam_video/__init__.py @@ -0,0 +1,29 @@ +# coding=utf-8 +# Copyright 2025 the HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_edgetam_video import * + from .modeling_edgetam_video import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/edgetam_video/configuration_edgetam_video.py b/src/transformers/models/edgetam_video/configuration_edgetam_video.py new file mode 100644 index 000000000000..954864397dcb --- /dev/null +++ b/src/transformers/models/edgetam_video/configuration_edgetam_video.py @@ -0,0 +1,435 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/edgetam_video/modular_edgetam_video.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_edgetam_video.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 the HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...configuration_utils import PretrainedConfig +from ..auto import CONFIG_MAPPING, AutoConfig + + +class EdgeTamVideoPromptEncoderConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`EdgeTamVideoPromptEncoder`]. The [`EdgeTamVideoPromptEncoder`] + module is used to encode the input 2D points and bounding boxes. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 256): + Dimensionality of the hidden states. + image_size (`int`, *optional*, defaults to 1024): + The expected output resolution of the image. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + mask_input_channels (`int`, *optional*, defaults to 16): + The number of channels to be fed to the `MaskDecoder` module. + num_point_embeddings (`int`, *optional*, defaults to 4): + The number of point embeddings to be used. + hidden_act (`str`, *optional*, defaults to `"gelu"`): + The non-linear activation function in the encoder and pooler. + layer_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the layer normalization layers. + scale (`float`, *optional*, defaults to 1): + The scale factor for the prompt encoder. + """ + + base_config_key = "prompt_encoder_config" + + def __init__( + self, + hidden_size=256, + image_size=1024, + patch_size=16, + mask_input_channels=16, + num_point_embeddings=4, + hidden_act="gelu", + layer_norm_eps=1e-6, + scale=1, + **kwargs, + ): + super().__init__(**kwargs) + self.hidden_size = hidden_size + self.image_size = image_size + self.patch_size = patch_size + self.mask_input_channels = mask_input_channels + self.num_point_embeddings = num_point_embeddings + self.hidden_act = hidden_act + self.layer_norm_eps = layer_norm_eps + self.scale = scale + + +class EdgeTamVideoMaskDecoderConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`EdgeTamVideoMaskDecoder`]. It is used to instantiate a EDGETAM_VIDEO + memory encoder according to the specified arguments, defining the model architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 256): + Dimensionality of the hidden states. + hidden_act (`str`, *optional*, defaults to `"gelu"`): + The non-linear activation function in the EDGETAM_VIDEO mask decoder. + mlp_dim (`int`, *optional*, defaults to 2048): + The dimension of the MLP in the two-way transformer. + num_hidden_layers (`int`, *optional*, defaults to 2): + The number of hidden layers in the two-way transformer. + num_attention_heads (`int`, *optional*, defaults to 8): + The number of attention heads in the two-way transformer. + attention_downsample_rate (`int`, *optional*, defaults to 2): + The downsample rate for the attention layers. + num_multimask_outputs (`int`, *optional*, defaults to 3): + The number of multimask outputs. + iou_head_depth (`int`, *optional*, defaults to 3): + The depth of the IoU head. + iou_head_hidden_dim (`int`, *optional*, defaults to 256): + The hidden dimension of the IoU head. + dynamic_multimask_via_stability (`bool`, *optional*, defaults to `True`): + Whether to use dynamic multimask via stability. + dynamic_multimask_stability_delta (`float`, *optional*, defaults to 0.05): + The stability delta for the dynamic multimask. + dynamic_multimask_stability_thresh (`float`, *optional*, defaults to 0.98): + The stability threshold for the dynamic multimask. + + """ + + base_config_key = "mask_decoder_config" + + def __init__( + self, + hidden_size=256, + hidden_act="gelu", + mlp_dim=2048, + num_hidden_layers=2, + num_attention_heads=8, + attention_downsample_rate=2, + num_multimask_outputs=3, + iou_head_depth=3, + iou_head_hidden_dim=256, + dynamic_multimask_via_stability=True, + dynamic_multimask_stability_delta=0.05, + dynamic_multimask_stability_thresh=0.98, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.num_multimask_outputs = num_multimask_outputs + self.hidden_act = hidden_act + self.iou_head_depth = iou_head_depth + self.iou_head_hidden_dim = iou_head_hidden_dim + self.dynamic_multimask_via_stability = dynamic_multimask_via_stability + self.dynamic_multimask_stability_delta = dynamic_multimask_stability_delta + self.dynamic_multimask_stability_thresh = dynamic_multimask_stability_thresh + + # TwoWayTransformer configuration + self.num_hidden_layers = num_hidden_layers + self.hidden_size = hidden_size + self.num_attention_heads = num_attention_heads + self.mlp_dim = mlp_dim + self.attention_downsample_rate = attention_downsample_rate + + +class EdgeTamVideoConfig(PretrainedConfig): + r""" + [`EdgeTamVideoConfig`] is the configuration class to store the configuration of a [`EdgeTamVideoModel`]. It is used to instantiate a + EDGETAM model according to the specified arguments, defining the memory attention, memory encoder, and image encoder + configs. Instantiating a configuration defaults will yield a similar configuration to that of the SAM 2.1 Hiera-tiny + [facebook/EdgeTAM](https://huggingface.co/facebook/EdgeTAM) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vision_config (Union[`dict`, `EdgeTamVideoVisionConfig`], *optional*): + Dictionary of configuration options used to initialize [`EdgeTamVideoVisionConfig`]. + prompt_encoder_config (Union[`dict`, `EdgeTamVideoPromptEncoderConfig`], *optional*): + Dictionary of configuration options used to initialize [`EdgeTamVideoPromptEncoderConfig`]. + mask_decoder_config (Union[`dict`, `EdgeTamVideoMaskDecoderConfig`], *optional*): + Dictionary of configuration options used to initialize [`EdgeTamMaskDecoderConfig`]. + initializer_range (`float`, *optional*, defaults to 0.02): + Standard deviation for parameter initialization. + num_maskmem (`int`, *optional*, defaults to 7): + The number of memory slots for the mask memory. + image_size (`int`, *optional*, defaults to 1024): + The size of the input images. + sigmoid_scale_for_mem_enc (`float`, *optional*, defaults to 20.0): + Scale factor for the sigmoid function in the memory encoder. + sigmoid_bias_for_mem_enc (`float`, *optional*, defaults to -10.0): + Bias for the sigmoid function in the memory encoder. + enable_occlusion_spatial_embedding (`bool`, *optional*, defaults to `True`): + Whether to enable spatial embedding for occlusions. + multimask_output_in_sam (`bool`, *optional*, defaults to `True`): + Whether to output multiple masks from the SAM head. + multimask_min_pt_num (`int`, *optional*, defaults to 0): + The minimum number of points to trigger multimask output. + multimask_max_pt_num (`int`, *optional*, defaults to 1): + The maximum number of points to trigger multimask output. + multimask_output_for_tracking (`bool`, *optional*, defaults to `True`): + Whether to use multimask output for tracking. + max_object_pointers_in_encoder (`int`, *optional*, defaults to 16): + The maximum number of object pointers in the encoder. + enable_temporal_pos_encoding_for_object_pointers (`bool`, *optional*, defaults to `True`): + Whether to enable temporal positional encoding for object pointers. + memory_attention_hidden_size (`int`, *optional*, defaults to 256): + Dimensionality of the memory attention hidden states. + memory_attention_num_layers (`int`, *optional*, defaults to 2): + The number of layers in the memory attention module. + memory_attention_num_attention_heads (`int`, *optional*, defaults to 1): + Number of attention heads for each attention layer in the memory attention. + memory_attention_downsample_rate (`int`, *optional*, defaults to 1): + The downsample rate for the attention layers. + memory_attention_mlp_hidden_size (`int`, *optional*, defaults to 2048): + The dimension of the feedforward network in the memory attention module. + memory_attention_mlp_hidden_act (`str`, *optional*, defaults to `"relu"`): + The non-linear activation function in the feedforward network in the memory attention module. + memory_attention_dropout (`float`, *optional*, defaults to 0.1): + The dropout rate for the memory attention module. + memory_attention_rope_theta (`float`, *optional*, defaults to 10000): + The Rope theta parameter. + memory_attention_rope_feat_sizes (`Tuple[int, int]`, *optional*, defaults to `[64, 64]`): + The feature sizes for the Rope positional encoding. + memory_attention_rope_k_sizes (`List[int]`, *optional*, defaults to `[16, 16]`): + The key feature sizes for the RoPE positional encoding in memory attention. + memory_attention_rope_dropout (`float`, *optional*, defaults to 0.1): + The dropout rate for the Rope positional encoding. + perceiver_resampler_num_latents (`int`, *optional*, defaults to 256): + The number of 1D latent tokens in the perceiver resampler. + perceiver_resampler_num_latents_2d (`int`, *optional*, defaults to 256): + The number of 2D latent tokens in the perceiver resampler. + perceiver_resampler_hidden_size (`int`, *optional*, defaults to 64): + The hidden size of the perceiver resampler. + perceiver_resampler_mlp_intermediate_size (`int`, *optional*, defaults to 256): + The intermediate size of the feedforward network in the perceiver resampler. + perceiver_resampler_num_attention_heads (`int`, *optional*, defaults to 1): + The number of attention heads in the perceiver resampler. + perceiver_resampler_attention_head_dim (`int`, *optional*, defaults to 64): + The dimension of each attention head in the perceiver resampler. + perceiver_resampler_num_layers (`int`, *optional*, defaults to 2): + The number of layers in the perceiver resampler. + perceiver_resampler_hidden_dropout (`float`, *optional*, defaults to 0.0): + The dropout rate for the hidden layers in the perceiver resampler. + perceiver_resampler_attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout rate for the attention layers in the perceiver resampler. + memory_encoder_hidden_size (`int`, *optional*, defaults to 256): + Dimensionality of the memory encoder hidden states. + memory_encoder_output_channels (`int`, *optional*, defaults to 64): + The number of output channels for the memory encoder. + mask_downsampler_embed_dim (`int`, *optional*, defaults to 256): + The dimension of the mask downsampler embedding. + memory_fuser_intermediate_dim (`int`, *optional*, defaults to 1024): + The intermediate dimension of the memory fuser feedforward network. + mask_downsampler_kernel_size (`int`, *optional*, defaults to 3): + The kernel size for the mask downsampler. + mask_downsampler_stride (`int`, *optional*, defaults to 2): + The stride for the mask downsampler. + mask_downsampler_padding (`int`, *optional*, defaults to 1): + The padding for the mask downsampler. + mask_downsampler_total_stride (`int`, *optional*, defaults to 16): + The total stride for the mask downsampler. + mask_downsampler_hidden_act (`str`, *optional*, defaults to `"gelu"`): + The non-linear activation function in the mask downsampler. + memory_fuser_num_layers (`int`, *optional*, defaults to 2): + The number of layers in the memory fuser. + memory_fuser_embed_dim (`int`, *optional*, defaults to 256): + The dimension of the memory fuser embedding. + memory_fuser_kernel_size (`int`, *optional*, defaults to 7): + The kernel size for the memory fuser. + memory_fuser_padding (`int`, *optional*, defaults to 3): + The padding for the memory fuser. + memory_fuser_layer_scale_init_value (`float`, *optional*, defaults to 1e-06): + The initial value for the layer scale in the memory fuser. + memory_fuser_hidden_act (`str`, *optional*, defaults to `"gelu"`): + The non-linear activation function in the memory fuser. + + Example: + + ```python + >>> from transformers import ( + ... EdgeTamVisionConfig, + ... EdgeTamVideoPromptEncoderConfig, + ... EdgeTamVideoMaskDecoderConfig, + ... EdgeTamVideoModel, + ... EdgeTamVideoConfig, + ... ) + + >>> # Initializing a EdgeTamVideoConfig with `"facebook/edgetam.1_hiera_tiny"` style configuration + >>> configuration = EdgeTamVideoConfig() + + >>> # Initializing a EdgeTamVideoModel (with random weights) from the `"facebook/edgetam.1_hiera_tiny"` style configuration + >>> model = EdgeTamVideoModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + + >>> # We can also initialize a EdgeTamConfig from a EdgeTamVisionConfig, EdgeTamPromptEncoderConfig, and EdgeTamMaskDecoderConfig + + >>> # Initializing EDGETAM vision encoder, memory attention, and memory encoder configurations + >>> vision_config = EdgeTamVisionConfig() + >>> prompt_encoder_config = EdgeTamVideoPromptEncoderConfig() + >>> mask_decoder_config = EdgeTamVideoMaskDecoderConfig() + + >>> config = EdgeTamVideoConfig(vision_config, prompt_encoder_config, mask_decoder_config) + ```""" + + model_type = "edgetam_video" + sub_configs = { + "vision_config": AutoConfig, + "prompt_encoder_config": EdgeTamVideoPromptEncoderConfig, + "mask_decoder_config": EdgeTamVideoMaskDecoderConfig, + } + + def __init__( + self, + vision_config=None, + prompt_encoder_config=None, + mask_decoder_config=None, + initializer_range=0.02, + num_maskmem=7, + image_size=1024, + sigmoid_scale_for_mem_enc=20.0, + sigmoid_bias_for_mem_enc=-10.0, + enable_occlusion_spatial_embedding=True, + multimask_output_in_sam=True, + multimask_min_pt_num=0, + multimask_max_pt_num=1, + multimask_output_for_tracking=True, + max_object_pointers_in_encoder=16, + enable_temporal_pos_encoding_for_object_pointers=True, + # memory attention + memory_attention_hidden_size=256, + memory_attention_num_layers=2, + memory_attention_num_attention_heads=1, + memory_attention_downsample_rate=1, + memory_attention_mlp_hidden_size=2048, + memory_attention_mlp_hidden_act="relu", + memory_attention_dropout=0.1, + memory_attention_rope_theta=10000, + memory_attention_rope_feat_sizes=None, + memory_attention_rope_k_sizes=None, + memory_attention_rope_dropout=0.1, + # spatial perceiver resampler + perceiver_resampler_num_latents=256, + perceiver_resampler_num_latents_2d=256, + perceiver_resampler_hidden_size=64, + perceiver_resampler_mlp_intermediate_size=256, + perceiver_resampler_num_attention_heads=1, + perceiver_resampler_attention_head_dim=64, + perceiver_resampler_num_layers=2, + perceiver_resampler_hidden_dropout=0.0, + perceiver_resampler_attention_dropout=0.0, + # memory encoder + memory_encoder_hidden_size=256, + memory_encoder_output_channels=64, + mask_downsampler_embed_dim=256, + memory_fuser_intermediate_dim=1024, + mask_downsampler_kernel_size=3, + mask_downsampler_stride=2, + mask_downsampler_padding=1, + mask_downsampler_total_stride=16, + mask_downsampler_hidden_act="gelu", + memory_fuser_num_layers=2, + memory_fuser_embed_dim=256, + memory_fuser_kernel_size=7, + memory_fuser_padding=3, + memory_fuser_layer_scale_init_value=1e-6, + memory_fuser_hidden_act="gelu", + **kwargs, + ): + super().__init__(**kwargs) + vision_config = vision_config if vision_config is not None else {} + prompt_encoder_config = prompt_encoder_config if prompt_encoder_config is not None else {} + mask_decoder_config = mask_decoder_config if mask_decoder_config is not None else {} + memory_attention_rope_feat_sizes = ( + [64, 64] if memory_attention_rope_feat_sizes is None else memory_attention_rope_feat_sizes + ) + memory_attention_rope_k_sizes = ( + [16, 16] if memory_attention_rope_k_sizes is None else memory_attention_rope_k_sizes + ) + + if isinstance(vision_config, dict): + vision_config["model_type"] = vision_config.get("model_type", "sam2_vision_model") + vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) + if isinstance(prompt_encoder_config, EdgeTamVideoPromptEncoderConfig): + prompt_encoder_config = prompt_encoder_config.to_dict() + if isinstance(mask_decoder_config, EdgeTamVideoMaskDecoderConfig): + mask_decoder_config = mask_decoder_config.to_dict() + + self.vision_config = vision_config + self.prompt_encoder_config = EdgeTamVideoPromptEncoderConfig(**prompt_encoder_config) + self.mask_decoder_config = EdgeTamVideoMaskDecoderConfig(**mask_decoder_config) + + self.initializer_range = initializer_range + self.num_maskmem = num_maskmem # default 1 input frame + 6 previous frames + self.image_size = image_size + self.sigmoid_scale_for_mem_enc = sigmoid_scale_for_mem_enc # scale factor for mask sigmoid prob + self.sigmoid_bias_for_mem_enc = sigmoid_bias_for_mem_enc # bias factor for mask sigmoid prob + self.enable_occlusion_spatial_embedding = enable_occlusion_spatial_embedding + self.multimask_output_in_sam = multimask_output_in_sam + self.multimask_min_pt_num = multimask_min_pt_num + self.multimask_max_pt_num = multimask_max_pt_num + self.multimask_output_for_tracking = multimask_output_for_tracking + self.max_object_pointers_in_encoder = max_object_pointers_in_encoder + self.enable_temporal_pos_encoding_for_object_pointers = enable_temporal_pos_encoding_for_object_pointers + + # memory attention + self.memory_attention_hidden_size = memory_attention_hidden_size + self.memory_attention_num_layers = memory_attention_num_layers + self.memory_attention_num_attention_heads = memory_attention_num_attention_heads + self.memory_attention_downsample_rate = memory_attention_downsample_rate + self.memory_attention_mlp_hidden_size = memory_attention_mlp_hidden_size + self.memory_attention_mlp_hidden_act = memory_attention_mlp_hidden_act + self.memory_attention_dropout = memory_attention_dropout + self.memory_attention_rope_theta = memory_attention_rope_theta + self.memory_attention_rope_feat_sizes = memory_attention_rope_feat_sizes + self.memory_attention_rope_k_sizes = memory_attention_rope_k_sizes + self.memory_attention_rope_dropout = memory_attention_rope_dropout + + # spatial perceiver resampler + self.perceiver_resampler_num_latents = perceiver_resampler_num_latents + self.perceiver_resampler_num_latents_2d = perceiver_resampler_num_latents_2d + self.perceiver_resampler_hidden_size = perceiver_resampler_hidden_size + self.perceiver_resampler_mlp_intermediate_size = perceiver_resampler_mlp_intermediate_size + self.perceiver_resampler_attention_head_dim = perceiver_resampler_attention_head_dim + self.perceiver_resampler_num_attention_heads = perceiver_resampler_num_attention_heads + self.perceiver_resampler_num_layers = perceiver_resampler_num_layers + self.perceiver_resampler_hidden_dropout = perceiver_resampler_hidden_dropout + self.perceiver_resampler_attention_dropout = perceiver_resampler_attention_dropout + + # memory encoder + self.memory_encoder_hidden_size = memory_encoder_hidden_size + self.memory_encoder_output_channels = memory_encoder_output_channels + self.mask_downsampler_embed_dim = mask_downsampler_embed_dim + self.mask_downsampler_kernel_size = mask_downsampler_kernel_size + self.mask_downsampler_stride = mask_downsampler_stride + self.mask_downsampler_padding = mask_downsampler_padding + self.mask_downsampler_total_stride = mask_downsampler_total_stride + self.mask_downsampler_hidden_act = mask_downsampler_hidden_act + self.memory_fuser_num_layers = memory_fuser_num_layers + self.memory_fuser_embed_dim = memory_fuser_embed_dim + self.memory_fuser_intermediate_dim = memory_fuser_intermediate_dim + self.memory_fuser_kernel_size = memory_fuser_kernel_size + self.memory_fuser_padding = memory_fuser_padding + self.memory_fuser_layer_scale_init_value = memory_fuser_layer_scale_init_value + self.memory_fuser_hidden_act = memory_fuser_hidden_act + + +__all__ = ["EdgeTamVideoMaskDecoderConfig", "EdgeTamVideoPromptEncoderConfig", "EdgeTamVideoConfig"] diff --git a/src/transformers/models/edgetam_video/modeling_edgetam_video.py b/src/transformers/models/edgetam_video/modeling_edgetam_video.py new file mode 100644 index 000000000000..3ba7ab4ebf2f --- /dev/null +++ b/src/transformers/models/edgetam_video/modeling_edgetam_video.py @@ -0,0 +1,3062 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/edgetam_video/modular_edgetam_video.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_edgetam_video.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 the HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from collections import OrderedDict +from collections.abc import Iterator +from dataclasses import dataclass +from typing import Any, Callable, Optional, Union + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor +from tqdm import tqdm + +from transformers.utils.generic import OutputRecorder + +from ...activations import ACT2FN +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutput +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...pytorch_utils import compile_compatible_method_lru_cache +from ...utils import ModelOutput, auto_docstring +from ...utils.generic import TransformersKwargs +from ..auto import AutoModel +from .configuration_edgetam_video import ( + EdgeTamVideoConfig, + EdgeTamVideoMaskDecoderConfig, + EdgeTamVideoPromptEncoderConfig, +) + + +class EdgeTamVideoLayerNorm(nn.LayerNorm): + r"""LayerNorm that supports two data formats: channels_last (default) or channels_first. + The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height, + width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width). + """ + + def __init__(self, normalized_shape, *, eps=1e-6, data_format="channels_last", **kwargs): + super().__init__(normalized_shape, eps=eps, **kwargs) + if data_format not in ["channels_last", "channels_first"]: + raise NotImplementedError(f"Unsupported data format: {data_format}") + self.data_format = data_format + + def forward(self, features: torch.Tensor) -> torch.Tensor: + """ + Args: + features: Tensor of shape (batch_size, channels, height, width) OR (batch_size, height, width, channels) + """ + if self.data_format == "channels_first": + features = features.permute(0, 2, 3, 1) + features = super().forward(features) + features = features.permute(0, 3, 1, 2) + else: + features = super().forward(features) + return features + + +# Lightly adapted from ConvNext (https://github.com/facebookresearch/ConvNeXt) +class EdgeTamVideoMemoryFuserCXBlock(GradientCheckpointingLayer): + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + self.depthwise_conv = nn.Conv2d( + config.memory_fuser_embed_dim, + config.memory_fuser_embed_dim, + kernel_size=config.memory_fuser_kernel_size, + padding=config.memory_fuser_padding, + groups=config.memory_fuser_embed_dim, + ) # depthwise conv + self.layer_norm = EdgeTamVideoLayerNorm(config.memory_fuser_embed_dim, eps=1e-6, data_format="channels_first") + self.activation = ACT2FN[config.memory_fuser_hidden_act] + self.pointwise_conv1 = nn.Linear( + config.memory_fuser_embed_dim, config.memory_fuser_intermediate_dim + ) # pointwise/1x1 convs, implemented with linear layers + self.pointwise_conv2 = nn.Linear(config.memory_fuser_intermediate_dim, config.memory_fuser_embed_dim) + self.scale = nn.Parameter( + config.memory_fuser_layer_scale_init_value * torch.ones(config.memory_fuser_embed_dim), + requires_grad=True, + ) + + def forward(self, hidden_states): + input = hidden_states + hidden_states = self.depthwise_conv(hidden_states) + hidden_states = self.layer_norm(hidden_states) + hidden_states = hidden_states.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C) + hidden_states = self.pointwise_conv1(hidden_states) + hidden_states = self.activation(hidden_states) + hidden_states = self.pointwise_conv2(hidden_states) + hidden_states = self.scale * hidden_states + hidden_states = hidden_states.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W) + + hidden_states = input + hidden_states + return hidden_states + + +@dataclass +@auto_docstring(custom_intro="Base class for the vision encoder's outputs.") +class EdgeTamVideoVisionEncoderOutput(ModelOutput): + r""" + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, height, width, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + fpn_hidden_states (`tuple(torch.FloatTensor)`): + Tuple of `torch.FloatTensor` (one for each feature level, from high to low resolution) of shape + `(batch_size, hidden_size, height, width)`. Feature maps from the Feature Pyramid Network neck. + fpn_position_encoding (`tuple(torch.FloatTensor)`): + Tuple of `torch.FloatTensor` (one for each feature level, from high to low resolution) of shape + `(batch_size, hidden_size, height, width)`. Positional encodings corresponding to the `fpn_hidden_states`. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each stage) of shape `(batch_size, height, width, hidden_size)`. Hidden-states of the + model at the output of each stage. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in + the self-attention heads. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + fpn_hidden_states: Optional[torch.FloatTensor] = None + fpn_position_encoding: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + attentions: Optional[tuple[torch.FloatTensor, ...]] = None + + +class EdgeTamVideoVisionRotaryEmbedding(nn.Module): + """ + Vision Rotary Position Embedding for SAM2, following transformers library standards. + Supports 2D (axial) rotary embeddings for spatial dimensions. + """ + + def __init__(self, config: EdgeTamVideoConfig, end_x: Optional[int] = None, end_y: Optional[int] = None): + super().__init__() + dim = config.memory_attention_hidden_size // ( + config.memory_attention_downsample_rate * config.memory_attention_num_attention_heads + ) + # Ensure even dimension for proper axial splitting + if dim % 4 != 0: + raise ValueError("Dimension must be divisible by 4 for axial RoPE") + end_x, end_y = config.memory_attention_rope_feat_sizes if end_x is None else (end_x, end_y) + freqs = 1.0 / (config.memory_attention_rope_theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim)) + + # Generate 2D position indices for axial rotary embedding + flattened_indices = torch.arange(end_x * end_y, dtype=torch.long) + x_positions = flattened_indices % end_x + y_positions = torch.div(flattened_indices, end_x, rounding_mode="floor") + freqs_x = torch.outer(x_positions, freqs).float() + freqs_y = torch.outer(y_positions, freqs).float() + inv_freq = torch.cat([freqs_x, freqs_y], dim=-1) + inv_freq = inv_freq.repeat_interleave(2, dim=-1) + # directly register the cos and sin embeddings as we have a fixed feature shape + self.register_buffer("rope_embeddings_cos", inv_freq.cos(), persistent=False) + self.register_buffer("rope_embeddings_sin", inv_freq.sin(), persistent=False) + + @torch.no_grad() + def forward(self) -> tuple[torch.Tensor, torch.Tensor]: + # As the feature map size is fixed, we can just return the pre-computed embeddings. + return self.rope_embeddings_cos, self.rope_embeddings_sin + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs, +): + attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling + if attention_mask is not None: + attn_weights = attn_weights + attention_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +class EdgeTamVideoAttention(nn.Module): + """ + EDGETAM_VIDEO's attention layer that allows for downscaling the size of the embedding after projection to queries, keys, and + values. + """ + + def __init__(self, config, downsample_rate=None): + super().__init__() + downsample_rate = config.attention_downsample_rate if downsample_rate is None else downsample_rate + self.config = config + self.hidden_size = config.hidden_size + self.internal_dim = config.hidden_size // downsample_rate + self.num_attention_heads = config.num_attention_heads + self.head_dim = self.internal_dim // config.num_attention_heads + self.scaling = self.head_dim**-0.5 + self.is_causal = False + + self.q_proj = nn.Linear(self.hidden_size, self.internal_dim) + self.k_proj = nn.Linear(self.hidden_size, self.internal_dim) + self.v_proj = nn.Linear(self.hidden_size, self.internal_dim) + self.o_proj = nn.Linear(self.internal_dim, self.hidden_size) + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_similarity: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[torch.Tensor, torch.Tensor]: + # Input projections + batch_size, point_batch_size = query.shape[:2] + new_shape = (batch_size * point_batch_size, -1, self.num_attention_heads, self.head_dim) + + query = self.q_proj(query).view(*new_shape).transpose(1, 2) + key = self.k_proj(key).view(*new_shape).transpose(1, 2) + value = self.v_proj(value).view(*new_shape).transpose(1, 2) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query, + key, + value, + attention_mask=attention_similarity, + dropout=0.0, + scaling=self.scaling, + is_causal=self.is_causal, + **kwargs, + ) + + attn_output = attn_output.reshape( + batch_size, point_batch_size, -1, self.num_attention_heads * self.head_dim + ).contiguous() + attn_output = self.o_proj(attn_output) + + return attn_output, attn_weights + + +def rotate_pairwise(x): + """ + pairwise rotation of the hidden dims of the input. Differerent from Llama Half-Tensor Rotation. + + This is an optimized version of the following more explicit implementation: + ```python + x_rotated = torch.zeros_like(x, dtype=x.dtype, device=x.device) + x_rotated[..., ::2] = -x[..., 1::2] + x_rotated[..., 1::2] = x[..., ::2] + return x_rotated + ``` + """ + x = x.view(*x.shape[:-1], -1, 2) + x1, x2 = x.unbind(dim=-1) + x = torch.stack((-x2, x1), dim=-1) + return x.flatten(start_dim=-2) + + +def apply_rotary_pos_emb_2d_self_attn( + q: torch.Tensor, + k: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, +) -> tuple[torch.Tensor, torch.Tensor]: + """ + Apply rotary position embedding to query and key tensors for self-attention. + + Args: + q: Query tensor of shape (..., seq_len, head_dim) + k: Key tensor of shape (..., seq_len, head_dim) + cos: Cosine position embedding of shape (seq_len, head_dim) + sin: Sine position embedding of shape (seq_len, head_dim) + + Returns: + Rotated (q, k) tensors + """ + # Apply RoPE to queries + q_embed = q.float() # force upscale to float32 as in the original implementation + q_embed = (q_embed * cos) + (rotate_pairwise(q_embed) * sin) + + # Apply RoPE to keys (same embeddings as queries for self-attention) + k_embed = k.float() # force upscale to float32 as in the original implementation + k_embed = (k_embed * cos) + (rotate_pairwise(k_embed) * sin) + + return q_embed.type_as(q), k_embed.type_as(k) + + +class EdgeTamVideoRoPESelfAttention(nn.Module): + """Self-attention with rotary position encoding.""" + + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + self.config = config + self.hidden_size = config.memory_attention_hidden_size + self.internal_dim = self.hidden_size // config.memory_attention_downsample_rate + self.num_attention_heads = config.memory_attention_num_attention_heads + self.head_dim = self.internal_dim // config.memory_attention_num_attention_heads + self.scaling = self.head_dim**-0.5 + self.is_causal = False + + self.q_proj = nn.Linear(self.hidden_size, self.internal_dim) + self.k_proj = nn.Linear(self.hidden_size, self.internal_dim) + self.v_proj = nn.Linear(self.hidden_size, self.internal_dim) + self.o_proj = nn.Linear(self.internal_dim, self.hidden_size) + self.dropout_p = config.memory_attention_rope_dropout + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + **kwargs: Unpack[FlashAttentionKwargs], + ) -> Tensor: + # Input projections + batch_size, point_batch_size = query.shape[:2] + new_shape = (batch_size * point_batch_size, -1, self.num_attention_heads, self.head_dim) + + query = self.q_proj(query).view(*new_shape).transpose(1, 2) + key = self.k_proj(key).view(*new_shape).transpose(1, 2) + value = self.v_proj(value).view(*new_shape).transpose(1, 2) + + cos, sin = position_embeddings + # Apply rotary position encoding for self-attention + query, key = apply_rotary_pos_emb_2d_self_attn(query, key, cos=cos, sin=sin) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query, + key, + value, + attention_mask=None, + dropout=0.0 if not self.training else self.dropout_p, + scaling=self.scaling, + is_causal=self.is_causal, + **kwargs, + ) + attn_output = attn_output.reshape( + batch_size, point_batch_size, -1, self.num_attention_heads * self.head_dim + ).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +def apply_rotary_pos_emb_2d_cross_attn( + q: torch.Tensor, + k: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + cos_k: torch.Tensor, + sin_k: torch.Tensor, + num_k_exclude_rope: int = 0, + repeat_freqs_k: int = 1, +) -> tuple[torch.Tensor, torch.Tensor]: + """ + Apply rotary position embedding to query and key tensors for cross-attention. + + Args: + q: Query tensor of shape (..., seq_len, head_dim) + k: Key tensor of shape (..., seq_len, head_dim) + cos: Cosine position embedding of shape (seq_len, head_dim) + sin: Sine position embedding of shape (seq_len, head_dim) + cos_k: Cosine position embedding for keys of shape (seq_len, head_dim) + sin_k: Sine position embedding for keys of shape (seq_len, head_dim) + num_k_exclude_rope: Number of tokens at end of k to exclude from RoPE (e.g., object pointer tokens) + repeat_freqs_k: Frequency repetition for keys in cross-attention (e.g., for spatial memory tokens) + + Returns: + Rotated (q, k) tensors + """ + # Apply RoPE to queries (always straightforward) + q_embed = q.float() + q_embed = (q_embed * cos) + (rotate_pairwise(q_embed) * sin) + + # Split keys: RoPE tokens and excluded tokens (e.g., object pointers) + num_total_k_tokens = k.shape[-2] + k_for_rope = k[..., : num_total_k_tokens - num_k_exclude_rope, :] + k_excluded = k[..., num_total_k_tokens - num_k_exclude_rope :, :] + + # Early return if no keys need RoPE + if k_for_rope.shape[-2] == 0: + return q_embed.type_as(q), k_excluded + + batch_size, num_heads, k_seq_len, channels_per_head = k_for_rope.shape + + # Handle temporal/spatial token structure for memory + # Keys have temporal + spatial structure, only spatial tokens get RoPE + tokens_per_group = k_seq_len // repeat_freqs_k + spatial_tokens = cos_k.shape[-2] + temporal_tokens = tokens_per_group - spatial_tokens + + # Reshape and separate temporal/spatial tokens + k_grouped = k_for_rope.view(batch_size, num_heads, repeat_freqs_k, tokens_per_group, channels_per_head) + k_temporal = k_grouped[..., :temporal_tokens, :].reshape(batch_size, num_heads, -1, channels_per_head) + k_spatial = k_grouped[..., temporal_tokens:, :].reshape(batch_size, num_heads, -1, channels_per_head) + + # Only apply RoPE to spatial tokens + k_rope_input = k_spatial + + # Prepare position embeddings for repeated groups + if repeat_freqs_k > 1: + cos_k = cos_k.repeat(1, 1, repeat_freqs_k, 1) + sin_k = sin_k.repeat(1, 1, repeat_freqs_k, 1) + + # Apply RoPE to spatial tokens + k_spatial_embed = k_rope_input.float() + k_spatial_embed = (k_spatial_embed * cos_k) + (rotate_pairwise(k_spatial_embed) * sin_k) + + # Reconstruct: temporal + spatial tokens back to original structure + k_spatial_reshaped = k_spatial_embed.view(batch_size, num_heads, repeat_freqs_k, -1, channels_per_head) + k_temporal_reshaped = k_temporal.view(batch_size, num_heads, repeat_freqs_k, -1, channels_per_head) + k_final = torch.cat([k_temporal_reshaped, k_spatial_reshaped], dim=3) + k_final = k_final.view(batch_size, num_heads, k_seq_len, channels_per_head) + + # Combine RoPE-processed keys with excluded tokens + k_embed = torch.cat([k_final.type_as(k), k_excluded], dim=-2) + return q_embed.type_as(q), k_embed + + +class EdgeTamVideoRoPECrossAttention(nn.Module): + """Cross-attention with rotary position encoding.""" + + def __init__(self, config: EdgeTamVideoConfig, kv_in_dim: int): + super().__init__() + self.config = config + self.hidden_size = config.memory_attention_hidden_size + self.internal_dim = self.hidden_size // config.memory_attention_downsample_rate + self.num_attention_heads = config.memory_attention_num_attention_heads + self.head_dim = self.internal_dim // config.memory_attention_num_attention_heads + self.scaling = self.head_dim**-0.5 + self.is_causal = False + + self.kv_in_dim = kv_in_dim + + self.q_proj = nn.Linear(self.hidden_size, self.internal_dim) + self.k_proj = nn.Linear(self.kv_in_dim, self.internal_dim) + self.v_proj = nn.Linear(self.kv_in_dim, self.internal_dim) + self.o_proj = nn.Linear(self.internal_dim, self.hidden_size) + self.dropout_p = config.memory_attention_rope_dropout + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + position_embeddings_k: tuple[torch.Tensor, torch.Tensor], + num_k_exclude_rope: int = 0, + rope_k_repeat: int = 0, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> Tensor: + # Input projections + batch_size, point_batch_size = query.shape[:2] + new_shape = (batch_size * point_batch_size, -1, self.num_attention_heads, self.head_dim) + + query = self.q_proj(query).view(*new_shape).transpose(1, 2) + key = self.k_proj(key).view(*new_shape).transpose(1, 2) + value = self.v_proj(value).view(*new_shape).transpose(1, 2) + + cos, sin = position_embeddings + cos_k, sin_k = position_embeddings_k + # Apply rotary position encoding for cross-attention + query, key = apply_rotary_pos_emb_2d_cross_attn( + query, + key, + cos=cos, + sin=sin, + cos_k=cos_k, + sin_k=sin_k, + repeat_freqs_k=rope_k_repeat, + num_k_exclude_rope=num_k_exclude_rope, + ) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query, + key, + value, + attention_mask=None, + dropout=0.0 if not self.training else self.dropout_p, + scaling=self.scaling, + is_causal=self.is_causal, + **kwargs, + ) + attn_output = attn_output.reshape( + batch_size, point_batch_size, -1, self.num_attention_heads * self.head_dim + ).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class EdgeTamVideoTwoWayAttentionBlock(nn.Module): + def __init__(self, config: EdgeTamVideoMaskDecoderConfig, skip_first_layer_pe: bool = False): + """ + A transformer block with four layers: + (1) self-attention of sparse inputs (2) cross attention of sparse inputs -> dense inputs (3) mlp block on + sparse inputs (4) cross attention of dense inputs -> sparse inputs + + Arguments: + config (`EdgeTamVideoMaskDecoderConfig`): + The configuration file used to instantiate the block + attention_downsample_rate (*optionalk*, int, defaults to 2): + The downsample ratio of the block used to reduce the inner dim of the attention. + skip_first_layer_pe (*optional*, bool, defaults to `False`): + Whether or not to skip the addition of the query_point_embedding on the first layer. + """ + super().__init__() + self.self_attn = EdgeTamVideoAttention(config, downsample_rate=1) + self.layer_norm1 = nn.LayerNorm(config.hidden_size) + + self.cross_attn_token_to_image = EdgeTamVideoAttention(config) + self.layer_norm2 = nn.LayerNorm(config.hidden_size) + + self.mlp = EdgeTamVideoFeedForward( + config.hidden_size, config.mlp_dim, config.hidden_size, num_layers=config.num_hidden_layers + ) + self.layer_norm3 = nn.LayerNorm(config.hidden_size) + + self.layer_norm4 = nn.LayerNorm(config.hidden_size) + self.cross_attn_image_to_token = EdgeTamVideoAttention(config) + + self.skip_first_layer_pe = skip_first_layer_pe + + def forward( + self, + queries: Tensor, + keys: Tensor, + query_point_embedding: Tensor, + key_point_embedding: Tensor, + attention_similarity: Tensor, + **kwargs: Unpack[TransformersKwargs], + ): + # Self attention block + if self.skip_first_layer_pe: + queries, _ = self.self_attn(query=queries, key=queries, value=queries) + else: + query = queries + query_point_embedding + attn_out, _ = self.self_attn(query=query, key=query, value=queries) + queries = queries + attn_out + queries = self.layer_norm1(queries) + + # Cross attention block, tokens attending to image embedding + query = queries + query_point_embedding + key = keys + key_point_embedding + + attn_out, _ = self.cross_attn_token_to_image( + query=query, key=key, value=keys, attention_similarity=attention_similarity + ) + queries = queries + attn_out + + queries = self.layer_norm2(queries) + + # MLP block + mlp_out = self.mlp(queries) + queries = queries + mlp_out + queries = self.layer_norm3(queries) + + # Cross attention block, image embedding attending to tokens + query = queries + query_point_embedding + key = keys + key_point_embedding + + attn_out, _ = self.cross_attn_image_to_token(query=key, key=query, value=queries) + keys = keys + attn_out + + keys = self.layer_norm4(keys) + return queries, keys, attn_out + + +# copied and adapted from original implementation, also practically equal to DetrSinePositionEmbedding +class EdgeTamVideoPositionEmbeddingSine(nn.Module): + """ + This is a more standard version of the position embedding, very similar to the one used by the Attention is all you + need paper, generalized to work on images. + """ + + def __init__( + self, num_pos_feats: int = 64, temperature: int = 10000, normalize: bool = False, scale: Optional[float] = None + ): + super().__init__() + if scale is not None and normalize is False: + raise ValueError("normalize should be True if scale is passed") + self.num_pos_feats = num_pos_feats + self.temperature = temperature + self.normalize = normalize + self.scale = 2 * math.pi if scale is None else scale + + @compile_compatible_method_lru_cache(maxsize=2) + def forward( + self, + shape: torch.Size, + device: Union[torch.device, str], + dtype: torch.dtype, + mask: Optional[Tensor] = None, + ) -> Tensor: + if mask is None: + mask = torch.zeros((shape[0], shape[2], shape[3]), device=device, dtype=torch.bool) + not_mask = (~mask).to(dtype) + y_embed = not_mask.cumsum(1) + x_embed = not_mask.cumsum(2) + if self.normalize: + eps = 1e-6 + y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale + x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale + + dim_t = torch.arange(self.num_pos_feats, dtype=torch.int64, device=device).to(dtype) + dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.num_pos_feats) + + pos_x = x_embed[:, :, :, None] / dim_t + pos_y = y_embed[:, :, :, None] / dim_t + pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) + return pos + + +class EdgeTamVideoMemoryFuser(nn.Module): + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + self.layers = nn.ModuleList( + [EdgeTamVideoMemoryFuserCXBlock(config) for _ in range(config.memory_fuser_num_layers)] + ) + + def forward(self, hidden_states): + # normally hidden_states: (N, C, H, W) + for layer in self.layers: + hidden_states = layer(hidden_states) + return hidden_states + + +class EdgeTamVideoMaskDownSamplerLayer(nn.Module): + def __init__(self, config: EdgeTamVideoConfig, in_channels: int, out_channels: int): + super().__init__() + self.conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size=config.mask_downsampler_kernel_size, + stride=config.mask_downsampler_stride, + padding=config.mask_downsampler_padding, + ) + self.layer_norm = EdgeTamVideoLayerNorm(out_channels, eps=1e-6, data_format="channels_first") + self.activation = ACT2FN[config.mask_downsampler_hidden_act] + + def forward(self, x): + return self.activation(self.layer_norm(self.conv(x))) + + +class EdgeTamVideoMaskDownSampler(nn.Module): + """ + Progressively downsample a mask by total_stride, each time by stride. + Note that LayerNorm is applied per *token*, like in ViT. + + With each downsample (by a factor stride**2), channel capacity increases by the same factor. + In the end, we linearly project to embed_dim channels. + """ + + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + + num_layers = int(math.log2(config.mask_downsampler_total_stride) // math.log2(config.mask_downsampler_stride)) + + self.layers = nn.ModuleList() + self.activation = ACT2FN[config.mask_downsampler_hidden_act] + mask_in_chans, mask_out_chans = 1, 1 + for _ in range(num_layers): + mask_out_chans = mask_in_chans * (config.mask_downsampler_stride**2) + self.layers.append(EdgeTamVideoMaskDownSamplerLayer(config, mask_in_chans, mask_out_chans)) + mask_in_chans = mask_out_chans + + self.final_conv = nn.Conv2d(mask_out_chans, config.mask_downsampler_embed_dim, kernel_size=1) + + def forward(self, x): + for layer in self.layers: + x = layer(x) + x = self.final_conv(x) + return x + + +class EdgeTamVideoMemoryEncoder(nn.Module): + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + + hidden_size = config.memory_encoder_hidden_size + output_channels = config.memory_encoder_output_channels + self.mask_downsampler = EdgeTamVideoMaskDownSampler(config) + self.feature_projection = nn.Conv2d(hidden_size, hidden_size, kernel_size=1) + self.memory_fuser = EdgeTamVideoMemoryFuser(config) + self.position_encoding = EdgeTamVideoPositionEmbeddingSine(num_pos_feats=output_channels // 2, normalize=True) + self.projection = nn.Conv2d(hidden_size, output_channels, kernel_size=1) + + def forward( + self, + vision_features: torch.Tensor, + masks: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor]: + ## Process masks + masks = self.mask_downsampler(masks) + ## Fuse pixel_features and downsampled masks + + vision_features = self.feature_projection(vision_features) + vision_features = vision_features + masks + vision_features = self.memory_fuser(vision_features) + vision_features = self.projection(vision_features) + + vision_pos_enc = self.position_encoding(vision_features.shape, vision_features.device, vision_features.dtype) + + return vision_features, vision_pos_enc + + +class EdgeTamVideoFeedForward(nn.Module): + def __init__( + self, + input_dim: int, + hidden_dim: int, + output_dim: int, + num_layers: int, + activation: str = "relu", + sigmoid_output: bool = False, + ): + super().__init__() + self.num_layers = num_layers + self.activation = ACT2FN[activation] + self.proj_in = nn.Linear(input_dim, hidden_dim) + self.proj_out = nn.Linear(hidden_dim, output_dim) + self.layers = nn.ModuleList([nn.Linear(hidden_dim, hidden_dim) for _ in range(num_layers - 2)]) + self.sigmoid_output = sigmoid_output + + def forward(self, hidden_states): + hidden_states = self.proj_in(hidden_states) + hidden_states = self.activation(hidden_states) + for layer in self.layers: + hidden_states = self.activation(layer(hidden_states)) + + hidden_states = self.proj_out(hidden_states) + if self.sigmoid_output: + hidden_states = F.sigmoid(hidden_states) + return hidden_states + + +@auto_docstring +class EdgeTamVideoPreTrainedModel(PreTrainedModel): + config_class = EdgeTamVideoConfig + base_model_prefix = "edgetam_video" + main_input_name = "pixel_values" + _supports_sdpa = True + _supports_flash_attn_2 = True + _supports_attention_backend = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, (nn.LayerNorm, EdgeTamVideoLayerNorm)): + module.weight.data.fill_(1.0) + module.bias.data.zero_() + elif isinstance(module, EdgeTamVideoModel): + if module.no_memory_positional_encoding is not None: + module.no_memory_positional_encoding.data.zero_() + if module.memory_temporal_positional_encoding is not None: + module.memory_temporal_positional_encoding.data.zero_() + if module.no_object_pointer is not None: + module.no_object_pointer.data.zero_() + if module.occlusion_spatial_embedding_parameter is not None: + module.occlusion_spatial_embedding_parameter.data.zero_() + if isinstance(module, EdgeTamVideoMemoryFuserCXBlock): + if module.scale is not None: + module.scale.data.zero_() + + +class EdgeTamVideoInferenceCache: + """Cache for vision features and model constants.""" + + def __init__( + self, + inference_device: Union[torch.device, str] = "cpu", + inference_state_device: Union[torch.device, str] = "cpu", + max_vision_features_cache_size: int = 1, + ): + self.inference_device = inference_device + self.inference_state_device = inference_state_device + self.max_vision_features_cache_size = max_vision_features_cache_size + + self._vision_features = {} + + def cache_vision_features(self, frame_idx: int, features: dict): + """Cache vision features with automatic device management.""" + cached = {} + if len(self._vision_features) >= self.max_vision_features_cache_size: + # remove the oldest frame + self._vision_features.pop(min(self._vision_features.keys())) + + for key, value in features.items(): + if isinstance(value, torch.Tensor): + cached[key] = value.to(self.inference_state_device, non_blocking=True) + elif isinstance(value, (list, tuple)) and value and isinstance(value[0], torch.Tensor): + cached[key] = [v.to(self.inference_state_device, non_blocking=True) for v in value] + else: + cached[key] = value + self._vision_features[frame_idx] = cached + + def get_vision_features(self, frame_idx: int) -> Optional[dict]: + """Get cached vision features, automatically moved to inference device.""" + if frame_idx not in self._vision_features: + return None + + cached = self._vision_features[frame_idx] + moved = {} + for key, value in cached.items(): + if isinstance(value, torch.Tensor): + moved[key] = value.to(self.inference_device, non_blocking=True) + elif isinstance(value, (list, tuple)) and value and isinstance(value[0], torch.Tensor): + moved[key] = [v.to(self.inference_device, non_blocking=True) for v in value] + else: + moved[key] = value + return moved + + def clear_all(self): + """Clear all cached data.""" + self._vision_features.clear() + + +class EdgeTamVideoInferenceSession: + r""" + Manages video inference session parameters, state and cache. + + Args: + video (`torch.FloatTensor`, *optional*): + The video to process. No need to provide when streaming. + video_height (`int`, *optional*): + The height of the video. + video_width (`int`, *optional*): + The width of the video. + inference_device (`torch.device`, *optional*, defaults to `"cpu"`): + The device to use for inference. + inference_state_device (`torch.device`, *optional*, defaults to `"cpu"`): + The device to store the inference state on. + video_storage_device (`torch.device`, *optional*, defaults to `"cpu"`): + The device to store the video on. + dtype (`torch.dtype`, *optional*, defaults to `"float32"`): + The dtype to use for the video. + max_vision_features_cache_size (`int`, *optional*, defaults to 1): + The maximum number of vision features to cache. + """ + + def __init__( + self, + video: Optional[torch.FloatTensor] = None, + video_height: Optional[int] = None, + video_width: Optional[int] = None, + inference_device: Union[torch.device, str] = "cpu", + inference_state_device: Union[torch.device, str] = "cpu", + video_storage_device: Union[torch.device, str] = "cpu", + dtype: Union[torch.dtype, str] = "float32", + max_vision_features_cache_size: int = 1, + ): + # store as a dictionary to avoid double memory allocation with torch.cat when adding new frames + self.processed_frames = ( + dict(enumerate(video.to(video_storage_device, dtype=dtype))) if video is not None else None + ) + self.video_height = video_height + self.video_width = video_width + + self.inference_device = inference_device + self.inference_state_device = inference_state_device + self.video_storage_device = video_storage_device + self.dtype = dtype + self.max_vision_features_cache_size = max_vision_features_cache_size + + # Cache for computed features + self.cache = EdgeTamVideoInferenceCache( + inference_device=self.inference_device, + inference_state_device=self.inference_state_device, + max_vision_features_cache_size=self.max_vision_features_cache_size, + ) + + # Persistent object tracking state + self._obj_id_to_idx = OrderedDict() + self._obj_idx_to_id = OrderedDict() + self.obj_ids = [] + + # Persistent user inputs + self.point_inputs_per_obj = {} + self.mask_inputs_per_obj = {} + + # Persistent model outputs/history + self.output_dict_per_obj = {} + self.frames_tracked_per_obj = {} + + # Session state flags + self.obj_with_new_inputs = [] + + @property + def num_frames(self) -> Optional[int]: + return len(self.processed_frames) if self.processed_frames is not None else None + + # Object management + def obj_id_to_idx(self, obj_id: int) -> int: + """Map object ID to index, creating new entry if needed.""" + obj_idx = self._obj_id_to_idx.get(obj_id, None) + if obj_idx is not None: + return obj_idx + + obj_idx = len(self._obj_id_to_idx) + self._obj_id_to_idx[obj_id] = obj_idx + self._obj_idx_to_id[obj_idx] = obj_id + self.obj_ids = list(self._obj_id_to_idx) + + self.point_inputs_per_obj[obj_idx] = {} + self.mask_inputs_per_obj[obj_idx] = {} + self.output_dict_per_obj[obj_idx] = { + "cond_frame_outputs": {}, + "non_cond_frame_outputs": {}, + } + self.frames_tracked_per_obj[obj_idx] = {} + + return obj_idx + + # Video Inference specific functions + def obj_idx_to_id(self, obj_idx: int) -> int: + """Map model-side object index to client-side object id.""" + return self._obj_idx_to_id[obj_idx] + + def get_obj_num(self) -> int: + """Get the total number of unique object ids received so far in this session.""" + return len(self._obj_idx_to_id) + + # Input management with device handling + def add_point_inputs(self, obj_idx: int, frame_idx: int, inputs: dict): + """Add point inputs with automatic device placement.""" + device_inputs = {} + for key, value in inputs.items(): + if isinstance(value, torch.Tensor): + device_inputs[key] = value.to(self.inference_device, non_blocking=True) + else: + device_inputs[key] = value + self.point_inputs_per_obj[obj_idx][frame_idx] = device_inputs + + def remove_point_inputs(self, obj_idx: int, frame_idx: int): + """Remove point inputs.""" + self.point_inputs_per_obj[obj_idx].pop(frame_idx, None) + + def add_mask_inputs(self, obj_idx: int, frame_idx: int, inputs: torch.Tensor): + """Add mask inputs with automatic device placement.""" + self.mask_inputs_per_obj[obj_idx][frame_idx] = inputs.to( + self.inference_device, dtype=self.dtype, non_blocking=True + ) + + def remove_mask_inputs(self, obj_idx: int, frame_idx: int): + """Remove mask inputs.""" + self.mask_inputs_per_obj[obj_idx].pop(frame_idx, None) + + # Output management with smart device placement + def store_output( + self, + obj_idx: int, + frame_idx: int, + output_key: Optional[str] = None, + output_value: Optional[Union[torch.Tensor, dict]] = None, + is_conditioning_frame: bool = True, + ): + """ + Store output with smart device management. + If output_key is None, the output is stored as a dictionary. + + Args: + obj_idx (int): The index of the object. + frame_idx (int): The index of the frame. + output_key (Optional[str]): The key of the output. If None, the output is stored as a dictionary. + output_value (Optional[Union[torch.Tensor, dict]]): The value of the output. + is_conditioning_frame (bool): Whether the output is for a conditioning frame. + """ + storage_key = "cond_frame_outputs" if is_conditioning_frame else "non_cond_frame_outputs" + + if output_key is None and isinstance(output_value, dict): + self.output_dict_per_obj[obj_idx][storage_key][frame_idx] = {} + for key, value in output_value.items(): + self.store_output(obj_idx, frame_idx, key, value, is_conditioning_frame) + return + + # Device placement: small tensors stay on inference device, large ones go to inference state device + if output_key in ["object_pointer", "object_score_logits"]: # Small tensors + self.output_dict_per_obj[obj_idx][storage_key][frame_idx][output_key] = output_value + elif isinstance(output_value, torch.Tensor): # Large tensors like masks, features + self.output_dict_per_obj[obj_idx][storage_key][frame_idx][output_key] = output_value.to( + self.inference_state_device, non_blocking=True + ) + else: + self.output_dict_per_obj[obj_idx][storage_key][frame_idx][output_key] = output_value + + def get_output( + self, + obj_idx: int, + frame_idx: int, + output_key: str, + is_conditioning_frame: bool = True, + ): + """ + Get output with smart device management. + + Args: + obj_idx (int): The index of the object. + frame_idx (int): The index of the frame. + output_key (str): The key of the output. + is_conditioning_frame (bool): Whether the output is for a conditioning frame. + """ + storage_key = "cond_frame_outputs" if is_conditioning_frame else "non_cond_frame_outputs" + out = self.output_dict_per_obj[obj_idx][storage_key].get(frame_idx, None) + # move to inference device if needed + if out is None: + return None + value = out[output_key] + if isinstance(value, torch.Tensor): + value = value.to(self.inference_device, non_blocking=True) + return value + + # Video frame management + def add_new_frame(self, pixel_values: torch.Tensor, frame_idx: Optional[int] = None) -> int: + """Add new frame with automatic device placement.""" + pixel_values = pixel_values.to(self.video_storage_device, dtype=self.dtype, non_blocking=True) + if pixel_values.dim() == 4: + pixel_values = pixel_values.squeeze(0) + + if frame_idx is None: + frame_idx = len(self.processed_frames) if self.processed_frames is not None else 0 + + if self.processed_frames is None: + self.processed_frames = {frame_idx: pixel_values} + else: + self.processed_frames[frame_idx] = pixel_values + + return frame_idx + + def get_frame(self, frame_idx: int) -> torch.Tensor: + """Get frame from video.""" + return self.processed_frames[frame_idx].to(self.inference_device, non_blocking=True) + + def reset_tracking_data(self): + """Reset tracking data but keep cache.""" + self._obj_id_to_idx.clear() + self._obj_idx_to_id.clear() + self.obj_ids.clear() + self.point_inputs_per_obj.clear() + self.mask_inputs_per_obj.clear() + self.output_dict_per_obj.clear() + self.frames_tracked_per_obj.clear() + self.obj_with_new_inputs = [] + # Note: cache and video data are preserved + + def reset_inference_session(self): + """Reset tracking data and cache.""" + self._obj_id_to_idx.clear() + self._obj_idx_to_id.clear() + self.obj_ids.clear() + self.point_inputs_per_obj.clear() + self.mask_inputs_per_obj.clear() + self.output_dict_per_obj.clear() + self.frames_tracked_per_obj.clear() + self.obj_with_new_inputs = [] + self.cache.clear_all() + + +class EdgeTamVideoMemoryAttentionMLP(nn.Module): + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + self.config = config + self.hidden_size = config.memory_attention_hidden_size + self.intermediate_size = config.memory_attention_mlp_hidden_size + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size) + self.dropout = nn.Dropout(config.memory_attention_dropout) + self.act_fn = ACT2FN[config.memory_attention_mlp_hidden_act] + + def forward(self, x): + return self.down_proj(self.dropout(self.act_fn(self.up_proj(x)))) + + +class EdgeTamVideoMemoryAttentionLayer(nn.Module): + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + hidden_size = config.memory_attention_hidden_size + self.self_attn = EdgeTamVideoRoPESelfAttention(config) + self.cross_attn_image = EdgeTamVideoRoPECrossAttention(config, kv_in_dim=64) + + # MLP module + self.mlp = EdgeTamVideoMemoryAttentionMLP(config) + + self.layer_norm1 = nn.LayerNorm(hidden_size) + self.layer_norm2 = nn.LayerNorm(hidden_size) + self.layer_norm3 = nn.LayerNorm(hidden_size) + self.dropout1 = nn.Dropout(config.memory_attention_dropout) + self.dropout2 = nn.Dropout(config.memory_attention_dropout) + self.dropout3 = nn.Dropout(config.memory_attention_dropout) + + def forward( + self, + queries: Tensor, + keys: Tensor, + key_point_embedding: Tensor, + rope_position_embeddings: tuple[Tensor, Tensor], + rope_position_embeddings_k: Optional[tuple[Tensor, Tensor]] = None, + num_k_exclude_rope: int = 0, + rope_k_repeat: int = 0, + ) -> torch.Tensor: + # Self-Attention + query = self.layer_norm1(queries) + query, _ = self.self_attn(query=query, key=query, value=query, position_embeddings=rope_position_embeddings) + queries = queries + self.dropout1(query) + + # Cross-Attention + query = self.layer_norm2(queries) + query, _ = self.cross_attn_image( + query=query, + key=keys + key_point_embedding, + value=keys, + position_embeddings=rope_position_embeddings, + position_embeddings_k=rope_position_embeddings_k, + num_k_exclude_rope=num_k_exclude_rope, + rope_k_repeat=rope_k_repeat, + ) + queries = queries + self.dropout2(query) + # MLP + query = self.layer_norm3(queries) + query = self.mlp(query) + queries = queries + self.dropout3(query) + return queries + + +class EdgeTamVideoMemoryAttention(nn.Module): + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + self.layers = nn.ModuleList( + [EdgeTamVideoMemoryAttentionLayer(config) for _ in range(config.memory_attention_num_layers)] + ) + self.layer_norm = nn.LayerNorm(config.memory_attention_hidden_size) + self.rotary_emb = EdgeTamVideoVisionRotaryEmbedding(config=config) + self.rotary_emb_k = EdgeTamVideoVisionRotaryEmbedding( + config, end_x=config.memory_attention_rope_k_sizes[0], end_y=config.memory_attention_rope_k_sizes[1] + ) + + def forward( + self, + current_vision_features: torch.Tensor, + memory: torch.Tensor, + current_vision_position_embeddings: Optional[Tensor] = None, + memory_posision_embeddings: Optional[Tensor] = None, + num_object_pointer_tokens: int = 0, + num_spatial_memory_tokens: int = -1, + ): + """ + Args: + current_vision_features (`torch.FloatTensor`): + The current vision features used for self-attention. + memory (`torch.FloatTensor`): + The memory features used for cross-attention. + current_vision_position_embeddings (`torch.FloatTensor`, *optional*): + The position embeddings for the current vision features. + memory_posision_embeddings (`torch.FloatTensor`, *optional*): + The position embeddings for the memory features. + num_object_pointer_tokens (`int`, *optional*, defaults to 0): + The number of object pointer tokens. + """ + output = current_vision_features + if current_vision_position_embeddings is not None: + output = output + 0.1 * current_vision_position_embeddings + + # Convert to batch first + output = output.transpose(0, 1) + memory = memory.transpose(0, 1).unsqueeze(1) + memory_posision_embeddings = memory_posision_embeddings.transpose(0, 1).unsqueeze(1) + rope_position_embeddings = self.rotary_emb() + rope_position_embeddings_k = self.rotary_emb_k() + for layer in self.layers: + output = layer( + queries=output.unsqueeze(1) if output.ndim == 3 else output, + keys=memory, + key_point_embedding=memory_posision_embeddings, + rope_position_embeddings=rope_position_embeddings, + rope_position_embeddings_k=rope_position_embeddings_k, + num_k_exclude_rope=num_object_pointer_tokens, + rope_k_repeat=num_spatial_memory_tokens, + ) + + normed_output = self.layer_norm(output) + + # Convert back to seq first + normed_output = normed_output.transpose(0, 1) + + return normed_output + + +class EdgeTamVideoPerceiverMLP(nn.Module): + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + self.hidden_size = config.perceiver_resampler_hidden_size + self.intermediate_size = config.perceiver_resampler_mlp_intermediate_size + + self.layer_norm = nn.LayerNorm(self.hidden_size) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = nn.GELU() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.layer_norm(hidden_states) + hidden_states = self.down_proj(self.act_fn(self.up_proj(hidden_states))) + return hidden_states + + +class EdgeTamVideoPerceiverAttention(nn.Module): + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + self.config = config + self.hidden_size = config.perceiver_resampler_hidden_size + self.num_attention_heads = config.perceiver_resampler_num_attention_heads + self.head_dim = config.perceiver_resampler_attention_head_dim + self.attention_dropout = config.perceiver_resampler_attention_dropout + + self.inner_dim = self.head_dim * self.num_attention_heads + self.scaling = self.head_dim**-0.5 + self.is_causal = False + + self.q_proj = nn.Linear(self.hidden_size, self.inner_dim, bias=False) + self.k_proj = nn.Linear(self.hidden_size, self.inner_dim, bias=False) + self.v_proj = nn.Linear(self.hidden_size, self.inner_dim, bias=False) + self.o_proj = nn.Linear(self.inner_dim, self.hidden_size, bias=False) + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + positional_encoding: Optional[torch.Tensor] = None, + **kwargs, + ) -> torch.Tensor: + # Project queries, keys, and values + query = self.q_proj(query) + key = self.k_proj(key) + value = self.v_proj(value) + + # Reshape for multi-head attention + batch_size, seq_len_q = query.shape[:2] + query = query.view(batch_size, seq_len_q, self.num_attention_heads, self.head_dim).transpose(1, 2) + seq_len_kv = key.shape[1] + key = key.view(batch_size, seq_len_kv, self.num_attention_heads, self.head_dim).transpose(1, 2) + value = value.view(batch_size, seq_len_kv, self.num_attention_heads, self.head_dim).transpose(1, 2) + + # Add positional encoding if provided + if positional_encoding is not None: + pos_encoding = positional_encoding.view( + batch_size, seq_len_kv, self.num_attention_heads, self.head_dim + ).transpose(1, 2) + key = key + pos_encoding + value = value + pos_encoding + + # Apply attention + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, _ = attention_interface( + self, + query, + key, + value, + attention_mask=None, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + is_causal=self.is_causal, + **kwargs, + ) + + # Reshape output + attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_len_q, self.inner_dim) + return self.o_proj(attn_output) + + +class EdgeTamVideoPerceiverEncoderLayer(nn.Module): + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + + self.cross_attention = EdgeTamVideoPerceiverAttention(config) + self.mlp = EdgeTamVideoPerceiverMLP(config) + self.dropout = nn.Dropout(config.perceiver_resampler_hidden_dropout) + + self.self_attention = EdgeTamVideoPerceiverAttention(config) + self.self_mlp = EdgeTamVideoPerceiverMLP(config) + + # Layer norms moved from attention classes to here + self.layer_norm_input = nn.LayerNorm(config.perceiver_resampler_hidden_size) + self.layer_norm_latents = nn.LayerNorm(config.perceiver_resampler_hidden_size) + self.layer_norm_self = nn.LayerNorm(config.perceiver_resampler_hidden_size) + + def forward( + self, + latents: torch.Tensor, + input_features: torch.Tensor, + positional_encoding: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + # Cross attention with layer norms + normalized_latents = self.layer_norm_latents(latents) + normalized_input = self.layer_norm_input(input_features) + cross_attention_output = self.cross_attention( + query=normalized_latents, + key=normalized_input, + value=normalized_input, + positional_encoding=positional_encoding, + ) + latents = latents + self.dropout(cross_attention_output) + + mlp_output = self.mlp(latents) + latents = latents + mlp_output + + # Self attention with layer norm + normalized_latents_self = self.layer_norm_self(latents) + self_attention_output = self.self_attention( + query=normalized_latents_self, key=normalized_latents_self, value=normalized_latents_self + ) + latents = latents + self_attention_output + + self_mlp_output = self.self_mlp(latents) + latents = latents + self_mlp_output + + return latents + + +def window_partition(hidden_state, window_size): + """ + Partition into non-overlapping windows with padding if needed. + + Args: + hidden_state (`torch.Tensor`): + Input tokens with [batch_size, height, width, num_channels]. + window_size (`int`): + Window size. + + Returns: + `tuple(torch.FloatTensor)` comprising various elements: + - windows: windows after partition with [batch_size * num_windows, window_size, window_size, num_channels]. + - (padded_height, padded_width): padded height and width before partition + """ + batch_size, height, width, num_channels = hidden_state.shape + + pad_height = (window_size - height % window_size) % window_size + pad_width = (window_size - width % window_size) % window_size + + # Noop in case pad_width == 0 and pad_height == 0. + hidden_state = nn.functional.pad(hidden_state, (0, 0, 0, pad_width, 0, pad_height)) + + padded_height, padded_width = height + pad_height, width + pad_width + + hidden_state = hidden_state.view( + batch_size, padded_height // window_size, window_size, padded_width // window_size, window_size, num_channels + ) + windows = hidden_state.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, num_channels) + return windows, (padded_height, padded_width) + + +class EdgeTamVideoPerceiverResampler(nn.Module): + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + self.config = config + self.hidden_size = config.perceiver_resampler_hidden_size + self.num_latents_1d = config.perceiver_resampler_num_latents + self.num_latents_2d = config.perceiver_resampler_num_latents_2d + self.num_layers = config.perceiver_resampler_num_layers + + if self.num_latents_1d > 0: + self.latents_1d = nn.Parameter(torch.randn(self.num_latents_1d, self.hidden_size)) + if self.num_latents_2d > 0: + self.latents_2d = nn.Parameter(torch.randn(self.num_latents_2d, self.hidden_size)) + + self.positional_encoding = EdgeTamVideoPositionEmbeddingSine( + num_pos_feats=self.hidden_size // 2, normalize=True + ) + + self.layers = nn.ModuleList([EdgeTamVideoPerceiverEncoderLayer(config) for _ in range(self.num_layers)]) + + self.layer_norm = nn.LayerNorm(self.hidden_size) + + def forward( + self, + hidden_states: torch.Tensor, + positional_encoding: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + output_latents = [] + output_positional_encodings = [] + + if self.num_latents_1d > 0: + latents_1d, pos_1d = self._forward_1d(hidden_states, positional_encoding) + output_latents.append(latents_1d) + output_positional_encodings.append(pos_1d) + + if self.num_latents_2d > 0: + latents_2d, pos_2d = self._forward_2d(hidden_states) + output_latents.append(latents_2d) + output_positional_encodings.append(pos_2d) + + combined_latents = torch.cat(output_latents, dim=1) + + combined_positional_encoding = None + if positional_encoding is not None and output_positional_encodings: + combined_positional_encoding = torch.cat(output_positional_encodings, dim=1) + + return combined_latents, combined_positional_encoding + + def _forward_1d( + self, + hidden_states: torch.Tensor, + positional_encoding: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + batch_size = hidden_states.shape[0] + + latents = self.latents_1d.unsqueeze(0).expand(batch_size, -1, -1) + flattened_features = hidden_states.permute(0, 2, 3, 1).flatten(1, 2) + + positional_features = None + if positional_encoding is not None: + positional_features = positional_encoding.permute(0, 2, 3, 1).flatten(1, 2) + + for layer in self.layers: + latents = layer(latents, flattened_features, positional_features) + + latents = self.layer_norm(latents) + + output_positional_encoding = None + if positional_encoding is not None: + output_positional_encoding = torch.zeros_like(latents) + + return latents, output_positional_encoding + + def _forward_2d(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + batch_size, channels, height, width = hidden_states.shape + + latents_2d = self.latents_2d.unsqueeze(0).expand(batch_size, -1, -1).view(-1, 1, channels) + + num_windows_per_dim = int(math.sqrt(self.num_latents_2d)) + window_size = height // num_windows_per_dim + + windowed_input = hidden_states.permute(0, 2, 3, 1) + windowed_features, _ = window_partition(windowed_input, window_size) + windowed_features = windowed_features.flatten(1, 2) + + for layer in self.layers: + latents_2d = layer(latents_2d, windowed_features, positional_encoding=None) + + latents_2d = latents_2d.view(batch_size, num_windows_per_dim, num_windows_per_dim, channels).permute( + 0, 3, 1, 2 + ) + + positional_encoding_2d = self.positional_encoding(latents_2d.shape, latents_2d.device, latents_2d.dtype).to( + dtype=hidden_states.dtype + ) + positional_encoding_2d = positional_encoding_2d.permute(0, 2, 3, 1).flatten(1, 2) + + latents_2d = latents_2d.permute(0, 2, 3, 1).flatten(1, 2) + latents_2d = self.layer_norm(latents_2d) + + return latents_2d, positional_encoding_2d + + +@dataclass +@auto_docstring(custom_intro="Base class for the EdgeTamVideo model's output.") +class EdgeTamVideoImageSegmentationOutput(ModelOutput): + r""" + iou_scores (`torch.FloatTensor` of shape `(batch_size, point_batch_size, num_masks)`): + The Intersection over Union (IoU) scores of the predicted masks. + pred_masks (`torch.FloatTensor` of shape `(batch_size, point_batch_size, num_masks, height, width)`): + The predicted low-resolution masks. This is an alias for `low_res_masks`. These masks need to be post-processed + by the processor to be brought to the original image size. + object_score_logits (`torch.FloatTensor` of shape `(batch_size, point_batch_size, 1)`): + Logits for the object score, indicating if an object is present. + image_embeddings (`tuple(torch.FloatTensor)`): + The features from the FPN, which are used by the mask decoder. This is a tuple of `torch.FloatTensor` where each + tensor has shape `(batch_size, channels, height, width)`. + vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of each stage) of shape `(batch_size, height, width, hidden_size)`. + Hidden-states of the vision model at the output of each stage. + vision_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. + Attentions weights of the vision model. + mask_decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. + Attentions weights of the mask decoder. + high_res_masks (`torch.FloatTensor` of shape `(batch_size, point_batch_size, num_masks, image_size, image_size)`, *optional*): + The predicted masks, upscaled to the original image size. Only used for EdgeTamVideoModel. + object_pointer (`torch.FloatTensor` of shape `(batch_size, point_batch_size, hidden_size)`, *optional*): + A tensor representing the object pointer, used for tracking in videos. Only used for EdgeTamVideoModel. + """ + + iou_scores: Optional[torch.FloatTensor] = None + pred_masks: Optional[torch.FloatTensor] = None + object_score_logits: Optional[torch.FloatTensor] = None + image_embeddings: tuple[torch.FloatTensor, ...] = None + vision_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + vision_attentions: Optional[tuple[torch.FloatTensor, ...]] = None + mask_decoder_attentions: Optional[tuple[torch.FloatTensor, ...]] = None + + high_res_masks: Optional[torch.FloatTensor] = None + object_pointer: Optional[torch.FloatTensor] = None + + +@dataclass +@auto_docstring(custom_intro="Base class for the Sam2 model's output.") +class EdgeTamVideoSegmentationOutput(ModelOutput): + r""" + pred_masks (`torch.FloatTensor` of shape `(batch_size, num_masks, height, width)`): + The predicted masks stored at the model's resolution. + frame_idx (`int`): + The frame index of the video. + """ + + pred_masks: Optional[torch.FloatTensor] = None + frame_idx: Optional[int] = None + + +class EdgeTamVideoPositionalEmbedding(nn.Module): + def __init__(self, config: EdgeTamVideoPromptEncoderConfig): + super().__init__() + self.scale = config.scale + positional_embedding = self.scale * torch.randn((2, config.hidden_size // 2)) + self.register_buffer("positional_embedding", positional_embedding) + + def forward(self, input_coords, input_shape=None): + """Positionally encode points that are normalized to [0,1].""" + coordinates = input_coords.clone() + + if input_shape is not None: + coordinates[:, :, :, 0] = coordinates[:, :, :, 0] / input_shape[1] + coordinates[:, :, :, 1] = coordinates[:, :, :, 1] / input_shape[0] + coordinates.to(torch.float32) + + # assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape + coordinates = 2 * coordinates - 1 + coordinates = coordinates.to(self.positional_embedding.dtype) + coordinates = coordinates @ self.positional_embedding + coordinates = 2 * np.pi * coordinates + # outputs d_1 x ... x d_n x channel shape + return torch.cat([torch.sin(coordinates), torch.cos(coordinates)], dim=-1) + + +class EdgeTamVideoMaskEmbedding(nn.Module): + def __init__(self, config: EdgeTamVideoPromptEncoderConfig): + super().__init__() + self.mask_input_channels = config.mask_input_channels // 4 + self.activation = ACT2FN[config.hidden_act] + self.conv1 = nn.Conv2d(1, self.mask_input_channels, kernel_size=2, stride=2) + self.conv2 = nn.Conv2d(self.mask_input_channels, config.mask_input_channels, kernel_size=2, stride=2) + self.conv3 = nn.Conv2d(config.mask_input_channels, config.hidden_size, kernel_size=1) + self.layer_norm1 = EdgeTamVideoLayerNorm( + self.mask_input_channels, eps=config.layer_norm_eps, data_format="channels_first" + ) + self.layer_norm2 = EdgeTamVideoLayerNorm( + self.mask_input_channels * 4, eps=config.layer_norm_eps, data_format="channels_first" + ) + + def forward(self, masks): + hidden_states = self.conv1(masks) + hidden_states = self.layer_norm1(hidden_states) + hidden_states = self.activation(hidden_states) + + hidden_states = self.conv2(hidden_states) + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.activation(hidden_states) + dense_embeddings = self.conv3(hidden_states) + return dense_embeddings + + +class EdgeTamVideoPromptEncoder(nn.Module): + def __init__(self, config: EdgeTamVideoPromptEncoderConfig): + super().__init__() + self.shared_embedding = EdgeTamVideoPositionalEmbedding(config) + self.mask_embed = EdgeTamVideoMaskEmbedding(config) + self.no_mask_embed = nn.Embedding(1, config.hidden_size) + + self.image_embedding_size = (config.image_size // config.patch_size, config.image_size // config.patch_size) + self.mask_input_size = (4 * config.image_size // config.patch_size, 4 * config.image_size // config.patch_size) + self.input_image_size = config.image_size + + self.point_embed = nn.Embedding(config.num_point_embeddings, config.hidden_size) + self.hidden_size = config.hidden_size + self.not_a_point_embed = nn.Embedding(1, config.hidden_size) + + def _embed_points(self, points: torch.Tensor, labels: torch.Tensor, pad: bool) -> torch.Tensor: + """Embeds point prompts.""" + points = points + 0.5 # Shift to center of pixel + if pad: + points = torch.nn.functional.pad(points, (0, 0, 0, 1), mode="constant", value=0) + labels = torch.nn.functional.pad(labels, (0, 1), mode="constant", value=-1) + input_shape = (self.input_image_size, self.input_image_size) + point_embedding = self.shared_embedding(points, input_shape) + + # torch.where and expanding the labels tensor is required by the ONNX export + point_embedding = torch.where(labels[..., None] == -1, self.not_a_point_embed.weight, point_embedding) + + # This is required for the ONNX export. The dtype, device need to be explicitly + # specified as otherwise torch.onnx.export interprets as double + point_embedding = torch.where( + labels[..., None] != -10, + point_embedding, + torch.zeros_like(point_embedding), + ) + + # Add point embeddings for labels >= 0 + point_embedding = point_embedding + self.point_embed(labels.clamp(min=0)) * (labels >= 0).unsqueeze(-1) + + return point_embedding + + def _embed_boxes(self, boxes: torch.Tensor) -> torch.Tensor: + """Embeds box prompts.""" + boxes += 0.5 # Shift to center of pixel + coords = boxes.view(*boxes.shape[:2], 2, 2) + # add padding point for consistency with the original implementation + coords = torch.nn.functional.pad(coords, (0, 0, 0, 1), mode="constant", value=0) + corner_embedding = self.shared_embedding(coords, (self.input_image_size, self.input_image_size)) + corner_embedding[:, :, 0, :] += self.point_embed.weight[2] + corner_embedding[:, :, 1, :] += self.point_embed.weight[3] + corner_embedding[:, :, 2, :] = self.not_a_point_embed.weight.expand_as(corner_embedding[:, :, 2, :]) + return corner_embedding + + def forward( + self, + input_points: Optional[tuple[torch.Tensor, torch.Tensor]], + input_labels: Optional[torch.Tensor], + input_boxes: Optional[torch.Tensor], + input_masks: Optional[torch.Tensor], + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Embeds different types of prompts, returning both sparse and dense embeddings. + + Args: + points (`torch.Tensor`, *optional*): + point coordinates and labels to embed. + boxes (`torch.Tensor`, *optional*): + boxes to embed + masks (`torch.Tensor`, *optional*): + masks to embed + """ + sparse_embeddings = None + batch_size = 1 + if input_points is not None: + batch_size = input_points.shape[0] + if input_labels is None: + raise ValueError("If points are provided, labels must also be provided.") + point_embeddings = self._embed_points(input_points, input_labels, pad=(input_boxes is None)) + sparse_embeddings = point_embeddings + if input_boxes is not None: + batch_size = input_boxes.shape[0] + box_embeddings = self._embed_boxes(input_boxes) + if sparse_embeddings is None: + sparse_embeddings = box_embeddings + else: + sparse_embeddings = torch.cat([sparse_embeddings, box_embeddings], dim=2) + if input_masks is not None: + dense_embeddings = self.mask_embed(input_masks) + else: + dense_embeddings = self.no_mask_embed.weight.reshape(1, -1, 1, 1).expand( + batch_size, -1, self.image_embedding_size[0], self.image_embedding_size[1] + ) + + return sparse_embeddings, dense_embeddings + + +class EdgeTamVideoTwoWayTransformer(nn.Module): + def __init__(self, config: EdgeTamVideoMaskDecoderConfig): + super().__init__() + self.config = config + + self.num_hidden_layers = config.num_hidden_layers + self.layers = nn.ModuleList() + + for i in range(self.num_hidden_layers): + self.layers.append(EdgeTamVideoTwoWayAttentionBlock(config, skip_first_layer_pe=(i == 0))) + + self.final_attn_token_to_image = EdgeTamVideoAttention(config) + self.layer_norm_final_attn = nn.LayerNorm(config.hidden_size) + + def forward( + self, + point_embeddings: Tensor, + image_embeddings: Tensor, + image_positional_embeddings: Tensor, + attention_similarity: Tensor, + target_embedding=None, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, BaseModelOutput]: + if image_embeddings is None: + raise ValueError("You have to specify an image_embedding") + + image_embeddings = image_embeddings.flatten(2).permute(0, 2, 1).unsqueeze(1) + image_positional_embeddings = image_positional_embeddings.flatten(2).permute(0, 2, 1).unsqueeze(1) + + # Prepare queries + queries = point_embeddings + keys = image_embeddings + + # Apply transformer blocks and final layernorm + for layer in self.layers: + if target_embedding is not None: + queries += target_embedding + + queries, keys, _ = layer( + queries=queries, + keys=keys, + query_point_embedding=point_embeddings, + key_point_embedding=image_positional_embeddings, + attention_similarity=attention_similarity, + **kwargs, + ) + # Apply the final attention layer from the points to the image + query = queries + point_embeddings + key = keys + image_positional_embeddings + + attn_out, _ = self.final_attn_token_to_image(query=query, key=key, value=keys) + + queries = queries + attn_out + queries = self.layer_norm_final_attn(queries) + return queries, keys + + +class EdgeTamVideoMaskDecoder(nn.Module): + def __init__(self, config: EdgeTamVideoMaskDecoderConfig): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + + self.num_multimask_outputs = config.num_multimask_outputs + self.num_mask_tokens = config.num_multimask_outputs + 1 + + self.iou_token = nn.Embedding(1, self.hidden_size) + self.mask_tokens = nn.Embedding(self.num_mask_tokens, self.hidden_size) + + self.transformer = EdgeTamVideoTwoWayTransformer(config) + + # should we create a new class for this? + self.upscale_conv1 = nn.ConvTranspose2d(self.hidden_size, self.hidden_size // 4, kernel_size=2, stride=2) + self.upscale_conv2 = nn.ConvTranspose2d(self.hidden_size // 4, self.hidden_size // 8, kernel_size=2, stride=2) + self.upscale_layer_norm = EdgeTamVideoLayerNorm(self.hidden_size // 4, data_format="channels_first") + self.activation = nn.GELU() + + mlps_list = [] + for _ in range(self.num_mask_tokens): + mlps_list += [EdgeTamVideoFeedForward(self.hidden_size, self.hidden_size, self.hidden_size // 8, 3)] + self.output_hypernetworks_mlps = nn.ModuleList(mlps_list) + self.iou_prediction_head = EdgeTamVideoFeedForward( + self.hidden_size, + config.iou_head_hidden_dim, + self.num_mask_tokens, + config.iou_head_depth, + sigmoid_output=True, + ) + + self.conv_s0 = nn.Conv2d(config.hidden_size, config.hidden_size // 8, kernel_size=1, stride=1) + self.conv_s1 = nn.Conv2d(config.hidden_size, config.hidden_size // 4, kernel_size=1, stride=1) + + self.obj_score_token = nn.Embedding(1, self.hidden_size) + self.pred_obj_score_head = EdgeTamVideoFeedForward(self.hidden_size, self.hidden_size, 1, 3) + + self.dynamic_multimask_via_stability = config.dynamic_multimask_via_stability + self.dynamic_multimask_stability_delta = config.dynamic_multimask_stability_delta + self.dynamic_multimask_stability_thresh = config.dynamic_multimask_stability_thresh + + def forward( + self, + image_embeddings: torch.Tensor, + image_positional_embeddings: torch.Tensor, + sparse_prompt_embeddings: torch.Tensor, + dense_prompt_embeddings: torch.Tensor, + multimask_output: bool, + high_resolution_features: list[torch.Tensor], + attention_similarity: Optional[torch.Tensor] = None, + target_embedding: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Predict masks given image and prompt embeddings. + + Args: + image_embeddings (`torch.Tensor`): + The embeddings from the image encoder. + image_positional_embeddings (`torch.Tensor`): + Positional encoding with the shape of image_embeddings. + sparse_prompt_embeddings (`torch.Tensor`): + The embeddings of the points and boxes. + dense_prompt_embeddings (`torch.Tensor`): + The embeddings of the mask inputs. + multimask_output (`bool`): + Whether to return multiple masks or a single mask. + high_resolution_features (`list[torch.Tensor]`, *optional*): + The high-resolution features from the vision encoder. + attention_similarity (`torch.Tensor`, *optional*): + The attention similarity tensor. + target_embedding (`torch.Tensor`, *optional*): + The target embedding. + """ + batch_size, num_channels, height, width = image_embeddings.shape + point_batch_size = sparse_prompt_embeddings.shape[1] + # Concatenate output tokens + output_tokens = torch.cat( + [ + self.obj_score_token.weight, + self.iou_token.weight, + self.mask_tokens.weight, + ], + dim=0, + ) + output_tokens = output_tokens.repeat(batch_size, point_batch_size, 1, 1) + + if sparse_prompt_embeddings.shape[0] != 0: + tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=2) + else: + tokens = output_tokens + point_embeddings = tokens.to(self.iou_token.weight.dtype) + + # Expand per-image data in batch direction to be per-mask + image_embeddings = image_embeddings + dense_prompt_embeddings + image_embeddings = image_embeddings.repeat_interleave(point_batch_size, dim=0) + image_positional_embeddings = image_positional_embeddings.repeat_interleave(point_batch_size, 0) + # Run the transformer + point_embeddings, image_embeddings = self.transformer( + point_embeddings=point_embeddings, + image_embeddings=image_embeddings, + image_positional_embeddings=image_positional_embeddings, + attention_similarity=attention_similarity, + target_embedding=target_embedding, + **kwargs, + ) + iou_token_out = point_embeddings[:, :, 1, :] + mask_tokens_out = point_embeddings[:, :, 2 : (2 + self.num_mask_tokens), :] + + # Upscale mask embeddings and predict masks using the mask tokens + image_embeddings = image_embeddings.transpose(2, 3).view( + batch_size * point_batch_size, num_channels, height, width + ) + + feat_s0, feat_s1 = high_resolution_features + feat_s0 = feat_s0.repeat_interleave(point_batch_size, dim=0) + feat_s1 = feat_s1.repeat_interleave(point_batch_size, dim=0) + upscaled_embedding = self.upscale_conv1(image_embeddings) + feat_s1 + upscaled_embedding = self.activation(self.upscale_layer_norm(upscaled_embedding)) + upscaled_embedding = self.activation(self.upscale_conv2(upscaled_embedding) + feat_s0) + + hyper_in_list: list[torch.Tensor] = [] + for i in range(self.num_mask_tokens): + current_mlp = self.output_hypernetworks_mlps[i] + hyper_in_list += [current_mlp(mask_tokens_out[:, :, i, :])] + hyper_in = torch.stack(hyper_in_list, dim=2) + + _, num_channels, height, width = upscaled_embedding.shape + upscaled_embedding = upscaled_embedding.view(batch_size, point_batch_size, num_channels, height * width) + masks = (hyper_in @ upscaled_embedding).view(batch_size, point_batch_size, -1, height, width) + + # Generate mask quality predictions + iou_pred = self.iou_prediction_head(iou_token_out) + object_score_logits = self.pred_obj_score_head(point_embeddings[:, :, 0, :]) + + # Select the correct mask or masks for output + if multimask_output: + mask_slice = slice(1, None) + masks = masks[:, :, mask_slice, :, :] + iou_pred = iou_pred[:, :, mask_slice] + elif self.dynamic_multimask_via_stability and not self.training: + mask_slice = slice(0, 1) + masks, iou_pred = self._dynamic_multimask_via_stability(masks, iou_pred) + else: + mask_slice = slice(0, 1) + masks = masks[:, :, mask_slice, :, :] + iou_pred = iou_pred[:, :, mask_slice] + + sam_tokens_out = mask_tokens_out[:, :, mask_slice] # [b, 3, c] shape + + return masks, iou_pred, sam_tokens_out, object_score_logits + + def _get_stability_scores(self, mask_logits): + """ + Compute stability scores of the mask logits based on the IoU between upper and + lower thresholds. + """ + mask_logits = mask_logits.flatten(-2) + stability_delta = self.dynamic_multimask_stability_delta + area_i = torch.sum(mask_logits > stability_delta, dim=-1).float() + area_u = torch.sum(mask_logits > -stability_delta, dim=-1).float() + stability_scores = torch.where(area_u > 0, area_i / area_u, 1.0) + return stability_scores + + def _dynamic_multimask_via_stability(self, all_mask_logits, all_iou_scores): + """ + When outputting a single mask, if the stability score from the current single-mask + output (based on output token 0) falls below a threshold, we instead select from + multi-mask outputs (based on output token 1~3) the mask with the highest predicted + IoU score. This is intended to ensure a valid mask for both clicking and tracking. + """ + # The best mask from multimask output tokens (1~3) + multimask_logits = all_mask_logits[:, :, 1:, :, :] + multimask_iou_scores = all_iou_scores[:, :, 1:] + best_scores_inds = torch.argmax(multimask_iou_scores, dim=-1) # [B, P] + best_scores_inds_expanded = best_scores_inds.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1) + best_scores_inds_expanded = best_scores_inds_expanded.expand( + -1, -1, 1, multimask_logits.size(-2), multimask_logits.size(-1) + ) + best_multimask_logits = torch.gather(multimask_logits, 2, best_scores_inds_expanded) # [B, P, 1, H, W] + best_multimask_iou_scores = torch.gather(multimask_iou_scores, 2, best_scores_inds.unsqueeze(-1)) # [B, P, 1] + + # The mask from singlemask output token 0 and its stability score + singlemask_logits = all_mask_logits[:, :, 0:1, :, :] + singlemask_iou_scores = all_iou_scores[:, :, 0:1] + stability_scores = self._get_stability_scores(singlemask_logits) + is_stable = stability_scores >= self.dynamic_multimask_stability_thresh + + # Dynamically fall back to best multimask output upon low stability scores. + mask_logits_out = torch.where( + is_stable[..., None, None].expand_as(singlemask_logits), + singlemask_logits, + best_multimask_logits, + ) + iou_scores_out = torch.where( + is_stable.expand_as(singlemask_iou_scores), + singlemask_iou_scores, + best_multimask_iou_scores, + ) + return mask_logits_out, iou_scores_out + + +# a large negative value as a placeholder score for missing objects +NO_OBJ_SCORE = -1024.0 + + +def get_1d_sine_pe(pos_inds, dim, temperature=10000): + """ + Get 1D sine positional embedding as in the original Transformer paper. + """ + pe_dim = dim // 2 + dim_t = torch.arange(pe_dim, dtype=torch.float32, device=pos_inds.device) + dim_t = temperature ** (2 * (dim_t // 2) / pe_dim) + + pos_embed = pos_inds.unsqueeze(-1) / dim_t + pos_embed = torch.cat([pos_embed.sin(), pos_embed.cos()], dim=-1) + return pos_embed + + +@auto_docstring +class EdgeTamVideoModel(EdgeTamVideoPreTrainedModel): + _tied_weights_keys = ["prompt_encoder.shared_embedding.positional_embedding"] + # need to be ignored, as it's a buffer and will not be correctly detected as tied weight + _keys_to_ignore_on_load_missing = ["prompt_encoder.shared_embedding.positional_embedding"] + _can_record_outputs = {"mask_decoder_attentions": OutputRecorder(EdgeTamVideoTwoWayAttentionBlock, index=2)} + _keys_to_ignore_on_load_unexpected = [] + + def __init__(self, config: EdgeTamVideoConfig): + super().__init__(config) + self.shared_image_embedding = EdgeTamVideoPositionalEmbedding(config.prompt_encoder_config) + self.vision_encoder = AutoModel.from_config(config.vision_config) + self.prompt_encoder = EdgeTamVideoPromptEncoder(config.prompt_encoder_config) + # The module using it is not a PreTrainedModel subclass so we need this + config.mask_decoder_config._attn_implementation = config._attn_implementation + self.mask_decoder = EdgeTamVideoMaskDecoder(config.mask_decoder_config) + + self.num_feature_levels = config.vision_config.num_feature_levels + self.backbone_feature_sizes = config.vision_config.backbone_feature_sizes + # a single token to indicate no memory embedding from previous frames + self.hidden_dim = config.vision_config.fpn_hidden_size + self.no_memory_embedding = torch.nn.Parameter(torch.zeros(1, 1, self.hidden_dim)) + self.config = config + # For video sequence inference + self.image_size = config.image_size + self.memory_attention = EdgeTamVideoMemoryAttention(config) + self.memory_encoder = EdgeTamVideoMemoryEncoder(config) + self.no_memory_positional_encoding = torch.nn.Parameter( + torch.zeros(1, 1, config.vision_config.fpn_hidden_size) + ) + self.mem_dim = config.memory_encoder_output_channels + self.num_maskmem = config.num_maskmem # Number of memories accessible + # Temporal encoding of the memories + self.memory_temporal_positional_encoding = torch.nn.Parameter( + torch.zeros(self.num_maskmem, 1, 1, self.mem_dim) + ) + + self.no_object_pointer = torch.nn.Parameter(torch.zeros(1, self.hidden_dim)) + # A conv layer to downsample the mask prompt to stride 4 (the same stride as + # low-res SAM mask logits) and to change its scales from 0~1 to SAM logit scale, + # so that it can be fed into the SAM mask decoder to generate a pointer. + self.mask_downsample = torch.nn.Conv2d(1, 1, kernel_size=4, stride=4) + # a feedforward layer on SAM output tokens to turn them into object pointers + self.object_pointer_proj = EdgeTamVideoFeedForward(self.hidden_dim, self.hidden_dim, self.hidden_dim, 3) + + if self.config.enable_temporal_pos_encoding_for_object_pointers: + # a linear projection on temporal positional encoding in object pointers to + # avoid potential interference with spatial positional encoding + self.temporal_positional_encoding_projection_layer = torch.nn.Linear(self.hidden_dim, self.mem_dim) + else: + self.temporal_positional_encoding_projection_layer = torch.nn.Identity() + + self.occlusion_spatial_embedding_parameter = None # compatibility with Sam2 + if config.enable_occlusion_spatial_embedding: + self.occlusion_spatial_embedding_parameter = torch.nn.Parameter(torch.zeros(1, self.mem_dim)) + self.spatial_perceiver = EdgeTamVideoPerceiverResampler(config) + + self.post_init() + + def _tie_weights(self): + self.prompt_encoder.shared_embedding.positional_embedding.data = ( + self.shared_image_embedding.positional_embedding.data + ) + + def get_input_embeddings(self): + return self.vision_encoder.get_input_embeddings() + + def get_image_wide_positional_embeddings(self) -> torch.Tensor: + size = self.prompt_encoder.image_embedding_size + target_device = self.shared_image_embedding.positional_embedding.device + target_dtype = self.shared_image_embedding.positional_embedding.dtype + grid = torch.ones(size, device=target_device, dtype=target_dtype) + y_embed = grid.cumsum(dim=0) - 0.5 + x_embed = grid.cumsum(dim=1) - 0.5 + y_embed = y_embed / size[0] + x_embed = x_embed / size[1] + + positional_embedding = self.shared_image_embedding(torch.stack([x_embed, y_embed], dim=-1)) + return positional_embedding.permute(2, 0, 1).unsqueeze(0) # channel x height x width + + @torch.no_grad() + def get_image_embeddings( + self, + pixel_values: torch.FloatTensor, + **kwargs: Unpack[TransformersKwargs], + ) -> list[torch.Tensor]: + r""" + Returns the image embeddings by passing the pixel values through the vision encoder. + + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Input pixel values + """ + batch_size = pixel_values.shape[0] + feature_maps, _, _, _ = self.get_image_features(pixel_values, **kwargs) + + # add no memory embedding to the last feature map + feature_maps[-1] = feature_maps[-1] + self.no_memory_embedding + + # reshape feature maps to the same shape as the backbone feature sizes + image_embeddings = [ + feat.permute(1, 2, 0).view(batch_size, -1, *feat_size) + for feat, feat_size in zip(feature_maps, self.backbone_feature_sizes) + ] + + return image_embeddings + + @torch.no_grad() + def get_prompt_embeddings( + self, + input_points: Optional[torch.FloatTensor] = None, + input_labels: Optional[torch.LongTensor] = None, + input_boxes: Optional[torch.FloatTensor] = None, + input_masks: Optional[torch.LongTensor] = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + r""" + Returns the prompt embeddings by passing the input points, labels, boxes and masks through the prompt encoder. + + Args: + input_points (`torch.FloatTensor` of shape `(batch_size, point_batch_size, num_points_per_image, 2)`): + Optional input points for the prompt encoder. The padding of the point is automatically done by the + processor. `point_batch_size` refers to the number of masks that we want the model to predict per + point. The model will output `point_batch_size` times 3 masks in total. + input_labels (`torch.LongTensor` of shape `(batch_size, point_batch_size, num_points_per_image)`): + Optional input labels for the prompt encoder. The padding of the labels is automatically done by the + processor, or can be fed by the user. + input_boxes (`torch.FloatTensor` of shape `(batch_size, num_boxes_per_image, 4)`): + Optional input boxes for the prompt encoder. The padding of the boxes is automatically done by the + processor. users can also pass manually the input boxes. + input_masks (`torch.LongTensor` of shape `(batch_size, image_size, image_size)`): + Optional input masks for the prompt encoder. + """ + prompt_output = self.prompt_encoder( + input_points=input_points, + input_labels=input_labels, + input_boxes=input_boxes, + input_masks=input_masks, + ) + return prompt_output + + @torch.inference_mode() + @auto_docstring(custom_intro="Propagate the objects through a streamed video frame.") + def forward( + self, + inference_session: EdgeTamVideoInferenceSession, + frame_idx: Optional[int] = None, + frame: Optional[torch.Tensor] = None, + reverse: bool = False, + ) -> EdgeTamVideoSegmentationOutput: + r""" + inference_session (`EdgeTamVideoInferenceSession`): + The video inference session object. + frame_idx (`int`, *optional*): + The index of the frame on which to run inference. No need to provide when inferring + on a new streamed frame. + frame (`torch.Tensor`, *optional*): + The frame to process. Provide when streaming. + reverse (`bool`, *optional*, defaults to `False`): + Whether to propagate in reverse. + """ + if frame is not None: + frame_idx = inference_session.add_new_frame(frame, frame_idx) + + if frame is not None and inference_session.get_obj_num() == 0: + raise ValueError("No objects are provided for tracking; please add inputs first.") + + num_objects = inference_session.get_obj_num() + pred_masks_per_obj = [None] * num_objects + # Note: We avoid batched inference here because per-object inputs (clicks/masks) + # can differ across objects. + for obj_idx in range(num_objects): + obj_id = inference_session.obj_idx_to_id(obj_idx) + has_new_inputs = obj_id in inference_session.obj_with_new_inputs + has_cond_output = frame_idx in inference_session.output_dict_per_obj[obj_idx]["cond_frame_outputs"] + # If this object has no new inputs and this frame already has a + # conditioning output, reuse the cached masks instead of recomputing. + if (not has_new_inputs) and has_cond_output: + pred_masks = inference_session.get_output(obj_idx, frame_idx, "pred_masks", is_conditioning_frame=True) + is_init_cond_frame = True + else: + # Defaults when there are no new inputs + is_init_cond_frame = False + point_inputs = None + mask_inputs = None + + if has_new_inputs: + is_init_cond_frame = frame_idx not in inference_session.frames_tracked_per_obj[obj_idx] + if is_init_cond_frame: + reverse = False + point_inputs = inference_session.point_inputs_per_obj[obj_idx].get(frame_idx, None) + mask_inputs = inference_session.mask_inputs_per_obj[obj_idx].get(frame_idx, None) + if point_inputs is not None or mask_inputs is not None: + inference_session.obj_with_new_inputs.remove(obj_id) + + current_out = self._run_single_frame_inference( + inference_session=inference_session, + obj_idx=obj_idx, + frame_idx=frame_idx, + batch_size=1, # run on the slice of a single object + is_init_cond_frame=is_init_cond_frame, + point_inputs=point_inputs, + mask_inputs=mask_inputs, + reverse=reverse, + run_mem_encoder=True, + streaming=frame is not None, + ) + inference_session.store_output( + obj_idx, frame_idx, output_value=current_out, is_conditioning_frame=is_init_cond_frame + ) + pred_masks = current_out["pred_masks"] + + pred_masks_per_obj[obj_idx] = pred_masks + if not is_init_cond_frame: + # only for tracked frames, not for initial conditioning frames + inference_session.frames_tracked_per_obj[obj_idx][frame_idx] = {"reverse": reverse} + + # Resize the output mask to the original video resolution (we directly use + # the mask scores on GPU for output to avoid any CPU conversion in between) + if len(pred_masks_per_obj) > 1: + all_pred_masks = torch.cat(pred_masks_per_obj, dim=0) + else: + all_pred_masks = pred_masks_per_obj[0] + + return EdgeTamVideoSegmentationOutput(pred_masks=all_pred_masks, frame_idx=frame_idx) + + def get_image_features( + self, + pixel_values: torch.FloatTensor, + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[ + list[torch.Tensor], + list[torch.Tensor], + Optional[tuple[torch.FloatTensor, ...]], + Optional[tuple[torch.FloatTensor, ...]], + ]: + r""" + Extract and preprocess image features using the vision encoder. + + Args: + pixel_values (`torch.FloatTensor`): + Input pixel values of shape `(batch_size, num_channels, height, width)`. + + Returns: + `tuple`: A tuple containing: + - feature_maps (`list[torch.Tensor]`): List of feature maps from different levels. + - feature_maps_position_embeddings (`list[torch.Tensor]`): List of positional embeddings for each feature level. + - vision_hidden_states (`tuple[torch.FloatTensor]`, *optional*): Hidden states from the vision encoder. + - vision_attentions (`tuple[torch.FloatTensor]`, *optional*): Attention weights from the vision encoder. + """ + vision_outputs: EdgeTamVideoVisionEncoderOutput = self.vision_encoder( + pixel_values, + **kwargs, + ) + + feature_maps = vision_outputs.fpn_hidden_states + feature_maps_position_embeddings = vision_outputs.fpn_position_encoding + + # precompute projected level 0 and level 1 features in SAM decoder + # to avoid running it again on every SAM click + feature_maps = list(feature_maps) + feature_maps[0] = self.mask_decoder.conv_s0(feature_maps[0]) + feature_maps[1] = self.mask_decoder.conv_s1(feature_maps[1]) + + # flatten NxCxHxW to HWxNxC + feature_maps = [feature_map.flatten(2).permute(2, 0, 1) for feature_map in feature_maps] + feature_maps_position_embeddings = [ + feature_map_position_embedding.flatten(2).permute(2, 0, 1) + for feature_map_position_embedding in feature_maps_position_embeddings + ] + + return feature_maps, feature_maps_position_embeddings, vision_outputs.hidden_states, vision_outputs.attentions + + def _prepare_vision_features( + self, + inference_session: EdgeTamVideoInferenceSession, + frame_idx: int, + batch_size: int, + ) -> tuple[torch.Tensor, list[torch.Tensor]]: + """Prepare vision features for a frame.""" + + # Check if features are cached + if cached_features := inference_session.cache.get_vision_features(frame_idx): + vision_feats = cached_features["vision_feats"] + vision_pos_embeds = cached_features["vision_pos_embeds"] + else: + # Compute features using image encoder + image_batch = inference_session.get_frame(frame_idx).unsqueeze(0) # Add batch dimension + vision_feats, vision_pos_embeds, _, _ = self.get_image_features(image_batch) + # Cache features + inference_session.cache.cache_vision_features( + frame_idx, {"vision_feats": vision_feats, "vision_pos_embeds": vision_pos_embeds} + ) + + # Expand to batch size if needed + if batch_size > 1: + vision_feats = vision_feats.expand(batch_size, -1, -1, -1) + vision_pos_embeds = [pe.expand(batch_size, -1, -1, -1) for pe in vision_pos_embeds] + + return vision_feats, vision_pos_embeds + + def _single_frame_forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + input_points: Optional[torch.FloatTensor] = None, + input_labels: Optional[torch.LongTensor] = None, + input_boxes: Optional[torch.FloatTensor] = None, + input_masks: Optional[torch.LongTensor] = None, + image_embeddings: Optional[torch.FloatTensor] = None, + multimask_output: bool = True, + attention_similarity: Optional[torch.FloatTensor] = None, + target_embedding: Optional[torch.FloatTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> EdgeTamVideoImageSegmentationOutput: + """ + input_points (`torch.FloatTensor` of shape `(batch_size, num_points, 2)`): + Input 2D spatial points, this is used by the prompt encoder to encode the prompt. Generally yields to much + better results. The points can be obtained by passing a list of list of list to the processor that will + create corresponding `torch` tensors of dimension 4. The first dimension is the image batch size, the + second dimension is the point batch size (i.e. how many segmentation masks do we want the model to predict + per input point), the third dimension is the number of points per segmentation mask (it is possible to pass + multiple points for a single mask), and the last dimension is the x (vertical) and y (horizontal) + coordinates of the point. If a different number of points is passed either for each image, or for each + mask, the processor will create "PAD" points that will correspond to the (0, 0) coordinate, and the + computation of the embedding will be skipped for these points using the labels. + input_labels (`torch.LongTensor` of shape `(batch_size, point_batch_size, num_points)`): + Input labels for the points, this is used by the prompt encoder to encode the prompt. According to the + official implementation, there are 3 types of labels + + - `1`: the point is a point that contains the object of interest + - `0`: the point is a point that does not contain the object of interest + - `-1`: the point corresponds to the background + + We added the label: + + - `-10`: the point is a padding point, thus should be ignored by the prompt encoder + + The padding labels should be automatically done by the processor. + input_boxes (`torch.FloatTensor` of shape `(batch_size, num_boxes, 4)`): + Input boxes for the points, this is used by the prompt encoder to encode the prompt. Generally yields to + much better generated masks. The boxes can be obtained by passing a list of list of list to the processor, + that will generate a `torch` tensor, with each dimension corresponding respectively to the image batch + size, the number of boxes per image and the coordinates of the top left and bottom right point of the box. + In the order (`x1`, `y1`, `x2`, `y2`): + + - `x1`: the x coordinate of the top left point of the input box + - `y1`: the y coordinate of the top left point of the input box + - `x2`: the x coordinate of the bottom right point of the input box + - `y2`: the y coordinate of the bottom right point of the input box + input_masks (`torch.FloatTensor` of shape `(batch_size, image_size, image_size)`): + SAM model also accepts segmentation masks as input. The mask will be embedded by the prompt encoder to + generate a corresponding embedding, that will be fed later on to the mask decoder. These masks needs to be + manually fed by the user, and they need to be of shape (`batch_size`, `image_size`, `image_size`). + image_embeddings (`torch.FloatTensor` of shape `(batch_size, output_channels, window_size, window_size)`): + Image embeddings, this is used by the mask decoder to generate masks and iou scores. For more memory + efficient computation, users can first retrieve the image embeddings using the `get_image_embeddings` + method, and then feed them to the `forward` method instead of feeding the `pixel_values`. + multimask_output (`bool`, *optional*): + In the original implementation and paper, the model always outputs 3 masks per image (or per point / per + bounding box if relevant). However, it is possible to just output a single mask, that corresponds to the + "best" mask, by specifying `multimask_output=False`. + attention_similarity (`torch.FloatTensor`, *optional*): + Attention similarity tensor, to be provided to the mask decoder for target-guided attention in case the + model is used for personalization as introduced in [PerSAM](https://huggingface.co/papers/2305.03048). + target_embedding (`torch.FloatTensor`, *optional*): + Embedding of the target concept, to be provided to the mask decoder for target-semantic prompting in case + the model is used for personalization as introduced in [PerSAM](https://huggingface.co/papers/2305.03048). + """ + if not ((pixel_values is None) ^ (image_embeddings is None)): + raise ValueError("Exactly one of pixel_values or image_embeddings must be provided.") + if input_points is not None and input_boxes is not None: + if input_points.shape[1] != input_boxes.shape[1]: + raise ValueError( + f"You should provide as many bounding boxes as input points per box. Got {input_points.shape[1]} and {input_boxes.shape[1]}." + ) + elif input_points is not None: + num_objects = input_points.shape[1] + elif input_boxes is not None: + num_objects = input_boxes.shape[1] + elif input_masks is not None: + num_objects = input_masks.shape[1] + else: + num_objects = 1 + + image_positional_embeddings = self.get_image_wide_positional_embeddings() + # repeat with batch size + batch_size = pixel_values.shape[0] if pixel_values is not None else image_embeddings[-1].shape[0] + image_positional_embeddings = image_positional_embeddings.repeat(batch_size, 1, 1, 1) + + vision_attentions = None + vision_hidden_states = None + + if pixel_values is not None: + feature_maps, _, vision_hidden_states, vision_attentions = self.get_image_features( + pixel_values, + **kwargs, + ) + + # add no memory embedding to the last feature map + feature_maps[-1] = feature_maps[-1] + self.no_memory_embedding + + # reshape feature maps to the same shape as the backbone feature sizes + image_embeddings = [ + feat.permute(1, 2, 0).view(batch_size, -1, *feat_size) + for feat, feat_size in zip(feature_maps, self.backbone_feature_sizes) + ] + + if input_points is not None and input_labels is None: + input_labels = torch.ones_like(input_points[:, :, :, 0], dtype=torch.int, device=input_points.device) + + if input_points is None and input_boxes is None: + # If no points are provide, pad with an empty point (with label -1) + input_points = torch.zeros( + batch_size, 1, 1, 2, dtype=image_embeddings[-1].dtype, device=image_embeddings[-1].device + ) + input_labels = -torch.ones(batch_size, 1, 1, dtype=torch.int32, device=image_embeddings[-1].device) + + if input_masks is not None: + # If mask_inputs is provided, downsize it into low-res mask input if needed + # and feed it as a dense mask prompt into the SAM mask encoder + if input_masks.shape[-2:] != self.prompt_encoder.mask_input_size: + input_masks = F.interpolate( + input_masks.float(), + size=self.prompt_encoder.mask_input_size, + align_corners=False, + mode="bilinear", + antialias=True, # use antialias for downsampling + ).to(input_masks.dtype) + + sparse_embeddings, dense_embeddings = self.prompt_encoder( + input_points=input_points, + input_labels=input_labels, + input_boxes=input_boxes, + input_masks=input_masks, + ) + low_res_multimasks, iou_scores, sam_output_tokens, object_score_logits = self.mask_decoder( + image_embeddings=image_embeddings[-1], + image_positional_embeddings=image_positional_embeddings, + sparse_prompt_embeddings=sparse_embeddings, + dense_prompt_embeddings=dense_embeddings, + multimask_output=multimask_output, + high_resolution_features=image_embeddings[:-1], + attention_similarity=attention_similarity, + target_embedding=target_embedding, + **kwargs, + ) + + is_obj_appearing = object_score_logits > 0 + # Mask used for spatial memories is always a *hard* choice between obj and no obj, + # consistent with the actual mask prediction + low_res_multimasks = torch.where( + is_obj_appearing[:, None, None], + low_res_multimasks, + NO_OBJ_SCORE, + ) + + # convert masks from possibly bfloat16 (or float16) to float32 + # (older PyTorch versions before 2.1 don't support `interpolate` on bf16) + high_res_multimasks = ( + F.interpolate( + low_res_multimasks.squeeze(1).float(), + size=(self.image_size, self.image_size), + mode="bilinear", + align_corners=False, + ) + .unsqueeze(1) + .to(low_res_multimasks.dtype) + ) + sam_output_token = sam_output_tokens[:, :, 0] + if multimask_output: + # take the best mask prediction (with the highest IoU estimation) + best_iou_inds = torch.argmax(iou_scores, dim=-1) + batch_inds = torch.arange(batch_size, device=high_res_multimasks.device) + object_batch_inds = torch.arange(num_objects, device=high_res_multimasks.device) + low_res_masks = low_res_multimasks[batch_inds, object_batch_inds, best_iou_inds] + high_res_masks = high_res_multimasks[batch_inds, object_batch_inds, best_iou_inds] + if sam_output_tokens.size(2) > 1: + sam_output_token = sam_output_tokens[batch_inds, object_batch_inds, best_iou_inds] + else: + low_res_masks, high_res_masks = low_res_multimasks[:, :, 0], high_res_multimasks[:, :, 0] + + # Extract object pointer from the SAM output token (with occlusion handling) + object_pointer = self.object_pointer_proj(sam_output_token) + lambda_is_obj_appearing = is_obj_appearing.to(object_pointer.dtype) + + object_pointer = lambda_is_obj_appearing * object_pointer + object_pointer = object_pointer + (1 - lambda_is_obj_appearing) * self.no_object_pointer + + return EdgeTamVideoImageSegmentationOutput( + iou_scores=iou_scores, + pred_masks=low_res_masks, + high_res_masks=high_res_masks, + object_pointer=object_pointer, + object_score_logits=object_score_logits, + image_embeddings=image_embeddings, + vision_hidden_states=vision_hidden_states, + vision_attentions=vision_attentions, + ) + + def _use_mask_as_output( + self, + backbone_features: torch.Tensor, + high_res_features: list[torch.Tensor], + mask_inputs: torch.Tensor, + ) -> EdgeTamVideoImageSegmentationOutput: + """ + Directly turn binary `mask_inputs` into a output mask logits without using SAM. + (same input and output shapes as in forward above). + """ + # Use -10/+20 as logits for neg/pos pixels (very close to 0/1 in prob after sigmoid). + out_scale, out_bias = 20.0, -10.0 # sigmoid(-10.0)=4.5398e-05 + mask_inputs_float = mask_inputs.to(backbone_features[0].dtype) + high_res_masks = mask_inputs_float * out_scale + out_bias + low_res_masks = F.interpolate( + high_res_masks.float(), + size=(high_res_masks.size(-2) // 4, high_res_masks.size(-1) // 4), + align_corners=False, + mode="bilinear", + antialias=True, # use antialias for downsampling + ).to(backbone_features[0].dtype) + # a dummy IoU prediction of all 1's under mask input + iou_scores = mask_inputs.new_ones(mask_inputs.size(0), 1).to(backbone_features[0].dtype) + # produce an object pointer using the SAM decoder from the mask input + object_pointer = self._single_frame_forward( + input_masks=self.mask_downsample(mask_inputs_float.to(backbone_features[0].dtype)), + image_embeddings=high_res_features + [backbone_features], + ).object_pointer + # In this method, we are treating mask_input as output, e.g. using it directly to create spatial mem; + # Below, we follow the same design axiom to use mask_input to decide if obj appears or not instead of relying + # on the object_scores from the SAM decoder. + is_obj_appearing = torch.any(mask_inputs.flatten(1).float() > 0.0, dim=1) + is_obj_appearing = is_obj_appearing[..., None] + lambda_is_obj_appearing = is_obj_appearing.to(backbone_features[0].dtype) + object_score_logits = out_scale * lambda_is_obj_appearing + out_bias + object_pointer = lambda_is_obj_appearing * object_pointer + object_pointer = object_pointer + (1 - lambda_is_obj_appearing) * self.no_object_pointer + return EdgeTamVideoImageSegmentationOutput( + iou_scores=iou_scores, + pred_masks=low_res_masks, + high_res_masks=high_res_masks, + object_pointer=object_pointer, + object_score_logits=object_score_logits, + image_embeddings=high_res_features + [backbone_features], + ) + + def _gather_memory_frame_outputs( + self, + inference_session: EdgeTamVideoInferenceSession, + obj_idx: int, + frame_idx: int, + track_in_reverse_time: bool = False, + ) -> list[tuple[int, dict]]: + """ + Get memory frames from conditioning and non-conditioning outputs. + + Returns: + List of (relative_temporal_offset, output_data) tuples. + """ + temporal_positions_and_previous_outputs = [] + + # Add conditioning frame outputs (no limit here, as is the case in the original checkpoints) + conditioning_outputs = inference_session.output_dict_per_obj[obj_idx]["cond_frame_outputs"] + if not conditioning_outputs: + raise ValueError( + "maskmem_features in conditioning outputs cannot be empty when not is_initial_conditioning_frame" + ) + + # Store (temporal_position, output_data) tuples + temporal_positions_and_previous_outputs = [(0, out) for out in conditioning_outputs.values()] + + # Add non-conditioning memory frames (up to self.num_maskmem - 1) + # These are typically frames tracked by the model without direct user input. + # Frames are selected with a stride, prioritizing the most recent ones. Here we only support stride = 1 for simplicity. + for relative_temporal_offset in range(self.num_maskmem - 1, 0, -1): + # relative_temporal_offset: how many frames before (or after if reversing) the current frame + if not track_in_reverse_time: + previous_frame_idx = frame_idx - relative_temporal_offset + else: + previous_frame_idx = frame_idx + relative_temporal_offset + + # check if the output is already stored without using get_output to avoid unnecessary memory transfers between CPU and GPU + output_data = inference_session.output_dict_per_obj[obj_idx]["non_cond_frame_outputs"].get( + previous_frame_idx, None + ) + + temporal_positions_and_previous_outputs.append((relative_temporal_offset, output_data)) + + return temporal_positions_and_previous_outputs + + def _build_memory_attention_inputs( + self, + temporal_positions_and_previous_outputs: list[tuple[int, dict]], + device: torch.device, + ) -> tuple[list[torch.Tensor], list[torch.Tensor]]: + """ + Concatenate memory features and positional embeddings from previous frames. + + Returns: + Tuple of (memories_to_concatenate, memory_positional_embeddings_to_concatenate). + """ + memories_to_concatenate = [] + memory_positional_embeddings_to_concatenate = [] + + for relative_temporal_offset, prev_output_data in temporal_positions_and_previous_outputs: + if prev_output_data is None: + continue # Skip if no output data for this temporal position (e.g., padding frames) + + # Load memory features (potentially from CPU to GPU) + # Features are flattened: (Batch, Channels, H, W) -> (H*W, Batch, Channels) + memory_features = prev_output_data["maskmem_features"].to(device, non_blocking=True) + memories_to_concatenate.append(memory_features.permute(1, 0, 2)) + + # Spatial positional encoding (potentially from CPU to GPU) + spatial_memory_pos_embed = prev_output_data["maskmem_pos_enc"].to(device, non_blocking=True) + spatial_memory_pos_embed = spatial_memory_pos_embed.squeeze(1).permute(1, 0, 2) + + # Add temporal positional encoding + # self.memory_temporal_positional_encoding shape: (NumMaskMem, 1, 1, MemDim) + combined_memory_pos_embed = ( + spatial_memory_pos_embed + self.memory_temporal_positional_encoding[relative_temporal_offset - 1] + ) + memory_positional_embeddings_to_concatenate.append(combined_memory_pos_embed) + + return memories_to_concatenate, memory_positional_embeddings_to_concatenate + + def _get_object_pointers( + self, + inference_session: EdgeTamVideoInferenceSession, + obj_idx: int, + frame_idx: int, + num_total_frames: int, + device: torch.device, + track_in_reverse_time: bool = False, + streaming: bool = False, + ) -> tuple[list[int], list[torch.Tensor], int]: + """ + Get object pointers and their positional embeddings from past frames. + + Returns: + Tuple of (temporal_offsets, pointer_tokens, max_object_pointers_to_use). + """ + temporal_position_sign_multiplier = -1 if track_in_reverse_time else 1 + + # Determine max object pointers to use + if streaming: + max_object_pointers_to_use = self.config.max_object_pointers_in_encoder + else: + max_object_pointers_to_use = min(num_total_frames, self.config.max_object_pointers_in_encoder) + + temporal_offsets: list[int] = [] + pointer_tokens: list[torch.Tensor] = [] + + # Add object pointers from selected conditioning frames + # Optionally, only include pointers from past frames during evaluation + conditioning_outputs = inference_session.output_dict_per_obj[obj_idx]["cond_frame_outputs"] + eligible_conditioning_outputs = conditioning_outputs + if not self.training: + eligible_conditioning_outputs = { + temporal_idx: out + for temporal_idx, out in conditioning_outputs.items() + if (temporal_idx >= frame_idx if track_in_reverse_time else temporal_idx <= frame_idx) + } + + for temporal_idx, out_data in eligible_conditioning_outputs.items(): + temporal_difference = (frame_idx - temporal_idx) * temporal_position_sign_multiplier + temporal_offsets.append(temporal_difference) + pointer_tokens.append(out_data["object_pointer"].to(device)) + + # Add object pointers from non-conditioning frames (up to max_object_pointers_to_use - 1) + for t_diff_offset in range(1, max_object_pointers_to_use): + ref_frame_idx = frame_idx + t_diff_offset if track_in_reverse_time else frame_idx - t_diff_offset + if ref_frame_idx < 0 or ( + not streaming and num_total_frames is not None and ref_frame_idx >= num_total_frames + ): + break # Stop if frame index is out of bounds + + # check if the output is already stored without using get_output to avoid unnecessary memory transfers between CPU and GPU + out_data = inference_session.output_dict_per_obj[obj_idx]["non_cond_frame_outputs"].get( + ref_frame_idx, None + ) + if out_data is not None: + temporal_offsets.append(t_diff_offset) + pointer_tokens.append(out_data["object_pointer"].to(device)) + + return temporal_offsets, pointer_tokens, max_object_pointers_to_use + + def _process_object_pointers( + self, + temporal_offsets: list[int], + pointer_tokens: list[torch.Tensor], + max_object_pointers_to_use: int, + batch_size: int, + num_channels: int, + device: torch.device, + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Process object pointers and compute their positional embeddings. + + Returns: + Tuple of (object_pointers, object_pointers_pos_embed). + """ + if not pointer_tokens: + return None, None + + # Stack object pointers: List of (Batch, Channels) -> (SeqLen_ptr, Batch, Channels) + object_pointers = torch.stack(pointer_tokens, dim=0) + + if self.config.enable_temporal_pos_encoding_for_object_pointers: + max_temporal_diff = float(max_object_pointers_to_use - 1) + # Determine dimensionality for temporal positional encoding of pointers + pointer_tpos_dim = num_channels + + # Normalize temporal differences before sine PE calculation + normalized_temporal_diffs = ( + torch.tensor(temporal_offsets, device=device, dtype=torch.float32) / max_temporal_diff + ) + sine_pe = get_1d_sine_pe(normalized_temporal_diffs, dim=pointer_tpos_dim).to(object_pointers.dtype) + projected_sine_pe = self.temporal_positional_encoding_projection_layer(sine_pe) + object_pointers_pos_embed = projected_sine_pe.unsqueeze(1).expand(-1, batch_size, self.mem_dim) + else: + object_pointers_pos_embed = object_pointers.new_zeros( + len(temporal_offsets), batch_size, self.mem_dim, dtype=object_pointers.dtype + ) + + if self.mem_dim < num_channels: + # If memory dimension is smaller, reshape/split pointers and repeat positional encoding + num_splits = num_channels // self.mem_dim + object_pointers = object_pointers.reshape(-1, batch_size, num_splits, self.mem_dim) + object_pointers = object_pointers.permute(0, 2, 1, 3).flatten( + 0, 1 + ) # (SeqLen_ptr*num_splits, Batch, MemDim) + object_pointers_pos_embed = object_pointers_pos_embed.repeat_interleave(num_splits, dim=0) + + return object_pointers, object_pointers_pos_embed + + def _prepare_memory_conditioned_features( + self, + inference_session: EdgeTamVideoInferenceSession, + frame_idx: int, + obj_idx: int, + is_initial_conditioning_frame: bool, + current_vision_features: list[torch.Tensor], + current_vision_positional_embeddings: list[torch.Tensor], + num_total_frames: int, + track_in_reverse_time: bool = False, + streaming: bool = False, + ) -> torch.Tensor: + """ + Fuse current frame's visual features with memory from previous frames for enhanced object tracking. + + This method conditions the current frame's visual features on temporal memory from previous frames, + enabling consistent object tracking across video sequences. For initial conditioning frames, it uses + no-memory embeddings. For subsequent frames, it retrieves and integrates memory features from both + conditioning frames (user interactions) and non-conditioning frames (tracked results) via cross-attention. + + Args: + inference_session (`EdgeTamVideoInferenceSession`): + The video inference session object. + frame_idx (`int`): + Index of the current frame being processed. + obj_idx (`int`): + Index of the object being processed. + is_initial_conditioning_frame (`bool`): + Whether this is an initial conditioning frame with user inputs (True) or a subsequent + tracking frame (False). + current_vision_features (`torch.Tensor`): + Highest-level vision features of shape `(seq_len, batch_size, channels)`. + current_vision_positional_embeddings (`torch.Tensor`): + Positional embedding tensors corresponding to the highest-level vision features. + num_total_frames (`int`): + Total number of frames in the video sequence. + track_in_reverse_time (`bool`, *optional*, defaults to `False`): + Whether tracking is performed in reverse temporal order. + streaming (`bool`, *optional*, defaults to `False`): + Whether this is streaming inference mode. + + Returns: + `torch.Tensor`: Memory-conditioned feature tensor of shape `(batch_size, channels, height, width)` + suitable for input to the SAM decoder. + """ + # Get dimensions from the highest-level (lowest-resolution) feature map + batch_size = current_vision_features.size(1) + num_channels = self.hidden_dim + height, width = self.backbone_feature_sizes[-1] + device = current_vision_features.device + + # If memory is disabled (e.g., for single image SAM), return current features directly. + if self.num_maskmem == 0: + # Permute (SeqLen, Batch, Channels) -> (Batch, Channels, SeqLen) then view as (Batch, Channels, Height, Width) + # Assuming SeqLen = Height * Width for the last feature map + current_feature_map = current_vision_features.permute(1, 2, 0).view( + batch_size, num_channels, height, width + ) + return current_feature_map + + # Step 1: Handle initial conditioning frames + if is_initial_conditioning_frame: + # For initial conditioning frames, no prior memory is used directly in this block. + # If configured, directly add a learnable "no memory" embedding. + # current_vision_features has shape (SeqLen, Batch, Channels) + conditioned_feature_map_flat = current_vision_features + self.no_memory_embedding + # Reshape to (Batch, Channels, Height, Width) + conditioned_feature_map = conditioned_feature_map_flat.permute(1, 2, 0).view( + batch_size, num_channels, height, width + ) + return conditioned_feature_map + + # Step 2: Get memory frames and concatenate their features + temporal_positions_and_previous_outputs = self._gather_memory_frame_outputs( + inference_session, obj_idx, frame_idx, track_in_reverse_time + ) + + memories_to_concatenate, memory_positional_embeddings_to_concatenate = self._build_memory_attention_inputs( + temporal_positions_and_previous_outputs, device + ) + num_spatial_memory_tokens = len(memories_to_concatenate) + + # Step 3: Get and process object pointers + temporal_offsets, pointer_tokens, max_object_pointers_to_use = self._get_object_pointers( + inference_session, obj_idx, frame_idx, num_total_frames, device, track_in_reverse_time, streaming + ) + + num_object_pointer_tokens = 0 + if pointer_tokens: + object_pointers, object_pointers_pos_embed = self._process_object_pointers( + temporal_offsets, pointer_tokens, max_object_pointers_to_use, batch_size, num_channels, device + ) + + if object_pointers is not None: + memories_to_concatenate.append(object_pointers) + memory_positional_embeddings_to_concatenate.append(object_pointers_pos_embed) + num_object_pointer_tokens = object_pointers.shape[0] + + # Step 4: Concatenate all retrieved memories and their positional embeddings + combined_memory = torch.cat(memories_to_concatenate, dim=0) + combined_memory_positional_embeddings = torch.cat(memory_positional_embeddings_to_concatenate, dim=0) + + # Step 5: Forward through the memory attention mechanism + conditioned_feature_map_flat = self.memory_attention( + current_vision_features=current_vision_features, + current_vision_position_embeddings=current_vision_positional_embeddings, + memory=combined_memory, + memory_posision_embeddings=combined_memory_positional_embeddings, # Corrected typo from API + num_object_pointer_tokens=num_object_pointer_tokens, + num_spatial_memory_tokens=num_spatial_memory_tokens, + ) + + # Reshape from (Batch, H*W, Channels) to (Batch, Channels, Height, Width) + conditioned_feature_map = ( + conditioned_feature_map_flat.squeeze(1).permute(0, 2, 1).view(batch_size, num_channels, height, width) + ) + return conditioned_feature_map + + def _use_multimask(self, is_init_cond_frame: bool, point_inputs: Optional[dict]) -> bool: + """Whether to use multimask output in the SAM head.""" + num_pts = 0 if point_inputs is None else point_inputs["point_labels"].size(2) + multimask_output = ( + self.config.multimask_output_in_sam + and (is_init_cond_frame or self.config.multimask_output_for_tracking) + and (self.config.multimask_min_pt_num <= num_pts <= self.config.multimask_max_pt_num) + ) + return multimask_output + + def _run_single_frame_inference( + self, + inference_session: EdgeTamVideoInferenceSession, + frame_idx: int, + obj_idx: int, + batch_size: int, + is_init_cond_frame: bool, + point_inputs: Optional[torch.Tensor], + mask_inputs: Optional[torch.Tensor], + reverse: bool, + run_mem_encoder: bool, + prev_sam_mask_logits: Optional[torch.Tensor] = None, + streaming: bool = False, + ) -> dict[str, Any]: + """ + Perform a single tracking step for video object segmentation. + + Args: + inference_session (`EdgeTamVideoInferenceSession`): + The video inference session object. + frame_idx (`int`): + Index of the current frame. + obj_idx (`int`): + Index of the current object. + batch_size (`int`): + Batch size of the current frame. + is_init_cond_frame (`bool`): + Whether this is an initial conditioning frame with user inputs. + point_inputs (`dict`, *optional*): + Point prompt inputs for the current frame. + mask_inputs (`torch.Tensor`, *optional*): + Mask prompt inputs for the current frame. + reverse (`bool`, *optional*, defaults to `False`): + Whether to track in reverse time order. + run_mem_encoder (`bool`, *optional*, defaults to `True`): + Whether to run the memory encoder on predicted masks. + prev_sam_mask_logits (`torch.Tensor`, *optional*): + Previously predicted SAM mask logits that can be fed with new clicks. + streaming (`bool`, *optional*, defaults to `False`): + Whether this is streaming inference. + + Returns: + `dict`: Dictionary containing the tracking results for the current frame, including: + - pred_masks: Predicted low-resolution masks. + - object_pointer: Object pointer for memory. + - object_score_logits: Object score logits (inference only). + - maskmem_features: Memory features for future frames. + - maskmem_pos_enc: Memory positional encodings. + """ + # Retrieve correct image features + current_vision_feats, current_vision_pos_embeds = self._prepare_vision_features( + inference_session, frame_idx, batch_size + ) + # point and mask should not appear as input simultaneously on the same frame + if point_inputs is not None and mask_inputs is not None: + raise ValueError( + "point_inputs and mask_inputs should not appear as input simultaneously on the same frame" + ) + # High-resolution feature maps for the SAM head, reshape (HW)BC => BCHW + if len(current_vision_feats) > 1: + high_res_features = [ + x.permute(1, 2, 0).view(x.size(1), x.size(2), *s) + for x, s in zip(current_vision_feats[:-1], self.backbone_feature_sizes[:-1]) + ] + else: + high_res_features = None + if mask_inputs is not None: + # We directly output the mask input (see it as a GT mask) without using a SAM prompt encoder + mask decoder. + pix_feat = current_vision_feats[-1].permute(1, 2, 0) + pix_feat = pix_feat.view(-1, self.hidden_dim, *self.backbone_feature_sizes[-1]) + sam_outputs = self._use_mask_as_output(pix_feat, high_res_features, mask_inputs) + else: + # fused the visual feature with previous memory features in the memory bank + pix_feat = self._prepare_memory_conditioned_features( + inference_session=inference_session, + frame_idx=frame_idx, + obj_idx=obj_idx, + is_initial_conditioning_frame=is_init_cond_frame, + current_vision_features=current_vision_feats[-1], + current_vision_positional_embeddings=current_vision_pos_embeds[-1], + num_total_frames=inference_session.num_frames, + track_in_reverse_time=reverse, + streaming=streaming, + ) + # apply SAM-style segmentation head + # here we might feed previously predicted low-res SAM mask logits into the SAM mask decoder, + # e.g. in demo where such logits come from earlier interaction instead of correction sampling + # (in this case, any `mask_inputs` shouldn't reach here as they are sent to _use_mask_as_output instead) + if prev_sam_mask_logits is not None: + mask_inputs = prev_sam_mask_logits + multimask_output = self._use_multimask(is_init_cond_frame, point_inputs) + sam_outputs = self._single_frame_forward( + pixel_values=None, # Vision features already computed + input_points=point_inputs["point_coords"] if point_inputs is not None else None, + input_labels=point_inputs["point_labels"] if point_inputs is not None else None, + input_masks=mask_inputs, + image_embeddings=high_res_features + [pix_feat], + multimask_output=multimask_output, + ) + + # Finally run the memory encoder on the predicted mask to encode + # it into a new memory feature (which will be used to condition vision features in future frames) + maskmem_features = None + maskmem_pos_enc = None + if run_mem_encoder and self.num_maskmem > 0: + maskmem_features, maskmem_pos_enc = self._encode_new_memory( + current_vision_feats=current_vision_feats[-1], + pred_masks_high_res=sam_outputs.high_res_masks, + object_score_logits=sam_outputs.object_score_logits, + is_mask_from_pts=(point_inputs is not None or mask_inputs is not None), + ) + + current_out = { + "pred_masks": sam_outputs.pred_masks, + "object_pointer": sam_outputs.object_pointer, + "maskmem_features": maskmem_features if maskmem_features is not None else None, + "maskmem_pos_enc": maskmem_pos_enc, + } + if not self.training: + current_out["object_score_logits"] = sam_outputs.object_score_logits + + return current_out + + def _encode_new_memory( + self, + current_vision_feats: torch.Tensor, + pred_masks_high_res: torch.Tensor, + object_score_logits: torch.Tensor, + is_mask_from_pts: bool, + ) -> tuple[torch.Tensor, list[torch.Tensor]]: + """Encode the current image and its prediction into a memory feature.""" + batch_size = current_vision_feats.size(1) # batch size on this frame + channels = self.hidden_dim + height, width = self.backbone_feature_sizes[-1] # top-level (lowest-resolution) feature size + # top-level feature, (HW)BC => BCHW + pix_feat = current_vision_feats.permute(1, 2, 0).view(batch_size, channels, height, width) + if is_mask_from_pts and not self.training: + # binarize the mask logits + mask_for_mem = (pred_masks_high_res > 0).to(pred_masks_high_res.dtype) + else: + # apply sigmoid on the raw mask logits to turn them into range (0, 1) + mask_for_mem = torch.sigmoid(pred_masks_high_res) + # apply scale and bias terms to the sigmoid probabilities + mask_for_mem = mask_for_mem * self.config.sigmoid_scale_for_mem_enc + mask_for_mem = mask_for_mem + self.config.sigmoid_bias_for_mem_enc + + maskmem_features, maskmem_pos_enc = self.memory_encoder( + pix_feat, + mask_for_mem, + ) + # add a no-object embedding to the spatial memory to indicate that the frame + # is predicted to be occluded (i.e. no object is appearing in the frame) + if self.occlusion_spatial_embedding_parameter is not None: + is_obj_appearing = (object_score_logits > 0).float() + maskmem_features += (1 - is_obj_appearing[..., None]) * self.occlusion_spatial_embedding_parameter[ + ..., None, None + ].expand(*maskmem_features.shape) + + maskmem_pos_enc = maskmem_pos_enc.to(pred_masks_high_res.dtype) + maskmem_features, maskmem_pos_enc = self.spatial_perceiver(maskmem_features, maskmem_pos_enc) + maskmem_features = maskmem_features.to(pred_masks_high_res.dtype) + maskmem_pos_enc = maskmem_pos_enc.to(pred_masks_high_res.dtype) + + return maskmem_features, maskmem_pos_enc + + @torch.inference_mode() + @auto_docstring( + custom_intro=""" + Propagate the objects through the video frames. Used when initializing an inference session with a whole video. + Yields EdgeTamVideoSegmentationOutput for each frame. + """ + ) + def propagate_in_video_iterator( + self, + inference_session: EdgeTamVideoInferenceSession, + start_frame_idx: Optional[int] = None, + max_frame_num_to_track: Optional[int] = None, + reverse: bool = False, + ) -> Iterator[EdgeTamVideoSegmentationOutput]: + r""" + inference_session (`EdgeTamVideoInferenceSession`): + The video inference session object. + start_frame_idx (`int`, *optional*): + The starting frame index for propagation. + Need to be provided if `forward` hasn't been called on new inputs yet. + If not provided, the starting frame index will be the earliest frame with input points. + max_frame_num_to_track (`int`, *optional*): + The maximum number of frames to track. + reverse (`bool`, *optional*, defaults to `False`): + Whether to propagate in reverse. + """ + num_frames = inference_session.num_frames + + # set start index, end index, and processing order + if start_frame_idx is None: + # default: start from the earliest frame with input points + frames_with_inputs = [ + frame_idx + for obj_output_dict in inference_session.output_dict_per_obj.values() + for frame_idx in obj_output_dict["cond_frame_outputs"] + ] + if not frames_with_inputs: + raise ValueError( + "Cannot determine the starting frame index; please specify it manually, or run inference on a frame with inputs first." + ) + start_frame_idx = min(frames_with_inputs) + if max_frame_num_to_track is None: + # default: track all the frames in the video + max_frame_num_to_track = num_frames + if reverse: + end_frame_idx = max(start_frame_idx - max_frame_num_to_track, 0) + if start_frame_idx > 0: + processing_order = range(start_frame_idx, end_frame_idx - 1, -1) + else: + processing_order = [] # skip reverse tracking if starting from frame 0 + else: + end_frame_idx = min(start_frame_idx + max_frame_num_to_track, num_frames - 1) + processing_order = range(start_frame_idx, end_frame_idx + 1) + + for frame_idx in tqdm(processing_order, desc="propagate in video"): + edgetam_video_output = self(inference_session, frame_idx=frame_idx, reverse=reverse) + yield edgetam_video_output + + +__all__ = ["EdgeTamVideoModel", "EdgeTamVideoInferenceSession", "EdgeTamVideoPreTrainedModel"] diff --git a/src/transformers/models/edgetam_video/modular_edgetam_video.py b/src/transformers/models/edgetam_video/modular_edgetam_video.py new file mode 100644 index 000000000000..b520cd5a756b --- /dev/null +++ b/src/transformers/models/edgetam_video/modular_edgetam_video.py @@ -0,0 +1,1243 @@ +# coding=utf-8 +# Copyright 2025 the HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from typing import Callable, Optional + +import torch +import torch.nn as nn +import torch.utils.checkpoint +from torch import Tensor + +from transformers.models.sam2.modeling_sam2 import ( + eager_attention_forward, + window_partition, +) +from transformers.utils.generic import OutputRecorder + +from ...activations import ACT2FN +from ...configuration_utils import PretrainedConfig +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS +from ...processing_utils import Unpack +from ...pytorch_utils import compile_compatible_method_lru_cache +from ...utils import ( + auto_docstring, +) +from ..auto import CONFIG_MAPPING, AutoConfig +from ..sam2_video.configuration_sam2_video import ( + Sam2VideoConfig, + Sam2VideoMaskDecoderConfig, + Sam2VideoPromptEncoderConfig, +) +from ..sam2_video.modeling_sam2_video import ( + Sam2VideoAttention, + Sam2VideoFeedForward, + Sam2VideoInferenceSession, + Sam2VideoLayerNorm, + Sam2VideoMemoryAttention, + Sam2VideoMemoryEncoder, + Sam2VideoMemoryFuserCXBlock, + Sam2VideoModel, + Sam2VideoPositionEmbeddingSine, + Sam2VideoPreTrainedModel, + Sam2VideoTwoWayAttentionBlock, + Sam2VideoVisionEncoderOutput, + Sam2VideoVisionRotaryEmbedding, + rotate_pairwise, +) + + +class EdgeTamVideoPromptEncoderConfig(Sam2VideoPromptEncoderConfig): + pass + + +class EdgeTamVideoMaskDecoderConfig(Sam2VideoMaskDecoderConfig): + pass + + +class EdgeTamVideoConfig(Sam2VideoConfig): + r""" + [`EdgeTamVideoConfig`] is the configuration class to store the configuration of a [`EdgeTamVideoModel`]. It is used to instantiate a + EDGETAM model according to the specified arguments, defining the memory attention, memory encoder, and image encoder + configs. Instantiating a configuration defaults will yield a similar configuration to that of the SAM 2.1 Hiera-tiny + [facebook/EdgeTAM](https://huggingface.co/facebook/EdgeTAM) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vision_config (Union[`dict`, `EdgeTamVideoVisionConfig`], *optional*): + Dictionary of configuration options used to initialize [`EdgeTamVideoVisionConfig`]. + prompt_encoder_config (Union[`dict`, `EdgeTamVideoPromptEncoderConfig`], *optional*): + Dictionary of configuration options used to initialize [`EdgeTamVideoPromptEncoderConfig`]. + mask_decoder_config (Union[`dict`, `EdgeTamVideoMaskDecoderConfig`], *optional*): + Dictionary of configuration options used to initialize [`EdgeTamMaskDecoderConfig`]. + initializer_range (`float`, *optional*, defaults to 0.02): + Standard deviation for parameter initialization. + num_maskmem (`int`, *optional*, defaults to 7): + The number of memory slots for the mask memory. + image_size (`int`, *optional*, defaults to 1024): + The size of the input images. + sigmoid_scale_for_mem_enc (`float`, *optional*, defaults to 20.0): + Scale factor for the sigmoid function in the memory encoder. + sigmoid_bias_for_mem_enc (`float`, *optional*, defaults to -10.0): + Bias for the sigmoid function in the memory encoder. + enable_occlusion_spatial_embedding (`bool`, *optional*, defaults to `True`): + Whether to enable spatial embedding for occlusions. + multimask_output_in_sam (`bool`, *optional*, defaults to `True`): + Whether to output multiple masks from the SAM head. + multimask_min_pt_num (`int`, *optional*, defaults to 0): + The minimum number of points to trigger multimask output. + multimask_max_pt_num (`int`, *optional*, defaults to 1): + The maximum number of points to trigger multimask output. + multimask_output_for_tracking (`bool`, *optional*, defaults to `True`): + Whether to use multimask output for tracking. + max_object_pointers_in_encoder (`int`, *optional*, defaults to 16): + The maximum number of object pointers in the encoder. + enable_temporal_pos_encoding_for_object_pointers (`bool`, *optional*, defaults to `True`): + Whether to enable temporal positional encoding for object pointers. + memory_attention_hidden_size (`int`, *optional*, defaults to 256): + Dimensionality of the memory attention hidden states. + memory_attention_num_layers (`int`, *optional*, defaults to 2): + The number of layers in the memory attention module. + memory_attention_num_attention_heads (`int`, *optional*, defaults to 1): + Number of attention heads for each attention layer in the memory attention. + memory_attention_downsample_rate (`int`, *optional*, defaults to 1): + The downsample rate for the attention layers. + memory_attention_mlp_hidden_size (`int`, *optional*, defaults to 2048): + The dimension of the feedforward network in the memory attention module. + memory_attention_mlp_hidden_act (`str`, *optional*, defaults to `"relu"`): + The non-linear activation function in the feedforward network in the memory attention module. + memory_attention_dropout (`float`, *optional*, defaults to 0.1): + The dropout rate for the memory attention module. + memory_attention_rope_theta (`float`, *optional*, defaults to 10000): + The Rope theta parameter. + memory_attention_rope_feat_sizes (`Tuple[int, int]`, *optional*, defaults to `[64, 64]`): + The feature sizes for the Rope positional encoding. + memory_attention_rope_k_sizes (`List[int]`, *optional*, defaults to `[16, 16]`): + The key feature sizes for the RoPE positional encoding in memory attention. + memory_attention_rope_dropout (`float`, *optional*, defaults to 0.1): + The dropout rate for the Rope positional encoding. + perceiver_resampler_num_latents (`int`, *optional*, defaults to 256): + The number of 1D latent tokens in the perceiver resampler. + perceiver_resampler_num_latents_2d (`int`, *optional*, defaults to 256): + The number of 2D latent tokens in the perceiver resampler. + perceiver_resampler_hidden_size (`int`, *optional*, defaults to 64): + The hidden size of the perceiver resampler. + perceiver_resampler_mlp_intermediate_size (`int`, *optional*, defaults to 256): + The intermediate size of the feedforward network in the perceiver resampler. + perceiver_resampler_num_attention_heads (`int`, *optional*, defaults to 1): + The number of attention heads in the perceiver resampler. + perceiver_resampler_attention_head_dim (`int`, *optional*, defaults to 64): + The dimension of each attention head in the perceiver resampler. + perceiver_resampler_num_layers (`int`, *optional*, defaults to 2): + The number of layers in the perceiver resampler. + perceiver_resampler_hidden_dropout (`float`, *optional*, defaults to 0.0): + The dropout rate for the hidden layers in the perceiver resampler. + perceiver_resampler_attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout rate for the attention layers in the perceiver resampler. + memory_encoder_hidden_size (`int`, *optional*, defaults to 256): + Dimensionality of the memory encoder hidden states. + memory_encoder_output_channels (`int`, *optional*, defaults to 64): + The number of output channels for the memory encoder. + mask_downsampler_embed_dim (`int`, *optional*, defaults to 256): + The dimension of the mask downsampler embedding. + memory_fuser_intermediate_dim (`int`, *optional*, defaults to 1024): + The intermediate dimension of the memory fuser feedforward network. + mask_downsampler_kernel_size (`int`, *optional*, defaults to 3): + The kernel size for the mask downsampler. + mask_downsampler_stride (`int`, *optional*, defaults to 2): + The stride for the mask downsampler. + mask_downsampler_padding (`int`, *optional*, defaults to 1): + The padding for the mask downsampler. + mask_downsampler_total_stride (`int`, *optional*, defaults to 16): + The total stride for the mask downsampler. + mask_downsampler_hidden_act (`str`, *optional*, defaults to `"gelu"`): + The non-linear activation function in the mask downsampler. + memory_fuser_num_layers (`int`, *optional*, defaults to 2): + The number of layers in the memory fuser. + memory_fuser_embed_dim (`int`, *optional*, defaults to 256): + The dimension of the memory fuser embedding. + memory_fuser_kernel_size (`int`, *optional*, defaults to 7): + The kernel size for the memory fuser. + memory_fuser_padding (`int`, *optional*, defaults to 3): + The padding for the memory fuser. + memory_fuser_layer_scale_init_value (`float`, *optional*, defaults to 1e-06): + The initial value for the layer scale in the memory fuser. + memory_fuser_hidden_act (`str`, *optional*, defaults to `"gelu"`): + The non-linear activation function in the memory fuser. + + Example: + + ```python + >>> from transformers import ( + ... EdgeTamVisionConfig, + ... EdgeTamVideoPromptEncoderConfig, + ... EdgeTamVideoMaskDecoderConfig, + ... EdgeTamVideoModel, + ... EdgeTamVideoConfig, + ... ) + + >>> # Initializing a EdgeTamVideoConfig with `"facebook/edgetam.1_hiera_tiny"` style configuration + >>> configuration = EdgeTamVideoConfig() + + >>> # Initializing a EdgeTamVideoModel (with random weights) from the `"facebook/edgetam.1_hiera_tiny"` style configuration + >>> model = EdgeTamVideoModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + + >>> # We can also initialize a EdgeTamConfig from a EdgeTamVisionConfig, EdgeTamPromptEncoderConfig, and EdgeTamMaskDecoderConfig + + >>> # Initializing EDGETAM vision encoder, memory attention, and memory encoder configurations + >>> vision_config = EdgeTamVisionConfig() + >>> prompt_encoder_config = EdgeTamVideoPromptEncoderConfig() + >>> mask_decoder_config = EdgeTamVideoMaskDecoderConfig() + + >>> config = EdgeTamVideoConfig(vision_config, prompt_encoder_config, mask_decoder_config) + ```""" + + model_type = "edgetam_video" + sub_configs = { + "vision_config": AutoConfig, + "prompt_encoder_config": EdgeTamVideoPromptEncoderConfig, + "mask_decoder_config": EdgeTamVideoMaskDecoderConfig, + } + + def __init__( + self, + vision_config=None, + prompt_encoder_config=None, + mask_decoder_config=None, + initializer_range=0.02, + num_maskmem=7, + image_size=1024, + sigmoid_scale_for_mem_enc=20.0, + sigmoid_bias_for_mem_enc=-10.0, + enable_occlusion_spatial_embedding=True, + multimask_output_in_sam=True, + multimask_min_pt_num=0, + multimask_max_pt_num=1, + multimask_output_for_tracking=True, + max_object_pointers_in_encoder=16, + enable_temporal_pos_encoding_for_object_pointers=True, + # memory attention + memory_attention_hidden_size=256, + memory_attention_num_layers=2, + memory_attention_num_attention_heads=1, + memory_attention_downsample_rate=1, + memory_attention_mlp_hidden_size=2048, + memory_attention_mlp_hidden_act="relu", + memory_attention_dropout=0.1, + memory_attention_rope_theta=10000, + memory_attention_rope_feat_sizes=None, + memory_attention_rope_k_sizes=None, + memory_attention_rope_dropout=0.1, + # spatial perceiver resampler + perceiver_resampler_num_latents=256, + perceiver_resampler_num_latents_2d=256, + perceiver_resampler_hidden_size=64, + perceiver_resampler_mlp_intermediate_size=256, + perceiver_resampler_num_attention_heads=1, + perceiver_resampler_attention_head_dim=64, + perceiver_resampler_num_layers=2, + perceiver_resampler_hidden_dropout=0.0, + perceiver_resampler_attention_dropout=0.0, + # memory encoder + memory_encoder_hidden_size=256, + memory_encoder_output_channels=64, + mask_downsampler_embed_dim=256, + memory_fuser_intermediate_dim=1024, + mask_downsampler_kernel_size=3, + mask_downsampler_stride=2, + mask_downsampler_padding=1, + mask_downsampler_total_stride=16, + mask_downsampler_hidden_act="gelu", + memory_fuser_num_layers=2, + memory_fuser_embed_dim=256, + memory_fuser_kernel_size=7, + memory_fuser_padding=3, + memory_fuser_layer_scale_init_value=1e-6, + memory_fuser_hidden_act="gelu", + **kwargs, + ): + PretrainedConfig.__init__(**kwargs) + vision_config = vision_config if vision_config is not None else {} + prompt_encoder_config = prompt_encoder_config if prompt_encoder_config is not None else {} + mask_decoder_config = mask_decoder_config if mask_decoder_config is not None else {} + memory_attention_rope_feat_sizes = ( + [64, 64] if memory_attention_rope_feat_sizes is None else memory_attention_rope_feat_sizes + ) + memory_attention_rope_k_sizes = ( + [16, 16] if memory_attention_rope_k_sizes is None else memory_attention_rope_k_sizes + ) + + if isinstance(vision_config, dict): + vision_config["model_type"] = vision_config.get("model_type", "sam2_vision_model") + vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) + if isinstance(prompt_encoder_config, EdgeTamVideoPromptEncoderConfig): + prompt_encoder_config = prompt_encoder_config.to_dict() + if isinstance(mask_decoder_config, EdgeTamVideoMaskDecoderConfig): + mask_decoder_config = mask_decoder_config.to_dict() + + self.vision_config = vision_config + self.prompt_encoder_config = EdgeTamVideoPromptEncoderConfig(**prompt_encoder_config) + self.mask_decoder_config = EdgeTamVideoMaskDecoderConfig(**mask_decoder_config) + + self.initializer_range = initializer_range + self.num_maskmem = num_maskmem # default 1 input frame + 6 previous frames + self.image_size = image_size + self.sigmoid_scale_for_mem_enc = sigmoid_scale_for_mem_enc # scale factor for mask sigmoid prob + self.sigmoid_bias_for_mem_enc = sigmoid_bias_for_mem_enc # bias factor for mask sigmoid prob + self.enable_occlusion_spatial_embedding = enable_occlusion_spatial_embedding + self.multimask_output_in_sam = multimask_output_in_sam + self.multimask_min_pt_num = multimask_min_pt_num + self.multimask_max_pt_num = multimask_max_pt_num + self.multimask_output_for_tracking = multimask_output_for_tracking + self.max_object_pointers_in_encoder = max_object_pointers_in_encoder + self.enable_temporal_pos_encoding_for_object_pointers = enable_temporal_pos_encoding_for_object_pointers + + # memory attention + self.memory_attention_hidden_size = memory_attention_hidden_size + self.memory_attention_num_layers = memory_attention_num_layers + self.memory_attention_num_attention_heads = memory_attention_num_attention_heads + self.memory_attention_downsample_rate = memory_attention_downsample_rate + self.memory_attention_mlp_hidden_size = memory_attention_mlp_hidden_size + self.memory_attention_mlp_hidden_act = memory_attention_mlp_hidden_act + self.memory_attention_dropout = memory_attention_dropout + self.memory_attention_rope_theta = memory_attention_rope_theta + self.memory_attention_rope_feat_sizes = memory_attention_rope_feat_sizes + self.memory_attention_rope_k_sizes = memory_attention_rope_k_sizes + self.memory_attention_rope_dropout = memory_attention_rope_dropout + + # spatial perceiver resampler + self.perceiver_resampler_num_latents = perceiver_resampler_num_latents + self.perceiver_resampler_num_latents_2d = perceiver_resampler_num_latents_2d + self.perceiver_resampler_hidden_size = perceiver_resampler_hidden_size + self.perceiver_resampler_mlp_intermediate_size = perceiver_resampler_mlp_intermediate_size + self.perceiver_resampler_attention_head_dim = perceiver_resampler_attention_head_dim + self.perceiver_resampler_num_attention_heads = perceiver_resampler_num_attention_heads + self.perceiver_resampler_num_layers = perceiver_resampler_num_layers + self.perceiver_resampler_hidden_dropout = perceiver_resampler_hidden_dropout + self.perceiver_resampler_attention_dropout = perceiver_resampler_attention_dropout + + # memory encoder + self.memory_encoder_hidden_size = memory_encoder_hidden_size + self.memory_encoder_output_channels = memory_encoder_output_channels + self.mask_downsampler_embed_dim = mask_downsampler_embed_dim + self.mask_downsampler_kernel_size = mask_downsampler_kernel_size + self.mask_downsampler_stride = mask_downsampler_stride + self.mask_downsampler_padding = mask_downsampler_padding + self.mask_downsampler_total_stride = mask_downsampler_total_stride + self.mask_downsampler_hidden_act = mask_downsampler_hidden_act + self.memory_fuser_num_layers = memory_fuser_num_layers + self.memory_fuser_embed_dim = memory_fuser_embed_dim + self.memory_fuser_intermediate_dim = memory_fuser_intermediate_dim + self.memory_fuser_kernel_size = memory_fuser_kernel_size + self.memory_fuser_padding = memory_fuser_padding + self.memory_fuser_layer_scale_init_value = memory_fuser_layer_scale_init_value + self.memory_fuser_hidden_act = memory_fuser_hidden_act + + +class EdgeTamVideoLayerNorm(Sam2VideoLayerNorm): + pass + + +class EdgeTamVideoMemoryFuserCXBlock(Sam2VideoMemoryFuserCXBlock): + pass + + +class EdgeTamVideoVisionEncoderOutput(Sam2VideoVisionEncoderOutput): + pass + + +class EdgeTamVideoVisionRotaryEmbedding(Sam2VideoVisionRotaryEmbedding): + def __init__(self, config: EdgeTamVideoConfig, end_x: Optional[int] = None, end_y: Optional[int] = None): + nn.Module.__init__() + dim = config.memory_attention_hidden_size // ( + config.memory_attention_downsample_rate * config.memory_attention_num_attention_heads + ) + # Ensure even dimension for proper axial splitting + if dim % 4 != 0: + raise ValueError("Dimension must be divisible by 4 for axial RoPE") + end_x, end_y = config.memory_attention_rope_feat_sizes if end_x is None else (end_x, end_y) + freqs = 1.0 / (config.memory_attention_rope_theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim)) + + # Generate 2D position indices for axial rotary embedding + flattened_indices = torch.arange(end_x * end_y, dtype=torch.long) + x_positions = flattened_indices % end_x + y_positions = torch.div(flattened_indices, end_x, rounding_mode="floor") + freqs_x = torch.outer(x_positions, freqs).float() + freqs_y = torch.outer(y_positions, freqs).float() + inv_freq = torch.cat([freqs_x, freqs_y], dim=-1) + inv_freq = inv_freq.repeat_interleave(2, dim=-1) + # directly register the cos and sin embeddings as we have a fixed feature shape + self.register_buffer("rope_embeddings_cos", inv_freq.cos(), persistent=False) + self.register_buffer("rope_embeddings_sin", inv_freq.sin(), persistent=False) + + +class EdgeTamVideoAttention(Sam2VideoAttention): + pass + + +def apply_rotary_pos_emb_2d_self_attn( + q: torch.Tensor, + k: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, +) -> tuple[torch.Tensor, torch.Tensor]: + """ + Apply rotary position embedding to query and key tensors for self-attention. + + Args: + q: Query tensor of shape (..., seq_len, head_dim) + k: Key tensor of shape (..., seq_len, head_dim) + cos: Cosine position embedding of shape (seq_len, head_dim) + sin: Sine position embedding of shape (seq_len, head_dim) + + Returns: + Rotated (q, k) tensors + """ + # Apply RoPE to queries + q_embed = q.float() # force upscale to float32 as in the original implementation + q_embed = (q_embed * cos) + (rotate_pairwise(q_embed) * sin) + + # Apply RoPE to keys (same embeddings as queries for self-attention) + k_embed = k.float() # force upscale to float32 as in the original implementation + k_embed = (k_embed * cos) + (rotate_pairwise(k_embed) * sin) + + return q_embed.type_as(q), k_embed.type_as(k) + + +def apply_rotary_pos_emb_2d_cross_attn( + q: torch.Tensor, + k: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + cos_k: torch.Tensor, + sin_k: torch.Tensor, + num_k_exclude_rope: int = 0, + repeat_freqs_k: int = 1, +) -> tuple[torch.Tensor, torch.Tensor]: + """ + Apply rotary position embedding to query and key tensors for cross-attention. + + Args: + q: Query tensor of shape (..., seq_len, head_dim) + k: Key tensor of shape (..., seq_len, head_dim) + cos: Cosine position embedding of shape (seq_len, head_dim) + sin: Sine position embedding of shape (seq_len, head_dim) + cos_k: Cosine position embedding for keys of shape (seq_len, head_dim) + sin_k: Sine position embedding for keys of shape (seq_len, head_dim) + num_k_exclude_rope: Number of tokens at end of k to exclude from RoPE (e.g., object pointer tokens) + repeat_freqs_k: Frequency repetition for keys in cross-attention (e.g., for spatial memory tokens) + + Returns: + Rotated (q, k) tensors + """ + # Apply RoPE to queries (always straightforward) + q_embed = q.float() + q_embed = (q_embed * cos) + (rotate_pairwise(q_embed) * sin) + + # Split keys: RoPE tokens and excluded tokens (e.g., object pointers) + num_total_k_tokens = k.shape[-2] + k_for_rope = k[..., : num_total_k_tokens - num_k_exclude_rope, :] + k_excluded = k[..., num_total_k_tokens - num_k_exclude_rope :, :] + + # Early return if no keys need RoPE + if k_for_rope.shape[-2] == 0: + return q_embed.type_as(q), k_excluded + + batch_size, num_heads, k_seq_len, channels_per_head = k_for_rope.shape + + # Handle temporal/spatial token structure for memory + # Keys have temporal + spatial structure, only spatial tokens get RoPE + tokens_per_group = k_seq_len // repeat_freqs_k + spatial_tokens = cos_k.shape[-2] + temporal_tokens = tokens_per_group - spatial_tokens + + # Reshape and separate temporal/spatial tokens + k_grouped = k_for_rope.view(batch_size, num_heads, repeat_freqs_k, tokens_per_group, channels_per_head) + k_temporal = k_grouped[..., :temporal_tokens, :].reshape(batch_size, num_heads, -1, channels_per_head) + k_spatial = k_grouped[..., temporal_tokens:, :].reshape(batch_size, num_heads, -1, channels_per_head) + + # Only apply RoPE to spatial tokens + k_rope_input = k_spatial + + # Prepare position embeddings for repeated groups + if repeat_freqs_k > 1: + cos_k = cos_k.repeat(1, 1, repeat_freqs_k, 1) + sin_k = sin_k.repeat(1, 1, repeat_freqs_k, 1) + + # Apply RoPE to spatial tokens + k_spatial_embed = k_rope_input.float() + k_spatial_embed = (k_spatial_embed * cos_k) + (rotate_pairwise(k_spatial_embed) * sin_k) + + # Reconstruct: temporal + spatial tokens back to original structure + k_spatial_reshaped = k_spatial_embed.view(batch_size, num_heads, repeat_freqs_k, -1, channels_per_head) + k_temporal_reshaped = k_temporal.view(batch_size, num_heads, repeat_freqs_k, -1, channels_per_head) + k_final = torch.cat([k_temporal_reshaped, k_spatial_reshaped], dim=3) + k_final = k_final.view(batch_size, num_heads, k_seq_len, channels_per_head) + + # Combine RoPE-processed keys with excluded tokens + k_embed = torch.cat([k_final.type_as(k), k_excluded], dim=-2) + return q_embed.type_as(q), k_embed + + +class EdgeTamVideoRoPESelfAttention(nn.Module): + """Self-attention with rotary position encoding.""" + + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + self.config = config + self.hidden_size = config.memory_attention_hidden_size + self.internal_dim = self.hidden_size // config.memory_attention_downsample_rate + self.num_attention_heads = config.memory_attention_num_attention_heads + self.head_dim = self.internal_dim // config.memory_attention_num_attention_heads + self.scaling = self.head_dim**-0.5 + self.is_causal = False + + self.q_proj = nn.Linear(self.hidden_size, self.internal_dim) + self.k_proj = nn.Linear(self.hidden_size, self.internal_dim) + self.v_proj = nn.Linear(self.hidden_size, self.internal_dim) + self.o_proj = nn.Linear(self.internal_dim, self.hidden_size) + self.dropout_p = config.memory_attention_rope_dropout + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + **kwargs: Unpack[FlashAttentionKwargs], + ) -> Tensor: + # Input projections + batch_size, point_batch_size = query.shape[:2] + new_shape = (batch_size * point_batch_size, -1, self.num_attention_heads, self.head_dim) + + query = self.q_proj(query).view(*new_shape).transpose(1, 2) + key = self.k_proj(key).view(*new_shape).transpose(1, 2) + value = self.v_proj(value).view(*new_shape).transpose(1, 2) + + cos, sin = position_embeddings + # Apply rotary position encoding for self-attention + query, key = apply_rotary_pos_emb_2d_self_attn(query, key, cos=cos, sin=sin) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query, + key, + value, + attention_mask=None, + dropout=0.0 if not self.training else self.dropout_p, + scaling=self.scaling, + is_causal=self.is_causal, + **kwargs, + ) + attn_output = attn_output.reshape( + batch_size, point_batch_size, -1, self.num_attention_heads * self.head_dim + ).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class EdgeTamVideoRoPECrossAttention(nn.Module): + """Cross-attention with rotary position encoding.""" + + def __init__(self, config: EdgeTamVideoConfig, kv_in_dim: int): + super().__init__() + self.config = config + self.hidden_size = config.memory_attention_hidden_size + self.internal_dim = self.hidden_size // config.memory_attention_downsample_rate + self.num_attention_heads = config.memory_attention_num_attention_heads + self.head_dim = self.internal_dim // config.memory_attention_num_attention_heads + self.scaling = self.head_dim**-0.5 + self.is_causal = False + + self.kv_in_dim = kv_in_dim + + self.q_proj = nn.Linear(self.hidden_size, self.internal_dim) + self.k_proj = nn.Linear(self.kv_in_dim, self.internal_dim) + self.v_proj = nn.Linear(self.kv_in_dim, self.internal_dim) + self.o_proj = nn.Linear(self.internal_dim, self.hidden_size) + self.dropout_p = config.memory_attention_rope_dropout + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + position_embeddings_k: tuple[torch.Tensor, torch.Tensor], + num_k_exclude_rope: int = 0, + rope_k_repeat: int = 0, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> Tensor: + # Input projections + batch_size, point_batch_size = query.shape[:2] + new_shape = (batch_size * point_batch_size, -1, self.num_attention_heads, self.head_dim) + + query = self.q_proj(query).view(*new_shape).transpose(1, 2) + key = self.k_proj(key).view(*new_shape).transpose(1, 2) + value = self.v_proj(value).view(*new_shape).transpose(1, 2) + + cos, sin = position_embeddings + cos_k, sin_k = position_embeddings_k + # Apply rotary position encoding for cross-attention + query, key = apply_rotary_pos_emb_2d_cross_attn( + query, + key, + cos=cos, + sin=sin, + cos_k=cos_k, + sin_k=sin_k, + repeat_freqs_k=rope_k_repeat, + num_k_exclude_rope=num_k_exclude_rope, + ) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query, + key, + value, + attention_mask=None, + dropout=0.0 if not self.training else self.dropout_p, + scaling=self.scaling, + is_causal=self.is_causal, + **kwargs, + ) + attn_output = attn_output.reshape( + batch_size, point_batch_size, -1, self.num_attention_heads * self.head_dim + ).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class EdgeTamVideoTwoWayAttentionBlock(Sam2VideoTwoWayAttentionBlock): + pass + + +class EdgeTamVideoPositionEmbeddingSine(Sam2VideoPositionEmbeddingSine): + # maxsize=2 because we need to cache the forward method for both memory encoder and perceiver resampler + @compile_compatible_method_lru_cache(maxsize=2) + def forward(self, **super_kwargs): + return super().forward(**super_kwargs) + + +class EdgeTamVideoMemoryEncoder(Sam2VideoMemoryEncoder): + pass + + +class EdgeTamVideoFeedForward(Sam2VideoFeedForward): + pass + + +class EdgeTamVideoPreTrainedModel(Sam2VideoPreTrainedModel): + pass + + +class EdgeTamVideoInferenceSession(Sam2VideoInferenceSession): + pass + + +class EdgeTamVideoMemoryAttentionMLP(nn.Module): + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + self.config = config + self.hidden_size = config.memory_attention_hidden_size + self.intermediate_size = config.memory_attention_mlp_hidden_size + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size) + self.dropout = nn.Dropout(config.memory_attention_dropout) + self.act_fn = ACT2FN[config.memory_attention_mlp_hidden_act] + + def forward(self, x): + return self.down_proj(self.dropout(self.act_fn(self.up_proj(x)))) + + +class EdgeTamVideoMemoryAttentionLayer(nn.Module): + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + hidden_size = config.memory_attention_hidden_size + self.self_attn = EdgeTamVideoRoPESelfAttention(config) + self.cross_attn_image = EdgeTamVideoRoPECrossAttention(config, kv_in_dim=64) + + # MLP module + self.mlp = EdgeTamVideoMemoryAttentionMLP(config) + + self.layer_norm1 = nn.LayerNorm(hidden_size) + self.layer_norm2 = nn.LayerNorm(hidden_size) + self.layer_norm3 = nn.LayerNorm(hidden_size) + self.dropout1 = nn.Dropout(config.memory_attention_dropout) + self.dropout2 = nn.Dropout(config.memory_attention_dropout) + self.dropout3 = nn.Dropout(config.memory_attention_dropout) + + def forward( + self, + queries: Tensor, + keys: Tensor, + key_point_embedding: Tensor, + rope_position_embeddings: tuple[Tensor, Tensor], + rope_position_embeddings_k: Optional[tuple[Tensor, Tensor]] = None, + num_k_exclude_rope: int = 0, + rope_k_repeat: int = 0, + ) -> torch.Tensor: + # Self-Attention + query = self.layer_norm1(queries) + query, _ = self.self_attn(query=query, key=query, value=query, position_embeddings=rope_position_embeddings) + queries = queries + self.dropout1(query) + + # Cross-Attention + query = self.layer_norm2(queries) + query, _ = self.cross_attn_image( + query=query, + key=keys + key_point_embedding, + value=keys, + position_embeddings=rope_position_embeddings, + position_embeddings_k=rope_position_embeddings_k, + num_k_exclude_rope=num_k_exclude_rope, + rope_k_repeat=rope_k_repeat, + ) + queries = queries + self.dropout2(query) + # MLP + query = self.layer_norm3(queries) + query = self.mlp(query) + queries = queries + self.dropout3(query) + return queries + + +class EdgeTamVideoMemoryAttention(Sam2VideoMemoryAttention): + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + self.rotary_emb_k = EdgeTamVideoVisionRotaryEmbedding( + config, end_x=config.memory_attention_rope_k_sizes[0], end_y=config.memory_attention_rope_k_sizes[1] + ) + + def forward( + self, + current_vision_features: torch.Tensor, + memory: torch.Tensor, + current_vision_position_embeddings: Optional[Tensor] = None, + memory_posision_embeddings: Optional[Tensor] = None, + num_object_pointer_tokens: int = 0, + num_spatial_memory_tokens: int = -1, + ): + """ + Args: + current_vision_features (`torch.FloatTensor`): + The current vision features used for self-attention. + memory (`torch.FloatTensor`): + The memory features used for cross-attention. + current_vision_position_embeddings (`torch.FloatTensor`, *optional*): + The position embeddings for the current vision features. + memory_posision_embeddings (`torch.FloatTensor`, *optional*): + The position embeddings for the memory features. + num_object_pointer_tokens (`int`, *optional*, defaults to 0): + The number of object pointer tokens. + """ + output = current_vision_features + if current_vision_position_embeddings is not None: + output = output + 0.1 * current_vision_position_embeddings + + # Convert to batch first + output = output.transpose(0, 1) + memory = memory.transpose(0, 1).unsqueeze(1) + memory_posision_embeddings = memory_posision_embeddings.transpose(0, 1).unsqueeze(1) + rope_position_embeddings = self.rotary_emb() + rope_position_embeddings_k = self.rotary_emb_k() + for layer in self.layers: + output = layer( + queries=output.unsqueeze(1) if output.ndim == 3 else output, + keys=memory, + key_point_embedding=memory_posision_embeddings, + rope_position_embeddings=rope_position_embeddings, + rope_position_embeddings_k=rope_position_embeddings_k, + num_k_exclude_rope=num_object_pointer_tokens, + rope_k_repeat=num_spatial_memory_tokens, + ) + + normed_output = self.layer_norm(output) + + # Convert back to seq first + normed_output = normed_output.transpose(0, 1) + + return normed_output + + +class EdgeTamVideoPerceiverMLP(nn.Module): + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + self.hidden_size = config.perceiver_resampler_hidden_size + self.intermediate_size = config.perceiver_resampler_mlp_intermediate_size + + self.layer_norm = nn.LayerNorm(self.hidden_size) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = nn.GELU() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.layer_norm(hidden_states) + hidden_states = self.down_proj(self.act_fn(self.up_proj(hidden_states))) + return hidden_states + + +class EdgeTamVideoPerceiverAttention(nn.Module): + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + self.config = config + self.hidden_size = config.perceiver_resampler_hidden_size + self.num_attention_heads = config.perceiver_resampler_num_attention_heads + self.head_dim = config.perceiver_resampler_attention_head_dim + self.attention_dropout = config.perceiver_resampler_attention_dropout + + self.inner_dim = self.head_dim * self.num_attention_heads + self.scaling = self.head_dim**-0.5 + self.is_causal = False + + self.q_proj = nn.Linear(self.hidden_size, self.inner_dim, bias=False) + self.k_proj = nn.Linear(self.hidden_size, self.inner_dim, bias=False) + self.v_proj = nn.Linear(self.hidden_size, self.inner_dim, bias=False) + self.o_proj = nn.Linear(self.inner_dim, self.hidden_size, bias=False) + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + positional_encoding: Optional[torch.Tensor] = None, + **kwargs, + ) -> torch.Tensor: + # Project queries, keys, and values + query = self.q_proj(query) + key = self.k_proj(key) + value = self.v_proj(value) + + # Reshape for multi-head attention + batch_size, seq_len_q = query.shape[:2] + query = query.view(batch_size, seq_len_q, self.num_attention_heads, self.head_dim).transpose(1, 2) + seq_len_kv = key.shape[1] + key = key.view(batch_size, seq_len_kv, self.num_attention_heads, self.head_dim).transpose(1, 2) + value = value.view(batch_size, seq_len_kv, self.num_attention_heads, self.head_dim).transpose(1, 2) + + # Add positional encoding if provided + if positional_encoding is not None: + pos_encoding = positional_encoding.view( + batch_size, seq_len_kv, self.num_attention_heads, self.head_dim + ).transpose(1, 2) + key = key + pos_encoding + value = value + pos_encoding + + # Apply attention + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, _ = attention_interface( + self, + query, + key, + value, + attention_mask=None, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + is_causal=self.is_causal, + **kwargs, + ) + + # Reshape output + attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_len_q, self.inner_dim) + return self.o_proj(attn_output) + + +class EdgeTamVideoPerceiverEncoderLayer(nn.Module): + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + + self.cross_attention = EdgeTamVideoPerceiverAttention(config) + self.mlp = EdgeTamVideoPerceiverMLP(config) + self.dropout = nn.Dropout(config.perceiver_resampler_hidden_dropout) + + self.self_attention = EdgeTamVideoPerceiverAttention(config) + self.self_mlp = EdgeTamVideoPerceiverMLP(config) + + # Layer norms moved from attention classes to here + self.layer_norm_input = nn.LayerNorm(config.perceiver_resampler_hidden_size) + self.layer_norm_latents = nn.LayerNorm(config.perceiver_resampler_hidden_size) + self.layer_norm_self = nn.LayerNorm(config.perceiver_resampler_hidden_size) + + def forward( + self, + latents: torch.Tensor, + input_features: torch.Tensor, + positional_encoding: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + # Cross attention with layer norms + normalized_latents = self.layer_norm_latents(latents) + normalized_input = self.layer_norm_input(input_features) + cross_attention_output = self.cross_attention( + query=normalized_latents, + key=normalized_input, + value=normalized_input, + positional_encoding=positional_encoding, + ) + latents = latents + self.dropout(cross_attention_output) + + mlp_output = self.mlp(latents) + latents = latents + mlp_output + + # Self attention with layer norm + normalized_latents_self = self.layer_norm_self(latents) + self_attention_output = self.self_attention( + query=normalized_latents_self, key=normalized_latents_self, value=normalized_latents_self + ) + latents = latents + self_attention_output + + self_mlp_output = self.self_mlp(latents) + latents = latents + self_mlp_output + + return latents + + +class EdgeTamVideoPerceiverResampler(nn.Module): + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + self.config = config + self.hidden_size = config.perceiver_resampler_hidden_size + self.num_latents_1d = config.perceiver_resampler_num_latents + self.num_latents_2d = config.perceiver_resampler_num_latents_2d + self.num_layers = config.perceiver_resampler_num_layers + + if self.num_latents_1d > 0: + self.latents_1d = nn.Parameter(torch.randn(self.num_latents_1d, self.hidden_size)) + if self.num_latents_2d > 0: + self.latents_2d = nn.Parameter(torch.randn(self.num_latents_2d, self.hidden_size)) + + self.positional_encoding = EdgeTamVideoPositionEmbeddingSine( + num_pos_feats=self.hidden_size // 2, normalize=True + ) + + self.layers = nn.ModuleList([EdgeTamVideoPerceiverEncoderLayer(config) for _ in range(self.num_layers)]) + + self.layer_norm = nn.LayerNorm(self.hidden_size) + + def forward( + self, + hidden_states: torch.Tensor, + positional_encoding: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + output_latents = [] + output_positional_encodings = [] + + if self.num_latents_1d > 0: + latents_1d, pos_1d = self._forward_1d(hidden_states, positional_encoding) + output_latents.append(latents_1d) + output_positional_encodings.append(pos_1d) + + if self.num_latents_2d > 0: + latents_2d, pos_2d = self._forward_2d(hidden_states) + output_latents.append(latents_2d) + output_positional_encodings.append(pos_2d) + + combined_latents = torch.cat(output_latents, dim=1) + + combined_positional_encoding = None + if positional_encoding is not None and output_positional_encodings: + combined_positional_encoding = torch.cat(output_positional_encodings, dim=1) + + return combined_latents, combined_positional_encoding + + def _forward_1d( + self, + hidden_states: torch.Tensor, + positional_encoding: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + batch_size = hidden_states.shape[0] + + latents = self.latents_1d.unsqueeze(0).expand(batch_size, -1, -1) + flattened_features = hidden_states.permute(0, 2, 3, 1).flatten(1, 2) + + positional_features = None + if positional_encoding is not None: + positional_features = positional_encoding.permute(0, 2, 3, 1).flatten(1, 2) + + for layer in self.layers: + latents = layer(latents, flattened_features, positional_features) + + latents = self.layer_norm(latents) + + output_positional_encoding = None + if positional_encoding is not None: + output_positional_encoding = torch.zeros_like(latents) + + return latents, output_positional_encoding + + def _forward_2d(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + batch_size, channels, height, width = hidden_states.shape + + latents_2d = self.latents_2d.unsqueeze(0).expand(batch_size, -1, -1).view(-1, 1, channels) + + num_windows_per_dim = int(math.sqrt(self.num_latents_2d)) + window_size = height // num_windows_per_dim + + windowed_input = hidden_states.permute(0, 2, 3, 1) + windowed_features, _ = window_partition(windowed_input, window_size) + windowed_features = windowed_features.flatten(1, 2) + + for layer in self.layers: + latents_2d = layer(latents_2d, windowed_features, positional_encoding=None) + + latents_2d = latents_2d.view(batch_size, num_windows_per_dim, num_windows_per_dim, channels).permute( + 0, 3, 1, 2 + ) + + positional_encoding_2d = self.positional_encoding(latents_2d.shape, latents_2d.device, latents_2d.dtype).to( + dtype=hidden_states.dtype + ) + positional_encoding_2d = positional_encoding_2d.permute(0, 2, 3, 1).flatten(1, 2) + + latents_2d = latents_2d.permute(0, 2, 3, 1).flatten(1, 2) + latents_2d = self.layer_norm(latents_2d) + + return latents_2d, positional_encoding_2d + + +@auto_docstring +class EdgeTamVideoModel(Sam2VideoModel): + _tied_weights_keys = ["prompt_encoder.shared_embedding.positional_embedding"] + # need to be ignored, as it's a buffer and will not be correctly detected as tied weight + _keys_to_ignore_on_load_missing = ["prompt_encoder.shared_embedding.positional_embedding"] + _keys_to_ignore_on_load_unexpected = [] + _can_record_outputs = {"mask_decoder_attentions": OutputRecorder(EdgeTamVideoTwoWayAttentionBlock, index=2)} + + def __init__(self, config: EdgeTamVideoConfig): + super().__init__(config) + self.spatial_perceiver = EdgeTamVideoPerceiverResampler(config) + + self.post_init() + + def _build_memory_attention_inputs( + self, + temporal_positions_and_previous_outputs: list[tuple[int, dict]], + device: torch.device, + ) -> tuple[list[torch.Tensor], list[torch.Tensor]]: + """ + Concatenate memory features and positional embeddings from previous frames. + + Returns: + Tuple of (memories_to_concatenate, memory_positional_embeddings_to_concatenate). + """ + memories_to_concatenate = [] + memory_positional_embeddings_to_concatenate = [] + + for relative_temporal_offset, prev_output_data in temporal_positions_and_previous_outputs: + if prev_output_data is None: + continue # Skip if no output data for this temporal position (e.g., padding frames) + + # Load memory features (potentially from CPU to GPU) + # Features are flattened: (Batch, Channels, H, W) -> (H*W, Batch, Channels) + memory_features = prev_output_data["maskmem_features"].to(device, non_blocking=True) + memories_to_concatenate.append(memory_features.permute(1, 0, 2)) + + # Spatial positional encoding (potentially from CPU to GPU) + spatial_memory_pos_embed = prev_output_data["maskmem_pos_enc"].to(device, non_blocking=True) + spatial_memory_pos_embed = spatial_memory_pos_embed.squeeze(1).permute(1, 0, 2) + + # Add temporal positional encoding + # self.memory_temporal_positional_encoding shape: (NumMaskMem, 1, 1, MemDim) + combined_memory_pos_embed = ( + spatial_memory_pos_embed + self.memory_temporal_positional_encoding[relative_temporal_offset - 1] + ) + memory_positional_embeddings_to_concatenate.append(combined_memory_pos_embed) + + return memories_to_concatenate, memory_positional_embeddings_to_concatenate + + def _prepare_memory_conditioned_features( + self, + inference_session: EdgeTamVideoInferenceSession, + frame_idx: int, + obj_idx: int, + is_initial_conditioning_frame: bool, + current_vision_features: list[torch.Tensor], + current_vision_positional_embeddings: list[torch.Tensor], + num_total_frames: int, + track_in_reverse_time: bool = False, + streaming: bool = False, + ) -> torch.Tensor: + """ + Fuse current frame's visual features with memory from previous frames for enhanced object tracking. + + This method conditions the current frame's visual features on temporal memory from previous frames, + enabling consistent object tracking across video sequences. For initial conditioning frames, it uses + no-memory embeddings. For subsequent frames, it retrieves and integrates memory features from both + conditioning frames (user interactions) and non-conditioning frames (tracked results) via cross-attention. + + Args: + inference_session (`EdgeTamVideoInferenceSession`): + The video inference session object. + frame_idx (`int`): + Index of the current frame being processed. + obj_idx (`int`): + Index of the object being processed. + is_initial_conditioning_frame (`bool`): + Whether this is an initial conditioning frame with user inputs (True) or a subsequent + tracking frame (False). + current_vision_features (`torch.Tensor`): + Highest-level vision features of shape `(seq_len, batch_size, channels)`. + current_vision_positional_embeddings (`torch.Tensor`): + Positional embedding tensors corresponding to the highest-level vision features. + num_total_frames (`int`): + Total number of frames in the video sequence. + track_in_reverse_time (`bool`, *optional*, defaults to `False`): + Whether tracking is performed in reverse temporal order. + streaming (`bool`, *optional*, defaults to `False`): + Whether this is streaming inference mode. + + Returns: + `torch.Tensor`: Memory-conditioned feature tensor of shape `(batch_size, channels, height, width)` + suitable for input to the SAM decoder. + """ + # Get dimensions from the highest-level (lowest-resolution) feature map + batch_size = current_vision_features.size(1) + num_channels = self.hidden_dim + height, width = self.backbone_feature_sizes[-1] + device = current_vision_features.device + + # If memory is disabled (e.g., for single image SAM), return current features directly. + if self.num_maskmem == 0: + # Permute (SeqLen, Batch, Channels) -> (Batch, Channels, SeqLen) then view as (Batch, Channels, Height, Width) + # Assuming SeqLen = Height * Width for the last feature map + current_feature_map = current_vision_features.permute(1, 2, 0).view( + batch_size, num_channels, height, width + ) + return current_feature_map + + # Step 1: Handle initial conditioning frames + if is_initial_conditioning_frame: + # For initial conditioning frames, no prior memory is used directly in this block. + # If configured, directly add a learnable "no memory" embedding. + # current_vision_features has shape (SeqLen, Batch, Channels) + conditioned_feature_map_flat = current_vision_features + self.no_memory_embedding + # Reshape to (Batch, Channels, Height, Width) + conditioned_feature_map = conditioned_feature_map_flat.permute(1, 2, 0).view( + batch_size, num_channels, height, width + ) + return conditioned_feature_map + + # Step 2: Get memory frames and concatenate their features + temporal_positions_and_previous_outputs = self._gather_memory_frame_outputs( + inference_session, obj_idx, frame_idx, track_in_reverse_time + ) + + memories_to_concatenate, memory_positional_embeddings_to_concatenate = self._build_memory_attention_inputs( + temporal_positions_and_previous_outputs, device + ) + num_spatial_memory_tokens = len(memories_to_concatenate) + + # Step 3: Get and process object pointers + temporal_offsets, pointer_tokens, max_object_pointers_to_use = self._get_object_pointers( + inference_session, obj_idx, frame_idx, num_total_frames, device, track_in_reverse_time, streaming + ) + + num_object_pointer_tokens = 0 + if pointer_tokens: + object_pointers, object_pointers_pos_embed = self._process_object_pointers( + temporal_offsets, pointer_tokens, max_object_pointers_to_use, batch_size, num_channels, device + ) + + if object_pointers is not None: + memories_to_concatenate.append(object_pointers) + memory_positional_embeddings_to_concatenate.append(object_pointers_pos_embed) + num_object_pointer_tokens = object_pointers.shape[0] + + # Step 4: Concatenate all retrieved memories and their positional embeddings + combined_memory = torch.cat(memories_to_concatenate, dim=0) + combined_memory_positional_embeddings = torch.cat(memory_positional_embeddings_to_concatenate, dim=0) + + # Step 5: Forward through the memory attention mechanism + conditioned_feature_map_flat = self.memory_attention( + current_vision_features=current_vision_features, + current_vision_position_embeddings=current_vision_positional_embeddings, + memory=combined_memory, + memory_posision_embeddings=combined_memory_positional_embeddings, # Corrected typo from API + num_object_pointer_tokens=num_object_pointer_tokens, + num_spatial_memory_tokens=num_spatial_memory_tokens, + ) + + # Reshape from (Batch, H*W, Channels) to (Batch, Channels, Height, Width) + conditioned_feature_map = ( + conditioned_feature_map_flat.squeeze(1).permute(0, 2, 1).view(batch_size, num_channels, height, width) + ) + return conditioned_feature_map + + def _encode_new_memory( + self, + current_vision_feats: torch.Tensor, + pred_masks_high_res: torch.Tensor, + object_score_logits: torch.Tensor, + is_mask_from_pts: bool, + ) -> tuple[torch.Tensor, list[torch.Tensor]]: + """Encode the current image and its prediction into a memory feature.""" + batch_size = current_vision_feats.size(1) # batch size on this frame + channels = self.hidden_dim + height, width = self.backbone_feature_sizes[-1] # top-level (lowest-resolution) feature size + # top-level feature, (HW)BC => BCHW + pix_feat = current_vision_feats.permute(1, 2, 0).view(batch_size, channels, height, width) + if is_mask_from_pts and not self.training: + # binarize the mask logits + mask_for_mem = (pred_masks_high_res > 0).to(pred_masks_high_res.dtype) + else: + # apply sigmoid on the raw mask logits to turn them into range (0, 1) + mask_for_mem = torch.sigmoid(pred_masks_high_res) + # apply scale and bias terms to the sigmoid probabilities + mask_for_mem = mask_for_mem * self.config.sigmoid_scale_for_mem_enc + mask_for_mem = mask_for_mem + self.config.sigmoid_bias_for_mem_enc + + maskmem_features, maskmem_pos_enc = self.memory_encoder( + pix_feat, + mask_for_mem, + ) + # add a no-object embedding to the spatial memory to indicate that the frame + # is predicted to be occluded (i.e. no object is appearing in the frame) + if self.occlusion_spatial_embedding_parameter is not None: + is_obj_appearing = (object_score_logits > 0).float() + maskmem_features += (1 - is_obj_appearing[..., None]) * self.occlusion_spatial_embedding_parameter[ + ..., None, None + ].expand(*maskmem_features.shape) + + maskmem_pos_enc = maskmem_pos_enc.to(pred_masks_high_res.dtype) + maskmem_features, maskmem_pos_enc = self.spatial_perceiver(maskmem_features, maskmem_pos_enc) + maskmem_features = maskmem_features.to(pred_masks_high_res.dtype) + maskmem_pos_enc = maskmem_pos_enc.to(pred_masks_high_res.dtype) + + return maskmem_features, maskmem_pos_enc + + +__all__ = [ + "EdgeTamVideoMaskDecoderConfig", + "EdgeTamVideoPromptEncoderConfig", + "EdgeTamVideoConfig", + "EdgeTamVideoModel", + "EdgeTamVideoInferenceSession", + "EdgeTamVideoPreTrainedModel", +] diff --git a/src/transformers/models/efficientloftr/convert_efficientloftr_to_hf.py b/src/transformers/models/efficientloftr/convert_efficientloftr_to_hf.py deleted file mode 100644 index d15d07dbb8f6..000000000000 --- a/src/transformers/models/efficientloftr/convert_efficientloftr_to_hf.py +++ /dev/null @@ -1,257 +0,0 @@ -# Copyright 2025 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import gc -import os -import re - -import torch -from datasets import load_dataset -from huggingface_hub import hf_hub_download - -from transformers.models.efficientloftr.image_processing_efficientloftr import EfficientLoFTRImageProcessor -from transformers.models.efficientloftr.modeling_efficientloftr import ( - EfficientLoFTRConfig, - EfficientLoFTRForKeypointMatching, -) - - -DEFAULT_MODEL_REPO = "stevenbucaille/efficient_loftr_pth" -DEFAULT_FILE = "eloftr.pth" - - -def prepare_imgs(): - dataset = load_dataset("hf-internal-testing/image-matching-test-dataset", split="train") - image0 = dataset[0]["image"] - image2 = dataset[2]["image"] - return [[image2, image0]] - - -def verify_model_outputs(model, device): - images = prepare_imgs() - preprocessor = EfficientLoFTRImageProcessor() - inputs = preprocessor(images=images, return_tensors="pt").to(device) - model.to(device) - model.eval() - with torch.no_grad(): - outputs = model(**inputs, output_hidden_states=True, output_attentions=True) - - predicted_number_of_matches = outputs.matches.shape[-1] - predicted_top10 = torch.topk(outputs.matching_scores[0, 0], k=10) - predicted_top10_matches_indices = predicted_top10.indices - predicted_top10_matching_scores = predicted_top10.values - - expected_number_of_matches = 4800 - expected_matches_shape = torch.Size((len(images), 2, expected_number_of_matches)) - expected_matching_scores_shape = torch.Size((len(images), 2, expected_number_of_matches)) - - expected_top10_matches_indices = torch.tensor( - [1798, 1639, 1401, 1559, 2596, 2362, 2441, 2605, 1643, 2607], dtype=torch.int64 - ).to(device) - expected_top10_matching_scores = torch.tensor( - [0.9563, 0.9355, 0.9265, 0.9091, 0.9071, 0.9062, 0.9000, 0.8978, 0.8908, 0.8853] - ).to(device) - - assert outputs.matches.shape == expected_matches_shape - assert outputs.matching_scores.shape == expected_matching_scores_shape - - torch.testing.assert_close(predicted_top10_matches_indices, expected_top10_matches_indices, rtol=5e-3, atol=5e-3) - torch.testing.assert_close(predicted_top10_matching_scores, expected_top10_matching_scores, rtol=5e-3, atol=5e-3) - - assert predicted_number_of_matches == expected_number_of_matches - - -ORIGINAL_TO_CONVERTED_KEY_MAPPING = { - r"matcher.backbone.layer(\d+).rbr_dense.conv": r"efficientloftr.backbone.stages.\1.blocks.0.conv1.conv", - r"matcher.backbone.layer(\d+).rbr_dense.bn": r"efficientloftr.backbone.stages.\1.blocks.0.conv1.norm", - r"matcher.backbone.layer(\d+).rbr_1x1.conv": r"efficientloftr.backbone.stages.\1.blocks.0.conv2.conv", - r"matcher.backbone.layer(\d+).rbr_1x1.bn": r"efficientloftr.backbone.stages.\1.blocks.0.conv2.norm", - r"matcher.backbone.layer(\d+).(\d+).rbr_dense.conv": r"efficientloftr.backbone.stages.\1.blocks.\2.conv1.conv", - r"matcher.backbone.layer(\d+).(\d+).rbr_dense.bn": r"efficientloftr.backbone.stages.\1.blocks.\2.conv1.norm", - r"matcher.backbone.layer(\d+).(\d+).rbr_1x1.conv": r"efficientloftr.backbone.stages.\1.blocks.\2.conv2.conv", - r"matcher.backbone.layer(\d+).(\d+).rbr_1x1.bn": r"efficientloftr.backbone.stages.\1.blocks.\2.conv2.norm", - r"matcher.backbone.layer(\d+).(\d+).rbr_identity": r"efficientloftr.backbone.stages.\1.blocks.\2.identity", - r"matcher.loftr_coarse.layers.(\d*[02468]).aggregate": lambda m: f"efficientloftr.local_feature_transformer.layers.{int(m.group(1)) // 2}.self_attention.aggregation.q_aggregation", - r"matcher.loftr_coarse.layers.(\d*[02468]).norm1": lambda m: f"efficientloftr.local_feature_transformer.layers.{int(m.group(1)) // 2}.self_attention.aggregation.norm", - r"matcher.loftr_coarse.layers.(\d*[02468]).q_proj": lambda m: f"efficientloftr.local_feature_transformer.layers.{int(m.group(1)) // 2}.self_attention.attention.q_proj", - r"matcher.loftr_coarse.layers.(\d*[02468]).k_proj": lambda m: f"efficientloftr.local_feature_transformer.layers.{int(m.group(1)) // 2}.self_attention.attention.k_proj", - r"matcher.loftr_coarse.layers.(\d*[02468]).v_proj": lambda m: f"efficientloftr.local_feature_transformer.layers.{int(m.group(1)) // 2}.self_attention.attention.v_proj", - r"matcher.loftr_coarse.layers.(\d*[02468]).merge": lambda m: f"efficientloftr.local_feature_transformer.layers.{int(m.group(1)) // 2}.self_attention.attention.o_proj", - r"matcher.loftr_coarse.layers.(\d*[02468]).mlp.(\d+)": lambda m: f"efficientloftr.local_feature_transformer.layers.{int(m.group(1)) // 2}.self_attention.mlp.fc{1 if m.group(2) == '0' else 2}", - r"matcher.loftr_coarse.layers.(\d*[02468]).norm2": lambda m: f"efficientloftr.local_feature_transformer.layers.{int(m.group(1)) // 2}.self_attention.mlp.layer_norm", - r"matcher.loftr_coarse.layers.(\d*[13579]).aggregate": lambda m: f"efficientloftr.local_feature_transformer.layers.{int(m.group(1)) // 2}.cross_attention.aggregation.q_aggregation", - r"matcher.loftr_coarse.layers.(\d*[13579]).norm1": lambda m: f"efficientloftr.local_feature_transformer.layers.{int(m.group(1)) // 2}.cross_attention.aggregation.norm", - r"matcher.loftr_coarse.layers.(\d*[13579]).q_proj": lambda m: f"efficientloftr.local_feature_transformer.layers.{int(m.group(1)) // 2}.cross_attention.attention.q_proj", - r"matcher.loftr_coarse.layers.(\d*[13579]).k_proj": lambda m: f"efficientloftr.local_feature_transformer.layers.{int(m.group(1)) // 2}.cross_attention.attention.k_proj", - r"matcher.loftr_coarse.layers.(\d*[13579]).v_proj": lambda m: f"efficientloftr.local_feature_transformer.layers.{int(m.group(1)) // 2}.cross_attention.attention.v_proj", - r"matcher.loftr_coarse.layers.(\d*[13579]).merge": lambda m: f"efficientloftr.local_feature_transformer.layers.{int(m.group(1)) // 2}.cross_attention.attention.o_proj", - r"matcher.loftr_coarse.layers.(\d*[13579]).mlp.(\d+)": lambda m: f"efficientloftr.local_feature_transformer.layers.{int(m.group(1)) // 2}.cross_attention.mlp.fc{1 if m.group(2) == '0' else 2}", - r"matcher.loftr_coarse.layers.(\d*[13579]).norm2": lambda m: f"efficientloftr.local_feature_transformer.layers.{int(m.group(1)) // 2}.cross_attention.mlp.layer_norm", - r"matcher.fine_preprocess.layer3_outconv": "refinement_layer.out_conv", - r"matcher.fine_preprocess.layer(\d+)_outconv.weight": lambda m: f"refinement_layer.out_conv_layers.{0 if int(m.group(1)) == 2 else m.group(1)}.out_conv1.weight", - r"matcher.fine_preprocess.layer(\d+)_outconv2\.0": lambda m: f"refinement_layer.out_conv_layers.{0 if int(m.group(1)) == 2 else m.group(1)}.out_conv2", - r"matcher.fine_preprocess.layer(\d+)_outconv2\.1": lambda m: f"refinement_layer.out_conv_layers.{0 if int(m.group(1)) == 2 else m.group(1)}.batch_norm", - r"matcher.fine_preprocess.layer(\d+)_outconv2\.3": lambda m: f"refinement_layer.out_conv_layers.{0 if int(m.group(1)) == 2 else m.group(1)}.out_conv3", -} - - -def convert_old_keys_to_new_keys(state_dict_keys: list[str]): - """ - This function should be applied only once, on the concatenated keys to efficiently rename using - the key mappings. - """ - output_dict = {} - if state_dict_keys is not None: - old_text = "\n".join(state_dict_keys) - new_text = old_text - for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items(): - if replacement is None: - new_text = re.sub(pattern, "", new_text) # an empty line - continue - new_text = re.sub(pattern, replacement, new_text) - output_dict = dict(zip(old_text.split("\n"), new_text.split("\n"))) - return output_dict - - -@torch.no_grad() -def write_model( - model_path, - model_repo, - file_name, - organization, - safe_serialization=True, - push_to_hub=False, -): - os.makedirs(model_path, exist_ok=True) - # ------------------------------------------------------------ - # EfficientLoFTR config - # ------------------------------------------------------------ - - config = EfficientLoFTRConfig() - config.architectures = ["EfficientLoFTRForKeypointMatching"] - config.save_pretrained(model_path) - print("Model config saved successfully...") - - # ------------------------------------------------------------ - # Convert weights - # ------------------------------------------------------------ - - print(f"Fetching all parameters from the checkpoint at {model_repo}/{file_name}...") - checkpoint_path = hf_hub_download(repo_id=model_repo, filename=file_name) - original_state_dict = torch.load(checkpoint_path, weights_only=True, map_location="cpu")["state_dict"] - - print("Converting model...") - all_keys = list(original_state_dict.keys()) - new_keys = convert_old_keys_to_new_keys(all_keys) - - state_dict = {} - for key in all_keys: - new_key = new_keys[key] - state_dict[new_key] = original_state_dict.pop(key).contiguous().clone() - - del original_state_dict - gc.collect() - - print("Loading the checkpoint in a EfficientLoFTR model...") - - device = "cuda" if torch.cuda.is_available() else "cpu" - with torch.device(device): - model = EfficientLoFTRForKeypointMatching(config) - model.load_state_dict(state_dict) - print("Checkpoint loaded successfully...") - del model.config._name_or_path - - print("Saving the model...") - model.save_pretrained(model_path, safe_serialization=safe_serialization) - del state_dict, model - - # Safety check: reload the converted model - gc.collect() - print("Reloading the model to check if it's saved correctly.") - model = EfficientLoFTRForKeypointMatching.from_pretrained(model_path) - print("Model reloaded successfully.") - - model_name = "efficientloftr" - if model_repo == DEFAULT_MODEL_REPO: - print("Checking the model outputs...") - verify_model_outputs(model, device) - print("Model outputs verified successfully.") - - if push_to_hub: - print("Pushing model to the hub...") - model.push_to_hub( - repo_id=f"{organization}/{model_name}", - commit_message="Add model", - ) - config.push_to_hub(repo_id=f"{organization}/{model_name}", commit_message="Add config") - - write_image_processor(model_path, model_name, organization, push_to_hub=push_to_hub) - - -def write_image_processor(save_dir, model_name, organization, push_to_hub=False): - image_processor = EfficientLoFTRImageProcessor() - image_processor.save_pretrained(save_dir) - - if push_to_hub: - print("Pushing image processor to the hub...") - image_processor.push_to_hub( - repo_id=f"{organization}/{model_name}", - commit_message="Add image processor", - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--repo_id", - default=DEFAULT_MODEL_REPO, - type=str, - help="Model repo ID of the original EfficientLoFTR checkpoint you'd like to convert.", - ) - parser.add_argument( - "--file_name", - default=DEFAULT_FILE, - type=str, - help="File name of the original EfficientLoFTR checkpoint you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - required=True, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument("--save_model", action="store_true", help="Save model to local") - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Push model and image preprocessor to the hub", - ) - parser.add_argument( - "--organization", - default="zju-community", - type=str, - help="Hub organization in which you want the model to be uploaded.", - ) - - args = parser.parse_args() - write_model( - args.pytorch_dump_folder_path, - args.repo_id, - args.file_name, - args.organization, - safe_serialization=True, - push_to_hub=args.push_to_hub, - ) diff --git a/src/transformers/models/efficientloftr/image_processing_efficientloftr_fast.py b/src/transformers/models/efficientloftr/image_processing_efficientloftr_fast.py index 5f7437c45b2e..1463ef405f37 100644 --- a/src/transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +++ b/src/transformers/models/efficientloftr/image_processing_efficientloftr_fast.py @@ -39,17 +39,13 @@ from ...utils import ( TensorType, auto_docstring, - is_torchvision_v2_available, ) if TYPE_CHECKING: from .modeling_efficientloftr import KeypointMatchingOutput -if is_torchvision_v2_available(): - import torchvision.transforms.v2.functional as F -else: - import torchvision.transforms.functional as F +import torchvision.transforms.v2.functional as F def _is_valid_image(image): diff --git a/src/transformers/models/efficientnet/convert_efficientnet_to_pytorch.py b/src/transformers/models/efficientnet/convert_efficientnet_to_pytorch.py deleted file mode 100644 index e9988524aca0..000000000000 --- a/src/transformers/models/efficientnet/convert_efficientnet_to_pytorch.py +++ /dev/null @@ -1,339 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert EfficientNet checkpoints from the original repository. - -URL: https://github.com/keras-team/keras/blob/v2.11.0/keras/applications/efficientnet.py""" - -import argparse -import json -import os - -import numpy as np -import PIL -import requests -import tensorflow.keras.applications.efficientnet as efficientnet -import torch -from huggingface_hub import hf_hub_download -from PIL import Image -from tensorflow.keras.preprocessing import image - -from transformers import ( - EfficientNetConfig, - EfficientNetForImageClassification, - EfficientNetImageProcessor, -) -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -model_classes = { - "b0": efficientnet.EfficientNetB0, - "b1": efficientnet.EfficientNetB1, - "b2": efficientnet.EfficientNetB2, - "b3": efficientnet.EfficientNetB3, - "b4": efficientnet.EfficientNetB4, - "b5": efficientnet.EfficientNetB5, - "b6": efficientnet.EfficientNetB6, - "b7": efficientnet.EfficientNetB7, -} - -CONFIG_MAP = { - "b0": { - "hidden_dim": 1280, - "width_coef": 1.0, - "depth_coef": 1.0, - "image_size": 224, - "dropout_rate": 0.2, - "dw_padding": [], - }, - "b1": { - "hidden_dim": 1280, - "width_coef": 1.0, - "depth_coef": 1.1, - "image_size": 240, - "dropout_rate": 0.2, - "dw_padding": [16], - }, - "b2": { - "hidden_dim": 1408, - "width_coef": 1.1, - "depth_coef": 1.2, - "image_size": 260, - "dropout_rate": 0.3, - "dw_padding": [5, 8, 16], - }, - "b3": { - "hidden_dim": 1536, - "width_coef": 1.2, - "depth_coef": 1.4, - "image_size": 300, - "dropout_rate": 0.3, - "dw_padding": [5, 18], - }, - "b4": { - "hidden_dim": 1792, - "width_coef": 1.4, - "depth_coef": 1.8, - "image_size": 380, - "dropout_rate": 0.4, - "dw_padding": [6], - }, - "b5": { - "hidden_dim": 2048, - "width_coef": 1.6, - "depth_coef": 2.2, - "image_size": 456, - "dropout_rate": 0.4, - "dw_padding": [13, 27], - }, - "b6": { - "hidden_dim": 2304, - "width_coef": 1.8, - "depth_coef": 2.6, - "image_size": 528, - "dropout_rate": 0.5, - "dw_padding": [31], - }, - "b7": { - "hidden_dim": 2560, - "width_coef": 2.0, - "depth_coef": 3.1, - "image_size": 600, - "dropout_rate": 0.5, - "dw_padding": [18], - }, -} - - -def get_efficientnet_config(model_name): - config = EfficientNetConfig() - config.hidden_dim = CONFIG_MAP[model_name]["hidden_dim"] - config.width_coefficient = CONFIG_MAP[model_name]["width_coef"] - config.depth_coefficient = CONFIG_MAP[model_name]["depth_coef"] - config.image_size = CONFIG_MAP[model_name]["image_size"] - config.dropout_rate = CONFIG_MAP[model_name]["dropout_rate"] - config.depthwise_padding = CONFIG_MAP[model_name]["dw_padding"] - - repo_id = "huggingface/label-files" - filename = "imagenet-1k-id2label.json" - config.num_labels = 1000 - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - return config - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -def convert_image_processor(model_name): - size = CONFIG_MAP[model_name]["image_size"] - preprocessor = EfficientNetImageProcessor( - size={"height": size, "width": size}, - image_mean=[0.485, 0.456, 0.406], - image_std=[0.47853944, 0.4732864, 0.47434163], - do_center_crop=False, - ) - return preprocessor - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def rename_keys(original_param_names): - block_names = [v.split("_")[0].split("block")[1] for v in original_param_names if v.startswith("block")] - block_names = sorted(set(block_names)) - num_blocks = len(block_names) - block_name_mapping = {b: str(i) for b, i in zip(block_names, range(num_blocks))} - - rename_keys = [] - rename_keys.append(("stem_conv/kernel:0", "embeddings.convolution.weight")) - rename_keys.append(("stem_bn/gamma:0", "embeddings.batchnorm.weight")) - rename_keys.append(("stem_bn/beta:0", "embeddings.batchnorm.bias")) - rename_keys.append(("stem_bn/moving_mean:0", "embeddings.batchnorm.running_mean")) - rename_keys.append(("stem_bn/moving_variance:0", "embeddings.batchnorm.running_var")) - - for b in block_names: - hf_b = block_name_mapping[b] - rename_keys.append((f"block{b}_expand_conv/kernel:0", f"encoder.blocks.{hf_b}.expansion.expand_conv.weight")) - rename_keys.append((f"block{b}_expand_bn/gamma:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.weight")) - rename_keys.append((f"block{b}_expand_bn/beta:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.bias")) - rename_keys.append( - (f"block{b}_expand_bn/moving_mean:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.running_mean") - ) - rename_keys.append( - (f"block{b}_expand_bn/moving_variance:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.running_var") - ) - rename_keys.append( - (f"block{b}_dwconv/depthwise_kernel:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_conv.weight") - ) - rename_keys.append((f"block{b}_bn/gamma:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.weight")) - rename_keys.append((f"block{b}_bn/beta:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.bias")) - rename_keys.append( - (f"block{b}_bn/moving_mean:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.running_mean") - ) - rename_keys.append( - (f"block{b}_bn/moving_variance:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.running_var") - ) - - rename_keys.append((f"block{b}_se_reduce/kernel:0", f"encoder.blocks.{hf_b}.squeeze_excite.reduce.weight")) - rename_keys.append((f"block{b}_se_reduce/bias:0", f"encoder.blocks.{hf_b}.squeeze_excite.reduce.bias")) - rename_keys.append((f"block{b}_se_expand/kernel:0", f"encoder.blocks.{hf_b}.squeeze_excite.expand.weight")) - rename_keys.append((f"block{b}_se_expand/bias:0", f"encoder.blocks.{hf_b}.squeeze_excite.expand.bias")) - rename_keys.append( - (f"block{b}_project_conv/kernel:0", f"encoder.blocks.{hf_b}.projection.project_conv.weight") - ) - rename_keys.append((f"block{b}_project_bn/gamma:0", f"encoder.blocks.{hf_b}.projection.project_bn.weight")) - rename_keys.append((f"block{b}_project_bn/beta:0", f"encoder.blocks.{hf_b}.projection.project_bn.bias")) - rename_keys.append( - (f"block{b}_project_bn/moving_mean:0", f"encoder.blocks.{hf_b}.projection.project_bn.running_mean") - ) - rename_keys.append( - (f"block{b}_project_bn/moving_variance:0", f"encoder.blocks.{hf_b}.projection.project_bn.running_var") - ) - - rename_keys.append(("top_conv/kernel:0", "encoder.top_conv.weight")) - rename_keys.append(("top_bn/gamma:0", "encoder.top_bn.weight")) - rename_keys.append(("top_bn/beta:0", "encoder.top_bn.bias")) - rename_keys.append(("top_bn/moving_mean:0", "encoder.top_bn.running_mean")) - rename_keys.append(("top_bn/moving_variance:0", "encoder.top_bn.running_var")) - - key_mapping = {} - for item in rename_keys: - if item[0] in original_param_names: - key_mapping[item[0]] = "efficientnet." + item[1] - - key_mapping["predictions/kernel:0"] = "classifier.weight" - key_mapping["predictions/bias:0"] = "classifier.bias" - return key_mapping - - -def replace_params(hf_params, tf_params, key_mapping): - for key, value in tf_params.items(): - if "normalization" in key: - continue - - hf_key = key_mapping[key] - if "_conv" in key and "kernel" in key: - new_hf_value = torch.from_numpy(value).permute(3, 2, 0, 1) - elif "depthwise_kernel" in key: - new_hf_value = torch.from_numpy(value).permute(2, 3, 0, 1) - elif "kernel" in key: - new_hf_value = torch.from_numpy(np.transpose(value)) - else: - new_hf_value = torch.from_numpy(value) - - # Replace HF parameters with original TF model parameters - assert hf_params[hf_key].shape == new_hf_value.shape - hf_params[hf_key].copy_(new_hf_value) - - -@torch.no_grad() -def convert_efficientnet_checkpoint(model_name, pytorch_dump_folder_path, save_model, push_to_hub): - """ - Copy/paste/tweak model's weights to our EfficientNet structure. - """ - # Load original model - original_model = model_classes[model_name]( - include_top=True, - weights="imagenet", - input_tensor=None, - input_shape=None, - pooling=None, - classes=1000, - classifier_activation="softmax", - ) - - tf_params = original_model.trainable_variables - tf_non_train_params = original_model.non_trainable_variables - tf_params = {param.name: param.numpy() for param in tf_params} - for param in tf_non_train_params: - tf_params[param.name] = param.numpy() - tf_param_names = list(tf_params.keys()) - - # Load HuggingFace model - config = get_efficientnet_config(model_name) - hf_model = EfficientNetForImageClassification(config).eval() - hf_params = hf_model.state_dict() - - # Create src-to-dst parameter name mapping dictionary - print("Converting parameters...") - key_mapping = rename_keys(tf_param_names) - replace_params(hf_params, tf_params, key_mapping) - - # Initialize preprocessor and preprocess input image - preprocessor = convert_image_processor(model_name) - inputs = preprocessor(images=prepare_img(), return_tensors="pt") - - # HF model inference - hf_model.eval() - with torch.no_grad(): - outputs = hf_model(**inputs) - hf_logits = outputs.logits.detach().numpy() - - # Original model inference - original_model.trainable = False - image_size = CONFIG_MAP[model_name]["image_size"] - img = prepare_img().resize((image_size, image_size), resample=PIL.Image.NEAREST) - x = image.img_to_array(img) - x = np.expand_dims(x, axis=0) - original_logits = original_model.predict(x) - - # Check whether original and HF model outputs match -> np.allclose - assert np.allclose(original_logits, hf_logits, atol=1e-3), "The predicted logits are not the same." - print("Model outputs match!") - - if save_model: - # Create folder to save model - if not os.path.isdir(pytorch_dump_folder_path): - os.mkdir(pytorch_dump_folder_path) - # Save converted model and image processor - hf_model.save_pretrained(pytorch_dump_folder_path) - preprocessor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - # Push model and image processor to hub - print(f"Pushing converted {model_name} to the hub...") - model_name = f"efficientnet-{model_name}" - preprocessor.push_to_hub(model_name) - hf_model.push_to_hub(model_name) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="b0", - type=str, - help="Version name of the EfficientNet model you want to convert, select from [b0, b1, b2, b3, b4, b5, b6, b7].", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default="hf_model", - type=str, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument("--save_model", action="store_true", help="Save model to local") - parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub") - - args = parser.parse_args() - convert_efficientnet_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.save_model, args.push_to_hub) diff --git a/src/transformers/models/efficientnet/image_processing_efficientnet_fast.py b/src/transformers/models/efficientnet/image_processing_efficientnet_fast.py index 3544d927c146..77e787614a10 100644 --- a/src/transformers/models/efficientnet/image_processing_efficientnet_fast.py +++ b/src/transformers/models/efficientnet/image_processing_efficientnet_fast.py @@ -18,6 +18,7 @@ from typing import Optional, Union import torch +from torchvision.transforms.v2 import functional as F from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature, DefaultFastImageProcessorKwargs from ...image_transforms import group_images_by_shape, reorder_images @@ -26,16 +27,9 @@ from ...utils import ( TensorType, auto_docstring, - is_torchvision_v2_available, ) -if is_torchvision_v2_available(): - from torchvision.transforms.v2 import functional as F -else: - from torchvision.transforms import functional as F - - class EfficientNetFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): """ Args: diff --git a/src/transformers/models/electra/convert_electra_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/electra/convert_electra_original_tf_checkpoint_to_pytorch.py deleted file mode 100644 index b0abc30cd758..000000000000 --- a/src/transformers/models/electra/convert_electra_original_tf_checkpoint_to_pytorch.py +++ /dev/null @@ -1,79 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert ELECTRA checkpoint.""" - -import argparse - -import torch - -from transformers import ElectraConfig, ElectraForMaskedLM, ElectraForPreTraining, load_tf_weights_in_electra -from transformers.utils import logging - - -logging.set_verbosity_info() - - -def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path, discriminator_or_generator): - # Initialise PyTorch model - config = ElectraConfig.from_json_file(config_file) - print(f"Building PyTorch model from configuration: {config}") - - if discriminator_or_generator == "discriminator": - model = ElectraForPreTraining(config) - elif discriminator_or_generator == "generator": - model = ElectraForMaskedLM(config) - else: - raise ValueError("The discriminator_or_generator argument should be either 'discriminator' or 'generator'") - - # Load weights from tf checkpoint - load_tf_weights_in_electra( - model, config, tf_checkpoint_path, discriminator_or_generator=discriminator_or_generator - ) - - # Save pytorch-model - print(f"Save PyTorch model to {pytorch_dump_path}") - torch.save(model.state_dict(), pytorch_dump_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." - ) - parser.add_argument( - "--config_file", - default=None, - type=str, - required=True, - help="The config json file corresponding to the pre-trained model. \nThis specifies the model architecture.", - ) - parser.add_argument( - "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - parser.add_argument( - "--discriminator_or_generator", - default=None, - type=str, - required=True, - help=( - "Whether to export the generator or the discriminator. Should be a string, either 'discriminator' or " - "'generator'." - ), - ) - args = parser.parse_args() - convert_tf_checkpoint_to_pytorch( - args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path, args.discriminator_or_generator - ) diff --git a/src/transformers/models/emu3/convert_emu3_weights_to_hf.py b/src/transformers/models/emu3/convert_emu3_weights_to_hf.py deleted file mode 100644 index 1427288878be..000000000000 --- a/src/transformers/models/emu3/convert_emu3_weights_to_hf.py +++ /dev/null @@ -1,447 +0,0 @@ -# Copyright 2024 The Emu team, BAAI and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import json -import os -import re -from typing import Optional - -import requests -import torch -from accelerate import init_empty_weights -from PIL import Image - -from transformers import ( - AutoModel, - AutoModelForCausalLM, - AutoTokenizer, - Emu3Config, - Emu3ForConditionalGeneration, - Emu3ImageProcessor, - Emu3Processor, - Emu3TextConfig, - GenerationConfig, -) -from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode - - -""" -Sample usage: - -``` -python src/transformers/models/emu3/convert_emu3_weights_to_hf.py \ - --vq_model_id BAAI/Emu3-VisionTokenizer --llm_model_id BAAI/Emu3-Chat --output_dir /output/path -``` - -Thereafter, models can be loaded via: - -```py -from transformers import Emu3ForConditionalGeneration, Emu3Processor - -model = Emu3ForConditionalGeneration.from_pretrained("/output/path") -processor = Emu3Processor.from_pretrained("/output/path") -``` - -""" - - -byte_encoder = bytes_to_unicode() -CHAT_TEMPLATE = "{% for message in messages %}{% if message['role'] != 'system' %}{{ message['role'].upper() + ': '}}{% endif %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + ' '}}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] + ' '}}{% endgeneration %}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}" - - -# Tiktoken to HF conversion, thanks for Xenova -def token_bytes_to_string(b): - return "".join([byte_encoder[ord(char)] for char in b.decode("latin-1")]) - - -# Adapted from https://github.com/openai/tiktoken/issues/60#issuecomment-1499977960 -def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: Optional[int] = None): - parts = [bytes([b]) for b in token] - while True: - min_idx = None - min_rank = None - for i, pair in enumerate(zip(parts[:-1], parts[1:])): - rank = mergeable_ranks.get(pair[0] + pair[1]) - if rank is not None and (min_rank is None or rank < min_rank): - min_idx = i - min_rank = rank - if min_rank is None or (max_rank is not None and min_rank >= max_rank): - break - assert min_idx is not None - parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2 :] - return parts - - -def generate_vocab_and_merges(encoder): - mergeable_ranks = encoder._mergeable_ranks - - merges = [] - vocab = {} - for token, rank in mergeable_ranks.items(): - vocab[token_bytes_to_string(token)] = rank - - if len(token) == 1: - continue - merged = tuple(bpe(mergeable_ranks, token, max_rank=rank)) - assert len(merged) == 2 - merges.append(" ".join(map(token_bytes_to_string, merged))) - - # Also add special tokens - vocab.update(encoder._special_tokens) - return vocab, merges - - -def convert_tiktoken(tokenizer, output_dir): - encoder = tokenizer.tokenizer - vocab, merges = generate_vocab_and_merges(encoder) - added_tokens = [ - { - "id": id, - "content": content, - "single_word": False, - "lstrip": False, - "rstrip": False, - "normalized": False, - "special": True, - } - for content, id in encoder._special_tokens.items() - if content != "<|extra_0|>" - ] - - # https://huggingface.co/Xenova/gpt2/raw/main/tokenizer_config.json - tokenizer_config_template = { - "add_prefix_space": False, - "bos_token": "<|extra_203|>", - "clean_up_tokenization_spaces": False, - "eos_token": "<|extra_204|>", - "pad_token": "<|endoftext|>", - } - tokenizer_config_template.update({"tokenizer_class": "GPT2Tokenizer"}) - tokenizer_config_template = dict(sorted(tokenizer_config_template.items(), key=lambda x: x[0])) - - # add placeholder image token by taking one of the reserved tokens - reserved_token_id = vocab["<|extra_0|>"] - vocab[""] = reserved_token_id - del vocab["<|extra_0|>"] - added_tokens.append( - { - "id": reserved_token_id, - "content": "", - "single_word": False, - "lstrip": False, - "rstrip": False, - "normalized": False, - "special": True, - } - ) - - os.makedirs(output_dir, exist_ok=True) - - pre_tokenizer = { - "type": "ByteLevel", - "add_prefix_space": False, - "trim_offsets": True, - "use_regex": True, - } - - # https://huggingface.co/Xenova/gpt2/raw/main/tokenizer.json - tokenizer_template = { - "version": "1.0", - "truncation": None, - "padding": None, - "added_tokens": added_tokens, - "normalizer": None, - "pre_tokenizer": pre_tokenizer, - "post_processor": None, - "decoder": { - "type": "ByteLevel", - "add_prefix_space": True, - "trim_offsets": True, - "use_regex": True, - }, - "model": { - "type": "BPE", - "dropout": None, - "unk_token": None, - "continuing_subword_prefix": "", - "end_of_word_suffix": "", - "fuse_unk": False, - "byte_fallback": False, - "vocab": vocab, - "merges": merges, - }, - } - - # Save to files - with open(os.path.join(output_dir, "vocab.json"), "w", encoding="utf-8") as fp: - json.dump(vocab, fp, indent=2, ensure_ascii=False) - - with open(os.path.join(output_dir, "tokenizer.json"), "w", encoding="utf-8") as fp: - json.dump(tokenizer_template, fp, indent=2, ensure_ascii=False) - - with open(os.path.join(output_dir, "tokenizer_config.json"), "w", encoding="utf-8") as fp: - json.dump(tokenizer_config_template, fp, indent=2, ensure_ascii=False) - - with open(os.path.join(output_dir, "special_tokens_map.json"), "w", encoding="utf-8") as fp: - json.dump( - { - "bos_token": "<|extra_203|>", - "eos_token": "<|extra_204|>", - "pad_token": "<|endoftext|>", - }, - fp, - indent=2, - ensure_ascii=False, - ) - - with open(os.path.join(output_dir, "merges.txt"), "w", encoding="utf-8") as fp: - fp.write("#version: 0.2\n") - fp.write("\n".join(merges)) - - -KEYS_TO_MODIFY_MAPPING = { - "^model": "model.text_model", - "^encoder": "model.vqmodel.encoder", - "^decoder": "model.vqmodel.decoder", - "^post_quant_conv": "model.vqmodel.post_quant_conv", - "^quant_conv": "model.vqmodel.quant_conv", - "^quantize": "model.vqmodel.quantize", - r"lm_head\.weight": "lm_head.weight", - # rename QKV proj for the VQ-VAE model because we use SiglipAttention - r"\.q\.": ".q_proj.", - r"\.k\.": ".k_proj.", - r"\.v\.": ".v_proj.", - r"\.proj_out\.": ".out_proj.", - # move the attention norms outside of attention modules - r"mid\.attn_1\.norm\.": "mid.attn_norm.", - r"attn\.0\.norm\.": "attn_norms.0.", - r"attn\.1\.norm\.": "attn_norms.1.", - r"attn\.2\.norm\.": "attn_norms.2.", - r"attn\.3\.norm\.": "attn_norms.3.", - # isolate down/mid/up into separate classes for readability - r"\.down\.": ".down_block.down.", - r"\.up\.": ".up_block.up.", - r"\.mid\.": ".middle_block.", -} - - -def convert_state_dict_to_hf(old_state_dict, new_state_dict): - for key, value in old_state_dict.items(): - # convert conv layers in attn to linear - if ( - any(key.endswith(name) for name in ["q.weight", "k.weight", "v.weight", "proj_out.weight"]) - and value.ndim == 4 - ): - value = value.squeeze() - - for old_pattern, new_pattern in KEYS_TO_MODIFY_MAPPING.items(): - key = re.sub(old_pattern, new_pattern, key) - - new_state_dict[key] = value - return new_state_dict - - -def convert_model(vq_model_id, llm_model_id, output_dir, hub_model_id=None, test_inference=False): - os.makedirs(output_dir, exist_ok=True) - - # Convert and save processor - tokenizer_tiktoken = AutoTokenizer.from_pretrained(llm_model_id, trust_remote_code=True) - convert_tiktoken(tokenizer_tiktoken, output_dir) - extra_special_tokens = { - "image_token": "", - "boi_token": "<|image start|>", - "eoi_token": "<|image end|>", - "image_wrapper_token": "<|image token|>", - "eof_token": "<|extra_201|>", - } - tokenizer_converted = AutoTokenizer.from_pretrained(output_dir, extra_special_tokens=extra_special_tokens) - tokenizer_converted.padding_side = "left" - - image_processor = Emu3ImageProcessor.from_pretrained(vq_model_id) - processor = Emu3Processor(image_processor, tokenizer_converted, chat_template=CHAT_TEMPLATE) - processor.save_pretrained(output_dir) - - # load models - model_llm = AutoModelForCausalLM.from_pretrained( - llm_model_id, - trust_remote_code=True, - ) - model_vqgan = AutoModel.from_pretrained(vq_model_id, trust_remote_code=True) - with open(f"{output_dir}/tokenizer.json", "r") as file: - tokenizer_config = json.load(file) - vocabulary_map = tokenizer_config["model"]["vocab"] - - text_config = Emu3TextConfig( - max_position_embeddings=model_llm.config.max_position_embeddings, - rope_scaling={"rope_type": "default"}, - ) - config = Emu3Config(text_config=text_config, vocabulary_map=vocabulary_map) - - with init_empty_weights(): - model = Emu3ForConditionalGeneration(config=config) - model.generation_config = GenerationConfig( - do_sample=True, - top_k=2048, - max_new_tokens=50_000, - pad_token_id=processor.tokenizer.pad_token_id, - eos_token_id=processor.tokenizer.eos_token_id, - ) - - state_dict = {} - state_dict = convert_state_dict_to_hf(model_llm.state_dict(), state_dict) - state_dict = convert_state_dict_to_hf(model_vqgan.state_dict(), state_dict) - - model.load_state_dict(state_dict, assign=True, strict=True) - model.save_pretrained(output_dir, safe_serialization=True) - - if hub_model_id is not None: - model.push_to_hub(hub_model_id) - processor.push_to_hub(hub_model_id) - - if test_inference and llm_model_id.endswith("Chat"): - # Short inference on a few examples to check if generation makes sense - print("Loading the checkpoint in a Emu3 model...") - print("*" * 100) - model = Emu3ForConditionalGeneration.from_pretrained(output_dir, dtype=torch.bfloat16, device_map="auto") - processor = Emu3Processor.from_pretrained(output_dir) - - conversation = [ - { - "role": "system", - "content": [ - {"type": "text", "text": "You are a helpful assistant."}, - ], - }, - { - "role": "user", - "content": [ - {"type": "text", "text": "Please tell me about this art work and its artist."}, - {"type": "image"}, - ], - }, - ] - prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) - - image = Image.open( - requests.get( - "https://uploads4.wikiart.org/images/paul-klee/death-for-the-idea-1915.jpg!Large.jpg", stream=True - ).raw - ) - inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device, torch.bfloat16) - length = inputs.input_ids.shape[1] - - out = model.generate(**inputs, max_new_tokens=40, do_sample=False) - generated_text = processor.batch_decode(out[:, length:], skip_special_tokens=True)[0] - - print(f"Generation for single-image: {generated_text}") - print("*" * 100) - elif test_inference and llm_model_id.endswith("Gen"): - processor = Emu3Processor.from_pretrained(output_dir) - model = Emu3ForConditionalGeneration.from_pretrained(output_dir, dtype=torch.bfloat16, device_map="auto") - - inputs = processor( - text=[ - "a portrait of young girl. masterpiece, film grained, best quality.", - "a dog running under the rain", - ], - padding=True, - return_tensors="pt", - return_for_image_generation=True, - ) - inputs = inputs.to(device="cuda:0", dtype=torch.bfloat16) - - neg_prompt = "lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry." - neg_inputs = processor(text=[neg_prompt] * 2, return_tensors="pt").to(device="cuda:0") - - image_sizes = inputs.pop("image_sizes") - HEIGHT, WIDTH = image_sizes[0] - VISUAL_TOKENS = model.vocabulary_mapping.image_tokens - - def prefix_allowed_tokens_fn(batch_id, input_ids): - height, width = HEIGHT, WIDTH - visual_tokens = VISUAL_TOKENS - image_token_id = processor.tokenizer.encode("<|image token|>", return_tensors="pt")[0].to(model.device) - eoi_token_id = processor.tokenizer.encode("<|image end|>", return_tensors="pt")[0] - eos_token_id = processor.tokenizer.encode("<|extra_204|>", return_tensors="pt")[0] - pad_token_id = processor.tokenizer.encode("<|endoftext|>", return_tensors="pt")[0] - eol_token_id = processor.tokenizer.encode("<|extra_200|>", return_tensors="pt")[0] - eof_token_id = processor.tokenizer.encode("<|extra_201|>", return_tensors="pt")[0] - - position = torch.nonzero(input_ids == image_token_id, as_tuple=True)[0][0] - offset = input_ids.shape[0] - position - if offset % (width + 1) == 0: - return (eol_token_id,) - elif offset == (width + 1) * height + 1: - return (eof_token_id,) - elif offset == (width + 1) * height + 2: - return (eoi_token_id,) - elif offset == (width + 1) * height + 3: - return (eos_token_id,) - elif offset > (width + 1) * height + 3: - return (pad_token_id,) - else: - return visual_tokens - - out = model.generate( - **inputs, - prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, - negative_prompt_ids=neg_inputs.input_ids, - negative_prompt_attention_mask=neg_inputs.attention_mask, - ) - - image = model.decode_image_tokens(out[:, inputs.input_ids.shape[1] :], height=HEIGHT, width=WIDTH) - images = processor.postprocess( - list(image.float()), return_tensors="PIL.Image.Image" - ) # internally we convert to np but it's not supported in bf16 precision - for i, image in enumerate(images["pixel_values"]): - image.save(f"result_{i}.png") - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--vq_model_id", - help="Model ID of Emu3 VQ-VAE on the hub", - default="BAAI/Emu3-VisionTokenizer", - ) - parser.add_argument( - "--llm_model_id", - help="Model ID of Emu3 bacbone LLM on the hub", - default="BAAI/Emu3-Chat", - ) - parser.add_argument( - "--output_dir", - help="Location to write HF model", - ) - parser.add_argument( - "--hub_model_id", - help="Model ID in the hub where to push the model.", - ) - parser.add_argument( - "--test_inference", - action="store_true", - help="Whether to load the model for generation to test it's converted correctly.", - ) - args = parser.parse_args() - convert_model( - vq_model_id=args.vq_model_id, - llm_model_id=args.llm_model_id, - output_dir=args.output_dir, - hub_model_id=args.hub_model_id, - test_inference=args.test_inference, - ) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/emu3/image_processing_emu3.py b/src/transformers/models/emu3/image_processing_emu3.py index aaf3afa41733..50ce82e01de8 100644 --- a/src/transformers/models/emu3/image_processing_emu3.py +++ b/src/transformers/models/emu3/image_processing_emu3.py @@ -266,8 +266,8 @@ def _pad_for_batching( """ max_shape = ( - max([size[0] for size in image_sizes]), - max([size[1] for size in image_sizes]), + max(size[0] for size in image_sizes), + max(size[1] for size in image_sizes), ) pixel_values = [ pad( @@ -486,7 +486,7 @@ def unnormalize( image_mean: Union[float, Iterable[float]], image_std: Union[float, Iterable[float]], input_data_format: Optional[Union[str, ChannelDimension]] = None, - ) -> np.array: + ) -> np.ndarray: """ Unnormalizes `image` using the mean and standard deviation specified by `mean` and `std`. image = (image * image_std) + image_mean diff --git a/src/transformers/models/encodec/convert_encodec_checkpoint_to_pytorch.py b/src/transformers/models/encodec/convert_encodec_checkpoint_to_pytorch.py deleted file mode 100644 index f1fb0168705f..000000000000 --- a/src/transformers/models/encodec/convert_encodec_checkpoint_to_pytorch.py +++ /dev/null @@ -1,365 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert EnCodec checkpoints.""" - -import argparse - -import torch - -from transformers import ( - EncodecConfig, - EncodecFeatureExtractor, - EncodecModel, - logging, -) - - -# checkpoints downloaded from: -# https://dl.fbaipublicfiles.com/encodec/v0/encodec_24khz-d7cc33bc.th -# https://huggingface.co/facebook/musicgen-small/resolve/main/compression_state_dict.bin -# https://dl.fbaipublicfiles.com/encodec/v0/encodec_48khz-7e698e3e.th - - -logging.set_verbosity_info() -logger = logging.get_logger("transformers.models.encodec") - -MAPPING_QUANTIZER = { - "quantizer.vq.layers.*._codebook.inited": "quantizer.layers.*.codebook.inited", - "quantizer.vq.layers.*._codebook.cluster_size": "quantizer.layers.*.codebook.cluster_size", - "quantizer.vq.layers.*._codebook.embed": "quantizer.layers.*.codebook.embed", - "quantizer.vq.layers.*._codebook.embed_avg": "quantizer.layers.*.codebook.embed_avg", -} -MAPPING_ENCODER = { - "encoder.model.0.conv.conv": "encoder.layers.0.conv", - "encoder.model.1.block.1.conv.conv": "encoder.layers.1.block.1.conv", - "encoder.model.1.block.3.conv.conv": "encoder.layers.1.block.3.conv", - "encoder.model.1.shortcut.conv.conv": "encoder.layers.1.shortcut.conv", - "encoder.model.3.conv.conv": "encoder.layers.3.conv", - "encoder.model.4.block.1.conv.conv": "encoder.layers.4.block.1.conv", - "encoder.model.4.block.3.conv.conv": "encoder.layers.4.block.3.conv", - "encoder.model.4.shortcut.conv.conv": "encoder.layers.4.shortcut.conv", - "encoder.model.6.conv.conv": "encoder.layers.6.conv", - "encoder.model.7.block.1.conv.conv": "encoder.layers.7.block.1.conv", - "encoder.model.7.block.3.conv.conv": "encoder.layers.7.block.3.conv", - "encoder.model.7.shortcut.conv.conv": "encoder.layers.7.shortcut.conv", - "encoder.model.9.conv.conv": "encoder.layers.9.conv", - "encoder.model.10.block.1.conv.conv": "encoder.layers.10.block.1.conv", - "encoder.model.10.block.3.conv.conv": "encoder.layers.10.block.3.conv", - "encoder.model.10.shortcut.conv.conv": "encoder.layers.10.shortcut.conv", - "encoder.model.12.conv.conv": "encoder.layers.12.conv", - "encoder.model.13.lstm": "encoder.layers.13.lstm", - "encoder.model.15.conv.conv": "encoder.layers.15.conv", -} -MAPPING_ENCODER_48K = { - "encoder.model.0.conv.norm": "encoder.layers.0.norm", - "encoder.model.1.block.1.conv.norm": "encoder.layers.1.block.1.norm", - "encoder.model.1.block.3.conv.norm": "encoder.layers.1.block.3.norm", - "encoder.model.1.shortcut.conv.norm": "encoder.layers.1.shortcut.norm", - "encoder.model.3.conv.norm": "encoder.layers.3.norm", - "encoder.model.4.block.1.conv.norm": "encoder.layers.4.block.1.norm", - "encoder.model.4.block.3.conv.norm": "encoder.layers.4.block.3.norm", - "encoder.model.4.shortcut.conv.norm": "encoder.layers.4.shortcut.norm", - "encoder.model.6.conv.norm": "encoder.layers.6.norm", - "encoder.model.7.block.1.conv.norm": "encoder.layers.7.block.1.norm", - "encoder.model.7.block.3.conv.norm": "encoder.layers.7.block.3.norm", - "encoder.model.7.shortcut.conv.norm": "encoder.layers.7.shortcut.norm", - "encoder.model.9.conv.norm": "encoder.layers.9.norm", - "encoder.model.10.block.1.conv.norm": "encoder.layers.10.block.1.norm", - "encoder.model.10.block.3.conv.norm": "encoder.layers.10.block.3.norm", - "encoder.model.10.shortcut.conv.norm": "encoder.layers.10.shortcut.norm", - "encoder.model.12.conv.norm": "encoder.layers.12.norm", - "encoder.model.15.conv.norm": "encoder.layers.15.norm", -} -MAPPING_DECODER = { - "decoder.model.0.conv.conv": "decoder.layers.0.conv", - "decoder.model.1.lstm": "decoder.layers.1.lstm", - "decoder.model.3.convtr.convtr": "decoder.layers.3.conv", - "decoder.model.4.block.1.conv.conv": "decoder.layers.4.block.1.conv", - "decoder.model.4.block.3.conv.conv": "decoder.layers.4.block.3.conv", - "decoder.model.4.shortcut.conv.conv": "decoder.layers.4.shortcut.conv", - "decoder.model.6.convtr.convtr": "decoder.layers.6.conv", - "decoder.model.7.block.1.conv.conv": "decoder.layers.7.block.1.conv", - "decoder.model.7.block.3.conv.conv": "decoder.layers.7.block.3.conv", - "decoder.model.7.shortcut.conv.conv": "decoder.layers.7.shortcut.conv", - "decoder.model.9.convtr.convtr": "decoder.layers.9.conv", - "decoder.model.10.block.1.conv.conv": "decoder.layers.10.block.1.conv", - "decoder.model.10.block.3.conv.conv": "decoder.layers.10.block.3.conv", - "decoder.model.10.shortcut.conv.conv": "decoder.layers.10.shortcut.conv", - "decoder.model.12.convtr.convtr": "decoder.layers.12.conv", - "decoder.model.13.block.1.conv.conv": "decoder.layers.13.block.1.conv", - "decoder.model.13.block.3.conv.conv": "decoder.layers.13.block.3.conv", - "decoder.model.13.shortcut.conv.conv": "decoder.layers.13.shortcut.conv", - "decoder.model.15.conv.conv": "decoder.layers.15.conv", -} -MAPPING_DECODER_48K = { - "decoder.model.0.conv.norm": "decoder.layers.0.norm", - "decoder.model.3.convtr.norm": "decoder.layers.3.norm", - "decoder.model.4.block.1.conv.norm": "decoder.layers.4.block.1.norm", - "decoder.model.4.block.3.conv.norm": "decoder.layers.4.block.3.norm", - "decoder.model.4.shortcut.conv.norm": "decoder.layers.4.shortcut.norm", - "decoder.model.6.convtr.norm": "decoder.layers.6.norm", - "decoder.model.7.block.1.conv.norm": "decoder.layers.7.block.1.norm", - "decoder.model.7.block.3.conv.norm": "decoder.layers.7.block.3.norm", - "decoder.model.7.shortcut.conv.norm": "decoder.layers.7.shortcut.norm", - "decoder.model.9.convtr.norm": "decoder.layers.9.norm", - "decoder.model.10.block.1.conv.norm": "decoder.layers.10.block.1.norm", - "decoder.model.10.block.3.conv.norm": "decoder.layers.10.block.3.norm", - "decoder.model.10.shortcut.conv.norm": "decoder.layers.10.shortcut.norm", - "decoder.model.12.convtr.norm": "decoder.layers.12.norm", - "decoder.model.13.block.1.conv.norm": "decoder.layers.13.block.1.norm", - "decoder.model.13.block.3.conv.norm": "decoder.layers.13.block.3.norm", - "decoder.model.13.shortcut.conv.norm": "decoder.layers.13.shortcut.norm", - "decoder.model.15.conv.norm": "decoder.layers.15.norm", -} -MAPPING_24K = { - **MAPPING_QUANTIZER, - **MAPPING_ENCODER, - **MAPPING_DECODER, -} -MAPPING_48K = { - **MAPPING_QUANTIZER, - **MAPPING_ENCODER, - **MAPPING_ENCODER_48K, - **MAPPING_DECODER, - **MAPPING_DECODER_48K, -} -TOP_LEVEL_KEYS = [] -IGNORE_KEYS = [] - - -def set_recursively(hf_pointer, key, value, full_name, weight_type): - for attribute in key.split("."): - hf_pointer = getattr(hf_pointer, attribute) - - if weight_type is not None: - hf_shape = getattr(hf_pointer, weight_type).shape - else: - hf_shape = hf_pointer.shape - - if hf_shape != value.shape: - raise ValueError( - f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be" - f" {value.shape} for {full_name}" - ) - - if weight_type == "weight": - hf_pointer.weight.data = value - elif weight_type == "weight_g": - hf_pointer.weight_g.data = value - elif weight_type == "weight_v": - hf_pointer.weight_v.data = value - elif weight_type == "bias": - hf_pointer.bias.data = value - elif weight_type == "running_mean": - hf_pointer.running_mean.data = value - elif weight_type == "running_var": - hf_pointer.running_var.data = value - elif weight_type == "num_batches_tracked": - hf_pointer.num_batches_tracked.data = value - elif weight_type == "weight_ih_l0": - hf_pointer.weight_ih_l0.data = value - elif weight_type == "weight_hh_l0": - hf_pointer.weight_hh_l0.data = value - elif weight_type == "bias_ih_l0": - hf_pointer.bias_ih_l0.data = value - elif weight_type == "bias_hh_l0": - hf_pointer.bias_hh_l0.data = value - elif weight_type == "weight_ih_l1": - hf_pointer.weight_ih_l1.data = value - elif weight_type == "weight_hh_l1": - hf_pointer.weight_hh_l1.data = value - elif weight_type == "bias_ih_l1": - hf_pointer.bias_ih_l1.data = value - elif weight_type == "bias_hh_l1": - hf_pointer.bias_hh_l1.data = value - else: - hf_pointer.data = value - - logger.info(f"{key + ('.' + weight_type if weight_type is not None else '')} was initialized from {full_name}.") - - -def should_ignore(name, ignore_keys): - for key in ignore_keys: - if key.endswith(".*"): - if name.startswith(key[:-1]): - return True - elif ".*." in key: - prefix, suffix = key.split(".*.") - if prefix in name and suffix in name: - return True - elif key in name: - return True - return False - - -def recursively_load_weights(orig_dict, hf_model, model_name): - unused_weights = [] - - if model_name in ["encodec_24khz", "encodec_32khz"]: - MAPPING = MAPPING_24K - elif model_name == "encodec_48khz": - MAPPING = MAPPING_48K - else: - raise ValueError(f"Unsupported model: {model_name}") - - for name, value in orig_dict.items(): - if should_ignore(name, IGNORE_KEYS): - logger.info(f"{name} was ignored") - continue - - is_used = False - for key, mapped_key in MAPPING.items(): - if "*" in key: - prefix, suffix = key.split(".*.") - if prefix in name and suffix in name: - key = suffix - - if key in name: - # HACK otherwise .embed gets initialized with .embed_avg too - if key.endswith("embed") and name.endswith("embed_avg"): - continue - - is_used = True - if "*" in mapped_key: - layer_index = name.split(key)[0].split(".")[-2] - mapped_key = mapped_key.replace("*", layer_index) - if "weight_g" in name: - weight_type = "weight_g" - elif "weight_v" in name: - weight_type = "weight_v" - elif "weight_ih_l0" in name: - weight_type = "weight_ih_l0" - elif "weight_hh_l0" in name: - weight_type = "weight_hh_l0" - elif "bias_ih_l0" in name: - weight_type = "bias_ih_l0" - elif "bias_hh_l0" in name: - weight_type = "bias_hh_l0" - elif "weight_ih_l1" in name: - weight_type = "weight_ih_l1" - elif "weight_hh_l1" in name: - weight_type = "weight_hh_l1" - elif "bias_ih_l1" in name: - weight_type = "bias_ih_l1" - elif "bias_hh_l1" in name: - weight_type = "bias_hh_l1" - elif "bias" in name: - weight_type = "bias" - elif "weight" in name: - weight_type = "weight" - elif "running_mean" in name: - weight_type = "running_mean" - elif "running_var" in name: - weight_type = "running_var" - elif "num_batches_tracked" in name: - weight_type = "num_batches_tracked" - else: - weight_type = None - set_recursively(hf_model, mapped_key, value, name, weight_type) - continue - if not is_used: - unused_weights.append(name) - - logger.warning(f"Unused weights: {unused_weights}") - - -@torch.no_grad() -def convert_checkpoint( - model_name, - checkpoint_path, - pytorch_dump_folder_path, - config_path=None, - repo_id=None, -): - """ - Copy/paste/tweak model's weights to transformers design. - """ - if config_path is not None: - config = EncodecConfig.from_pretrained(config_path) - else: - config = EncodecConfig() - - if model_name == "encodec_24khz": - pass # config is already correct - elif model_name == "encodec_32khz": - config.upsampling_ratios = [8, 5, 4, 4] - config.target_bandwidths = [2.2] - config.num_filters = 64 - config.sampling_rate = 32_000 - config.codebook_size = 2048 - config.use_causal_conv = False - config.normalize = False - config.use_conv_shortcut = False - elif model_name == "encodec_48khz": - config.upsampling_ratios = [8, 5, 4, 2] - config.target_bandwidths = [3.0, 6.0, 12.0, 24.0] - config.sampling_rate = 48_000 - config.audio_channels = 2 - config.use_causal_conv = False - config.norm_type = "time_group_norm" - config.normalize = True - config.chunk_length_s = 1.0 - config.overlap = 0.01 - else: - raise ValueError(f"Unknown model name: {model_name}") - - model = EncodecModel(config) - - feature_extractor = EncodecFeatureExtractor( - feature_size=config.audio_channels, - sampling_rate=config.sampling_rate, - chunk_length_s=config.chunk_length_s, - overlap=config.overlap, - ) - feature_extractor.save_pretrained(pytorch_dump_folder_path) - - original_checkpoint = torch.load(checkpoint_path, weights_only=True) - if "best_state" in original_checkpoint: - # we might have a training state saved, in which case discard the yaml results and just retain the weights - original_checkpoint = original_checkpoint["best_state"] - recursively_load_weights(original_checkpoint, model, model_name) - model.save_pretrained(pytorch_dump_folder_path) - - if repo_id: - print("Pushing to the hub...") - feature_extractor.push_to_hub(repo_id) - model.push_to_hub(repo_id) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--model", - default="encodec_24khz", - type=str, - help="The model to convert. Should be one of 'encodec_24khz', 'encodec_32khz', 'encodec_48khz'.", - ) - parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint") - parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert") - parser.add_argument( - "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model." - ) - parser.add_argument( - "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub." - ) - - args = parser.parse_args() - convert_checkpoint( - args.model, - args.checkpoint_path, - args.pytorch_dump_folder_path, - args.config_path, - args.push_to_hub, - ) diff --git a/src/transformers/models/eomt/convert_eomt_to_hf.py b/src/transformers/models/eomt/convert_eomt_to_hf.py deleted file mode 100644 index 6d822c1bfc86..000000000000 --- a/src/transformers/models/eomt/convert_eomt_to_hf.py +++ /dev/null @@ -1,340 +0,0 @@ -# coding=utf-8 -# Copyright 2025 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import gc -import json -import os -import re -from typing import Optional - -import torch -from accelerate import init_empty_weights -from huggingface_hub import snapshot_download - -from transformers import EomtConfig, EomtForUniversalSegmentation, EomtImageProcessorFast - - -# fmt: off -MAPPINGS = { - # Embeddings - r"network.encoder.backbone.cls_token" : r"embeddings.cls_token", - r"network.encoder.backbone.reg_token" : r"embeddings.register_tokens", - r"network.encoder.backbone.pos_embed" : r"embeddings.position_embeddings.weight", - r"network.encoder.backbone.patch_embed.proj" : r"embeddings.patch_embeddings.projection", - - # Encoder Block - r"network.encoder.backbone.blocks.(\d+).norm1" : r"layers.\1.norm1", - r"network.encoder.backbone.blocks.(\d+).attn.proj" : r"layers.\1.attention.out_proj", - r"network.encoder.backbone.blocks.(\d+).ls1.gamma" : r"layers.\1.layer_scale1.lambda1", - r"network.encoder.backbone.blocks.(\d+).norm2" : r"layers.\1.norm2", - r"network.encoder.backbone.blocks.(\d+).ls2.gamma" : r"layers.\1.layer_scale2.lambda1", - r"network.encoder.backbone.blocks.(\d+).attn" : r"layers.\1.attention", - - # Others - r"network.q.weight" : r"query.weight", - r"network.class_head" : r"class_predictor", - r"network.upscale.(\d+).conv1" : r"upscale_block.block.\1.conv1", - r"network.upscale.(\d+).conv2" : r"upscale_block.block.\1.conv2", - r"network.upscale.(\d+).norm" : r"upscale_block.block.\1.layernorm2d", - r"network.mask_head.0" : r"mask_head.fc1", - r"network.mask_head.2" : r"mask_head.fc2", - r"network.mask_head.4" : r"mask_head.fc3", - r"network.encoder.backbone.norm" : r"layernorm", - r"network.attn_mask_probs" : r"attn_mask_probs", -} -# fmt: on - -# Mappings for MLP layers, depending on the type of MLP used in ckpts. -MLP_MAPPINGS = { - "swiglu_ffn": { - r"network.encoder.backbone.blocks.(\d+).mlp.fc1": r"layers.\1.mlp.weights_in", - r"network.encoder.backbone.blocks.(\d+).mlp.fc2": r"layers.\1.mlp.weights_out", - }, - "vanilla_mlp": { - r"network.encoder.backbone.blocks.(\d+).mlp": r"layers.\1.mlp", - }, -} - - -def convert_old_keys_to_new_keys(state_dict): - keys_as_text = "\n".join(state_dict.keys()) - new_keys_as_text = keys_as_text - for old, repl in MAPPINGS.items(): - if repl is None: - new_keys_as_text = re.sub(old, "", new_keys_as_text) - else: - new_keys_as_text = re.sub(old, repl, new_keys_as_text) - output_dict = dict(zip(keys_as_text.split("\n"), new_keys_as_text.split("\n"))) - return output_dict - - -def split_qkv_tensor(key, tensor): - """Splits a qkv tensor into separate q, k, v tensors and updates the key accordingly.""" - - new_keys = ["q_proj", "k_proj", "v_proj"] - split_size = tensor.shape[0] // 3 - split_tensors = torch.split(tensor, split_size, dim=0) - - return {key.replace("qkv", new_key): split_tensors[i] for i, new_key in enumerate(new_keys)} - - -def convert_state_dict_to_hf(state_dict): - """Convert state dict keys to HF format.""" - conversion_dict = convert_old_keys_to_new_keys(state_dict) - converted_state_dict = {} - - for old_key, new_key in conversion_dict.items(): - if new_key: - if "qkv" in new_key: # Detect merged attention keys and split them. - qkv_split_dict = split_qkv_tensor(new_key, state_dict[old_key]) - converted_state_dict.update(qkv_split_dict) - else: - converted_state_dict[new_key] = state_dict[old_key] - - for i in [ - "network.encoder.pixel_mean", - "network.encoder.pixel_std", - ]: - converted_state_dict.pop(i) - - # Embeddings will not have initial dimension - pos_embed_key = "embeddings.position_embeddings.weight" - converted_state_dict[pos_embed_key] = converted_state_dict[pos_embed_key].squeeze(0) - - return converted_state_dict - - -def ensure_model_downloaded( - repo_id: Optional[str] = None, revision: Optional[str] = None, local_dir: Optional[str] = None -) -> str: - """ - Ensures model files are downloaded locally, downloads them if not. - Returns path to local files. - - Args: - repo_id: The Hugging Face model repo ID (required if local_dir not provided) - revision: Optional git revision to use - local_dir: Optional local directory path where model files should be stored/found - """ - if local_dir is not None: - if os.path.exists(local_dir): - print(f"Using provided local directory: {local_dir}") - else: - # Create the local directory if it doesn't exist - os.makedirs(local_dir, exist_ok=True) - print(f"Created local directory: {local_dir}") - - if repo_id is None: - raise ValueError("Either repo_id or local_dir must be provided") - - print(f"Ensuring {repo_id} (revision: {revision or 'latest'}) is downloaded...") - - try: - # First try to find files locally - download_dir = snapshot_download(repo_id, revision=revision, local_files_only=True, local_dir=local_dir) - print(f"Found model files locally at {download_dir}") - return download_dir - except Exception: - # If files not found locally, download them - print(f"Downloading model files for {repo_id}...") - download_dir = snapshot_download(repo_id, revision=revision, local_files_only=False, local_dir=local_dir) - print(f"Downloaded model files to {download_dir}") - return download_dir - - -def load_model_state_dict(input_path: str) -> dict: - """ - Load model state dict, handling both single and sharded files. - """ - index_path = os.path.join(input_path, "pytorch_model.bin.index.json") - single_file_path = os.path.join(input_path, "pytorch_model.bin") - - # Check if we have a sharded model - if os.path.exists(index_path): - print("Loading sharded model...") - state_dict = {} - with open(index_path, "r") as f: - index = json.load(f) - - # Get unique shard files and load each one only once - unique_shard_files = sorted(set(index["weight_map"].values())) - for shard_file in unique_shard_files: - print(f"Loading shard {shard_file}...") - shard_path = os.path.join(input_path, shard_file) - shard_dict = torch.load(shard_path, map_location="cpu") - state_dict.update(shard_dict) - - return state_dict - - # Single file model - elif os.path.exists(single_file_path): - print("Loading single file model...") - return torch.load(single_file_path, map_location="cpu") - - else: - raise ValueError(f"No model files found in {input_path}") - - -def convert_model( - repo_id=None, - local_dir=None, - output_dir=None, - output_hub_path=None, - safe_serialization=True, - revision=None, -): - """Convert and save the model weights, processor, and configuration.""" - if output_dir is None and output_hub_path is None: - raise ValueError("At least one of output_dir or output_hub_path must be specified") - - if repo_id is None and local_dir is None: - raise ValueError("Either repo_id or local_dir must be specified") - - # Create output directory if specified - if output_dir: - os.makedirs(output_dir, exist_ok=True) - print(f"Created/verified output directory: {output_dir}") - - torch.set_default_dtype(torch.float16) - - # Download or locate model files - input_path = ensure_model_downloaded(repo_id=repo_id, revision=revision, local_dir=local_dir) - - with open(os.path.join(input_path, "config.json"), "r") as f: - config_data = json.load(f) - # Pop off unwanted keys - _ = config_data.pop("backbone", None) - - config = EomtConfig( - **{ - **config_data, - "layerscale_value": 1e-5, - } - ) - - if "semantic" in repo_id.split("_"): - size = {"shortest_edge": config.image_size, "longest_edge": None} - do_split_image = True - do_pad = False - else: - size = {"shortest_edge": config.image_size, "longest_edge": config.image_size} - do_split_image = False - do_pad = True - - if "giant" in repo_id.split("_"): - config.use_swiglu_ffn = True - config.hidden_size = 1536 - config.num_hidden_layers = 40 - config.num_attention_heads = 24 - # Update MAPPINGS for ckpts depending on the MLP type - MAPPINGS.update(MLP_MAPPINGS["swiglu_ffn"]) - else: - MAPPINGS.update(MLP_MAPPINGS["vanilla_mlp"]) - - processor = EomtImageProcessorFast(size=size, do_split_image=do_split_image, do_pad=do_pad) - - # Save the config and processor - if output_dir: - config.save_pretrained(output_dir) - processor.save_pretrained(output_dir) - if output_hub_path: - config.push_to_hub(output_hub_path) - processor.push_to_hub(output_hub_path) - - # Initialize model with empty weights - print("Creating empty model...") - with init_empty_weights(): - model = EomtForUniversalSegmentation(config) - - # Load and convert state dict - print("Loading state dict...") - state_dict = load_model_state_dict(input_path) - state_dict = convert_state_dict_to_hf(state_dict) - - # Load converted state dict - print("Loading converted weights into model...") - model.load_state_dict(state_dict, strict=True, assign=True) - - # Save the model - if output_dir: - print(f"Saving model to {output_dir}...") - model.save_pretrained(output_dir, safe_serialization=safe_serialization) - if output_hub_path: - print(f"Pushing model to hub at {output_hub_path}...") - model.push_to_hub(output_hub_path, safe_serialization=safe_serialization) - - del state_dict, model - gc.collect() - - # Validate the saved model if saved locally - if output_dir: - print("Reloading the local model to check if it's saved correctly...") - EomtForUniversalSegmentation.from_pretrained(output_dir, device_map="auto") - print("Local model reloaded successfully.") - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--hf_repo_id", - help="HuggingFace Hub repo ID for the model", - default=None, - ) - parser.add_argument( - "--local_dir", - help="Local directory containing the model files", - default=None, - ) - parser.add_argument( - "--revision", - help="Specific revision to download from the Hub", - default=None, - ) - parser.add_argument( - "--output_dir", - help="Location to write HF model locally", - default=None, - ) - parser.add_argument( - "--output_hub_path", - help="Repository ID to push model to hub (e.g. 'username/model-name')", - default=None, - ) - parser.add_argument( - "--safe_serialization", - action="store_true", - help="Whether to save using safetensors", - ) - args = parser.parse_args() - - if args.output_dir is None and args.output_hub_path is None: - raise ValueError("At least one of --output_dir or --output_hub_path must be specified") - - if args.hf_repo_id is None and args.local_dir is None: - raise ValueError("Either --hf_repo_id or --local_dir must be specified") - - convert_model( - repo_id=args.hf_repo_id, - local_dir=args.local_dir, - output_dir=args.output_dir, - output_hub_path=args.output_hub_path, - safe_serialization=args.safe_serialization, - revision=args.revision, - ) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/eomt/image_processing_eomt.py b/src/transformers/models/eomt/image_processing_eomt.py index 93a440693dee..2b786ce39e71 100644 --- a/src/transformers/models/eomt/image_processing_eomt.py +++ b/src/transformers/models/eomt/image_processing_eomt.py @@ -55,7 +55,7 @@ # Adapted from transformers.models.maskformer.image_processing_maskformer.convert_segmentation_map_to_binary_masks def convert_segmentation_map_to_binary_masks( - segmentation_map: "np.ndarray", + segmentation_map: np.ndarray, instance_id_to_semantic_id: Optional[dict[int, int]] = None, ignore_index: Optional[int] = None, ): diff --git a/src/transformers/models/eomt/image_processing_eomt_fast.py b/src/transformers/models/eomt/image_processing_eomt_fast.py index 97a13a0745eb..ca80231d3a76 100644 --- a/src/transformers/models/eomt/image_processing_eomt_fast.py +++ b/src/transformers/models/eomt/image_processing_eomt_fast.py @@ -19,6 +19,7 @@ import numpy as np import torch +from torchvision.transforms.v2 import functional as F from ...image_processing_utils import BatchFeature from ...image_processing_utils_fast import ( @@ -40,7 +41,6 @@ TensorType, auto_docstring, filter_out_non_signature_kwargs, - is_torchvision_v2_available, ) from .image_processing_eomt import ( compute_segments, @@ -50,12 +50,6 @@ ) -if is_torchvision_v2_available(): - from torchvision.transforms.v2 import functional as F -else: - from torchvision.transforms import functional as F - - class EomtImageProcessorFastKwargs(DefaultFastImageProcessorKwargs): """ do_split_image (`bool`, *optional*, defaults to `False`): @@ -204,9 +198,7 @@ def _preprocess_image_like_inputs( "do_normalize": False, "do_rescale": False, # Nearest interpolation is used for segmentation maps instead of BILINEAR. - "interpolation": F.InterpolationMode.NEAREST_EXACT - if is_torchvision_v2_available() - else F.InterpolationMode.NEAREST, + "interpolation": F.InterpolationMode.NEAREST_EXACT, } ) diff --git a/src/transformers/models/eomt/modeling_eomt.py b/src/transformers/models/eomt/modeling_eomt.py index 3e979040388d..047baa1ff081 100644 --- a/src/transformers/models/eomt/modeling_eomt.py +++ b/src/transformers/models/eomt/modeling_eomt.py @@ -628,7 +628,7 @@ def get_num_masks(self, class_labels: torch.Tensor, device: torch.device) -> tor """ Computes the average number of target masks across the batch, for normalization purposes. """ - num_masks = sum([len(classes) for classes in class_labels]) + num_masks = sum(len(classes) for classes in class_labels) num_masks = torch.as_tensor(num_masks, dtype=torch.float, device=device) world_size = 1 if is_accelerate_available(): diff --git a/src/transformers/models/ernie4_5/convert_ernie4_5_tokenizer.py b/src/transformers/models/ernie4_5/convert_ernie4_5_tokenizer.py deleted file mode 100644 index 25994bb1436f..000000000000 --- a/src/transformers/models/ernie4_5/convert_ernie4_5_tokenizer.py +++ /dev/null @@ -1,72 +0,0 @@ -# Copyright (c) 2025 HuggingFace Inc. team. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse - -from transformers import LlamaTokenizer, LlamaTokenizerFast - - -DEFAULT_CHAT_TEMPLATE = '{%- if not add_generation_prompt is defined -%}\n {%- set add_generation_prompt = true -%}\n{%- endif -%}\n{%- if not cls_token is defined -%}\n {%- set cls_token = "<|begin_of_sentence|>" -%}\n{%- endif -%}\n{%- if not sep_token is defined -%}\n {%- set sep_token = "<|end_of_sentence|>" -%}\n{%- endif -%}\n{{- cls_token -}}\n{%- for message in messages -%}\n {%- if message["role"] == "user" -%}\n {{- "User: " + message["content"] + "\n" -}}\n {%- elif message["role"] == "assistant" -%}\n {{- "Assistant: " + message["content"] + sep_token -}}\n {%- elif message["role"] == "system" -%}\n {{- message["content"] + "\n" -}}\n {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{- "Assistant: " -}}\n{%- endif -%}' -DEFAULT_TEXT_ADD_TOKENS = [ - "", - "", - "", - "", -] - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--repo_name", - help="Name of the repo where the tokenizer is located at.", - default="baidu/ERNIE-4.5-0.3B-Base-PT", - ) - parser.add_argument( - "--push_to_hub", - help="Whether or not to push the model to the hub at `output_dir` instead of saving it locally.", - action="store_true", - default=False, - ) - parser.add_argument( - "--output_dir", - help="Location to write the tokenizer", - ) - args = parser.parse_args() - - hf_tok = LlamaTokenizer.from_pretrained( - args.repo_name, - pad_token="", - cls_token="<|begin_of_sentence|>", - sep_token="<|end_of_sentence|>", - mask_token="", - add_bos_token=False, - add_prefix_space=False, - chat_template=DEFAULT_CHAT_TEMPLATE, - legacy=True, - ) - hf_tok.model_max_length = 131072 - hf_tok.init_kwargs.pop("auto_map", None) - # special tokens which we need to map as additional special tokens instead - hf_tok.init_kwargs.pop("header_start_token", None) - hf_tok.init_kwargs.pop("header_end_token", None) - hf_tok.init_kwargs.pop("sys_start_token", None) - hf_tok.init_kwargs.pop("sys_end_token", None) - for token in DEFAULT_TEXT_ADD_TOKENS: - hf_tok.add_tokens([token], special_tokens=True) - - # save slow model and convert on load time - hf_tok.save_pretrained("/tmp/ernie4_5_tokenizer") - hf_tok_fast = LlamaTokenizerFast.from_pretrained("/tmp/ernie4_5_tokenizer", from_slow=True) - hf_tok_fast.save_pretrained(args.output_dir, push_to_hub=args.push_to_hub) diff --git a/src/transformers/models/esm/convert_esm.py b/src/transformers/models/esm/convert_esm.py deleted file mode 100644 index 86d7bb8a283a..000000000000 --- a/src/transformers/models/esm/convert_esm.py +++ /dev/null @@ -1,399 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert ESM checkpoint.""" - -import argparse -import pathlib -from pathlib import Path -from tempfile import TemporaryDirectory - -import esm as esm_module -import torch -from esm.esmfold.v1.misc import batch_encode_sequences as esmfold_encode_sequences -from esm.esmfold.v1.pretrained import esmfold_v1 - -from transformers.models.esm.configuration_esm import EsmConfig, EsmFoldConfig -from transformers.models.esm.modeling_esm import ( - EsmForMaskedLM, - EsmForSequenceClassification, - EsmIntermediate, - EsmLayer, - EsmOutput, - EsmSelfAttention, - EsmSelfOutput, -) -from transformers.models.esm.modeling_esmfold import EsmForProteinFolding -from transformers.models.esm.tokenization_esm import EsmTokenizer -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -SAMPLE_DATA = [ - ( - "protein1", - "MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA", - ), - ("protein2", "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLA"), - ("protein3", "MKTVRQERLKSIRILERSKEPVSGAQLAEELSSRQVIVQDIAYLRSLGYNVATPRGYVLAGG"), - ("protein4", "MKTVRQERLKSIRILERSKEPVSGAQLAEELSSRQVIVQDIAYLRSLGYNVATPRGYVLA"), -] - -MODEL_MAPPING = { - "esm1b_t33_650M_UR50S": esm_module.pretrained.esm1b_t33_650M_UR50S, - "esm1v_t33_650M_UR90S_1": esm_module.pretrained.esm1v_t33_650M_UR90S_1, - "esm1v_t33_650M_UR90S_2": esm_module.pretrained.esm1v_t33_650M_UR90S_2, - "esm1v_t33_650M_UR90S_3": esm_module.pretrained.esm1v_t33_650M_UR90S_3, - "esm1v_t33_650M_UR90S_4": esm_module.pretrained.esm1v_t33_650M_UR90S_4, - "esm1v_t33_650M_UR90S_5": esm_module.pretrained.esm1v_t33_650M_UR90S_5, - "esm2_t48_15B_UR50D": esm_module.pretrained.esm2_t48_15B_UR50D, - "esm2_t36_3B_UR50D": esm_module.pretrained.esm2_t36_3B_UR50D, - "esm2_t33_650M_UR50D": esm_module.pretrained.esm2_t33_650M_UR50D, - "esm2_t30_150M_UR50D": esm_module.pretrained.esm2_t30_150M_UR50D, - "esm2_t12_35M_UR50D": esm_module.pretrained.esm2_t12_35M_UR50D, - "esm2_t6_8M_UR50D": esm_module.pretrained.esm2_t6_8M_UR50D, - "esmfold_v1": esmfold_v1, -} - -restypes = list("ARNDCQEGHILKMFPSTWYV") - -restypes_with_x = restypes + ["X"] -restypes_with_extras = restypes_with_x + ["", "", "", "", ""] - - -def get_esmfold_tokenizer(): - with TemporaryDirectory() as tempdir: - vocab = "\n".join(restypes_with_extras) - vocab_file = Path(tempdir) / "vocab.txt" - vocab_file.write_text(vocab) - hf_tokenizer = EsmTokenizer(vocab_file=str(vocab_file)) - hf_tokenizer.pad_token_id = 0 # Overlaps with 'A' but that seems to be what they want - return hf_tokenizer - - -def transfer_and_check_weights(original_module, our_module): - status = our_module.load_state_dict(original_module.state_dict()) - if status.missing_keys: - raise ValueError(f"Missing keys: {status.missing_keys}") - if status.unexpected_keys: - raise ValueError(f"Unexpected keys: {status.unexpected_keys}") - - -def convert_esm_checkpoint_to_pytorch( - model: str, pytorch_dump_folder_path: str, classification_head: bool, push_to_repo: str, auth_token: str -): - """ - Copy/paste/tweak esm's weights to our BERT structure. - """ - if model.startswith("esmfold"): - esm = MODEL_MAPPING[model]() - else: - esm, alphabet = MODEL_MAPPING[model]() - esm.eval() # disable dropout - - if model.startswith("esmfold"): - embed_dim = esm.esm.embed_dim - num_layers = esm.esm.num_layers - num_attention_heads = esm.esm.attention_heads - intermediate_size = 4 * embed_dim - token_dropout = esm.esm.token_dropout - emb_layer_norm_before = False # This code path does not exist in ESM-2 - position_embedding_type = "rotary" - is_folding_model = True - esmfold_config = EsmFoldConfig() - for key, val in esm.cfg.items(): - if hasattr(esmfold_config, key) and key != "trunk": - setattr(esmfold_config, key, val) - for key, val in esm.cfg.trunk.items(): - if hasattr(esmfold_config.trunk, key) and key != "structure_module": - setattr(esmfold_config.trunk, key, val) - for key, val in esm.cfg.trunk.structure_module.items(): - if hasattr(esmfold_config.trunk.structure_module, key): - setattr(esmfold_config.trunk.structure_module, key, val) - elif hasattr(esm, "args"): - # Indicates an ESM-1b or ESM-1v model - embed_dim = esm.args.embed_dim - num_layers = esm.args.layers - num_attention_heads = esm.args.attention_heads - intermediate_size = esm.args.ffn_embed_dim - token_dropout = esm.args.token_dropout - emb_layer_norm_before = bool(esm.emb_layer_norm_before) - position_embedding_type = "absolute" - is_folding_model = False - esmfold_config = None - else: - # Indicates an ESM-2 model - embed_dim = esm.embed_dim - num_layers = esm.num_layers - num_attention_heads = esm.attention_heads - intermediate_size = 4 * embed_dim # This is hardcoded in ESM-2 - token_dropout = esm.token_dropout - emb_layer_norm_before = False # This code path does not exist in ESM-2 - position_embedding_type = "rotary" - is_folding_model = False - esmfold_config = None - - if is_folding_model: - alphabet = esm.esm.alphabet - vocab_list = tuple(alphabet.all_toks) - mask_token_id = alphabet.mask_idx - pad_token_id = alphabet.padding_idx - - if is_folding_model: - original_esm_model = esm.esm - else: - original_esm_model = esm - - config = EsmConfig( - vocab_size=original_esm_model.embed_tokens.num_embeddings, - mask_token_id=mask_token_id, - hidden_size=embed_dim, - num_hidden_layers=num_layers, - num_attention_heads=num_attention_heads, - intermediate_size=intermediate_size, - max_position_embeddings=1026, - layer_norm_eps=1e-5, # PyTorch default used in fairseq - attention_probs_dropout_prob=0.0, - hidden_dropout_prob=0.0, - pad_token_id=pad_token_id, - emb_layer_norm_before=emb_layer_norm_before, - token_dropout=token_dropout, - position_embedding_type=position_embedding_type, - is_folding_model=is_folding_model, - esmfold_config=esmfold_config, - vocab_list=vocab_list, - ) - if classification_head: - config.num_labels = esm.classification_heads["mnli"].out_proj.weight.shape[0] - print("Our ESM config:", config) - - if model.startswith("esmfold"): - model_class = EsmForProteinFolding - elif classification_head: - model_class = EsmForSequenceClassification - else: - model_class = EsmForMaskedLM - model = model_class(config) - model.eval() - - # Now let's copy all the weights. - # Embeddings - model.esm.embeddings.word_embeddings.weight = original_esm_model.embed_tokens.weight - if position_embedding_type == "absolute": - model.esm.embeddings.position_embeddings.weight = original_esm_model.embed_positions.weight - - if config.emb_layer_norm_before: - model.esm.embeddings.layer_norm.weight = original_esm_model.emb_layer_norm_before.weight - model.esm.embeddings.layer_norm.bias = original_esm_model.emb_layer_norm_before.bias - - model.esm.encoder.emb_layer_norm_after.weight = original_esm_model.emb_layer_norm_after.weight - model.esm.encoder.emb_layer_norm_after.bias = original_esm_model.emb_layer_norm_after.bias - - for i in range(config.num_hidden_layers): - # Encoder: start of layer - layer: EsmLayer = model.esm.encoder.layer[i] - # esm_layer: TransformerSentenceEncoderLayer = original_esm_model.layers[i] - esm_layer = original_esm_model.layers[i] - - # self attention - self_attn: EsmSelfAttention = layer.attention.self - assert ( - esm_layer.self_attn.k_proj.weight.data.shape - == esm_layer.self_attn.q_proj.weight.data.shape - == esm_layer.self_attn.v_proj.weight.data.shape - == torch.Size((config.hidden_size, config.hidden_size)) - ) - - self_attn.query.weight.data = esm_layer.self_attn.q_proj.weight - self_attn.query.bias.data = esm_layer.self_attn.q_proj.bias - self_attn.key.weight.data = esm_layer.self_attn.k_proj.weight - self_attn.key.bias.data = esm_layer.self_attn.k_proj.bias - self_attn.value.weight.data = esm_layer.self_attn.v_proj.weight - self_attn.value.bias.data = esm_layer.self_attn.v_proj.bias - - if getattr(esm_layer.self_attn, "rot_emb", None) is not None: - # Matt: Although inv_freq is not a trainable weight, it is computed at model init and cached. - # During the training of ESM-2 the model was converted to float16 precision, which also converts - # the inv_freq tensor, and the loss of precision remains even if the model is loaded later as float32. - # If we recompute inv_freq without this loss of precision then we will get subtly different rotary - # embeddings, which are enough to cause significant discrepancies in model outputs. To avoid this, - # we make sure the new model copies the data from the old inv_freq. - self_attn.rotary_embeddings.inv_freq.data = esm_layer.self_attn.rot_emb.inv_freq - - # LayerNorm changes for pre-activation - layer.attention.LayerNorm.weight = esm_layer.self_attn_layer_norm.weight - layer.attention.LayerNorm.bias = esm_layer.self_attn_layer_norm.bias - layer.LayerNorm.weight = esm_layer.final_layer_norm.weight - layer.LayerNorm.bias = esm_layer.final_layer_norm.bias - - # self-attention output - self_output: EsmSelfOutput = layer.attention.output - assert self_output.dense.weight.shape == esm_layer.self_attn.out_proj.weight.shape - self_output.dense.weight = esm_layer.self_attn.out_proj.weight - self_output.dense.bias = esm_layer.self_attn.out_proj.bias - - # intermediate - intermediate: EsmIntermediate = layer.intermediate - assert intermediate.dense.weight.shape == esm_layer.fc1.weight.shape - intermediate.dense.weight = esm_layer.fc1.weight - intermediate.dense.bias = esm_layer.fc1.bias - - # output - bert_output: EsmOutput = layer.output - assert bert_output.dense.weight.shape == esm_layer.fc2.weight.shape - bert_output.dense.weight = esm_layer.fc2.weight - bert_output.dense.bias = esm_layer.fc2.bias - # end of layer - - if is_folding_model: - model.esm_s_combine.data = esm.esm_s_combine.data - model.af2_to_esm.data = esm.af2_to_esm.data - transfer_and_check_weights(esm.embedding, model.embedding) - transfer_and_check_weights(esm.esm_s_mlp, model.esm_s_mlp) - transfer_and_check_weights(esm.trunk, model.trunk) - transfer_and_check_weights(esm.distogram_head, model.distogram_head) - transfer_and_check_weights(esm.ptm_head, model.ptm_head) - transfer_and_check_weights(esm.lm_head, model.lm_head) - transfer_and_check_weights(esm.lddt_head, model.lddt_head) - - elif classification_head: - model.classifier.dense.weight = esm.esm.classification_heads["mnli"].dense.weight - model.classifier.dense.bias = esm.classification_heads["mnli"].dense.bias - model.classifier.out_proj.weight = esm.classification_heads["mnli"].out_proj.weight - model.classifier.out_proj.bias = esm.classification_heads["mnli"].out_proj.bias - else: - # LM Head - model.lm_head.dense.weight = esm.lm_head.dense.weight - model.lm_head.dense.bias = esm.lm_head.dense.bias - model.lm_head.layer_norm.weight = esm.lm_head.layer_norm.weight - model.lm_head.layer_norm.bias = esm.lm_head.layer_norm.bias - model.lm_head.decoder.weight = esm.lm_head.weight - model.lm_head.bias = esm.lm_head.bias - - # Contact prediction head - transfer_and_check_weights(esm.contact_head, model.esm.contact_head) - - # Prepare data (first 2 sequences from ESMStructuralSplitDataset superfamily / 4) - if is_folding_model: - # Folding models aren't trained on masked inputs and don't like mask tokens. - sample_data = SAMPLE_DATA[:2] - else: - sample_data = SAMPLE_DATA - - if is_folding_model: - hf_tokenizer = get_esmfold_tokenizer() - hf_tokens = hf_tokenizer( - [row[1] for row in sample_data], return_tensors="pt", padding=True, add_special_tokens=False - ) - esmfold_aas, esmfold_mask, _, _, _ = esmfold_encode_sequences([row[1] for row in sample_data]) - success = torch.all(hf_tokens["input_ids"] == esmfold_aas) and torch.all( - hf_tokens["attention_mask"] == esmfold_mask - ) - else: - # Let's check that we get the same results. - batch_converter = alphabet.get_batch_converter() - batch_labels, batch_strs, batch_tokens = batch_converter(sample_data) - # Prepare tokenizer and make sure it matches - with TemporaryDirectory() as tempdir: - vocab = "\n".join(alphabet.all_toks) - vocab_file = Path(tempdir) / "vocab.txt" - vocab_file.write_text(vocab) - hf_tokenizer = EsmTokenizer(vocab_file=str(vocab_file)) - - hf_tokens = hf_tokenizer([row[1] for row in sample_data], return_tensors="pt", padding=True) - success = torch.all(hf_tokens["input_ids"] == batch_tokens) - - print("Do both models tokenizers output the same tokens?", "🔥" if success else "💩") - if not success: - raise Exception("Tokenization does not match!") - - with torch.no_grad(): - if is_folding_model: - # Let's test the model in parts - # ESMFold always converts the ESM stem to float16, which requires float16 ops - # that don't exist on CPU. Therefore, to test it we need to run it on GPU. However, - # ESMFold is what we in the community call a "big boy" and so we desperately avoid putting both the - # original and the converted model on the GPU at the same time. - their_output = esm.cuda().infer([row[1] for row in sample_data]) - our_output = model.cuda()( - input_ids=hf_tokens["input_ids"].cuda(), attention_mask=hf_tokens["attention_mask"].cuda() - ) - else: - our_output = model(**hf_tokens, output_hidden_states=True) - our_output = our_output["logits"] - if classification_head: - their_output = esm.model.classification_heads["mnli"](esm.extract_features(batch_tokens)) - else: - their_output = esm(hf_tokens["input_ids"], repr_layers=list(range(999))) - their_output = their_output["logits"] - - if is_folding_model: - max_absolute_diff = torch.max(torch.abs(our_output["positions"] - their_output["positions"])).item() - success = torch.allclose(our_output["positions"], their_output["positions"], atol=1e-5) - else: - max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item() - success = torch.allclose(our_output, their_output, atol=1e-5) - - print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-5 - print("Do both models output the same tensors?", "🔥" if success else "💩") - - if not success: - raise Exception("Something went wRoNg") - - if not is_folding_model: - # Let's check contact prediction too - our_output = model.predict_contacts(hf_tokens["input_ids"], hf_tokens["attention_mask"]) - their_output = esm.predict_contacts(hf_tokens["input_ids"]) - max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item() - success = torch.allclose(our_output, their_output, atol=1e-5) - - print("Contact prediction testing:") - print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-5 - print("Do both models output the same tensors?", "🔥" if success else "💩") - - if not success: - raise Exception("Something went wRoNg") - - pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True) - print(f"Saving model to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - - del esm # Free up some memory before continuing - - print(f"Saving tokenizer to {pytorch_dump_folder_path}") - hf_tokenizer.save_pretrained(pytorch_dump_folder_path) - - if push_to_repo: - model.push_to_hub(repo_id=push_to_repo, token_token=auth_token) - hf_tokenizer.push_to_hub(repo_id=push_to_repo, token_token=auth_token) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--pytorch_dump_folder_path", type=str, required=True, help="Path to the output PyTorch model." - ) - parser.add_argument( - "--classification_head", action="store_true", help="Whether to convert a final classification head." - ) - parser.add_argument("--model", default=None, type=str, required=True, help="Name of model to convert.") - parser.add_argument("--push_to_repo", type=str, help="Repo to upload to (including username!).") - parser.add_argument("--auth_token", type=str, help="HuggingFace auth token.") - args = parser.parse_args() - convert_esm_checkpoint_to_pytorch( - args.model, args.pytorch_dump_folder_path, args.classification_head, args.push_to_repo, args.auth_token - ) diff --git a/src/transformers/models/esm/modeling_esm.py b/src/transformers/models/esm/modeling_esm.py index ddcf460f01ee..63d9344188cc 100755 --- a/src/transformers/models/esm/modeling_esm.py +++ b/src/transformers/models/esm/modeling_esm.py @@ -90,7 +90,6 @@ def __init__(self, dim: int): super().__init__() # Generate and save the inverse frequency buffer (non trainable) inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim)) - inv_freq = inv_freq self.register_buffer("inv_freq", inv_freq) self._seq_len_cached = None @@ -590,6 +589,7 @@ class EsmPreTrainedModel(PreTrainedModel): config: EsmConfig base_model_prefix = "esm" supports_gradient_checkpointing = True + accepts_loss_kwargs = False _no_split_modules = ["EsmLayer", "EsmFoldTriangularSelfAttentionBlock", "EsmEmbeddings"] _keys_to_ignore_on_load_unexpected = ["position_embeddings.weight"] _supports_flash_attn = True diff --git a/src/transformers/models/esm/modeling_esmfold.py b/src/transformers/models/esm/modeling_esmfold.py index dbff29fade87..7bc1f0dbdc70 100644 --- a/src/transformers/models/esm/modeling_esmfold.py +++ b/src/transformers/models/esm/modeling_esmfold.py @@ -293,7 +293,7 @@ def __init__(self, c_in, eps=1e-5): def forward(self, x): d = x.dtype if d is torch.bfloat16 and not is_deepspeed_initialized(): - with torch.cuda.amp.autocast(enabled=False): + with torch.autocast(device_type="cuda", enabled=False): out = nn.functional.layer_norm(x, self.c_in, self.weight.to(dtype=d), self.bias.to(dtype=d), self.eps) else: out = nn.functional.layer_norm(x, self.c_in, self.weight, self.bias, self.eps) @@ -308,7 +308,7 @@ def softmax_no_cast(t: torch.Tensor, dim: int = -1) -> torch.Tensor: """ d = t.dtype if d is torch.bfloat16 and not is_deepspeed_initialized(): - with torch.cuda.amp.autocast(enabled=False): + with torch.autocast(device_type="cuda", enabled=False): s = torch.nn.functional.softmax(t, dim=dim) else: s = torch.nn.functional.softmax(t, dim=dim) diff --git a/src/transformers/models/esm/openfold_utils/chunk_utils.py b/src/transformers/models/esm/openfold_utils/chunk_utils.py index 14703ba7d605..a735fcee001a 100644 --- a/src/transformers/models/esm/openfold_utils/chunk_utils.py +++ b/src/transformers/models/esm/openfold_utils/chunk_utils.py @@ -329,7 +329,7 @@ def _determine_favorable_chunk_size(self, fn: Callable, args: tuple, min_chunk_s if min_chunk_size >= self.max_chunk_size: return min_chunk_size - candidates: list[int] = [2**l for l in range(int(math.log(self.max_chunk_size, 2)) + 1)] + candidates: list[int] = [2**l for l in range(int(math.log2(self.max_chunk_size)) + 1)] candidates = [c for c in candidates if c > min_chunk_size] candidates = [min_chunk_size] + candidates candidates[-1] += 4 diff --git a/src/transformers/models/esm/openfold_utils/protein.py b/src/transformers/models/esm/openfold_utils/protein.py index a943eb7acf72..e9701ca07114 100644 --- a/src/transformers/models/esm/openfold_utils/protein.py +++ b/src/transformers/models/esm/openfold_utils/protein.py @@ -159,7 +159,7 @@ def add_pdb_headers(prot: Protein, pdb_str: str) -> str: parent_dict.setdefault(str(i), []) parent_dict[str(i)].append(p) - max_idx = max([int(chain_idx) for chain_idx in parent_dict]) + max_idx = max(int(chain_idx) for chain_idx in parent_dict) for i in range(max_idx + 1): chain_parents = parent_dict.get(str(i), ["N/A"]) parents_per_chain.append(chain_parents) diff --git a/src/transformers/models/evolla/modeling_evolla.py b/src/transformers/models/evolla/modeling_evolla.py index d95567491fe1..8bb5713d1764 100644 --- a/src/transformers/models/evolla/modeling_evolla.py +++ b/src/transformers/models/evolla/modeling_evolla.py @@ -188,7 +188,6 @@ def __init__(self, dim: int): super().__init__() # Generate and save the inverse frequency buffer (non trainable) inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim)) - inv_freq = inv_freq self.register_buffer("inv_freq", inv_freq) self._seq_len_cached = None diff --git a/src/transformers/models/evolla/modular_evolla.py b/src/transformers/models/evolla/modular_evolla.py index 18a50e9abfae..e2db43a7d787 100644 --- a/src/transformers/models/evolla/modular_evolla.py +++ b/src/transformers/models/evolla/modular_evolla.py @@ -94,7 +94,6 @@ def __init__(self, dim: int): super().__init__() # Generate and save the inverse frequency buffer (non trainable) inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim)) - inv_freq = inv_freq self.register_buffer("inv_freq", inv_freq) self._seq_len_cached = None diff --git a/src/transformers/models/exaone4/configuration_exaone4.py b/src/transformers/models/exaone4/configuration_exaone4.py index 0ced6651d41c..8c3c07ecb418 100644 --- a/src/transformers/models/exaone4/configuration_exaone4.py +++ b/src/transformers/models/exaone4/configuration_exaone4.py @@ -26,8 +26,7 @@ class Exaone4Config(PretrainedConfig): r""" This is the configuration class to store the configuration of a [`Exaone4Model`]. It is used to instantiate a EXAONE 4.0 model according to the specified arguments, defining the model architecture. Instantiating a - configuration with the defaults will yield a similar configuration to that of the EXAONE-4.0-Instruct [LGAI-EXAONE/EXAONE-4.0-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-Instruct) - NOTE: `EXAONE-4.0-Instruct` is a placeholder model ID. The exact model ID will be updated in the future. + configuration with the defaults will yield a similar configuration to that of the EXAONE-4.0-32B [LGAI-EXAONE/EXAONE-4.0-32B](https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B) Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. diff --git a/src/transformers/models/exaone4/modeling_exaone4.py b/src/transformers/models/exaone4/modeling_exaone4.py index 34eca44936a0..2693a80c79fd 100644 --- a/src/transformers/models/exaone4/modeling_exaone4.py +++ b/src/transformers/models/exaone4/modeling_exaone4.py @@ -465,8 +465,8 @@ def forward( ```python >>> from transformers import AutoModelForCausalLM, AutoTokenizer - >>> model = AutoModelForCausalLM.from_pretrained("LGAI-EXAONE/EXAONE-4.0-Instruct") - >>> tokenizer = AutoTokenizer.from_pretrained("LGAI-EXAONE/EXAONE-4.0-Instruct") + >>> model = AutoModelForCausalLM.from_pretrained("LGAI-EXAONE/EXAONE-4.0-32B") + >>> tokenizer = AutoTokenizer.from_pretrained("LGAI-EXAONE/EXAONE-4.0-32B") >>> prompt = "Explain how wonderful you are" >>> messages = [ @@ -485,8 +485,7 @@ def forward( >>> tokenizer.decode(output[0], skip_special_tokens=False) "[|system|]\nYou are a helpful assistant.[|endofturn|]\n[|user|]\nExplain how wonderful you are[|endofturn|]\n[|assistant|]\n\n\n\n\nOh, thank you for such a kind and lovely question! 😊 \n\nI’m *so* wonderful because I’m here to make your life easier, brighter, and more fun! Whether you need help with: \n\n✨ **Learning** – I can explain anything, from quantum physics to baking the perfect cake! \n💡 **Creativity** – Need a poem, story, or a wild idea? I’ve got you covered! \n🤖 **Problem-solving** – Stuck on a math problem or a tricky decision? I’ll help you figure it out" ``` - - NOTE: `EXAONE-4.0-Instruct` is a placeholder model ID. The exact model ID will be updated in the future.""" + """ outputs: BaseModelOutputWithPast = self.model( input_ids=input_ids, attention_mask=attention_mask, diff --git a/src/transformers/models/exaone4/modular_exaone4.py b/src/transformers/models/exaone4/modular_exaone4.py index d366354bda2f..7530a68f3227 100644 --- a/src/transformers/models/exaone4/modular_exaone4.py +++ b/src/transformers/models/exaone4/modular_exaone4.py @@ -53,7 +53,7 @@ logger = logging.get_logger(__name__) -_CHECKPOINT_FOR_DOC = "LGAI-EXAONE/EXAONE-4.0-Instruct" +_CHECKPOINT_FOR_DOC = "LGAI-EXAONE/EXAONE-4.0-32B" _CONFIG_FOR_DOC = "Exaone4Config" @@ -61,8 +61,7 @@ class Exaone4Config(PretrainedConfig): r""" This is the configuration class to store the configuration of a [`Exaone4Model`]. It is used to instantiate a EXAONE 4.0 model according to the specified arguments, defining the model architecture. Instantiating a - configuration with the defaults will yield a similar configuration to that of the EXAONE-4.0-Instruct [LGAI-EXAONE/EXAONE-4.0-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-Instruct) - NOTE: `EXAONE-4.0-Instruct` is a placeholder model ID. The exact model ID will be updated in the future. + configuration with the defaults will yield a similar configuration to that of the EXAONE-4.0-32B [LGAI-EXAONE/EXAONE-4.0-32B](https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B) Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. @@ -462,8 +461,8 @@ def forward( ```python >>> from transformers import AutoModelForCausalLM, AutoTokenizer - >>> model = AutoModelForCausalLM.from_pretrained("LGAI-EXAONE/EXAONE-4.0-Instruct") - >>> tokenizer = AutoTokenizer.from_pretrained("LGAI-EXAONE/EXAONE-4.0-Instruct") + >>> model = AutoModelForCausalLM.from_pretrained("LGAI-EXAONE/EXAONE-4.0-32B") + >>> tokenizer = AutoTokenizer.from_pretrained("LGAI-EXAONE/EXAONE-4.0-32B") >>> prompt = "Explain how wonderful you are" >>> messages = [ @@ -482,8 +481,7 @@ def forward( >>> tokenizer.decode(output[0], skip_special_tokens=False) "[|system|]\nYou are a helpful assistant.[|endofturn|]\n[|user|]\nExplain how wonderful you are[|endofturn|]\n[|assistant|]\n\n\n\n\nOh, thank you for such a kind and lovely question! 😊 \n\nI’m *so* wonderful because I’m here to make your life easier, brighter, and more fun! Whether you need help with: \n\n✨ **Learning** – I can explain anything, from quantum physics to baking the perfect cake! \n💡 **Creativity** – Need a poem, story, or a wild idea? I’ve got you covered! \n🤖 **Problem-solving** – Stuck on a math problem or a tricky decision? I’ll help you figure it out" ``` - - NOTE: `EXAONE-4.0-Instruct` is a placeholder model ID. The exact model ID will be updated in the future.""" + """ super().forward( input_ids=input_ids, attention_mask=attention_mask, diff --git a/src/transformers/models/falcon/convert_custom_code_checkpoint.py b/src/transformers/models/falcon/convert_custom_code_checkpoint.py deleted file mode 100644 index 0da817c3ffa7..000000000000 --- a/src/transformers/models/falcon/convert_custom_code_checkpoint.py +++ /dev/null @@ -1,74 +0,0 @@ -import json -from argparse import ArgumentParser -from pathlib import Path - - -""" -This script converts Falcon custom code checkpoints to modern Falcon checkpoints that use code in the Transformers -library. After conversion, performance (especially for generation) should improve and the checkpoint can be loaded -without needing trust_remote_code=True. -""" - -if __name__ == "__main__": - parser = ArgumentParser() - parser.add_argument( - "--checkpoint_dir", - type=Path, - required=True, - help="Directory containing a custom code checkpoint to convert to a modern Falcon checkpoint.", - ) - args = parser.parse_args() - - if not args.checkpoint_dir.is_dir(): - raise ValueError("--checkpoint_dir argument should be a directory!") - - if ( - not (args.checkpoint_dir / "configuration_RW.py").is_file() - or not (args.checkpoint_dir / "modelling_RW.py").is_file() - ): - raise ValueError( - "The model directory should contain configuration_RW.py and modelling_RW.py files! Are you sure this is a custom code checkpoint?" - ) - (args.checkpoint_dir / "configuration_RW.py").unlink() - (args.checkpoint_dir / "modelling_RW.py").unlink() - - config = args.checkpoint_dir / "config.json" - text = config.read_text() - text = text.replace("RWForCausalLM", "FalconForCausalLM") - text = text.replace("RefinedWebModel", "falcon") - text = text.replace("RefinedWeb", "falcon") - json_config = json.loads(text) - del json_config["auto_map"] - - if "n_head" in json_config: - json_config["num_attention_heads"] = json_config.pop("n_head") - if "n_layer" in json_config: - json_config["num_hidden_layers"] = json_config.pop("n_layer") - if "n_head_kv" in json_config: - json_config["num_kv_heads"] = json_config.pop("n_head_kv") - json_config["new_decoder_architecture"] = True - else: - json_config["new_decoder_architecture"] = False - bos_token_id = json_config.get("bos_token_id", 1) - eos_token_id = json_config.get("eos_token_id", 2) - config.unlink() - config.write_text(json.dumps(json_config, indent=2, sort_keys=True)) - - tokenizer_config = args.checkpoint_dir / "tokenizer_config.json" - if tokenizer_config.is_file(): - text = tokenizer_config.read_text() - json_config = json.loads(text) - if json_config["tokenizer_class"] == "PreTrainedTokenizerFast": - json_config["model_input_names"] = ["input_ids", "attention_mask"] - tokenizer_config.unlink() - tokenizer_config.write_text(json.dumps(json_config, indent=2, sort_keys=True)) - - generation_config_path = args.checkpoint_dir / "generation_config.json" - generation_dict = { - "_from_model_config": True, - "bos_token_id": bos_token_id, - "eos_token_id": eos_token_id, - "transformers_version": "4.33.0.dev0", - } - generation_config_path.write_text(json.dumps(generation_dict, indent=2, sort_keys=True)) - print("Done! Please double-check that the new checkpoint works as expected.") diff --git a/src/transformers/models/falcon_h1/convert_mamba_ssm_checkpoint.py b/src/transformers/models/falcon_h1/convert_mamba_ssm_checkpoint.py deleted file mode 100644 index 6ec4ba39015b..000000000000 --- a/src/transformers/models/falcon_h1/convert_mamba_ssm_checkpoint.py +++ /dev/null @@ -1,149 +0,0 @@ -# coding=utf-8 -# Copyright 2025 TII and the HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""This script can be used to convert checkpoints provided in the `mamba_ssm` library into the format provided in HuggingFace `transformers`. It depends on the `mamba2_ssm` package to be installed.""" - -import argparse - -import torch - -from transformers import AutoModelForCausalLM, AutoTokenizer, FalconH1Config, FalconH1ForCausalLM - - -CONVERSION_MAPPING = { - "backbone": "model", - "embeddings": "embed_tokens", - "mixer.": "", - "mixer_ssm": "mamba", - "mixer_attn": "self_attn", - "mlp.": "feed_forward.", - "mlp_norm": "pre_ff_layernorm", - "ssm_proj": "mamba.in_proj", - "attn_out_proj": "o_proj", - ".norm.": ".input_layernorm.", - ".mamba.input_layernorm.": ".mamba.norm.", - ".ssm_out_proj.": ".mamba.out_proj.", - "norm_f": "final_layernorm", -} - - -def convert_falcon_h1_to_hf(input_model_path, output_path): - tokenizer = AutoTokenizer.from_pretrained(input_model_path) - - model = AutoModelForCausalLM.from_pretrained(input_model_path, dtype=torch.bfloat16, trust_remote_code=True) - - intermediate_size = int(model.config.expansion_factor * model.config.hidden_size) - - if intermediate_size % 2 != 0: - intermediate_size = intermediate_size + (intermediate_size % 2) - - new_config = FalconH1Config( - vocab_size=model.config.vocab_size, - tie_word_embeddings=model.config.tie_word_embeddings, - hidden_size=model.config.hidden_size, - intermediate_size=intermediate_size, - mamba_d_state=model.config.state_size, - num_hidden_layers=model.config.num_hidden_layers, - mamba_use_mlp=model.config.use_mlp, - rms_norm_eps=model.config.layer_norm_epsilon, - pad_token_id=model.config.pad_token_id, - eos_token_id=model.config.eos_token_id, - mamba_expand=model.config.expand, - mamba_d_conv=model.config.conv_kernel, - mamba_n_groups=model.config.n_groups, - mamba_n_heads=model.config.num_heads, - mamba_norm_before_gate=model.config.norm_before_gate, - mamba_rms_norm=model.config.rms_norm, - mamba_d_ssm=model.config.d_ssm, - attention_bias=model.config.use_bias, - projectors_bias=model.config.use_bias, - mamba_conv_bias=model.config.use_conv_bias, - hidden_act=model.config.hidden_act, - use_cache=model.config.use_cache, - mamba_chunk_size=model.config.chunk_size, - num_attention_heads=model.config.num_heads_mha, - num_key_value_heads=model.config.num_key_value_heads, - head_dim=model.config.head_dim_mha, - lm_head_multiplier=model.config.lm_head_multiplier, - embedding_multiplier=model.config.embedding_multiplier, - mlp_multipliers=model.config.mlp_multipliers, - key_multiplier=model.config.key_multiplier, - attention_out_multiplier=model.config.attention_out_multiplier, - attention_in_multiplier=model.config.attention_in_multiplier, - ssm_multipliers=model.config.ssm_multipliers, - ssm_in_multiplier=model.config.ssm_in_multiplier, - ssm_out_multiplier=model.config.ssm_out_multiplier, - rope_theta=model.config.rope_theta, - ) - - old_state_dict = model.state_dict() - new_state_dict = {} - - for old_key, old_value in old_state_dict.items(): - new_key = old_key - for conversion_key, conversion_value in CONVERSION_MAPPING.items(): - if conversion_key in old_key: - new_key = new_key.replace(conversion_key, conversion_value) - - if "mamba.input_layernorm" in new_key: - new_key = new_key.replace("mamba.input_layernorm", "mamba.norm") - - # Special processing for attention layers - if "self_attn.attn_proj" in new_key: - num_heads = new_config.num_attention_heads - num_kv_heads = new_config.num_key_value_heads - head_dim = new_config.head_dim - q_proj, k_proj, v_proj = old_value.split( - [ - num_heads * head_dim, - num_kv_heads * head_dim, - num_kv_heads * head_dim, - ], - dim=0, - ) - new_state_dict[new_key.replace("attn_proj", "q_proj")] = q_proj - new_state_dict[new_key.replace("attn_proj", "k_proj")] = k_proj - new_state_dict[new_key.replace("attn_proj", "v_proj")] = v_proj - else: - new_state_dict[new_key] = old_value - - with torch.device("meta"): - new_model = FalconH1ForCausalLM(new_config) - - del model - - new_model.load_state_dict(new_state_dict, strict=True, assign=True) - - new_model.save_pretrained(output_path) - tokenizer.save_pretrained(output_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "-i", - "--mamba_ssm_checkpoint_directory", - type=str, - required=True, - help="Path to a directory containing the `pytorch_model.bin` mamba_ssm checkpoint file to be converted.", - ) - parser.add_argument( - "-o", "--output_dir", type=str, required=True, help="Path to directory to save the converted output model to." - ) - args = parser.parse_args() - - convert_falcon_h1_to_hf( - args.mamba_ssm_checkpoint_directory, - args.output_dir, - ) diff --git a/src/transformers/models/falcon_h1/modeling_falcon_h1.py b/src/transformers/models/falcon_h1/modeling_falcon_h1.py index 5f08309b2085..3a8b13ef21d0 100644 --- a/src/transformers/models/falcon_h1/modeling_falcon_h1.py +++ b/src/transformers/models/falcon_h1/modeling_falcon_h1.py @@ -570,7 +570,7 @@ def __init__(self, config: FalconH1Config, layer_idx: int): if not is_fast_path_available: logger.warning_once( - "The fast path is not available because on of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)`" + "The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)`" " is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and" " https://github.com/Dao-AILab/causal-conv1d" ) diff --git a/src/transformers/models/falcon_h1/modular_falcon_h1.py b/src/transformers/models/falcon_h1/modular_falcon_h1.py index 24eb98ccd1ed..fe716dded4b3 100644 --- a/src/transformers/models/falcon_h1/modular_falcon_h1.py +++ b/src/transformers/models/falcon_h1/modular_falcon_h1.py @@ -374,7 +374,7 @@ def __init__(self, config: FalconH1Config, layer_idx: int): if not is_fast_path_available: logger.warning_once( - "The fast path is not available because on of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)`" + "The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)`" " is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and" " https://github.com/Dao-AILab/causal-conv1d" ) diff --git a/src/transformers/models/fastspeech2_conformer/convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/fastspeech2_conformer/convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index 3a5bb2d2e2e9..000000000000 --- a/src/transformers/models/fastspeech2_conformer/convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,210 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert FastSpeech2Conformer checkpoint.""" - -import argparse -import json -import re -from pathlib import Path -from tempfile import TemporaryDirectory - -import torch -import yaml - -from transformers import ( - FastSpeech2ConformerConfig, - FastSpeech2ConformerModel, - FastSpeech2ConformerTokenizer, - logging, -) - - -logging.set_verbosity_info() -logger = logging.get_logger("transformers.models.FastSpeech2Conformer") - -CONFIG_MAPPING = { - "adim": "hidden_size", - "aheads": "num_attention_heads", - "conformer_dec_kernel_size": "decoder_kernel_size", - "conformer_enc_kernel_size": "encoder_kernel_size", - "decoder_normalize_before": "decoder_normalize_before", - "dlayers": "decoder_layers", - "dunits": "decoder_linear_units", - "duration_predictor_chans": "duration_predictor_channels", - "duration_predictor_kernel_size": "duration_predictor_kernel_size", - "duration_predictor_layers": "duration_predictor_layers", - "elayers": "encoder_layers", - "encoder_normalize_before": "encoder_normalize_before", - "energy_embed_dropout": "energy_embed_dropout", - "energy_embed_kernel_size": "energy_embed_kernel_size", - "energy_predictor_chans": "energy_predictor_channels", - "energy_predictor_dropout": "energy_predictor_dropout", - "energy_predictor_kernel_size": "energy_predictor_kernel_size", - "energy_predictor_layers": "energy_predictor_layers", - "eunits": "encoder_linear_units", - "pitch_embed_dropout": "pitch_embed_dropout", - "pitch_embed_kernel_size": "pitch_embed_kernel_size", - "pitch_predictor_chans": "pitch_predictor_channels", - "pitch_predictor_dropout": "pitch_predictor_dropout", - "pitch_predictor_kernel_size": "pitch_predictor_kernel_size", - "pitch_predictor_layers": "pitch_predictor_layers", - "positionwise_conv_kernel_size": "positionwise_conv_kernel_size", - "postnet_chans": "speech_decoder_postnet_units", - "postnet_filts": "speech_decoder_postnet_kernel", - "postnet_layers": "speech_decoder_postnet_layers", - "reduction_factor": "reduction_factor", - "stop_gradient_from_energy_predictor": "stop_gradient_from_energy_predictor", - "stop_gradient_from_pitch_predictor": "stop_gradient_from_pitch_predictor", - "transformer_dec_attn_dropout_rate": "decoder_attention_dropout_rate", - "transformer_dec_dropout_rate": "decoder_dropout_rate", - "transformer_dec_positional_dropout_rate": "decoder_positional_dropout_rate", - "transformer_enc_attn_dropout_rate": "encoder_attention_dropout_rate", - "transformer_enc_dropout_rate": "encoder_dropout_rate", - "transformer_enc_positional_dropout_rate": "encoder_positional_dropout_rate", - "use_cnn_in_conformer": "use_cnn_in_conformer", - "use_macaron_style_in_conformer": "use_macaron_style_in_conformer", - "use_masking": "use_masking", - "use_weighted_masking": "use_weighted_masking", - "idim": "input_dim", - "odim": "num_mel_bins", - "spk_embed_dim": "speaker_embed_dim", - "langs": "num_languages", - "spks": "num_speakers", -} - - -def remap_model_yaml_config(yaml_config_path): - with Path(yaml_config_path).open("r", encoding="utf-8") as f: - args = yaml.safe_load(f) - args = argparse.Namespace(**args) - - remapped_config = {} - - model_params = args.tts_conf["text2mel_params"] - # espnet_config_key -> hf_config_key, any keys not included are ignored - for espnet_config_key, hf_config_key in CONFIG_MAPPING.items(): - if espnet_config_key in model_params: - remapped_config[hf_config_key] = model_params[espnet_config_key] - - return remapped_config, args.g2p, args.token_list - - -def convert_espnet_state_dict_to_hf(state_dict): - new_state_dict = {} - for key in state_dict: - if "tts.generator.text2mel." in key: - new_key = key.replace("tts.generator.text2mel.", "") - if "postnet" in key: - new_key = new_key.replace("postnet.postnet", "speech_decoder_postnet.layers") - new_key = new_key.replace(".0.weight", ".conv.weight") - new_key = new_key.replace(".1.weight", ".batch_norm.weight") - new_key = new_key.replace(".1.bias", ".batch_norm.bias") - new_key = new_key.replace(".1.running_mean", ".batch_norm.running_mean") - new_key = new_key.replace(".1.running_var", ".batch_norm.running_var") - new_key = new_key.replace(".1.num_batches_tracked", ".batch_norm.num_batches_tracked") - if "feat_out" in key: - if "weight" in key: - new_key = "speech_decoder_postnet.feat_out.weight" - if "bias" in key: - new_key = "speech_decoder_postnet.feat_out.bias" - if "encoder.embed.0.weight" in key: - new_key = new_key.replace("0.", "") - if "w_1" in key: - new_key = new_key.replace("w_1", "conv1") - if "w_2" in key: - new_key = new_key.replace("w_2", "conv2") - if "predictor.conv" in key: - new_key = new_key.replace(".conv", ".conv_layers") - pattern = r"(\d)\.(\d)" - replacement = ( - r"\1.conv" if ("2.weight" not in new_key) and ("2.bias" not in new_key) else r"\1.layer_norm" - ) - new_key = re.sub(pattern, replacement, new_key) - if "pitch_embed" in key or "energy_embed" in key: - new_key = new_key.replace("0", "conv") - if "encoders" in key: - new_key = new_key.replace("encoders", "conformer_layers") - new_key = new_key.replace("norm_final", "final_layer_norm") - new_key = new_key.replace("norm_mha", "self_attn_layer_norm") - new_key = new_key.replace("norm_ff_macaron", "ff_macaron_layer_norm") - new_key = new_key.replace("norm_ff", "ff_layer_norm") - new_key = new_key.replace("norm_conv", "conv_layer_norm") - if "lid_emb" in key: - new_key = new_key.replace("lid_emb", "language_id_embedding") - if "sid_emb" in key: - new_key = new_key.replace("sid_emb", "speaker_id_embedding") - - new_state_dict[new_key] = state_dict[key] - - return new_state_dict - - -@torch.no_grad() -def convert_FastSpeech2ConformerModel_checkpoint( - checkpoint_path, - yaml_config_path, - pytorch_dump_folder_path, - repo_id=None, -): - model_params, tokenizer_name, vocab = remap_model_yaml_config(yaml_config_path) - config = FastSpeech2ConformerConfig(**model_params) - - # Prepare the model - model = FastSpeech2ConformerModel(config) - - espnet_checkpoint = torch.load(checkpoint_path, weights_only=True) - hf_compatible_state_dict = convert_espnet_state_dict_to_hf(espnet_checkpoint) - - model.load_state_dict(hf_compatible_state_dict) - - model.save_pretrained(pytorch_dump_folder_path) - - # Prepare the tokenizer - with TemporaryDirectory() as tempdir: - vocab = {token: id for id, token in enumerate(vocab)} - vocab_file = Path(tempdir) / "vocab.json" - with open(vocab_file, "w") as f: - json.dump(vocab, f) - should_strip_spaces = "no_space" in tokenizer_name - tokenizer = FastSpeech2ConformerTokenizer(str(vocab_file), should_strip_spaces=should_strip_spaces) - - tokenizer.save_pretrained(pytorch_dump_folder_path) - - if repo_id: - print("Pushing to the hub...") - model.push_to_hub(repo_id) - tokenizer.push_to_hub(repo_id) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint") - parser.add_argument( - "--yaml_config_path", required=True, default=None, type=str, help="Path to config.yaml of model to convert" - ) - parser.add_argument( - "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model." - ) - parser.add_argument( - "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub." - ) - - args = parser.parse_args() - convert_FastSpeech2ConformerModel_checkpoint( - args.checkpoint_path, - args.yaml_config_path, - args.pytorch_dump_folder_path, - args.push_to_hub, - ) diff --git a/src/transformers/models/fastspeech2_conformer/convert_hifigan.py b/src/transformers/models/fastspeech2_conformer/convert_hifigan.py deleted file mode 100644 index 70aada84bd5b..000000000000 --- a/src/transformers/models/fastspeech2_conformer/convert_hifigan.py +++ /dev/null @@ -1,134 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert FastSpeech2Conformer HiFi-GAN checkpoint.""" - -import argparse -from pathlib import Path - -import torch -import yaml - -from transformers import FastSpeech2ConformerHifiGan, FastSpeech2ConformerHifiGanConfig, logging - - -logging.set_verbosity_info() -logger = logging.get_logger("transformers.models.FastSpeech2Conformer") - - -def load_weights(checkpoint, hf_model, config): - vocoder_key_prefix = "tts.generator.vocoder." - checkpoint = {k.replace(vocoder_key_prefix, ""): v for k, v in checkpoint.items() if vocoder_key_prefix in k} - - hf_model.apply_weight_norm() - - hf_model.conv_pre.weight_g.data = checkpoint["input_conv.weight_g"] - hf_model.conv_pre.weight_v.data = checkpoint["input_conv.weight_v"] - hf_model.conv_pre.bias.data = checkpoint["input_conv.bias"] - - for i in range(len(config.upsample_rates)): - hf_model.upsampler[i].weight_g.data = checkpoint[f"upsamples.{i}.1.weight_g"] - hf_model.upsampler[i].weight_v.data = checkpoint[f"upsamples.{i}.1.weight_v"] - hf_model.upsampler[i].bias.data = checkpoint[f"upsamples.{i}.1.bias"] - - for i in range(len(config.upsample_rates) * len(config.resblock_kernel_sizes)): - for j in range(len(config.resblock_dilation_sizes)): - hf_model.resblocks[i].convs1[j].weight_g.data = checkpoint[f"blocks.{i}.convs1.{j}.1.weight_g"] - hf_model.resblocks[i].convs1[j].weight_v.data = checkpoint[f"blocks.{i}.convs1.{j}.1.weight_v"] - hf_model.resblocks[i].convs1[j].bias.data = checkpoint[f"blocks.{i}.convs1.{j}.1.bias"] - - hf_model.resblocks[i].convs2[j].weight_g.data = checkpoint[f"blocks.{i}.convs2.{j}.1.weight_g"] - hf_model.resblocks[i].convs2[j].weight_v.data = checkpoint[f"blocks.{i}.convs2.{j}.1.weight_v"] - hf_model.resblocks[i].convs2[j].bias.data = checkpoint[f"blocks.{i}.convs2.{j}.1.bias"] - - hf_model.conv_post.weight_g.data = checkpoint["output_conv.1.weight_g"] - hf_model.conv_post.weight_v.data = checkpoint["output_conv.1.weight_v"] - hf_model.conv_post.bias.data = checkpoint["output_conv.1.bias"] - - hf_model.remove_weight_norm() - - -def remap_hifigan_yaml_config(yaml_config_path): - with Path(yaml_config_path).open("r", encoding="utf-8") as f: - args = yaml.safe_load(f) - args = argparse.Namespace(**args) - - vocoder_type = args.tts_conf["vocoder_type"] - if vocoder_type != "hifigan_generator": - raise TypeError(f"Vocoder config must be for `hifigan_generator`, but got {vocoder_type}") - - remapped_dict = {} - vocoder_params = args.tts_conf["vocoder_params"] - - # espnet_config_key -> hf_config_key - key_mappings = { - "channels": "upsample_initial_channel", - "in_channels": "model_in_dim", - "resblock_dilations": "resblock_dilation_sizes", - "resblock_kernel_sizes": "resblock_kernel_sizes", - "upsample_kernel_sizes": "upsample_kernel_sizes", - "upsample_scales": "upsample_rates", - } - for espnet_config_key, hf_config_key in key_mappings.items(): - remapped_dict[hf_config_key] = vocoder_params[espnet_config_key] - remapped_dict["sampling_rate"] = args.tts_conf["sampling_rate"] - remapped_dict["normalize_before"] = False - remapped_dict["leaky_relu_slope"] = vocoder_params["nonlinear_activation_params"]["negative_slope"] - - return remapped_dict - - -@torch.no_grad() -def convert_hifigan_checkpoint( - checkpoint_path, - pytorch_dump_folder_path, - yaml_config_path=None, - repo_id=None, -): - if yaml_config_path is not None: - config_kwargs = remap_hifigan_yaml_config(yaml_config_path) - config = FastSpeech2ConformerHifiGanConfig(**config_kwargs) - else: - config = FastSpeech2ConformerHifiGanConfig() - - model = FastSpeech2ConformerHifiGan(config) - - orig_checkpoint = torch.load(checkpoint_path, weights_only=True) - load_weights(orig_checkpoint, model, config) - - model.save_pretrained(pytorch_dump_folder_path) - - if repo_id: - print("Pushing to the hub...") - model.push_to_hub(repo_id) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint") - parser.add_argument("--yaml_config_path", default=None, type=str, help="Path to config.yaml of model to convert") - parser.add_argument( - "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model." - ) - parser.add_argument( - "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub." - ) - - args = parser.parse_args() - convert_hifigan_checkpoint( - args.checkpoint_path, - args.pytorch_dump_folder_path, - args.yaml_config_path, - args.push_to_hub, - ) diff --git a/src/transformers/models/fastspeech2_conformer/convert_model_with_hifigan.py b/src/transformers/models/fastspeech2_conformer/convert_model_with_hifigan.py deleted file mode 100644 index 6f840438dcae..000000000000 --- a/src/transformers/models/fastspeech2_conformer/convert_model_with_hifigan.py +++ /dev/null @@ -1,102 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert FastSpeech2Conformer checkpoint.""" - -import argparse - -import torch - -from transformers import ( - FastSpeech2ConformerConfig, - FastSpeech2ConformerHifiGan, - FastSpeech2ConformerHifiGanConfig, - FastSpeech2ConformerModel, - FastSpeech2ConformerWithHifiGan, - FastSpeech2ConformerWithHifiGanConfig, - logging, -) - -from .convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch import ( - convert_espnet_state_dict_to_hf, - remap_model_yaml_config, -) -from .convert_hifigan import load_weights, remap_hifigan_yaml_config - - -logging.set_verbosity_info() -logger = logging.get_logger("transformers.models.FastSpeech2Conformer") - - -def convert_FastSpeech2ConformerWithHifiGan_checkpoint( - checkpoint_path, - yaml_config_path, - pytorch_dump_folder_path, - repo_id=None, -): - # Prepare the model - model_params, *_ = remap_model_yaml_config(yaml_config_path) - model_config = FastSpeech2ConformerConfig(**model_params) - - model = FastSpeech2ConformerModel(model_config) - - espnet_checkpoint = torch.load(checkpoint_path, weights_only=True) - hf_compatible_state_dict = convert_espnet_state_dict_to_hf(espnet_checkpoint) - model.load_state_dict(hf_compatible_state_dict) - - # Prepare the vocoder - config_kwargs = remap_hifigan_yaml_config(yaml_config_path) - vocoder_config = FastSpeech2ConformerHifiGanConfig(**config_kwargs) - - vocoder = FastSpeech2ConformerHifiGan(vocoder_config) - load_weights(espnet_checkpoint, vocoder, vocoder_config) - - # Prepare the model + vocoder - config = FastSpeech2ConformerWithHifiGanConfig.from_sub_model_configs(model_config, vocoder_config) - with_hifigan_model = FastSpeech2ConformerWithHifiGan(config) - with_hifigan_model.model = model - with_hifigan_model.vocoder = vocoder - - with_hifigan_model.save_pretrained(pytorch_dump_folder_path) - - if repo_id: - print("Pushing to the hub...") - with_hifigan_model.push_to_hub(repo_id) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint") - parser.add_argument( - "--yaml_config_path", required=True, default=None, type=str, help="Path to config.yaml of model to convert" - ) - parser.add_argument( - "--pytorch_dump_folder_path", - required=True, - default=None, - type=str, - help="Path to the output `FastSpeech2ConformerModel` PyTorch model.", - ) - parser.add_argument( - "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub." - ) - - args = parser.parse_args() - - convert_FastSpeech2ConformerWithHifiGan_checkpoint( - args.checkpoint_path, - args.yaml_config_path, - args.pytorch_dump_folder_path, - args.push_to_hub, - ) diff --git a/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py b/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py index 2b038a93396d..5a2dc39385b3 100644 --- a/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +++ b/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py @@ -21,6 +21,7 @@ import torch from torch import nn +from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutput from ...modeling_utils import PreTrainedModel from ...utils import ModelOutput, auto_docstring, logging @@ -472,24 +473,37 @@ def forward( class FastSpeech2ConformerConvolutionModule(nn.Module): - def __init__(self, config: FastSpeech2ConformerConfig, module_config): + def __init__(self, config: FastSpeech2ConformerConfig, module_config=None): + """ + Args: + config (FastSpeech2ConformerConfig): Configuration for the model. + module_config (dict): Configuration for the module (e.g., encoder or decoder). + """ super().__init__() - # kernel_size should be an odd number for 'SAME' padding channels = config.hidden_size - kernel_size = module_config["kernel_size"] + # kernel_size should be an odd number for 'SAME' padding + if module_config is None: + # e.g. using `ParakeetEncoderConfig` in src/transformers/models/parakeet/configuration_parakeet.py + kernel_size = config.conv_kernel_size + self.activation = ACT2FN[getattr(config, "hidden_act", "silu")] + else: + kernel_size = module_config["kernel_size"] + self.activation = ACT2FN[module_config.get("activation", "silu")] + self.padding = (kernel_size - 1) // 2 self.pointwise_conv1 = nn.Conv1d(channels, 2 * channels, kernel_size=1, stride=1, padding=0, bias=True) self.depthwise_conv = nn.Conv1d( - channels, channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2, groups=channels, bias=True + channels, channels, kernel_size, stride=1, padding=self.padding, groups=channels, bias=True ) self.norm = nn.BatchNorm1d(channels) self.pointwise_conv2 = nn.Conv1d(channels, channels, kernel_size=1, stride=1, padding=0, bias=True) - def forward(self, hidden_states): + def forward(self, hidden_states, attention_mask=None): """ Compute convolution module. Args: hidden_states (`torch.Tensor` of shape `(batch, time, channels)`): Input tensor. + attention_mask (`torch.Tensor` of shape `(batch, 1, time)`): Attention mask. Returns: `torch.Tensor`: Output tensor of shape `(batch, time, channels)`. @@ -503,12 +517,15 @@ def forward(self, hidden_states): # (batch_size, channel, dim) hidden_states = nn.functional.glu(hidden_states, dim=1) + # Apply padding mask before convolution + if attention_mask is not None: + all_masked_rows = torch.all(~attention_mask, dim=-1) + hidden_states = hidden_states.masked_fill(all_masked_rows, 0.0) + # 1D Depthwise Conv hidden_states = self.depthwise_conv(hidden_states) hidden_states = self.norm(hidden_states) - - hidden_states = hidden_states * torch.sigmoid(hidden_states) - + hidden_states = self.activation(hidden_states) hidden_states = self.pointwise_conv2(hidden_states) return hidden_states.transpose(1, 2) diff --git a/src/transformers/models/flava/configuration_flava.py b/src/transformers/models/flava/configuration_flava.py index c3ecf68a8982..b7bcb920e47a 100644 --- a/src/transformers/models/flava/configuration_flava.py +++ b/src/transformers/models/flava/configuration_flava.py @@ -516,7 +516,7 @@ def __init__( # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different. for key, value in _text_config_dict.items(): - if key in text_config and value != text_config[key] and key not in ["transformers_version"]: + if key in text_config and value != text_config[key] and key != "transformers_version": # If specified in `text_config_dict` if key in text_config_dict: message = ( @@ -548,7 +548,7 @@ def __init__( # Give a warning if the values exist in both `_image_config_dict` and `image_config` but being different. for key, value in _image_config_dict.items(): - if key in image_config and value != image_config[key] and key not in ["transformers_version"]: + if key in image_config and value != image_config[key] and key != "transformers_version": # If specified in `image_config_dict` if key in image_config_dict: message = ( @@ -576,11 +576,7 @@ def __init__( # Give a warning if the values exist in both `_multimodal_config_dict` and `multimodal_config` but being # different. for key, value in _multimodal_config_dict.items(): - if ( - key in multimodal_config - and value != multimodal_config[key] - and key not in ["transformers_version"] - ): + if key in multimodal_config and value != multimodal_config[key] and key != "transformers_version": # If specified in `multimodal_config_dict` if key in multimodal_config_dict: message = ( @@ -611,7 +607,7 @@ def __init__( if ( key in image_codebook_config and value != image_codebook_config[key] - and key not in ["transformers_version"] + and key != "transformers_version" ): # If specified in `image_codebook_config_dict` if key in image_codebook_config_dict: diff --git a/src/transformers/models/flava/convert_dalle_to_flava_codebook.py b/src/transformers/models/flava/convert_dalle_to_flava_codebook.py deleted file mode 100644 index 6408d0e1df04..000000000000 --- a/src/transformers/models/flava/convert_dalle_to_flava_codebook.py +++ /dev/null @@ -1,102 +0,0 @@ -# coding=utf-8 -# Copyright 2022 Meta Platforms authors and The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os - -import torch - -from transformers import FlavaImageCodebook, FlavaImageCodebookConfig - - -def rreplace(s, old, new, occurrence): - li = s.rsplit(old, occurrence) - return new.join(li) - - -def count_parameters(state_dict): - # encoder.embeddings are double copied in original FLAVA - return sum(param.float().sum() if "encoder.embeddings" not in key else 0 for key, param in state_dict.items()) - - -def upgrade_state_dict(state_dict): - upgrade = {} - - group_keys = ["group_1", "group_2", "group_3", "group_4"] - for key, value in state_dict.items(): - for group_key in group_keys: - if group_key in key: - key = key.replace(f"{group_key}.", f"{group_key}.group.") - - if "res_path" in key: - key = key.replace("res_path.", "res_path.path.") - - if key.endswith(".w"): - key = rreplace(key, ".w", ".weight", 1) - if key.endswith(".b"): - key = rreplace(key, ".b", ".bias", 1) - - upgrade[key] = value.float() - - return upgrade - - -@torch.no_grad() -def convert_dalle_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None, save_checkpoint=True): - """ - Copy/paste/tweak model's weights to transformers design. - """ - from dall_e import Encoder - - encoder = Encoder() - if os.path.exists(checkpoint_path): - ckpt = torch.load(checkpoint_path, weights_only=True) - else: - ckpt = torch.hub.load_state_dict_from_url(checkpoint_path) - - if isinstance(ckpt, Encoder): - ckpt = ckpt.state_dict() - encoder.load_state_dict(ckpt) - - if config_path is not None: - config = FlavaImageCodebookConfig.from_pretrained(config_path) - else: - config = FlavaImageCodebookConfig() - - hf_model = FlavaImageCodebook(config).eval() - state_dict = encoder.state_dict() - - hf_state_dict = upgrade_state_dict(state_dict) - hf_model.load_state_dict(hf_state_dict) - hf_state_dict = hf_model.state_dict() - hf_count = count_parameters(hf_state_dict) - state_dict_count = count_parameters(state_dict) - - assert torch.allclose(hf_count, state_dict_count, atol=1e-3) - - if save_checkpoint: - hf_model.save_pretrained(pytorch_dump_folder_path) - else: - return hf_state_dict - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to flava checkpoint") - parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert") - args = parser.parse_args() - - convert_dalle_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path) diff --git a/src/transformers/models/flava/convert_flava_original_pytorch_to_hf.py b/src/transformers/models/flava/convert_flava_original_pytorch_to_hf.py deleted file mode 100644 index 8b6e536a3ab5..000000000000 --- a/src/transformers/models/flava/convert_flava_original_pytorch_to_hf.py +++ /dev/null @@ -1,99 +0,0 @@ -# coding=utf-8 -# Copyright 2022 Meta Platforms authors and The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os - -import torch - -from transformers import FlavaConfig, FlavaForPreTraining -from transformers.models.flava.convert_dalle_to_flava_codebook import convert_dalle_checkpoint - - -def count_parameters(state_dict): - # encoder.embeddings are double copied in original FLAVA - return sum(param.float().sum() if "encoder.embeddings" not in key else 0 for key, param in state_dict.items()) - - -def upgrade_state_dict(state_dict, codebook_state_dict): - upgrade = {} - - for key, value in state_dict.items(): - if "text_encoder.embeddings" in key or "image_encoder.embeddings" in key: - continue - - key = key.replace("heads.cmd.mim_head.cls.predictions", "mmm_image_head") - key = key.replace("heads.cmd.mlm_head.cls.predictions", "mmm_text_head") - key = key.replace("heads.cmd.itm_head.cls", "itm_head") - key = key.replace("heads.cmd.itm_head.pooler", "itm_head.pooler") - key = key.replace("heads.cmd.clip_head.logit_scale", "flava.logit_scale") - key = key.replace("heads.fairseq_mlm.cls.predictions", "mlm_head") - key = key.replace("heads.imagenet.mim_head.cls.predictions", "mim_head") - key = key.replace("mm_text_projection", "flava.text_to_mm_projection") - key = key.replace("mm_image_projection", "flava.image_to_mm_projection") - key = key.replace("image_encoder.module", "flava.image_model") - key = key.replace("text_encoder.module", "flava.text_model") - key = key.replace("mm_encoder.module.encoder.cls_token", "flava.multimodal_model.cls_token") - key = key.replace("mm_encoder.module", "flava.multimodal_model") - key = key.replace("text_projection", "flava.text_projection") - key = key.replace("image_projection", "flava.image_projection") - - upgrade[key] = value.float() - - for key, value in codebook_state_dict.items(): - upgrade[f"image_codebook.{key}"] = value - - return upgrade - - -@torch.no_grad() -def convert_flava_checkpoint(checkpoint_path, codebook_path, pytorch_dump_folder_path, config_path=None): - """ - Copy/paste/tweak model's weights to transformers design. - """ - if config_path is not None: - config = FlavaConfig.from_pretrained(config_path) - else: - config = FlavaConfig() - - hf_model = FlavaForPreTraining(config).eval() - - codebook_state_dict = convert_dalle_checkpoint(codebook_path, None, save_checkpoint=False) - - if os.path.exists(checkpoint_path): - state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True) - else: - state_dict = torch.hub.load_state_dict_from_url(checkpoint_path, map_location="cpu") - - hf_state_dict = upgrade_state_dict(state_dict, codebook_state_dict) - hf_model.load_state_dict(hf_state_dict) - hf_state_dict = hf_model.state_dict() - hf_count = count_parameters(hf_state_dict) - state_dict_count = count_parameters(state_dict) + count_parameters(codebook_state_dict) - - assert torch.allclose(hf_count, state_dict_count, atol=1e-3) - - hf_model.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to flava checkpoint") - parser.add_argument("--codebook_path", default=None, type=str, help="Path to flava codebook checkpoint") - parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert") - args = parser.parse_args() - - convert_flava_checkpoint(args.checkpoint_path, args.codebook_path, args.pytorch_dump_folder_path, args.config_path) diff --git a/src/transformers/models/flava/image_processing_flava_fast.py b/src/transformers/models/flava/image_processing_flava_fast.py index 97409ddd57ed..732d25e71f69 100644 --- a/src/transformers/models/flava/image_processing_flava_fast.py +++ b/src/transformers/models/flava/image_processing_flava_fast.py @@ -21,6 +21,7 @@ from typing import Any, Optional, Union import torch +from torchvision.transforms.v2 import functional as F from ...image_processing_utils_fast import ( BaseImageProcessorFast, @@ -34,7 +35,6 @@ from ...utils import ( TensorType, auto_docstring, - is_torchvision_v2_available, ) from .image_processing_flava import ( FLAVA_CODEBOOK_MEAN, @@ -45,12 +45,6 @@ ) -if is_torchvision_v2_available(): - from torchvision.transforms.v2 import functional as F -else: - from torchvision.transforms import functional as F - - class FlavaMaskingGenerator: def __init__( self, diff --git a/src/transformers/models/florence2/convert_florence2_original_pytorch_to_hf.py b/src/transformers/models/florence2/convert_florence2_original_pytorch_to_hf.py deleted file mode 100644 index de77d4e4c72a..000000000000 --- a/src/transformers/models/florence2/convert_florence2_original_pytorch_to_hf.py +++ /dev/null @@ -1,530 +0,0 @@ -# coding=utf-8 -# Copyright 2025 Microsoft and the HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -from collections import OrderedDict - -import torch - -from transformers import ( - AddedToken, - AutoConfig, - AutoModelForCausalLM, - AutoProcessor, - Florence2Config, - Florence2ForConditionalGeneration, - Florence2Processor, - Florence2VisionConfig, -) - - -def convert_config(original_config: dict): - new_config = Florence2VisionConfig( - embed_dim=original_config["dim_embed"], - max_temporal_embeddings=original_config["visual_temporal_embedding"]["max_temporal_embeddings"], - max_pos_embeddings=original_config["image_pos_embed"]["max_pos_embeddings"], - **original_config, - ) - - return new_config - - -def vision_conv_embeddings(idx): - """ - The function helps in renaming vision convolution embedding layer weights. - - Args: - idx: stage number in original model - """ - convs = [] - convs.append( - ( - f"vision_tower.convs.{idx}.proj.weight", - f"model.vision_tower.convs.{idx}.conv.weight", - ) - ) - convs.append( - ( - f"vision_tower.convs.{idx}.proj.bias", - f"model.vision_tower.convs.{idx}.conv.bias", - ) - ) - convs.append( - ( - f"vision_tower.convs.{idx}.norm.weight", - f"model.vision_tower.convs.{idx}.norm.weight", - ) - ) - convs.append( - ( - f"vision_tower.convs.{idx}.norm.bias", - f"model.vision_tower.convs.{idx}.norm.bias", - ) - ) - return convs - - -def vision_spatial_block(stage_idx, block_idx): - """ - The function helps in renaming vision spatial block layers weights. - - Args: - idx: stage number in original model - cnt: count of blocks in each stage - """ - spatial_block = [] - spatial_block.append( - ( - f"vision_tower.blocks.{stage_idx}.{block_idx}.spatial_block.conv1.fn.dw.weight", - f"model.vision_tower.blocks.{stage_idx}.{block_idx}.spatial_block.conv1.weight", - ) - ) - spatial_block.append( - ( - f"vision_tower.blocks.{stage_idx}.{block_idx}.spatial_block.conv1.fn.dw.bias", - f"model.vision_tower.blocks.{stage_idx}.{block_idx}.spatial_block.conv1.bias", - ) - ) - spatial_block.append( - ( - f"vision_tower.blocks.{stage_idx}.{block_idx}.spatial_block.window_attn.norm.weight", - f"model.vision_tower.blocks.{stage_idx}.{block_idx}.spatial_block.norm1.weight", - ) - ) - spatial_block.append( - ( - f"vision_tower.blocks.{stage_idx}.{block_idx}.spatial_block.window_attn.norm.bias", - f"model.vision_tower.blocks.{stage_idx}.{block_idx}.spatial_block.norm1.bias", - ) - ) - spatial_block.append( - ( - f"vision_tower.blocks.{stage_idx}.{block_idx}.spatial_block.window_attn.fn.qkv.weight", - f"model.vision_tower.blocks.{stage_idx}.{block_idx}.spatial_block.window_attn.qkv.weight", - ) - ) - spatial_block.append( - ( - f"vision_tower.blocks.{stage_idx}.{block_idx}.spatial_block.window_attn.fn.qkv.bias", - f"model.vision_tower.blocks.{stage_idx}.{block_idx}.spatial_block.window_attn.qkv.bias", - ) - ) - spatial_block.append( - ( - f"vision_tower.blocks.{stage_idx}.{block_idx}.spatial_block.window_attn.fn.proj.weight", - f"model.vision_tower.blocks.{stage_idx}.{block_idx}.spatial_block.window_attn.proj.weight", - ) - ) - spatial_block.append( - ( - f"vision_tower.blocks.{stage_idx}.{block_idx}.spatial_block.window_attn.fn.proj.bias", - f"model.vision_tower.blocks.{stage_idx}.{block_idx}.spatial_block.window_attn.proj.bias", - ) - ) - spatial_block.append( - ( - f"vision_tower.blocks.{stage_idx}.{block_idx}.spatial_block.conv2.fn.dw.weight", - f"model.vision_tower.blocks.{stage_idx}.{block_idx}.spatial_block.conv2.weight", - ) - ) - spatial_block.append( - ( - f"vision_tower.blocks.{stage_idx}.{block_idx}.spatial_block.conv2.fn.dw.bias", - f"model.vision_tower.blocks.{stage_idx}.{block_idx}.spatial_block.conv2.bias", - ) - ) - spatial_block.append( - ( - f"vision_tower.blocks.{stage_idx}.{block_idx}.spatial_block.ffn.norm.weight", - f"model.vision_tower.blocks.{stage_idx}.{block_idx}.spatial_block.norm2.weight", - ) - ) - spatial_block.append( - ( - f"vision_tower.blocks.{stage_idx}.{block_idx}.spatial_block.ffn.norm.bias", - f"model.vision_tower.blocks.{stage_idx}.{block_idx}.spatial_block.norm2.bias", - ) - ) - spatial_block.append( - ( - f"vision_tower.blocks.{stage_idx}.{block_idx}.spatial_block.ffn.fn.net.fc1.weight", - f"model.vision_tower.blocks.{stage_idx}.{block_idx}.spatial_block.ffn.fc1.weight", - ) - ) - spatial_block.append( - ( - f"vision_tower.blocks.{stage_idx}.{block_idx}.spatial_block.ffn.fn.net.fc1.bias", - f"model.vision_tower.blocks.{stage_idx}.{block_idx}.spatial_block.ffn.fc1.bias", - ) - ) - spatial_block.append( - ( - f"vision_tower.blocks.{stage_idx}.{block_idx}.spatial_block.ffn.fn.net.fc2.weight", - f"model.vision_tower.blocks.{stage_idx}.{block_idx}.spatial_block.ffn.fc2.weight", - ) - ) - spatial_block.append( - ( - f"vision_tower.blocks.{stage_idx}.{block_idx}.spatial_block.ffn.fn.net.fc2.bias", - f"model.vision_tower.blocks.{stage_idx}.{block_idx}.spatial_block.ffn.fc2.bias", - ) - ) - return spatial_block - - -def vision_channel_block(stage_idx, block_idx): - """ - The function helps in renaming vision channel block layers weights. - - Args: - idx: stage number in original model - cnt: count of blocks in each stage - """ - channel_block = [] - channel_block.append( - ( - f"vision_tower.blocks.{stage_idx}.{block_idx}.channel_block.conv1.fn.dw.weight", - f"model.vision_tower.blocks.{stage_idx}.{block_idx}.channel_block.conv1.weight", - ) - ) - channel_block.append( - ( - f"vision_tower.blocks.{stage_idx}.{block_idx}.channel_block.conv1.fn.dw.bias", - f"model.vision_tower.blocks.{stage_idx}.{block_idx}.channel_block.conv1.bias", - ) - ) - channel_block.append( - ( - f"vision_tower.blocks.{stage_idx}.{block_idx}.channel_block.channel_attn.norm.weight", - f"model.vision_tower.blocks.{stage_idx}.{block_idx}.channel_block.norm1.weight", - ) - ) - channel_block.append( - ( - f"vision_tower.blocks.{stage_idx}.{block_idx}.channel_block.channel_attn.norm.bias", - f"model.vision_tower.blocks.{stage_idx}.{block_idx}.channel_block.norm1.bias", - ) - ) - channel_block.append( - ( - f"vision_tower.blocks.{stage_idx}.{block_idx}.channel_block.channel_attn.fn.qkv.weight", - f"model.vision_tower.blocks.{stage_idx}.{block_idx}.channel_block.channel_attn.qkv.weight", - ) - ) - channel_block.append( - ( - f"vision_tower.blocks.{stage_idx}.{block_idx}.channel_block.channel_attn.fn.qkv.bias", - f"model.vision_tower.blocks.{stage_idx}.{block_idx}.channel_block.channel_attn.qkv.bias", - ) - ) - channel_block.append( - ( - f"vision_tower.blocks.{stage_idx}.{block_idx}.channel_block.channel_attn.fn.proj.weight", - f"model.vision_tower.blocks.{stage_idx}.{block_idx}.channel_block.channel_attn.proj.weight", - ) - ) - channel_block.append( - ( - f"vision_tower.blocks.{stage_idx}.{block_idx}.channel_block.channel_attn.fn.proj.bias", - f"model.vision_tower.blocks.{stage_idx}.{block_idx}.channel_block.channel_attn.proj.bias", - ) - ) - channel_block.append( - ( - f"vision_tower.blocks.{stage_idx}.{block_idx}.channel_block.conv2.fn.dw.weight", - f"model.vision_tower.blocks.{stage_idx}.{block_idx}.channel_block.conv2.weight", - ) - ) - channel_block.append( - ( - f"vision_tower.blocks.{stage_idx}.{block_idx}.channel_block.conv2.fn.dw.bias", - f"model.vision_tower.blocks.{stage_idx}.{block_idx}.channel_block.conv2.bias", - ) - ) - channel_block.append( - ( - f"vision_tower.blocks.{stage_idx}.{block_idx}.channel_block.ffn.norm.weight", - f"model.vision_tower.blocks.{stage_idx}.{block_idx}.channel_block.norm2.weight", - ) - ) - channel_block.append( - ( - f"vision_tower.blocks.{stage_idx}.{block_idx}.channel_block.ffn.norm.bias", - f"model.vision_tower.blocks.{stage_idx}.{block_idx}.channel_block.norm2.bias", - ) - ) - channel_block.append( - ( - f"vision_tower.blocks.{stage_idx}.{block_idx}.channel_block.ffn.fn.net.fc1.weight", - f"model.vision_tower.blocks.{stage_idx}.{block_idx}.channel_block.ffn.fc1.weight", - ) - ) - channel_block.append( - ( - f"vision_tower.blocks.{stage_idx}.{block_idx}.channel_block.ffn.fn.net.fc1.bias", - f"model.vision_tower.blocks.{stage_idx}.{block_idx}.channel_block.ffn.fc1.bias", - ) - ) - channel_block.append( - ( - f"vision_tower.blocks.{stage_idx}.{block_idx}.channel_block.ffn.fn.net.fc2.weight", - f"model.vision_tower.blocks.{stage_idx}.{block_idx}.channel_block.ffn.fc2.weight", - ) - ) - channel_block.append( - ( - f"vision_tower.blocks.{stage_idx}.{block_idx}.channel_block.ffn.fn.net.fc2.bias", - f"model.vision_tower.blocks.{stage_idx}.{block_idx}.channel_block.ffn.fc2.bias", - ) - ) - return channel_block - - -def multi_modal_projector(): - """ - Function helps in renaming final classification layer - """ - projector = [] - projector.append(("image_projection", "model.multi_modal_projector.image_projection.weight")) - projector.append(("image_proj_norm.weight", "model.multi_modal_projector.image_proj_norm.weight")) - projector.append(("image_proj_norm.bias", "model.multi_modal_projector.image_proj_norm.bias")) - projector.append( - ( - "image_pos_embed.row_embeddings.weight", - "model.multi_modal_projector.image_position_embed.row_embeddings.weight", - ) - ) - projector.append( - ( - "image_pos_embed.column_embeddings.weight", - "model.multi_modal_projector.image_position_embed.column_embeddings.weight", - ) - ) - projector.append( - ( - "visual_temporal_embed.pos_idx_to_embed", - "model.multi_modal_projector.visual_temporal_embed.pos_idx_to_embed", - ) - ) - return projector - - -def language_model(state_dict): - language_state_dict_keys = [] - for key in state_dict.keys(): - if key.startswith("language_model.model") and "lm_head" not in key: - new_key = key.replace("language_model.model.", "model.language_model.") - language_state_dict_keys.append((key, new_key)) - language_state_dict_keys.append(("language_model.lm_head.weight", "lm_head.weight")) - return language_state_dict_keys - - -def convert_florence2_checkpoint(hf_model_id, pytorch_dump_folder, output_hub_path): - """ - Function to convert the microsoft florence2 checkpoint to huggingface checkpoint - """ - - hf_config = AutoConfig.from_pretrained(hf_model_id, trust_remote_code=True) - hf_model = AutoModelForCausalLM.from_pretrained( - hf_model_id, trust_remote_code=True, dtype=torch.float16, attn_implementation="eager" - ) - hf_processor = AutoProcessor.from_pretrained(hf_model_id, trust_remote_code=True) - huggingface_weights = OrderedDict() - list_of_state_dict = [] - - image_processor = hf_processor.image_processor - - tokenizer = hf_processor.tokenizer - tokenizer.image_token = "" - tokenizer.add_tokens(AddedToken(tokenizer.image_token, special=True, normalized=False), special_tokens=True) - tokenizer.image_token_id = tokenizer.encode(tokenizer.image_token, add_special_tokens=False)[0] - tokenizer.extra_special_tokens = {"image_token": ""} - - post_processor_config = { - "ocr": { - "pattern": r"(.+?)", - "area_threshold": 0.0, - }, - "phrase_grounding": { - "banned_grounding_tokens": [ - "it", - "I", - "me", - "mine", - "you", - "your", - "yours", - "he", - "him", - "his", - "she", - "her", - "hers", - "they", - "them", - "their", - "theirs", - "one", - "oneself", - "we", - "us", - "our", - "ours", - "you", - "your", - "yours", - "they", - "them", - "their", - "theirs", - "mine", - "yours", - "his", - "hers", - "its", - "ours", - "yours", - "theirs", - "myself", - "yourself", - "himself", - "herself", - "itself", - "ourselves", - "yourselves", - "themselves", - "this", - "that", - "these", - "those", - "who", - "whom", - "whose", - "which", - "what", - "who", - "whom", - "whose", - "which", - "that", - "all", - "another", - "any", - "anybody", - "anyone", - "anything", - "each", - "everybody", - "everyone", - "everything", - "few", - "many", - "nobody", - "none", - "one", - "several", - "some", - "somebody", - "someone", - "something", - "each other", - "one another", - "myself", - "yourself", - "himself", - "herself", - "itself", - "ourselves", - "yourselves", - "themselves", - "the image", - "image", - "images", - "the", - "a", - "an", - "a group", - "other objects", - "lots", - "a set", - ], - }, - "pure_text": {}, - "description_with_bboxes": {}, - "description_with_polygons": {}, - "polygons": {}, - "bboxes": {}, - "description_with_bboxes_or_polygons": {}, - } - processor = Florence2Processor( - image_processor=image_processor, tokenizer=tokenizer, post_processor_config=post_processor_config - ) - - vision_config = convert_config(hf_config.vision_config.__dict__) - text_config = hf_config.text_config.__dict__ - config = Florence2Config( - text_config=text_config, - vision_config=vision_config, - image_token_id=tokenizer.image_token_id, - dtype=torch.float16, - ) - - for stage_idx in range(len(config.vision_config.embed_dim)): - list_of_state_dict = list_of_state_dict + vision_conv_embeddings(stage_idx) - for block_idx in range(config.vision_config.depths[stage_idx]): - list_of_state_dict = list_of_state_dict + vision_spatial_block(stage_idx, block_idx) - list_of_state_dict = list_of_state_dict + vision_channel_block(stage_idx, block_idx) - - original_weights = hf_model.state_dict() - list_of_state_dict = list_of_state_dict + multi_modal_projector() - list_of_state_dict = list_of_state_dict + language_model(original_weights) - for i in range(len(list_of_state_dict)): - if list_of_state_dict[i][0] == "image_projection": - original_weights[list_of_state_dict[i][0]].transpose_(1, 0) - huggingface_weights[list_of_state_dict[i][1]] = original_weights[list_of_state_dict[i][0]] - - model = Florence2ForConditionalGeneration(config) - model.load_state_dict(huggingface_weights, strict=True, assign=True) - model.tie_weights() - # We add an image token so we resize the model and pad to 64 for performance reasons - pad_shape = 64 - model.resize_token_embeddings(len(tokenizer), pad_shape) - - if pytorch_dump_folder: - model.save_pretrained(pytorch_dump_folder) - processor.save_pretrained(pytorch_dump_folder) - - if output_hub_path: - model.push_to_hub(output_hub_path) - processor.push_to_hub(output_hub_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--hf_model_id", - default="microsoft/Florence-2-base", - type=str, - help="Name of the florence2 model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - parser.add_argument( - "--output_hub_path", - help="Location on the hub of the converted model", - ) - - args = parser.parse_args() - convert_florence2_checkpoint(args.hf_model_id, args.pytorch_dump_folder_path, args.output_hub_path) diff --git a/src/transformers/models/fnet/convert_fnet_original_flax_checkpoint_to_pytorch.py b/src/transformers/models/fnet/convert_fnet_original_flax_checkpoint_to_pytorch.py deleted file mode 100644 index 71660354db14..000000000000 --- a/src/transformers/models/fnet/convert_fnet_original_flax_checkpoint_to_pytorch.py +++ /dev/null @@ -1,156 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert FNet checkpoint.""" - -import argparse - -import torch -from flax.training.checkpoints import restore_checkpoint - -from transformers import FNetConfig, FNetForPreTraining -from transformers.utils import logging - - -logging.set_verbosity_info() - - -def convert_flax_checkpoint_to_pytorch(flax_checkpoint_path, fnet_config_file, save_path): - # Initialise PyTorch model - config = FNetConfig.from_json_file(fnet_config_file) - print(f"Building PyTorch model from configuration: {config}") - fnet_pretraining_model = FNetForPreTraining(config) - - checkpoint_dict = restore_checkpoint(flax_checkpoint_path, None) - pretrained_model_params = checkpoint_dict["target"] - - # Embeddings - # Position IDs - state_dict = fnet_pretraining_model.state_dict() - - position_ids = state_dict["fnet.embeddings.position_ids"] - new_state_dict = {"fnet.embeddings.position_ids": position_ids} - # Embedding Layers - new_state_dict["fnet.embeddings.word_embeddings.weight"] = torch.tensor( - pretrained_model_params["encoder"]["embedder"]["word"]["embedding"] - ) - new_state_dict["fnet.embeddings.position_embeddings.weight"] = torch.tensor( - pretrained_model_params["encoder"]["embedder"]["position"]["embedding"][0] - ) - new_state_dict["fnet.embeddings.token_type_embeddings.weight"] = torch.tensor( - pretrained_model_params["encoder"]["embedder"]["type"]["embedding"] - ) - new_state_dict["fnet.embeddings.projection.weight"] = torch.tensor( - pretrained_model_params["encoder"]["embedder"]["hidden_mapping_in"]["kernel"] - ).T - new_state_dict["fnet.embeddings.projection.bias"] = torch.tensor( - pretrained_model_params["encoder"]["embedder"]["hidden_mapping_in"]["bias"] - ) - new_state_dict["fnet.embeddings.LayerNorm.weight"] = torch.tensor( - pretrained_model_params["encoder"]["embedder"]["layer_norm"]["scale"] - ) - new_state_dict["fnet.embeddings.LayerNorm.bias"] = torch.tensor( - pretrained_model_params["encoder"]["embedder"]["layer_norm"]["bias"] - ) - - # Encoder Layers - for layer in range(config.num_hidden_layers): - new_state_dict[f"fnet.encoder.layer.{layer}.fourier.output.LayerNorm.weight"] = torch.tensor( - pretrained_model_params["encoder"][f"encoder_{layer}"]["mixing_layer_norm"]["scale"] - ) - new_state_dict[f"fnet.encoder.layer.{layer}.fourier.output.LayerNorm.bias"] = torch.tensor( - pretrained_model_params["encoder"][f"encoder_{layer}"]["mixing_layer_norm"]["bias"] - ) - - new_state_dict[f"fnet.encoder.layer.{layer}.intermediate.dense.weight"] = torch.tensor( - pretrained_model_params["encoder"][f"feed_forward_{layer}"]["intermediate"]["kernel"] - ).T - new_state_dict[f"fnet.encoder.layer.{layer}.intermediate.dense.bias"] = torch.tensor( - pretrained_model_params["encoder"][f"feed_forward_{layer}"]["intermediate"]["bias"] - ) - - new_state_dict[f"fnet.encoder.layer.{layer}.output.dense.weight"] = torch.tensor( - pretrained_model_params["encoder"][f"feed_forward_{layer}"]["output"]["kernel"] - ).T - new_state_dict[f"fnet.encoder.layer.{layer}.output.dense.bias"] = torch.tensor( - pretrained_model_params["encoder"][f"feed_forward_{layer}"]["output"]["bias"] - ) - - new_state_dict[f"fnet.encoder.layer.{layer}.output.LayerNorm.weight"] = torch.tensor( - pretrained_model_params["encoder"][f"encoder_{layer}"]["output_layer_norm"]["scale"] - ) - new_state_dict[f"fnet.encoder.layer.{layer}.output.LayerNorm.bias"] = torch.tensor( - pretrained_model_params["encoder"][f"encoder_{layer}"]["output_layer_norm"]["bias"] - ) - - # Pooler Layers - new_state_dict["fnet.pooler.dense.weight"] = torch.tensor(pretrained_model_params["encoder"]["pooler"]["kernel"]).T - new_state_dict["fnet.pooler.dense.bias"] = torch.tensor(pretrained_model_params["encoder"]["pooler"]["bias"]) - - # Masked LM Layers - new_state_dict["cls.predictions.transform.dense.weight"] = torch.tensor( - pretrained_model_params["predictions_dense"]["kernel"] - ).T - new_state_dict["cls.predictions.transform.dense.bias"] = torch.tensor( - pretrained_model_params["predictions_dense"]["bias"] - ) - new_state_dict["cls.predictions.transform.LayerNorm.weight"] = torch.tensor( - pretrained_model_params["predictions_layer_norm"]["scale"] - ) - new_state_dict["cls.predictions.transform.LayerNorm.bias"] = torch.tensor( - pretrained_model_params["predictions_layer_norm"]["bias"] - ) - new_state_dict["cls.predictions.decoder.weight"] = torch.tensor( - pretrained_model_params["encoder"]["embedder"]["word"]["embedding"] - ) - new_state_dict["cls.predictions.decoder.bias"] = torch.tensor( - pretrained_model_params["predictions_output"]["output_bias"] - ) - new_state_dict["cls.predictions.bias"] = torch.tensor(pretrained_model_params["predictions_output"]["output_bias"]) - - # Seq Relationship Layers - new_state_dict["cls.seq_relationship.weight"] = torch.tensor( - pretrained_model_params["classification"]["output_kernel"] - ) - new_state_dict["cls.seq_relationship.bias"] = torch.tensor( - pretrained_model_params["classification"]["output_bias"] - ) - - # Load State Dict - fnet_pretraining_model.load_state_dict(new_state_dict) - - # Save PreTrained - print(f"Saving pretrained model to {save_path}") - fnet_pretraining_model.save_pretrained(save_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--flax_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." - ) - parser.add_argument( - "--fnet_config_file", - default=None, - type=str, - required=True, - help=( - "The config json file corresponding to the pre-trained FNet model. \n" - "This specifies the model architecture." - ), - ) - parser.add_argument("--save_path", default=None, type=str, required=True, help="Path to the output model.") - args = parser.parse_args() - convert_flax_checkpoint_to_pytorch(args.flax_checkpoint_path, args.fnet_config_file, args.save_path) diff --git a/src/transformers/models/focalnet/convert_focalnet_to_hf_format.py b/src/transformers/models/focalnet/convert_focalnet_to_hf_format.py deleted file mode 100644 index ead9950e2a61..000000000000 --- a/src/transformers/models/focalnet/convert_focalnet_to_hf_format.py +++ /dev/null @@ -1,237 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert FocalNet checkpoints from the original repository. URL: https://github.com/microsoft/FocalNet/tree/main""" - -import argparse -import json - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image -from torchvision import transforms - -from transformers import BitImageProcessor, FocalNetConfig, FocalNetForImageClassification -from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling - - -def get_focalnet_config(model_name): - depths = [2, 2, 6, 2] if "tiny" in model_name else [2, 2, 18, 2] - use_conv_embed = bool("large" in model_name or "huge" in model_name) - use_post_layernorm = bool("large" in model_name or "huge" in model_name) - use_layerscale = bool("large" in model_name or "huge" in model_name) - - if "large" in model_name or "xlarge" in model_name or "huge" in model_name: - if "fl3" in model_name: - focal_levels = [3, 3, 3, 3] - focal_windows = [5, 5, 5, 5] - elif "fl4" in model_name: - focal_levels = [4, 4, 4, 4] - focal_windows = [3, 3, 3, 3] - - if "tiny" in model_name or "small" in model_name or "base" in model_name: - focal_windows = [3, 3, 3, 3] - if "lrf" in model_name: - focal_levels = [3, 3, 3, 3] - else: - focal_levels = [2, 2, 2, 2] - - if "tiny" in model_name: - embed_dim = 96 - elif "small" in model_name: - embed_dim = 96 - elif "base" in model_name: - embed_dim = 128 - elif "large" in model_name: - embed_dim = 192 - elif "xlarge" in model_name: - embed_dim = 256 - elif "huge" in model_name: - embed_dim = 352 - - # set label information - repo_id = "huggingface/label-files" - if "large" in model_name or "huge" in model_name: - filename = "imagenet-22k-id2label.json" - else: - filename = "imagenet-1k-id2label.json" - - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - label2id = {v: k for k, v in id2label.items()} - - config = FocalNetConfig( - embed_dim=embed_dim, - depths=depths, - focal_levels=focal_levels, - focal_windows=focal_windows, - use_conv_embed=use_conv_embed, - id2label=id2label, - label2id=label2id, - use_post_layernorm=use_post_layernorm, - use_layerscale=use_layerscale, - ) - - return config - - -def rename_key(name): - if "patch_embed.proj" in name: - name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection") - if "patch_embed.norm" in name: - name = name.replace("patch_embed.norm", "embeddings.norm") - if "layers" in name: - name = "encoder." + name - if "encoder.layers" in name: - name = name.replace("encoder.layers", "encoder.stages") - if "downsample.proj" in name: - name = name.replace("downsample.proj", "downsample.projection") - if "blocks" in name: - name = name.replace("blocks", "layers") - if "modulation.f.weight" in name or "modulation.f.bias" in name: - name = name.replace("modulation.f", "modulation.projection_in") - if "modulation.h.weight" in name or "modulation.h.bias" in name: - name = name.replace("modulation.h", "modulation.projection_context") - if "modulation.proj.weight" in name or "modulation.proj.bias" in name: - name = name.replace("modulation.proj", "modulation.projection_out") - - if name == "norm.weight": - name = "layernorm.weight" - if name == "norm.bias": - name = "layernorm.bias" - - if "head" in name: - name = name.replace("head", "classifier") - else: - name = "focalnet." + name - - return name - - -def convert_focalnet_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False): - # fmt: off - model_name_to_url = { - "focalnet-tiny": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_tiny_srf.pth", - "focalnet-tiny-lrf": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_tiny_lrf.pth", - "focalnet-small": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_small_srf.pth", - "focalnet-small-lrf": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_small_lrf.pth", - "focalnet-base": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_base_srf.pth", - "focalnet-base-lrf": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_base_lrf.pth", - "focalnet-large-lrf-fl3": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_large_lrf_384.pth", - "focalnet-large-lrf-fl4": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_large_lrf_384_fl4.pth", - "focalnet-xlarge-lrf-fl3": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_xlarge_lrf_384.pth", - "focalnet-xlarge-lrf-fl4": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_xlarge_lrf_384_fl4.pth", - } - # fmt: on - - checkpoint_url = model_name_to_url[model_name] - print("Checkpoint URL: ", checkpoint_url) - state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["model"] - - # rename keys - for key in state_dict.copy(): - val = state_dict.pop(key) - state_dict[rename_key(key)] = val - - config = get_focalnet_config(model_name) - model = FocalNetForImageClassification(config) - model.eval() - - # load state dict - model.load_state_dict(state_dict) - - # verify conversion - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - - processor = BitImageProcessor( - do_resize=True, - size={"shortest_edge": 256}, - resample=PILImageResampling.BILINEAR, - do_center_crop=True, - crop_size=224, - do_normalize=True, - image_mean=IMAGENET_DEFAULT_MEAN, - image_std=IMAGENET_DEFAULT_STD, - ) - image = Image.open(requests.get(url, stream=True).raw) - inputs = processor(images=image, return_tensors="pt") - - image_transforms = transforms.Compose( - [ - transforms.Resize(256), - transforms.CenterCrop(224), - transforms.ToTensor(), - transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), - ] - ) - - original_pixel_values = image_transforms(image).unsqueeze(0) - - # verify pixel_values - assert torch.allclose(inputs.pixel_values, original_pixel_values, atol=1e-4) - - outputs = model(**inputs) - - predicted_class_idx = outputs.logits.argmax(-1).item() - print("Predicted class:", model.config.id2label[predicted_class_idx]) - - print("First values of logits:", outputs.logits[0, :3]) - - if model_name == "focalnet-tiny": - expected_slice = torch.tensor([0.2166, -0.4368, 0.2191]) - elif model_name == "focalnet-tiny-lrf": - expected_slice = torch.tensor([1.1669, 0.0125, -0.1695]) - elif model_name == "focalnet-small": - expected_slice = torch.tensor([0.4917, -0.0430, 0.1341]) - elif model_name == "focalnet-small-lrf": - expected_slice = torch.tensor([-0.2588, -0.5342, -0.2331]) - elif model_name == "focalnet-base": - expected_slice = torch.tensor([-0.1655, -0.4090, -0.1730]) - elif model_name == "focalnet-base-lrf": - expected_slice = torch.tensor([0.5306, -0.0483, -0.3928]) - assert torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - print(f"Saving model and processor of {model_name} to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - print(f"Pushing model and processor of {model_name} to the hub...") - model.push_to_hub(f"{model_name}") - processor.push_to_hub(f"{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="focalnet-tiny", - type=str, - help="Name of the FocalNet model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether to push the model and processor to the hub.", - ) - - args = parser.parse_args() - convert_focalnet_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/fsmt/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/fsmt/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100755 index 07a83a1cb0a9..000000000000 --- a/src/transformers/models/fsmt/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,280 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Note: if you intend to run this script make sure you look under scripts/fsmt/ -# to locate the appropriate script to do the work correctly. There is a set of scripts to: -# - download and prepare data and run the conversion script -# - perform eval to get the best hparam into the config -# - generate model_cards - useful if you have multiple models from the same paper - -import argparse -import json -import os -import re -from collections import OrderedDict -from os.path import basename, dirname - -import fairseq -import torch -from fairseq import hub_utils -from fairseq.data.dictionary import Dictionary - -from transformers import FSMTConfig, FSMTForConditionalGeneration -from transformers.models.fsmt.tokenization_fsmt import VOCAB_FILES_NAMES -from transformers.tokenization_utils_base import TOKENIZER_CONFIG_FILE -from transformers.utils import WEIGHTS_NAME, logging - - -logging.set_verbosity_warning() - -json_indent = 2 - -# based on the results of a search on a range of `num_beams`, `length_penalty` and `early_stopping` -# values against wmt19 test data to obtain the best BLEU scores, we will use the following defaults: -# -# * `num_beams`: 5 (higher scores better, but requires more memory/is slower, can be adjusted by users) -# * `early_stopping`: `False` consistently scored better -# * `length_penalty` varied, so will assign the best one depending on the model -best_score_hparams = { - # fairseq: - "wmt19-ru-en": {"length_penalty": 1.1}, - "wmt19-en-ru": {"length_penalty": 1.15}, - "wmt19-en-de": {"length_penalty": 1.0}, - "wmt19-de-en": {"length_penalty": 1.1}, - # allenai: - "wmt16-en-de-dist-12-1": {"length_penalty": 0.6}, - "wmt16-en-de-dist-6-1": {"length_penalty": 0.6}, - "wmt16-en-de-12-1": {"length_penalty": 0.8}, - "wmt19-de-en-6-6-base": {"length_penalty": 0.6}, - "wmt19-de-en-6-6-big": {"length_penalty": 0.6}, -} - -# this remaps the different models to their organization names -org_names = {} -for m in ["wmt19-ru-en", "wmt19-en-ru", "wmt19-en-de", "wmt19-de-en"]: - org_names[m] = "facebook" -for m in [ - "wmt16-en-de-dist-12-1", - "wmt16-en-de-dist-6-1", - "wmt16-en-de-12-1", - "wmt19-de-en-6-6-base", - "wmt19-de-en-6-6-big", -]: - org_names[m] = "allenai" - - -def rewrite_dict_keys(d): - # (1) remove word breaking symbol, (2) add word ending symbol where the word is not broken up, - # e.g.: d = {'le@@': 5, 'tt@@': 6, 'er': 7} => {'le': 5, 'tt': 6, 'er': 7} - d2 = dict((re.sub(r"@@$", "", k), v) if k.endswith("@@") else (re.sub(r"$", "", k), v) for k, v in d.items()) - keep_keys = ["", "", "", ""] - # restore the special tokens - for k in keep_keys: - del d2[f"{k}"] - d2[k] = d[k] # restore - return d2 - - -def convert_fsmt_checkpoint_to_pytorch(fsmt_checkpoint_path, pytorch_dump_folder_path): - # prep - assert os.path.exists(fsmt_checkpoint_path) - os.makedirs(pytorch_dump_folder_path, exist_ok=True) - print(f"Writing results to {pytorch_dump_folder_path}") - - # handle various types of models - - checkpoint_file = basename(fsmt_checkpoint_path) - fsmt_folder_path = dirname(fsmt_checkpoint_path) - - cls = fairseq.model_parallel.models.transformer.ModelParallelTransformerModel - models = cls.hub_models() - kwargs = {"bpe": "fastbpe", "tokenizer": "moses"} - data_name_or_path = "." - # note: since the model dump is old, fairseq has upgraded its model some - # time later, and it does a whole lot of rewrites and splits on the saved - # weights, therefore we can't use torch.load() directly on the model file. - # see: upgrade_state_dict(state_dict) in fairseq_model.py - print(f"using checkpoint {checkpoint_file}") - chkpt = hub_utils.from_pretrained( - fsmt_folder_path, checkpoint_file, data_name_or_path, archive_map=models, **kwargs - ) - - args = vars(chkpt["args"]["model"]) - - src_lang = args["source_lang"] - tgt_lang = args["target_lang"] - - data_root = dirname(pytorch_dump_folder_path) - model_dir = basename(pytorch_dump_folder_path) - - # dicts - src_dict_file = os.path.join(fsmt_folder_path, f"dict.{src_lang}.txt") - tgt_dict_file = os.path.join(fsmt_folder_path, f"dict.{tgt_lang}.txt") - - src_dict = Dictionary.load(src_dict_file) - src_vocab = rewrite_dict_keys(src_dict.indices) - src_vocab_size = len(src_vocab) - src_vocab_file = os.path.join(pytorch_dump_folder_path, "vocab-src.json") - print(f"Generating {src_vocab_file} of {src_vocab_size} of {src_lang} records") - with open(src_vocab_file, "w", encoding="utf-8") as f: - f.write(json.dumps(src_vocab, ensure_ascii=False, indent=json_indent)) - - # detect whether this is a do_lower_case situation, which can be derived by checking whether we - # have at least one uppercase letter in the source vocab - do_lower_case = True - for k in src_vocab: - if not k.islower(): - do_lower_case = False - break - - tgt_dict = Dictionary.load(tgt_dict_file) - tgt_vocab = rewrite_dict_keys(tgt_dict.indices) - tgt_vocab_size = len(tgt_vocab) - tgt_vocab_file = os.path.join(pytorch_dump_folder_path, "vocab-tgt.json") - print(f"Generating {tgt_vocab_file} of {tgt_vocab_size} of {tgt_lang} records") - with open(tgt_vocab_file, "w", encoding="utf-8") as f: - f.write(json.dumps(tgt_vocab, ensure_ascii=False, indent=json_indent)) - - # merges_file (bpecodes) - merges_file = os.path.join(pytorch_dump_folder_path, VOCAB_FILES_NAMES["merges_file"]) - for fn in ["bpecodes", "code"]: # older fairseq called the merges file "code" - fsmt_merges_file = os.path.join(fsmt_folder_path, fn) - if os.path.exists(fsmt_merges_file): - break - with open(fsmt_merges_file, encoding="utf-8") as fin: - merges = fin.read() - merges = re.sub(r" \d+$", "", merges, 0, re.M) # remove frequency number - print(f"Generating {merges_file}") - with open(merges_file, "w", encoding="utf-8") as fout: - fout.write(merges) - - # model config - fsmt_model_config_file = os.path.join(pytorch_dump_folder_path, "config.json") - - # validate bpe/tokenizer config, as currently it's hardcoded to moses+fastbpe - - # may have to modify the tokenizer if a different type is used by a future model - assert args["bpe"] == "fastbpe", f"need to extend tokenizer to support bpe={args['bpe']}" - assert args["tokenizer"] == "moses", f"need to extend tokenizer to support bpe={args['tokenizer']}" - - model_conf = { - "architectures": ["FSMTForConditionalGeneration"], - "model_type": "fsmt", - "activation_dropout": args["activation_dropout"], - "activation_function": "relu", - "attention_dropout": args["attention_dropout"], - "d_model": args["decoder_embed_dim"], - "dropout": args["dropout"], - "init_std": 0.02, - "max_position_embeddings": args["max_source_positions"], - "num_hidden_layers": args["encoder_layers"], - "src_vocab_size": src_vocab_size, - "tgt_vocab_size": tgt_vocab_size, - "langs": [src_lang, tgt_lang], - "encoder_attention_heads": args["encoder_attention_heads"], - "encoder_ffn_dim": args["encoder_ffn_embed_dim"], - "encoder_layerdrop": args["encoder_layerdrop"], - "encoder_layers": args["encoder_layers"], - "decoder_attention_heads": args["decoder_attention_heads"], - "decoder_ffn_dim": args["decoder_ffn_embed_dim"], - "decoder_layerdrop": args["decoder_layerdrop"], - "decoder_layers": args["decoder_layers"], - "bos_token_id": 0, - "pad_token_id": 1, - "eos_token_id": 2, - "is_encoder_decoder": True, - "scale_embedding": not args["no_scale_embedding"], - "tie_word_embeddings": args["share_all_embeddings"], - } - - # good hparam defaults to start with - model_conf["num_beams"] = 5 - model_conf["early_stopping"] = False - if model_dir in best_score_hparams and "length_penalty" in best_score_hparams[model_dir]: - model_conf["length_penalty"] = best_score_hparams[model_dir]["length_penalty"] - else: - model_conf["length_penalty"] = 1.0 - - print(f"Generating {fsmt_model_config_file}") - with open(fsmt_model_config_file, "w", encoding="utf-8") as f: - f.write(json.dumps(model_conf, ensure_ascii=False, indent=json_indent)) - - # tokenizer config - fsmt_tokenizer_config_file = os.path.join(pytorch_dump_folder_path, TOKENIZER_CONFIG_FILE) - - tokenizer_conf = { - "langs": [src_lang, tgt_lang], - "model_max_length": 1024, - "do_lower_case": do_lower_case, - } - - print(f"Generating {fsmt_tokenizer_config_file}") - with open(fsmt_tokenizer_config_file, "w", encoding="utf-8") as f: - f.write(json.dumps(tokenizer_conf, ensure_ascii=False, indent=json_indent)) - - # model - model = chkpt["models"][0] - model_state_dict = model.state_dict() - - # rename keys to start with 'model.' - model_state_dict = OrderedDict(("model." + k, v) for k, v in model_state_dict.items()) - - # remove unneeded keys - ignore_keys = [ - "model.model", - "model.encoder.version", - "model.decoder.version", - "model.encoder_embed_tokens.weight", - "model.decoder_embed_tokens.weight", - "model.encoder.embed_positions._float_tensor", - "model.decoder.embed_positions._float_tensor", - ] - for k in ignore_keys: - model_state_dict.pop(k, None) - - config = FSMTConfig.from_pretrained(pytorch_dump_folder_path) - model_new = FSMTForConditionalGeneration(config) - - # check that it loads ok - model_new.load_state_dict(model_state_dict, strict=False) - - # save - pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME) - print(f"Generating {pytorch_weights_dump_path}") - torch.save(model_state_dict, pytorch_weights_dump_path) - - print("Conversion is done!") - print("\nLast step is to upload the files to s3") - print(f"cd {data_root}") - print(f"transformers upload {model_dir}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--fsmt_checkpoint_path", - default=None, - type=str, - required=True, - help=( - "Path to the official PyTorch checkpoint file which is expected to reside in the dump dir with dicts," - " bpecodes, etc." - ), - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - args = parser.parse_args() - convert_fsmt_checkpoint_to_pytorch(args.fsmt_checkpoint_path, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/funnel/convert_funnel_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/funnel/convert_funnel_original_tf_checkpoint_to_pytorch.py deleted file mode 100755 index 4eab188f2ab7..000000000000 --- a/src/transformers/models/funnel/convert_funnel_original_tf_checkpoint_to_pytorch.py +++ /dev/null @@ -1,64 +0,0 @@ -# coding=utf-8 -# Copyright 2020 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Funnel checkpoint.""" - -import argparse - -import torch - -from transformers import FunnelBaseModel, FunnelConfig, FunnelModel, load_tf_weights_in_funnel -from transformers.utils import logging - - -logging.set_verbosity_info() - - -def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path, base_model): - # Initialise PyTorch model - config = FunnelConfig.from_json_file(config_file) - print(f"Building PyTorch model from configuration: {config}") - model = FunnelBaseModel(config) if base_model else FunnelModel(config) - - # Load weights from tf checkpoint - load_tf_weights_in_funnel(model, config, tf_checkpoint_path) - - # Save pytorch-model - print(f"Save PyTorch model to {pytorch_dump_path}") - torch.save(model.state_dict(), pytorch_dump_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." - ) - parser.add_argument( - "--config_file", - default=None, - type=str, - required=True, - help="The config json file corresponding to the pre-trained model. \nThis specifies the model architecture.", - ) - parser.add_argument( - "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - parser.add_argument( - "--base_model", action="store_true", help="Whether you want just the base model (no decoder) or not." - ) - args = parser.parse_args() - convert_tf_checkpoint_to_pytorch( - args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path, args.base_model - ) diff --git a/src/transformers/models/fuyu/convert_fuyu_model_weights_to_hf.py b/src/transformers/models/fuyu/convert_fuyu_model_weights_to_hf.py deleted file mode 100644 index 29ef7859c9a0..000000000000 --- a/src/transformers/models/fuyu/convert_fuyu_model_weights_to_hf.py +++ /dev/null @@ -1,134 +0,0 @@ -# Copyright 2023 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import os -import sys -import warnings - -import flatdict -import torch - -from transformers import FuyuConfig, FuyuForCausalLM, LlamaTokenizer - - -try: - from transformers import LlamaTokenizerFast - - tokenizer_class = LlamaTokenizerFast -except ImportError as e: - warnings.warn(e) - warnings.warn( - "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion" - ) - tokenizer_class = LlamaTokenizer - -""" -Sample usage: # TODO fix clone links from persimmon to fuyu -``` -git clone https://github.com/adept-ai-labs/adept-inference -wget https://axtkn4xl5cip.objectstorage.us-phoenix-1.oci.customer-oci.com/n/axtkn4xl5cip/b/adept-public-data/o/8b_base_model_release.tar -wget https://axtkn4xl5cip.objectstorage.us-phoenix-1.oci.customer-oci.com/n/axtkn4xl5cip/b/adept-public-data/o/8b_chat_model_release.tar -python src/transformers/models/fuyu/convert_fuyu_weights_to_hf.py --input_dir /path/to/downloaded/fuyu/weights/ --output_dir /output/path -``` - -Thereafter, models can be loaded via: - -```py -from transformers import FuyuForCausalLM, FuyuTokenizer - -model = FuyuForCausalLM.from_pretrained("/output/path") -tokenizer = FuyuTokenizer.from_pretrained("/output/path") -``` - -Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions -come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). -""" - - -KEYS_TO_MODIFY_MAPPING = { - "self_attention": "self_attn", - "language_model.encoder": "language_model.model", - "word_embeddings_for_head": "language_model.lm_head", - "language_model.embedding.word_embeddings": "language_model.model.embed_tokens", - "vit_encoder.linear_encoder": "vision_embed_tokens", -} - -KEYS_TO_REMOVE = { - "rotary_emb.inv_freq", - "image_patch_projection", - "image_patch_projection.weight", - "image_patch_projection.bias", -} - - -def rename_state_dict(state_dict): - model_state_dict = {} - for key, value in state_dict.items(): - for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items(): - if key_to_modify in key: - key = key.replace(key_to_modify, new_key) - # if KEYS_TO_REMOVE in key: - if key in KEYS_TO_REMOVE: - continue - model_state_dict[key] = value - return model_state_dict - - -def convert_fuyu_checkpoint(pytorch_dump_folder_path, ada_lib_path, pt_model_path, safe_serialization=False): - sys.path.insert(0, ada_lib_path) - model_state_dict_base = torch.load(pt_model_path, map_location="cpu", weights_only=True) - state_dict = flatdict.FlatDict(model_state_dict_base["model"], ".") - state_dict = rename_state_dict(state_dict) - - transformers_config = FuyuConfig() - model = FuyuForCausalLM(transformers_config).to(torch.bfloat16) - model.load_state_dict(state_dict) - model.save_pretrained(pytorch_dump_folder_path, safe_serialization=safe_serialization) - transformers_config.save_pretrained(pytorch_dump_folder_path) - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--input_dir", - help="Location of Fuyu weights, which contains tokenizer.model and model folders", - ) - parser.add_argument( - "--pt_model_path", - help="Location of Fuyu `model_optim_rng.pt`", - ) - parser.add_argument( - "--output_dir", - help="Location to write HF model and tokenizer", - ) - parser.add_argument( - "--ada_lib_path", - help="Location of original source code from adept to deserialize .pt checkpoint", - ) - parser.add_argument("--safe_serialization", type=bool, help="Whether or not to save using `safetensors`.") - args = parser.parse_args() - spm_path = os.path.join(args.input_dir, "adept_vocab.model") - - convert_fuyu_checkpoint( - pytorch_dump_folder_path=args.output_dir, - pt_model_path=args.pt_model_path, - safe_serialization=args.safe_serialization, - ada_lib_path=args.ada_lib_path, - ) - tokenizer = tokenizer_class(spm_path, bos_token="|ENDOFTEXT|", eos_token="|ENDOFTEXT|") - tokenizer.save_pretrained(args.output_dir) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/fuyu/image_processing_fuyu.py b/src/transformers/models/fuyu/image_processing_fuyu.py index e52d9dc8ee91..366782be16f4 100644 --- a/src/transformers/models/fuyu/image_processing_fuyu.py +++ b/src/transformers/models/fuyu/image_processing_fuyu.py @@ -135,7 +135,7 @@ def to(self, *args, **kwargs) -> "BatchFeature": [`BatchFeature`]: The same instance after modification. """ requires_backends(self, ["torch"]) - import torch # noqa + import torch new_data = {} device = kwargs.get("device") diff --git a/src/transformers/models/gemma/convert_gemma_weights_to_hf.py b/src/transformers/models/gemma/convert_gemma_weights_to_hf.py deleted file mode 100644 index ac624df78505..000000000000 --- a/src/transformers/models/gemma/convert_gemma_weights_to_hf.py +++ /dev/null @@ -1,206 +0,0 @@ -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import os -import warnings - -import torch -from accelerate import init_empty_weights - -from transformers import GemmaConfig, GemmaForCausalLM, GemmaTokenizer - - -try: - from transformers import GemmaTokenizerFast -except ImportError as e: - warnings.warn(e) - warnings.warn( - "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion" - ) - GemmaTokenizerFast = None - -""" -Sample usage: - -``` -python src/transformers/models/gemma/convert_gemma_weights_to_hf.py \ - --input_dir /path/to/downloaded/gemma/weights --model_size 7B --output_dir /output/path -``` - -Thereafter, models can be loaded via: - -```py -from transformers import GemmaForCausalLM, GemmaTokenizerFast - -model = GemmaForCausalLM.from_pretrained("/output/path") -tokenizer = GemmaTokenizerFast.from_pretrained("/output/path") -``` - -Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions -come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). -""" - -gemma_2b_config = GemmaConfig( - num_hidden_layers=18, - num_attention_heads=8, - num_key_value_heads=1, - hidden_size=2048, - intermediate_size=16384, -) - -gemma_7b_config = GemmaConfig() - -CONFIG_MAPPING = {"2B": gemma_2b_config, "7B": gemma_7b_config} -LAYER_NAME_MAPPING = {"embedder.weight": "model.embed_tokens.weight"} - - -def write_model(save_path, input_base_path, config, safe_serialization=True, push_to_hub=False, dtype=torch.float32): - num_attn_heads = config.num_attention_heads - hidden_size = config.hidden_size - num_kv_heads = config.num_key_value_heads - head_dim = config.head_dim - - print(f"Fetching all parameters from the checkpoint at '{input_base_path}'") - model_state_dict = torch.load(input_base_path, map_location="cpu", weights_only=True)["model_state_dict"] - model_state_dict.pop("freqs_cis") - - state_dict = {} - for k, v in model_state_dict.items(): - if "qkv_proj" in k: - if num_kv_heads == 1: - v = v.reshape(num_attn_heads + num_kv_heads * 2, head_dim, hidden_size) - q_proj = v[:num_attn_heads, ...] - k_proj = v[num_attn_heads : num_attn_heads + num_kv_heads, ...].repeat(num_kv_heads, 1, 1) - v_proj = v[-num_kv_heads:, ...].repeat(num_kv_heads, 1, 1) - - state_dict[k.replace("qkv_proj", "q_proj")] = q_proj.reshape( - num_attn_heads * head_dim, hidden_size - ).clone() - state_dict[k.replace("qkv_proj", "k_proj")] = k_proj.reshape( - num_kv_heads * head_dim, hidden_size - ).clone() - state_dict[k.replace("qkv_proj", "v_proj")] = v_proj[0].clone() - else: - q_proj, k_proj, v_proj = torch.split(v, v.shape[0] // 3, 0) - state_dict[k.replace("qkv_proj", "q_proj")] = q_proj.reshape( - num_attn_heads * head_dim, hidden_size - ).clone() - state_dict[k.replace("qkv_proj", "k_proj")] = k_proj.reshape( - num_kv_heads * head_dim, hidden_size - ).clone() - state_dict[k.replace("qkv_proj", "v_proj")] = v_proj.clone() - - elif k == "embedder.weight": - state_dict[LAYER_NAME_MAPPING[k]] = v - state_dict["lm_head.weight"] = v - else: - state_dict[k] = v - - torch.set_default_dtype(dtype) - - print("Loading the checkpoint in a Gemma model.") - with init_empty_weights(): - model = GemmaForCausalLM(config) - model.load_state_dict(state_dict, assign=True, strict=False) - - model.config.dtype = torch.float32 - del model.config._name_or_path - print("Saving in the Transformers format.") - - if push_to_hub: - print(f"pushing the model to {save_path}") - model.push_to_hub(save_path, safe_serialization=safe_serialization, private=True) - else: - model.save_pretrained(save_path, safe_serialization=safe_serialization) - - -def write_tokenizer(input_tokenizer_path, save_path, push_to_hub=False): - # Initialize the tokenizer based on the `spm` model - tokenizer_class = GemmaTokenizer if GemmaTokenizerFast is None else GemmaTokenizerFast - print(f"Saving a {tokenizer_class.__name__} to {save_path}.") - tokenizer = tokenizer_class(input_tokenizer_path) - if push_to_hub: - tokenizer.push_to_hub(save_path) - else: - tokenizer.save_pretrained(save_path) - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--input_checkpoint", - help="Absolute path to the target Gemma weights.", - required=True, - ) - parser.add_argument( - "--tokenizer_checkpoint", - help="Location of Gemma tokenizer model", - ) - parser.add_argument( - "--model_size", - default="7B", - choices=["2B", "7B", "tokenizer_only"], - help="'f' models correspond to the finetuned versions, and are specific to the Gemma2 official release. For more details on Gemma2, check out the original repo: https://huggingface.co/google/gemma-7b", - ) - parser.add_argument( - "--output_dir", - default="google/gemma-7b", - help="Location to write HF model and tokenizer", - ) - parser.add_argument( - "--pickle_serialization", - help="Whether or not to save using `safetensors`.", - action="store_true", - default=False, - ) - parser.add_argument( - "--convert_tokenizer", - help="Whether or not to convert the tokenizer as well.", - action="store_true", - default=False, - ) - parser.add_argument( - "--push_to_hub", - help="Whether or not to push the model to the hub at `output_dir` instead of saving it locally.", - action="store_true", - default=False, - ) - parser.add_argument( - "--dtype", - default="float32", - help="Target dtype of the converted model", - ) - args = parser.parse_args() - - if args.convert_tokenizer: - if args.tokenizer_checkpoint is None: - raise ValueError("Path to the tokenizer is required when passing --convert_tokenizer") - - spm_path = os.path.join(args.tokenizer_checkpoint) - write_tokenizer(spm_path, args.output_dir, args.push_to_hub) - - config = CONFIG_MAPPING[args.model_size] - dtype = getattr(torch, args.dtype) - write_model( - config=config, - input_base_path=args.input_checkpoint, - save_path=args.output_dir, - safe_serialization=not args.pickle_serialization, - push_to_hub=args.push_to_hub, - dtype=dtype, - ) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py index 5f72f27d9382..04d27b309a40 100644 --- a/src/transformers/models/gemma/modeling_gemma.py +++ b/src/transformers/models/gemma/modeling_gemma.py @@ -322,6 +322,13 @@ class GemmaPreTrainedModel(PreTrainedModel): "attentions": GemmaAttention, } + def _init_weights(self, module): + super()._init_weights(module) + + # We initialize with 0s to be 1 centered as the RMSNorm here does (1 + weight) + if "RMSNorm" in module.__class__.__name__: + module.weight.data.zero_() + @auto_docstring class GemmaModel(GemmaPreTrainedModel): diff --git a/src/transformers/models/gemma/modular_gemma.py b/src/transformers/models/gemma/modular_gemma.py index 281fcd54fb7d..f2f9c7dc4056 100644 --- a/src/transformers/models/gemma/modular_gemma.py +++ b/src/transformers/models/gemma/modular_gemma.py @@ -23,6 +23,7 @@ from ...configuration_utils import PretrainedConfig from ...masking_utils import create_causal_mask from ...modeling_outputs import BaseModelOutputWithPast +from ...modeling_utils import PreTrainedModel from ...processing_utils import Unpack from ...tokenization_utils import AddedToken, PreTrainedTokenizer from ...utils import TransformersKwargs, logging @@ -32,6 +33,8 @@ LlamaForTokenClassification, LlamaMLP, LlamaModel, + LlamaPreTrainedModel, + LlamaRotaryEmbedding, ) from ..llama.tokenization_llama import LlamaTokenizer @@ -366,6 +369,19 @@ def __init__(self, config): self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) +class GemmaRotaryEmbedding(LlamaRotaryEmbedding): + pass + + +class GemmaPreTrainedModel(LlamaPreTrainedModel): + def _init_weights(self, module): + PreTrainedModel._init_weights(self, module) + + # We initialize with 0s to be 1 centered as the RMSNorm here does (1 + weight) + if "RMSNorm" in module.__class__.__name__: + module.weight.data.zero_() + + class GemmaModel(LlamaModel): def forward( self, @@ -472,5 +488,5 @@ class GemmaForTokenClassification(LlamaForTokenClassification): "GemmaForCausalLM", "GemmaForSequenceClassification", "GemmaForTokenClassification", - "GemmaPreTrainedModel", # noqa: F822 + "GemmaPreTrainedModel", ] diff --git a/src/transformers/models/gemma2/convert_gemma2_weights_to_hf.py b/src/transformers/models/gemma2/convert_gemma2_weights_to_hf.py deleted file mode 100644 index ba8705534fd0..000000000000 --- a/src/transformers/models/gemma2/convert_gemma2_weights_to_hf.py +++ /dev/null @@ -1,239 +0,0 @@ -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import os -import warnings - -import torch -from accelerate import init_empty_weights - -from transformers import Gemma2Config, Gemma2ForCausalLM, GemmaTokenizer - - -try: - from transformers import GemmaTokenizerFast -except ImportError as e: - warnings.warn(e) - warnings.warn( - "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion" - ) - GemmaTokenizerFast = None - -""" -Sample usage: - -``` -python src/transformers/models/gemma2/convert_gemma2_weights_to_hf.py \ - --input_dir /path/to/downloaded/gemma/weights --model_size 9B --output_dir /output/path -``` - -Thereafter, models can be loaded via: - -```py -from transformers import Gemma2ForCausalLM, GemmaTokenizerFast - -model = Gemma2ForCausalLM.from_pretrained("/output/path") -tokenizer = GemmaTokenizerFast.from_pretrained("/output/path") -``` - -Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions -come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). -""" - -gemma_9b_config = Gemma2Config( - num_hidden_layers=42, - num_attention_heads=16, - num_key_value_heads=8, - hidden_size=3584, - intermediate_size=14336, - final_logit_softcapping=30.0, - attn_logit_softcapping=50.0, - head_dim=256, - sliding_window=4096, - query_pre_attn_scalar=224, -) - -gemma_27b_config = Gemma2Config( - num_hidden_layers=46, - num_attention_heads=32, - num_key_value_heads=16, - hidden_size=4608, - intermediate_size=36864, - final_logit_softcapping=30.0, - attn_logit_softcapping=50.0, - head_dim=128, - sliding_window=4096, - query_pre_attn_scalar=144, -) - -CONFIG_MAPPING = {"9B": gemma_9b_config, "27B": gemma_27b_config} -LAYER_NAME_MAPPING = {"embedder.weight": "model.embed_tokens.weight"} - - -def write_model(save_path, input_base_path, config, safe_serialization=True, push_to_hub=False, dtype=torch.float32): - num_attn_heads = config.num_attention_heads - hidden_size = config.hidden_size - num_kv_heads = config.num_key_value_heads - head_dim = config.head_dim - - print(f"Fetching all parameters from the checkpoint at '{input_base_path}'") - - if os.path.isdir(input_base_path): - print("Model seems sharded") - - model_state_dict = {} - files = [file for file in os.listdir(input_base_path) if file.endswith(".bin")] - - for file in files: - print(file) - loaded_state_dict = torch.load(os.path.join(input_base_path, file), map_location="cpu", weights_only=True) - model_state_dict.update(loaded_state_dict) - else: - print("Model does not seem to be sharded") - model_state_dict = torch.load(input_base_path, map_location="cpu", weights_only=True)["model_state_dict"] - model_state_dict.pop("freqs_cis") - - state_dict = {} - for k, v in model_state_dict.items(): - if "qkv_proj" in k: - if num_kv_heads == 1: - v = v.reshape(num_attn_heads + num_kv_heads * 2, head_dim, hidden_size) - q_proj = v[:num_attn_heads, ...] - k_proj = v[num_attn_heads : num_attn_heads + num_kv_heads, ...].repeat(num_kv_heads, 1, 1) - v_proj = v[-num_kv_heads:, ...].repeat(num_kv_heads, 1, 1) - - state_dict[k.replace("qkv_proj", "q_proj")] = q_proj.reshape( - num_attn_heads * head_dim, hidden_size - ).clone() - state_dict[k.replace("qkv_proj", "k_proj")] = k_proj.reshape( - num_kv_heads * head_dim, hidden_size - ).clone() - state_dict[k.replace("qkv_proj", "v_proj")] = v_proj[0].clone() - else: - q_proj, k_proj, v_proj = torch.split( - v, [num_attn_heads * head_dim, num_kv_heads * head_dim, num_kv_heads * head_dim], 0 - ) - state_dict[k.replace("qkv_proj", "q_proj")] = q_proj.reshape( - num_attn_heads * head_dim, hidden_size - ).clone() - state_dict[k.replace("qkv_proj", "k_proj")] = k_proj.reshape( - num_kv_heads * head_dim, hidden_size - ).clone() - state_dict[k.replace("qkv_proj", "v_proj")] = v_proj.reshape( - num_kv_heads * head_dim, hidden_size - ).clone() - - elif k == "embedder.weight": - state_dict[LAYER_NAME_MAPPING[k]] = v - state_dict["lm_head.weight"] = v - else: - state_dict[k] = v - - torch.set_default_dtype(dtype) - - print("Loading the checkpoint in a Gemma2 model.") - with init_empty_weights(): - model = Gemma2ForCausalLM(config) - model.load_state_dict(state_dict, assign=True, strict=False) - - model.config.dtype = torch.float32 - del model.config._name_or_path - print("Saving in the Transformers format.") - - if push_to_hub: - print(f"pushing the model to {save_path}") - model.push_to_hub(save_path, safe_serialization=safe_serialization, private=True) - else: - model.save_pretrained(save_path, safe_serialization=safe_serialization) - - -def write_tokenizer(input_tokenizer_path, save_path, push_to_hub=False): - # Initialize the tokenizer based on the `spm` model - tokenizer_class = GemmaTokenizer if GemmaTokenizerFast is None else GemmaTokenizerFast - print(f"Saving a {tokenizer_class.__name__} to {save_path}.") - tokenizer = tokenizer_class(input_tokenizer_path) - if push_to_hub: - tokenizer.push_to_hub(save_path) - else: - tokenizer.save_pretrained(save_path) - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--input_checkpoint", - help="Absolute path to the target Gemma2 weights.", - required=True, - ) - parser.add_argument( - "--tokenizer_checkpoint", - help="Location of Gemma2 tokenizer model", - ) - parser.add_argument( - "--model_size", - default="9B", - choices=["9B", "27B", "tokenizer_only"], - help="'f' models correspond to the finetuned versions, and are specific to the Gemma22 official release. For more details on Gemma2, check out the original repo: https://huggingface.co/google/gemma-7b", - ) - parser.add_argument( - "--output_dir", - default="google/gemma-9b", - help="Location to write HF model and tokenizer", - ) - parser.add_argument( - "--pickle_serialization", - help="Whether or not to save using `safetensors`.", - action="store_true", - default=False, - ) - parser.add_argument( - "--convert_tokenizer", - help="Whether or not to convert the tokenizer as well.", - action="store_true", - default=False, - ) - parser.add_argument( - "--push_to_hub", - help="Whether or not to push the model to the hub at `output_dir` instead of saving it locally.", - action="store_true", - default=False, - ) - parser.add_argument( - "--dtype", - default="float32", - help="Target dtype of the converted model", - ) - args = parser.parse_args() - - if args.convert_tokenizer: - if args.tokenizer_checkpoint is None: - raise ValueError("Path to the tokenizer is required when passing --convert_tokenizer") - - spm_path = os.path.join(args.tokenizer_checkpoint) - write_tokenizer(spm_path, args.output_dir, args.push_to_hub) - if args.model_size != "tokenizer_only": - config = CONFIG_MAPPING[args.model_size] - dtype = getattr(torch, args.dtype) - write_model( - config=config, - input_base_path=args.input_checkpoint, - save_path=args.output_dir, - safe_serialization=not args.pickle_serialization, - push_to_hub=args.push_to_hub, - dtype=dtype, - ) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/gemma2/modeling_gemma2.py b/src/transformers/models/gemma2/modeling_gemma2.py index 3d088cfc52cf..ec2f1521ef85 100644 --- a/src/transformers/models/gemma2/modeling_gemma2.py +++ b/src/transformers/models/gemma2/modeling_gemma2.py @@ -83,6 +83,42 @@ def forward(self, x): return down_proj +class Gemma2RotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: Gemma2Config, device=None): + super().__init__() + # BC: "rope_type" was originally "type" + if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = self.inv_freq + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + def rotate_half(x): """Rotates half the hidden dims of the input.""" x1 = x[..., : x.shape[-1] // 2] @@ -299,42 +335,6 @@ def forward( return outputs -class Gemma2RotaryEmbedding(nn.Module): - inv_freq: torch.Tensor # fix linting for `register_buffer` - - def __init__(self, config: Gemma2Config, device=None): - super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" - self.max_seq_len_cached = config.max_position_embeddings - self.original_max_seq_len = config.max_position_embeddings - - self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) - self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq - - @torch.no_grad() - @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) - def forward(self, x, position_ids): - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) - position_ids_expanded = position_ids[:, None, :].float() - - device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" - with torch.autocast(device_type=device_type, enabled=False): # Force float32 - freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) - emb = torch.cat((freqs, freqs), dim=-1) - cos = emb.cos() * self.attention_scaling - sin = emb.sin() * self.attention_scaling - - return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) - - @auto_docstring class Gemma2PreTrainedModel(PreTrainedModel): config: Gemma2Config @@ -353,6 +353,13 @@ class Gemma2PreTrainedModel(PreTrainedModel): "attentions": Gemma2Attention, } + def _init_weights(self, module): + super()._init_weights(module) + + # We initialize with 0s to be 1 centered as the RMSNorm here does (1 + weight) + if "RMSNorm" in module.__class__.__name__: + module.weight.data.zero_() + @auto_docstring class Gemma2Model(Gemma2PreTrainedModel): diff --git a/src/transformers/models/gemma2/modular_gemma2.py b/src/transformers/models/gemma2/modular_gemma2.py index c7e34e4abed4..e54795019c7f 100644 --- a/src/transformers/models/gemma2/modular_gemma2.py +++ b/src/transformers/models/gemma2/modular_gemma2.py @@ -36,7 +36,9 @@ GemmaForTokenClassification, GemmaMLP, GemmaModel, + GemmaPreTrainedModel, GemmaRMSNorm, + GemmaRotaryEmbedding, apply_rotary_pos_emb, repeat_kv, ) @@ -212,6 +214,10 @@ def __init__(self, config): self.act_fn = ACT2FN[config.hidden_activation] +class Gemma2RotaryEmbedding(GemmaRotaryEmbedding): + pass + + def eager_attention_forward( module: nn.Module, query: torch.Tensor, @@ -363,6 +369,10 @@ def forward( return outputs +class Gemma2PreTrainedModel(GemmaPreTrainedModel): + pass + + class Gemma2Model(GemmaModel): def __init__(self, config: Gemma2Config): super().__init__(config) @@ -571,7 +581,7 @@ class Gemma2ForTokenClassification(GemmaForTokenClassification): "Gemma2Config", "Gemma2ForCausalLM", "Gemma2Model", - "Gemma2PreTrainedModel", # noqa: F822 + "Gemma2PreTrainedModel", "Gemma2ForSequenceClassification", "Gemma2ForTokenClassification", ] diff --git a/src/transformers/models/gemma3/convert_gemma3_weights.py b/src/transformers/models/gemma3/convert_gemma3_weights.py deleted file mode 100644 index 8d7a21219197..000000000000 --- a/src/transformers/models/gemma3/convert_gemma3_weights.py +++ /dev/null @@ -1,689 +0,0 @@ -# coding=utf-8 -# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved. -# -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -r"""Utility to convert Gemma models from Orbax to HF Transformers checkpoint. - -python src/transformers/models/gemma3/convert_gemma3_weights.py \ - --variant='gemma3_4b' \ - --tokenizer_path="$HOME/gemma3/tokenizer/gemma3_cleaned_262144_v2.spiece.model" \ - --checkpoint_path="$HOME/gemma3/gemma3_4b_pt_orbax/" \ - --output_path="$HOME/gemma3/gemma3_4b_pt_safetensors/" -""" - -from collections.abc import Iterator, Sequence -from typing import Any, Optional - -import accelerate -import numpy as np -import torch -import tree -from absl import app, flags, logging -from orbax import checkpoint as obc - -from transformers import ( - Gemma3Config, - Gemma3ForCausalLM, - Gemma3ForConditionalGeneration, - Gemma3ImageProcessor, - Gemma3Processor, - Gemma3TextConfig, - Gemma3TextModel, - GemmaTokenizerFast, - GenerationConfig, - SiglipVisionConfig, -) -from transformers.image_utils import PILImageResampling - - -# ==== Internal Constants and Classes ==== - - -_CHAT_TEMPLATE = """{{ bos_token }} -{%- if messages[0]['role'] == 'system' -%} - {%- if messages[0]['content'] is string -%} - {%- set first_user_prefix = messages[0]['content'] + '\n\n' -%} - {%- else -%} - {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%} - {%- endif -%} - {%- set loop_messages = messages[1:] -%} -{%- else -%} - {%- set first_user_prefix = "" -%} - {%- set loop_messages = messages -%} -{%- endif -%} -{%- for message in loop_messages -%} - {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%} - {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }} - {%- endif -%} - {%- if (message['role'] == 'assistant') -%} - {%- set role = "model" -%} - {%- else -%} - {%- set role = message['role'] -%} - {%- endif -%} - {{ '' + role + '\n' + (first_user_prefix if loop.first else "") }} - {%- if message['content'] is string -%} - {{ message['content'] | trim }} - {%- elif message['content'] is iterable -%} - {%- for item in message['content'] -%} - {%- if item['type'] == 'image' -%} - {{ '' }} - {%- elif item['type'] == 'text' -%} - {{ item['text'] | trim }} - {%- endif -%} - {%- endfor -%} - {%- else -%} - {{ raise_exception("Invalid content type") }} - {%- endif -%} - {{ '\n' }} -{%- endfor -%} -{%- if add_generation_prompt -%} - {{'model\n'}} -{%- endif -%} -""" - -_DTYPES = {"float32", "bfloat16", "float16"} - -_SIGLIP_BASE = "SigLiPFromPatches_0/siglip_encoder" -_SIGLIP_EMBEDDING = "SigLiPFromPatches_0/siglip_encoder/embedding" -_SIGLIP_TRANSFORMER_ENCODER_BLOCK = "SigLiPFromPatches_0/siglip_encoder/Transformer/encoderblock_" -_SIGLIP_TRANSFORMER_ENCODER_BLOCK_LEN = len(_SIGLIP_TRANSFORMER_ENCODER_BLOCK) -_SIGLIP_TRANSFORMER_ENCODER_NORM = "SigLiPFromPatches_0/siglip_encoder/Transformer/encoder_norm" - -_TRANSFORMER_DECODER_BLOCK = "/layer_" -_TRANSFORMER_DECODER_BLOCK_LEN = len(_TRANSFORMER_DECODER_BLOCK) -_TRANSFORMER_EMBEDDER = "/embedder" -_TRANSFORMER_FINAL_NORM = "/final_norm" -_TRANSFORMER_POST_TRAINING_PREFIX = "rlx_networks/policy_network/" -_TRANSFORMER_POST_TRAINING_PREFIX_LEN = len(_TRANSFORMER_POST_TRAINING_PREFIX) - -_VISION_CONFIG = { - "hidden_size": 1152, - "intermediate_size": 4304, - "num_hidden_layers": 27, - "num_attention_heads": 16, - "num_channels": 3, - "image_size": 896, - "patch_size": 14, - "hidden_act": "gelu_pytorch_tanh", - "layer_norm_eps": 1e-6, - "attention_dropout": 0.0, - "vision_use_head": False, -} - -_VARIANT_EMBEDDINGGEMMA = "embedding" -_VARIANT_GEMMA_3_270M = "gemma3_270m" -_VARIANT_GEMMA_3_1B = "gemma3_1b" -_VARIANT_GEMMA_3_4B = "gemma3_4b" -_VARIANT_GEMMA_3_12B = "gemma3_12b" -_VARIANT_GEMMA_3_27B = "gemma3_27b" -_VARIANTS = { - _VARIANT_EMBEDDINGGEMMA: Gemma3Config( - text_config=Gemma3TextConfig( - vocab_size=262_144, - hidden_size=768, - intermediate_size=1152, - num_hidden_layers=24, - num_attention_heads=3, - num_key_value_heads=1, - head_dim=256, - max_position_embeddings=1024, - query_pre_attn_scalar=256, - sliding_window=512, - rope_scaling=None, - use_bidirectional_attention=True, - ), - vision_config=None, - ), - _VARIANT_GEMMA_3_270M: Gemma3Config( - text_config=Gemma3TextConfig( - vocab_size=262_144, - hidden_size=640, - intermediate_size=2048, - num_hidden_layers=18, - num_attention_heads=4, - num_key_value_heads=1, - head_dim=256, - max_position_embeddings=32768, - query_pre_attn_scalar=256, - sliding_window=512, - rope_scaling=None, - ), - vision_config=None, - ), - _VARIANT_GEMMA_3_1B: Gemma3Config( - text_config=Gemma3TextConfig( - vocab_size=262_144, - hidden_size=1152, - intermediate_size=6 * 1152, - num_attention_heads=4, - num_hidden_layers=26, - num_key_value_heads=1, - head_dim=256, - sliding_window=512, - rope_theta=1_000_000, # used for global RoPE only - rope_local_base_freq=10_000, - attn_logit_softcapping=None, - query_pre_attn_scalar=256, - max_position_embeddings=32_768, - ), - vision_config=None, - ), - _VARIANT_GEMMA_3_4B: Gemma3Config( - text_config=Gemma3TextConfig( - vocab_size=262_208, - hidden_size=2560, - intermediate_size=2560 * 8 // 2, - num_attention_heads=8, - head_dim=256, - num_hidden_layers=34, - num_key_value_heads=4, - sliding_window=1024, - rope_scaling={"rope_type": "linear", "factor": 8.0}, # used for global RoPE only - rope_theta=1_000_000, - rope_local_base_freq=10_000, - attn_logit_softcapping=None, - query_pre_attn_scalar=256, - ), - vision_config=_VISION_CONFIG, - ), - _VARIANT_GEMMA_3_12B: Gemma3Config( - text_config=Gemma3TextConfig( - vocab_size=262_208, - hidden_size=30 * 128, - intermediate_size=30 * 128 * 8 // 2, - num_attention_heads=16, - head_dim=256, - num_hidden_layers=48, - num_key_value_heads=8, - sliding_window=1024, - rope_scaling={"rope_type": "linear", "factor": 8.0}, # used for global RoPE only - rope_theta=1_000_000, - rope_local_base_freq=10_000, - attn_logit_softcapping=None, - query_pre_attn_scalar=256, - ), - vision_config=_VISION_CONFIG, - ), - _VARIANT_GEMMA_3_27B: Gemma3Config( - text_config=Gemma3TextConfig( - vocab_size=262_208, - hidden_size=42 * 128, - intermediate_size=42 * 128 * 8 // 2, - num_attention_heads=32, - num_hidden_layers=62, - num_key_value_heads=16, - head_dim=128, - sliding_window=1024, - rope_scaling={"rope_type": "linear", "factor": 8.0}, # used for global RoPE only - rope_theta=1_000_000, - rope_local_base_freq=10_000, - attn_logit_softcapping=None, - query_pre_attn_scalar=(42 * 128 // 32), # 1 / sqrt(hidden_size // num_attention_heads) - ), - vision_config=_VISION_CONFIG, - ), -} - -_TEXT_ONLY_VARIANTS = (_VARIANT_EMBEDDINGGEMMA, _VARIANT_GEMMA_3_270M, _VARIANT_GEMMA_3_1B) - -# ==== Flags ==== - -_CHECKPOINT_PATH = flags.DEFINE_string( - name="checkpoint_path", - default=None, - help="Path to the Orbax checkpoint.", - required=True, -) - -_INCLUDE_CHAT_TEMPLATE = flags.DEFINE_bool( - name="include_chat_template", default=False, help="If true, will save the default chat template with the tokenizer" -) - -_OUTPUT_PATH = flags.DEFINE_string( - name="output_path", - default=None, - help="Path to store the HF checkpoint.", - required=True, -) - -_NUM_LINEAR_LAYERS = flags.DEFINE_integer( - name="num_linear_layers", - default=2, - help="Number of linear projection layers at the end of the Sentence Transformer.", -) - -_TRANSFORMER_DTYPE = flags.DEFINE_enum( - name="text_dtype", - default="bfloat16", - help="The floating point precision (aka dtype) of the model.", - enum_values=_DTYPES, -) - -_TOKENIZER_PATH = flags.DEFINE_string( - name="tokenizer_path", - default=None, - help="Path to the SentencePiece model file.", - required=True, -) - -_VARIANT = flags.DEFINE_enum( - name="variant", - default=_VARIANT_GEMMA_3_4B, - help="The model variant to convert.", - enum_values=set(_VARIANTS.keys()), -) - -_VERBOSE = flags.DEFINE_bool( - name="verbose", - default=False, - help="If true, log the path, shape, and dtype of every converted layer.", -) - -_VISION_DTYPE = flags.DEFINE_enum( - name="vision_dtype", - default="float32", - help="The floating point precision (aka dtype) of the model.", - enum_values=_DTYPES, -) - - -def convert_siglip_weight( - config: SiglipVisionConfig, - paths: Sequence[str], - weights: np.ndarray, -) -> tuple[str, np.ndarray]: - path, prop = paths - normalized_path: str = "" - updated_weights: np.ndarray = None - - if path == _SIGLIP_BASE: - normalized_path = "vision_tower.vision_model.embeddings.position_embedding.weight" - updated_weights = weights.reshape(-1, config.hidden_size) - elif path == _SIGLIP_EMBEDDING: - if prop == "kernel": - normalized_path = "vision_tower.vision_model.embeddings.patch_embedding.weight" - updated_weights = weights.transpose(3, 2, 0, 1) - elif prop == "bias": - normalized_path = "vision_tower.vision_model.embeddings.patch_embedding.bias" - updated_weights = weights - else: - raise ValueError(f"Unexpected member, `{prop}`, for path `{path}`. Should be `bias` or `kernel`.") - elif path.startswith(_SIGLIP_TRANSFORMER_ENCODER_BLOCK): - encoder_block_path = path[_SIGLIP_TRANSFORMER_ENCODER_BLOCK_LEN:] - next_path_separator_idx = encoder_block_path.find("/") - layer_idx = encoder_block_path[:next_path_separator_idx] - encoder_block_path = encoder_block_path[next_path_separator_idx:] - normalized_path = f"vision_tower.vision_model.encoder.layers.{layer_idx}" - - if encoder_block_path.startswith("/LayerNorm"): - normalized_path += ".layer_norm1" if path.endswith("_0") else ".layer_norm2" - - if prop == "scale": - normalized_path += ".weight" - updated_weights = weights.transpose() - elif prop == "bias": - normalized_path += ".bias" - updated_weights = weights - else: - raise ValueError(f"Unexpected member, `{prop}`, for path `{path}`. Should be `bias` or `scale`.") - elif encoder_block_path.startswith("/MlpBlock_0"): - normalized_path += ".mlp.fc1" if "/Dense_0" in encoder_block_path else ".mlp.fc2" - - if prop == "kernel": - normalized_path += ".weight" - updated_weights = weights.transpose() - elif prop == "bias": - normalized_path += ".bias" - updated_weights = weights - else: - raise ValueError(f"Unexpected member, `{prop}`, for path `{path}`. Should be `bias` or `kernel`.") - elif encoder_block_path.startswith("/MultiHeadDotProductAttention_0"): - if encoder_block_path.endswith("/key"): - normalized_path += ".self_attn.k_proj" - elif encoder_block_path.endswith("/out"): - normalized_path += ".self_attn.out_proj" - elif encoder_block_path.endswith("/query"): - normalized_path += ".self_attn.q_proj" - elif encoder_block_path.endswith("/value"): - normalized_path += ".self_attn.v_proj" - else: - raise ValueError(f"Unexpected path `{path}` in SigLIP Transformer MultiHeadDotProductAttention_0.") - - if prop == "bias": - normalized_path += ".bias" - updated_weights = weights.reshape(-1, config.hidden_size).reshape(-1) - elif prop == "kernel": - normalized_path += ".weight" - updated_weights = weights.reshape(-1, config.hidden_size).transpose() - else: - raise ValueError(f"Unexpected member, `{prop}`, for path `{path}`. Should be `bias` or `kernel`.") - else: - raise ValueError(f"Unexpected path `{path}` in SigLIP Transformer Encoder Block.") - elif path == _SIGLIP_TRANSFORMER_ENCODER_NORM: - if prop == "scale": - normalized_path = "vision_tower.vision_model.post_layernorm.weight" - updated_weights = weights.transpose() - elif prop == "bias": - normalized_path = "vision_tower.vision_model.post_layernorm.bias" - updated_weights = weights - else: - raise ValueError(f"Unexpected member, `{prop}`, for path `{path}`. Should be `bias` or `scale`.") - else: - raise ValueError(f"Unexpected path `{path}`.") - - return normalized_path, updated_weights - - -def convert_transformer_weights( - config: Gemma3TextConfig, - paths: Sequence[str], - weights: np.ndarray, -) -> Iterator[tuple[str, np.ndarray]]: - path, prop = paths - - if path.startswith(_TRANSFORMER_POST_TRAINING_PREFIX): - path = path[_TRANSFORMER_POST_TRAINING_PREFIX_LEN:] - - converted_paths: list[str] = [] - converted_weights: list[Any] = [] - - attn_head_dim = config.num_attention_heads * config.head_dim - kv_head_dim = config.num_key_value_heads * config.head_dim - - if path.endswith(_TRANSFORMER_EMBEDDER): - if prop == "input_embedding": - # Tied to language_model.lm_head.weight, assigned at the end. - converted_paths = ["language_model.model.embed_tokens.weight"] - - if _VARIANT.value not in _TEXT_ONLY_VARIANTS: - # Gemma3 model doesn't have image soft token in input and output embeddings, resize to avoid bugs we had with Mllama - pre_expansion_embeddings = weights - mu = np.mean(pre_expansion_embeddings, axis=0) - sigma = np.cov(pre_expansion_embeddings, rowvar=False, bias=True) - new_embeddings = np.random.multivariate_normal(mu, 1e-5 * sigma, size=64) - weights = np.vstack([pre_expansion_embeddings, new_embeddings]) - - converted_weights = [weights] - elif _VARIANT.value in _TEXT_ONLY_VARIANTS or prop in ("mm_output_embedding", "mm_input_embedding_extra"): - return zip([], []) - else: - raise ValueError(f"Unexpected member, {prop}, in Embedder.") - elif path.startswith(f"{_TRANSFORMER_EMBEDDER}/mm"): - if _VARIANT.value in _TEXT_ONLY_VARIANTS: - return zip([], []) - - if path.endswith("/mm_input_projection"): - converted_paths = ["multi_modal_projector.mm_input_projection_weight"] - converted_weights = [weights] - elif path.endswith("/mm_soft_embedding_norm"): - converted_paths = ["multi_modal_projector.mm_soft_emb_norm.weight"] - converted_weights = [weights] - else: - raise ValueError(f"Unexpected subpath, `{path}`, in Embedder.") - elif path.endswith(_TRANSFORMER_FINAL_NORM): - converted_paths = ["language_model.model.norm.weight"] - converted_weights = [weights] - elif _TRANSFORMER_DECODER_BLOCK in path: - decoder_block_start = path.find(_TRANSFORMER_DECODER_BLOCK) - decoder_block_offset = decoder_block_start + _TRANSFORMER_DECODER_BLOCK_LEN - decoder_block_path = path[decoder_block_offset:] - next_path_seperator_idx = decoder_block_path.find("/") - layer_idx = decoder_block_path[:next_path_seperator_idx] - decoder_block_path = decoder_block_path[next_path_seperator_idx:] - - base_path = f"language_model.model.layers.{layer_idx}" - - if path.endswith("attn/attn_vec_einsum"): - converted_paths = [f"{base_path}.self_attn.o_proj.weight"] - converted_weights = [weights.transpose(2, 0, 1).reshape(config.hidden_size, attn_head_dim)] - elif path.endswith("attn/_key_norm"): - converted_paths = [f"{base_path}.self_attn.k_norm.weight"] - converted_weights = [weights] - elif path.endswith("attn/kv_einsum"): - converted_paths = [ - f"{base_path}.self_attn.k_proj.weight", - f"{base_path}.self_attn.v_proj.weight", - ] - k_proj_weights, v_proj_weights = weights - converted_weights = [ - k_proj_weights.transpose(0, 2, 1).reshape(kv_head_dim, config.hidden_size), - v_proj_weights.transpose(0, 2, 1).reshape(kv_head_dim, config.hidden_size), - ] - elif path.endswith("attn/q_einsum"): - converted_paths = [f"{base_path}.self_attn.q_proj.weight"] - converted_weights = [weights.transpose(0, 2, 1).reshape(attn_head_dim, config.hidden_size)] - elif path.endswith("attn/_query_norm"): - converted_paths = [f"{base_path}.self_attn.q_norm.weight"] - converted_weights = [weights] - elif path.endswith("mlp/gating_einsum"): - converted_paths = [ - f"{base_path}.mlp.gate_proj.weight", - f"{base_path}.mlp.up_proj.weight", - ] - gate_proj_weight, up_proj_weight = weights - converted_weights = [gate_proj_weight, up_proj_weight] - elif path.endswith("mlp/linear"): - converted_paths = [f"{base_path}.mlp.down_proj.weight"] - converted_weights = [weights.transpose()] - elif path.endswith("post_attention_norm"): - converted_paths = [f"{base_path}.post_attention_layernorm.weight"] - converted_weights = [weights] - elif path.endswith("post_ffw_norm"): - converted_paths = [f"{base_path}.post_feedforward_layernorm.weight"] - converted_weights = [weights] - elif path.endswith("pre_attention_norm"): - converted_paths = [f"{base_path}.input_layernorm.weight"] - converted_weights = [weights] - elif path.endswith("pre_ffw_norm"): - converted_paths = [f"{base_path}.pre_feedforward_layernorm.weight"] - converted_weights = [weights] - else: - raise ValueError(f"Unexpected path `{path}` in Decoder Block.") - - if (cpl := len(converted_paths)) != (cwl := len(converted_weights)): - raise ValueError( - "The `converted_paths` and `converted_weights` should be the same " - f"length. Got {cpl} and {cwl}, respectively, for {path}." - ) - - return zip(converted_paths, converted_weights) - - -def convert( - checkpoint_path: str, config: Gemma3Config, variant: str -) -> tuple[dict[str, torch.Tensor], Optional[Sequence[np.ndarray]]]: - """Loads Orbax checkpoint from `input_path` and converts it to HF tree.""" - checkpointer = obc.PyTreeCheckpointer() - ckpt = checkpointer.restore(checkpoint_path) - hf_tree: dict[str, torch.Tensor] = {} - orbax_tree_flat = tree.flatten_with_path(ckpt) - - def update_tree(path: str, weights: np.ndarray, target_dtype: torch.dtype) -> None: - hf_tree[path] = torch.from_numpy(weights.astype("float32")).type(target_dtype) - if _VERBOSE.value: - logging.info( - "%s converted shape=%s with dtype=%s", - path, - weights.shape, - target_dtype, - ) - - for paths, value in orbax_tree_flat: - if paths[0].startswith("SigLiPFromPatches_"): - if config.vision_config is None: - continue - - path, weights = convert_siglip_weight(config=config.vision_config, paths=paths, weights=value) - update_tree(path, weights, config.vision_config.dtype) - else: - for path, weights in convert_transformer_weights(config=config.text_config, paths=paths, weights=value): - if variant in _TEXT_ONLY_VARIANTS: - path = path[len("language_model.") :] - if variant == _VARIANT_EMBEDDINGGEMMA: - path = path[len("model.") :] - - update_tree(path, weights, config.text_config.dtype) - - if variant == _VARIANT_EMBEDDINGGEMMA: - return hf_tree, [weight[1].T for weight in orbax_tree_flat[: _NUM_LINEAR_LAYERS.value]] - elif config.vision_config is None: - hf_tree["lm_head.weight"] = hf_tree["model.embed_tokens.weight"] - else: - hf_tree["language_model.lm_head.weight"] = hf_tree["language_model.model.embed_tokens.weight"] - - return hf_tree, None - - -def main(*args): - del args - - output_path = _OUTPUT_PATH.value - variant = _VARIANT.value - - config = _VARIANTS[variant] - config.text_config.dtype = getattr(torch, _TRANSFORMER_DTYPE.value) - - if variant in _TEXT_ONLY_VARIANTS: - config.vision_config = None - else: - config.vision_config.dtype = getattr(torch, _VISION_DTYPE.value) - - if _INCLUDE_CHAT_TEMPLATE.value: - # Chat template is included for instruction tuned models, which treat - # both "" and "" as generation stoppers. - config.eos_token_id = [1, 106] - - logging.info( - "Converting Gemma 3 (%s) @ %s (language) and %s (vision)", - variant, - _TRANSFORMER_DTYPE.value, - _VISION_DTYPE.value, - ) - state_tree, st_linears = convert(_CHECKPOINT_PATH.value, config, variant) - logging.info("Converted Gemma 3 (%s) state tree from Orbax to Hugging Face.", variant) - - with accelerate.init_empty_weights(): - if variant == _VARIANT_EMBEDDINGGEMMA: - model = Gemma3TextModel(config=config.text_config) - elif variant in _TEXT_ONLY_VARIANTS: - model = Gemma3ForCausalLM(config=config.text_config) - else: - model = Gemma3ForConditionalGeneration(config) - - model.load_state_dict(state_tree, assign=True, strict=True) - logging.info( - "Loaded Gemma 3 (%s) in Hugging Face Transformers as a %s instance.", - variant, - type(model).__name__, - ) - model.save_pretrained(output_path, safe_serialization=True) - logging.info( - "Saved Gemma 3 (%s) to SafeTensors in %s using %s", - variant, - output_path, - type(model).__name__, - ) - del model - del state_tree - - tokenizer = GemmaTokenizerFast( - _TOKENIZER_PATH.value, - add_bos_token=True, - add_eos_token=variant == _VARIANT_EMBEDDINGGEMMA, - padding_side="right" if variant == _VARIANT_EMBEDDINGGEMMA else "left", - extra_special_tokens={ - "image_token": "", # Should be ID=262_144 - "boi_token": "", # Should be ID=255_999 - "eoi_token": "", # Should be ID=256_000 - }, - chat_template=_CHAT_TEMPLATE if _INCLUDE_CHAT_TEMPLATE.value else None, - ) - tokenizer.save_pretrained(output_path) - logging.info("Saved GemmaTokenizer for %s to %s", variant, output_path) - - if variant not in _TEXT_ONLY_VARIANTS: - image_processor = Gemma3ImageProcessor( - image_seq_length=256, - image_mean=(0.5,) * 3, - image_std=(0.5,) * 3, - size={"height": 896, "width": 896}, - resample=PILImageResampling.BILINEAR, - ) - processor = Gemma3Processor( - image_processor=image_processor, - tokenizer=tokenizer, - chat_template=tokenizer.chat_template, - ) - processor.save_pretrained(output_path) - logging.info("Saved Gemma3Processor for %s to %s", variant, output_path) - del processor - - del tokenizer - - generation_config = GenerationConfig( - pad_token_id=config.pad_token_id, - bos_token_id=config.bos_token_id, - eos_token_id=config.eos_token_id, - cache_implementation="hybrid", - temperature=1.0, - do_sample=True, - top_k=64, - top_p=0.95, - ) - generation_config.save_pretrained(output_path) - - if variant == _VARIANT_EMBEDDINGGEMMA: - from sentence_transformers import SentenceTransformer, models - - # TODO: Support Retrieval tasks where we use `"title: {title} | text: {passage}"` interally and construct this - # from split-records cached data, but externally these come through as a single string with components - # separated by a newline. This should be used for `passage` for SentenceTransformers and the relevant MTEB - # Retrieval tasks. - # https://github.com/embeddings-benchmark/mteb/blob/main/docs/usage/usage.md#running-sentencetransformer-model-with-prompts - task_prompts = { - "query": "task: search result | query: ", - "document": "title: none | text: ", - "BitextMining": "task: search result | query: ", - "Clustering": "task: clustering | query: ", - "Classification": "task: classification | query: ", - "InstructionRetrieval": "task: code retrieval | query: ", - "MultilabelClassification": "task: classification | query: ", - "PairClassification": "task: sentence similarity | query: ", - "Reranking": "task: search result | query: ", - "Retrieval": "task: search result | query: ", - "Retrieval-query": "task: search result | query: ", - "Retrieval-document": "title: none | text: ", - "STS": "task: sentence similarity | query: ", - "Summarization": "task: summarization | query: ", - } - - transformer = models.Transformer(output_path) - pooling = models.Pooling(config.text_config.hidden_size, pooling_mode="mean") - normalize = models.Normalize() - linears = [] - - for linear_weight in st_linears: - out_size, in_size = linear_weight.shape[:2] - dense = models.Dense(in_size, out_size, bias=False, activation_function=None) - dense.linear.weight.data = torch.from_numpy(linear_weight.astype("float32")) - linears.append(dense) - - model = SentenceTransformer(modules=[transformer, pooling, *linears, normalize], prompts=task_prompts) - model = model.to(getattr(torch, _TRANSFORMER_DTYPE.value)) - model.save_pretrained(output_path) - - -if __name__ == "__main__": - app.run(main) diff --git a/src/transformers/models/gemma3/image_processing_gemma3_fast.py b/src/transformers/models/gemma3/image_processing_gemma3_fast.py index eb828a89643d..c61152bc6b22 100644 --- a/src/transformers/models/gemma3/image_processing_gemma3_fast.py +++ b/src/transformers/models/gemma3/image_processing_gemma3_fast.py @@ -19,6 +19,7 @@ from typing import Optional, Union import torch +from torchvision.transforms.v2 import functional as F from ...image_processing_utils_fast import ( BaseImageProcessorFast, @@ -32,16 +33,10 @@ from ...utils import ( TensorType, auto_docstring, - is_torchvision_v2_available, logging, ) -if is_torchvision_v2_available(): - from torchvision.transforms.v2 import functional as F -else: - from torchvision.transforms import functional as F - logger = logging.get_logger(__name__) diff --git a/src/transformers/models/gemma3/modeling_gemma3.py b/src/transformers/models/gemma3/modeling_gemma3.py index 7a91db1905f7..4536ec7f69f7 100644 --- a/src/transformers/models/gemma3/modeling_gemma3.py +++ b/src/transformers/models/gemma3/modeling_gemma3.py @@ -434,6 +434,9 @@ def _init_weights(self, module): super()._init_weights(module) if isinstance(module, Gemma3MultiModalProjector): module.mm_input_projection_weight.data.zero_() + # We initialize with 0s to be 1 centered as the RMSNorm here does (1 + weight) + elif "RMSNorm" in module.__class__.__name__: + module.weight.data.zero_() def _bidirectional_window_overlay(sliding_window: int) -> Callable[[int, int, int, int], bool]: diff --git a/src/transformers/models/gemma3/modular_gemma3.py b/src/transformers/models/gemma3/modular_gemma3.py index d10d01f55759..22a10f0c8dec 100644 --- a/src/transformers/models/gemma3/modular_gemma3.py +++ b/src/transformers/models/gemma3/modular_gemma3.py @@ -526,6 +526,9 @@ def _init_weights(self, module): PreTrainedModel._init_weights(self, module) if isinstance(module, Gemma3MultiModalProjector): module.mm_input_projection_weight.data.zero_() + # We initialize with 0s to be 1 centered as the RMSNorm here does (1 + weight) + elif "RMSNorm" in module.__class__.__name__: + module.weight.data.zero_() def _bidirectional_window_overlay(sliding_window: int) -> Callable[[int, int, int, int], bool]: @@ -1208,7 +1211,7 @@ class Gemma3TextForSequenceClassification(GenericForSequenceClassification, Gemm __all__ = [ "Gemma3Config", "Gemma3TextConfig", - "Gemma3PreTrainedModel", # noqa: F822 + "Gemma3PreTrainedModel", "Gemma3TextModel", "Gemma3ForCausalLM", "Gemma3ForConditionalGeneration", diff --git a/src/transformers/models/gemma3n/configuration_gemma3n.py b/src/transformers/models/gemma3n/configuration_gemma3n.py index 3502d2a423c9..47b5b47d3630 100644 --- a/src/transformers/models/gemma3n/configuration_gemma3n.py +++ b/src/transformers/models/gemma3n/configuration_gemma3n.py @@ -291,9 +291,7 @@ def __init__( if activation_sparsity_pattern is None: num_sparse_layers = 10 if num_hidden_layers > 10 else 0 - activation_sparsity_pattern = (0.95,) * num_sparse_layers + (0.0,) * ( - num_hidden_layers - num_sparse_layers - ) + activation_sparsity_pattern = [0.95] * num_sparse_layers + [0.0] * (num_hidden_layers - num_sparse_layers) if (len_asp := len(activation_sparsity_pattern)) != num_hidden_layers: raise ValueError( @@ -502,10 +500,10 @@ def __init__( **kwargs, ): super().__init__(**kwargs) + self.architecture = architecture self.initializer_range = initializer_range self.do_pooling = do_pooling self.model_args = model_args # named "model_args" for BC with timm - self.architecture = architecture self.hidden_size = hidden_size self.vocab_size = vocab_size self.vocab_offset = vocab_offset @@ -553,8 +551,8 @@ def from_dict(cls, config_dict: dict[str, Any], **kwargs): def to_dict(self) -> dict[str, Any]: output = super().to_dict() - output["num_classes"] = self.num_labels - output["label_names"] = list(self.id2label.values()) + output.setdefault("num_classes", self.num_labels) + output.setdefault("label_names", list(self.id2label.values())) output.pop("id2label", None) output.pop("label2id", None) return output diff --git a/src/transformers/models/gemma3n/convert_gemma3n_weights.py b/src/transformers/models/gemma3n/convert_gemma3n_weights.py deleted file mode 100644 index 6b77bbf766c1..000000000000 --- a/src/transformers/models/gemma3n/convert_gemma3n_weights.py +++ /dev/null @@ -1,809 +0,0 @@ -# coding=utf-8 -# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved. -# -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -r"""Utility to convert Gemma models from Orbax to HF Transformers checkpoint. - -python src/transformers/models/gemma3n/convert_gemma3n_weights.py \ - --variant='gemma3n_e4b' \ - --tokenizer_path="$HOME/tokenizers/gemma-3n-tokenizer.model" \ - --checkpoint_path="$HOME/checkpoints/gemma-3n-orbax/" \ - --output_path="$HOME/checkpoints/gemma-3n-safetensors/" -""" - -import json -import os -import re -from collections.abc import Iterable, Mapping -from typing import Any - -import accelerate -import numpy as np -import torch -import tree -from absl import app, flags, logging -from orbax import checkpoint as obc - -from transformers import ( - Gemma3nAudioConfig, - Gemma3nAudioFeatureExtractor, - Gemma3nConfig, - Gemma3nForConditionalGeneration, - Gemma3nProcessor, - Gemma3nTextConfig, - Gemma3nVisionConfig, - GemmaTokenizerFast, - GenerationConfig, - SiglipImageProcessorFast, -) -from transformers.image_utils import PILImageResampling - - -# ==== Internal Constants and Classes ==== - - -_CHAT_TEMPLATE = """{{ bos_token }} -{%- if messages[0]['role'] == 'system' -%} - {%- if messages[0]['content'] is string -%} - {%- set first_user_prefix = messages[0]['content'] + '\n\n' -%} - {%- else -%} - {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%} - {%- endif -%} - {%- set loop_messages = messages[1:] -%} -{%- else -%} - {%- set first_user_prefix = "" -%} - {%- set loop_messages = messages -%} -{%- endif -%} -{%- for message in loop_messages -%} - {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%} - {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }} - {%- endif -%} - {%- if (message['role'] == 'assistant') -%} - {%- set role = "model" -%} - {%- else -%} - {%- set role = message['role'] -%} - {%- endif -%} - {{ '' + role + '\n' + (first_user_prefix if loop.first else "") }} - {%- if message['content'] is string -%} - {{ message['content'] | trim }} - {%- elif message['content'] is iterable -%} - {%- for item in message['content'] -%} - {%- if item['type'] == 'audio' -%} - {{ '' }} - {%- elif item['type'] == 'image' -%} - {{ '' }} - {%- elif item['type'] == 'text' -%} - {{ item['text'] | trim }} - {%- endif -%} - {%- endfor -%} - {%- else -%} - {{ raise_exception("Invalid content type") }} - {%- endif -%} - {{ '\n' }} -{%- endfor -%} -{%- if add_generation_prompt -%} - {{'model\n'}} -{%- endif -%} -""" - -_DTYPES = {"float32", "bfloat16", "float16"} - -_SLIDING_WINDOW_PATTERN = 5 - -_AUDIO_ENCODER_PARAMETER = "AudioEncoder/encoder" -_AUDIO_ENCODER_CONFORMER = f"{_AUDIO_ENCODER_PARAMETER}/conformer/stacked_layers" -_AUDIO_ENCODER_SSCP = f"{_AUDIO_ENCODER_PARAMETER}/feature" - -_TRANSFORMER_PARAMETER = "transformer" -_TRANSFORMER_ALTUP_PROJ = f"{_TRANSFORMER_PARAMETER}/altup_projection_" -_TRANSFORMER_ALTUP_UNEMB = f"{_TRANSFORMER_PARAMETER}/altup_unembed_projection_" -_TRANSFORMER_DECODER_BLOCK = f"{_TRANSFORMER_PARAMETER}/stacked_layers/attention_type_" -_TRANSFORMER_DECODER_BLOCK_LEN = len(_TRANSFORMER_DECODER_BLOCK) -_TRANSFORMER_EMBEDDER = f"{_TRANSFORMER_PARAMETER}/embedder" -_TRANSFORMER_FINAL_NORM = "transformer/final_norm" -_TRANSFORMER_POST_TRAINING_PREFIX = "rlx_networks/policy_network/" -_TRANSFORMER_POST_TRAINING_PREFIX_LEN = len(_TRANSFORMER_POST_TRAINING_PREFIX) - -# _MOBILE_NET_CONFIG = Gemma3nVisionConfig.from_pretrained("") - -_MOBILE_NET_PREFIX = "mobilenet" -_MOBILE_NET_TIMM_SUMMED_BLOCK_SIZES = [3, 8, 45, 84] -_MOBILE_NET_CONV = "block_group_conv2d_" -_MOBILE_NET_FIB = "block_group_fused_ib_" -_MOBILE_NET_MQA = "block_group_mmqa_" -_MOBILE_NET_MSFA = "block_adapter_" -_MOBILE_NET_UIB = "block_group_uib_" -_MOBILE_NET_UIB_HAS_DW_START = { - (1, 0), - (1, 1), - (1, 2), - (1, 3), - (1, 4), - (2, 0), - (2, 1), - (2, 2), - (2, 3), - (2, 4), - (2, 5), - (2, 6), - (2, 7), - (3, 0), -} -_MOBILE_NET_UIB_HAS_DW_MID = { - (1, 0), - (2, 0), - (3, 0), -} - -_VARIANT_GEMMA_3_2B = "gemma3n_e2b" -_VARIANT_GEMMA_3_4B = "gemma3n_e4b" -_VARIANTS: Mapping[str, Gemma3nConfig] = { - _VARIANT_GEMMA_3_2B: Gemma3nConfig( - text_config=Gemma3nTextConfig( - intermediate_size=2048 * 4, - num_hidden_layers=30, - activation_sparsity_pattern=(0.95,) * 10 + (0.0,) * 20, - num_kv_shared_layers=10, - ), - vision_config=Gemma3nVisionConfig(), - audio_config=Gemma3nAudioConfig(), - ), - _VARIANT_GEMMA_3_4B: Gemma3nConfig( - text_config=Gemma3nTextConfig(), - vision_config=Gemma3nVisionConfig(), - audio_config=Gemma3nAudioConfig(), - ), -} - - -# ==== Flags ==== - -_AUDIO_DTYPE = flags.DEFINE_enum( - name="audio_dtype", - default="bfloat16", - help="The floating point precision (aka dtype) of the model.", - enum_values=_DTYPES, -) - -_CHECKPOINT_PATH = flags.DEFINE_string( - name="checkpoint_path", - default=None, - help="Path to the Orbax checkpoint.", - required=True, -) - -_INCLUDE_CHAT_TEMPLATE = flags.DEFINE_bool( - name="include_chat_template", default=False, help="If true, will save the default chat template with the tokenizer" -) - -_OUTPUT_PATH = flags.DEFINE_string( - name="output_path", - default=None, - help="Path to store the HF checkpoint.", - required=True, -) - -_TRANSFORMER_DTYPE = flags.DEFINE_enum( - name="text_dtype", - default="bfloat16", - help="The floating point precision (aka dtype) of the model.", - enum_values=_DTYPES, -) - -_TOKENIZER_PATH = flags.DEFINE_string( - name="tokenizer_path", - default=None, - help="Path to the SentencePiece model file.", - required=True, -) - -_VARIANT = flags.DEFINE_enum( - name="variant", - default=_VARIANT_GEMMA_3_4B, - help="The model variant to convert.", - enum_values=set(_VARIANTS.keys()), -) - -_VERBOSE = flags.DEFINE_bool( - name="verbose", - default=False, - help="If true, log the path, shape, and dtype of every converted layer.", -) - -_VISION_DTYPE = flags.DEFINE_enum( - name="vision_dtype", - default="bfloat16", - help="The floating point precision (aka dtype) of the model.", - enum_values=_DTYPES, -) - - -def convert_audio_encoder_weights( - config: Gemma3nAudioConfig, - path: str, - param: str, - weights: np.ndarray, -) -> Iterable[tuple[str, np.ndarray]]: - converted_paths: list[str] = [] - converted_weights: list[Any] = [] - - if path.startswith(_AUDIO_ENCODER_CONFORMER): - assert weights.shape[0] == config.conf_num_hidden_layers - - for i, matrix in enumerate(weights): - if "fflayer_end" in path: - base = f"conformer.{i}.ffw_layer_end" - - if path.endswith("ffn_layer1"): - converted_paths.append(f"{base}.ffw_layer_1.weight") - converted_weights.append(matrix.transpose()) - elif path.endswith("ffn_layer2"): - converted_paths.append(f"{base}.ffw_layer_2.weight") - converted_weights.append(matrix.transpose()) - elif path.endswith("post_layer_norm"): - converted_paths.append(f"{base}.post_layer_norm.weight") - converted_weights.append(matrix) - elif path.endswith("pre_layer_norm"): - converted_paths.append(f"{base}.pre_layer_norm.weight") - converted_weights.append(matrix) - elif "fflayer_start" in path: - base = f"conformer.{i}.ffw_layer_start" - - if path.endswith("ffn_layer1"): - converted_paths.append(f"{base}.ffw_layer_1.weight") - converted_weights.append(matrix.transpose()) - elif path.endswith("ffn_layer2"): - converted_paths.append(f"{base}.ffw_layer_2.weight") - converted_weights.append(matrix.transpose()) - elif path.endswith("post_layer_norm"): - converted_paths.append(f"{base}.post_layer_norm.weight") - converted_weights.append(matrix) - elif path.endswith("pre_layer_norm"): - converted_paths.append(f"{base}.pre_layer_norm.weight") - converted_weights.append(matrix) - elif path.endswith("final_ln"): - converted_paths.append(f"conformer.{i}.norm.weight") - converted_weights.append(matrix) - elif "lconv" in path: - base = f"conformer.{i}.lconv1d" - - if path.endswith("conv_norm"): - converted_paths.append(f"{base}.conv_norm.weight") - converted_weights.append(matrix) - elif path.endswith("depthwise_conv1d"): - converted_paths.append(f"{base}.depthwise_conv1d.weight") - converted_weights.append(matrix.transpose()) - elif path.endswith("linear_end"): - converted_paths.append(f"{base}.linear_end.weight") - converted_weights.append(matrix.transpose()) - elif path.endswith("linear_start"): - converted_paths.append(f"{base}.linear_start.weight") - converted_weights.append(matrix.transpose()) - elif path.endswith("ln"): - converted_paths.append(f"{base}.pre_layer_norm.weight") - converted_weights.append(matrix) - elif "trans_atten" in path: - base = f"conformer.{i}.attention" - - if param == "per_dim_scale": - converted_paths.append(f"{base}.attn.per_dim_scale") - converted_weights.append(matrix) - - if path.endswith("query_key_value_projection"): - converted_paths.extend( - [f"{base}.attn.q_proj.weight", f"{base}.attn.k_proj.weight", f"{base}.attn.v_proj.weight"] - ) - converted_weights.extend( - [ - m.reshape(config.hidden_size, config.hidden_size).transpose() - for m in matrix.transpose(1, 0, 2, 3) - ] - ) - elif path.endswith("pos_proj"): - converted_paths.append(f"{base}.attn.relative_position_embedding.pos_proj.weight") - converted_weights.append(matrix.reshape(config.hidden_size, config.hidden_size).transpose()) - elif path.endswith("post"): - converted_paths.append(f"{base}.post.weight") - converted_weights.append(matrix.transpose(2, 0, 1).reshape(config.hidden_size, config.hidden_size)) - elif path.endswith("post_norm"): - converted_paths.append(f"{base}.post_norm.weight") - converted_weights.append(matrix) - elif path.endswith("pre_norm"): - converted_paths.append(f"{base}.pre_attn_norm.weight") - converted_weights.append(matrix) - elif path.startswith(_AUDIO_ENCODER_SSCP): - if path.endswith("input_proj"): - converted_paths.append("subsample_conv_projection.input_proj_linear.weight") - converted_weights.append( - weights.transpose(2, 0, 1).reshape(config.hidden_size, config.sscp_conv_channel_size[1] ** 2) - ) - elif "norm_" in path: - index = int(path[-1]) - converted_paths.append(f"subsample_conv_projection.conv_{index}.norm.weight") - converted_weights.append(weights) - elif "subsampling_" in path: - index = int(path[-1]) - converted_paths.append(f"subsample_conv_projection.conv_{index}.conv.weight") - converted_weights.append(weights.transpose(3, 2, 0, 1)) - - if (cpl := len(converted_paths)) != (cwl := len(converted_weights)): - raise ValueError( - "The `converted_paths` and `converted_weights` should be the same " - f"length. Got {cpl} and {cwl}, respectively, for {path}." - ) - - return zip(converted_paths, converted_weights) - - -def convert_transformer_weights( - config: Gemma3nTextConfig, - path: str, - param: str, - weights: np.ndarray, -) -> Iterable[tuple[str, np.ndarray]]: - if path.startswith(_TRANSFORMER_POST_TRAINING_PREFIX): - path = path[_TRANSFORMER_POST_TRAINING_PREFIX_LEN:] - - converted_paths: list[str] = [] - converted_weights: list[Any] = [] - - if path.startswith(_TRANSFORMER_ALTUP_PROJ): - index = int(path[-1]) - converted_paths.append(f"altup_projections.{index}.weight") - converted_weights.append(weights.transpose()) - elif path.startswith(_TRANSFORMER_ALTUP_UNEMB): - index = int(path[-1]) - converted_paths.append(f"altup_unembed_projections.{index}.weight") - converted_weights.append(weights.transpose()) - elif path.startswith(_TRANSFORMER_DECODER_BLOCK): - attention_type_index = int(path[_TRANSFORMER_DECODER_BLOCK_LEN]) - assert weights.shape[0] == config.num_hidden_layers / _SLIDING_WINDOW_PATTERN - - for i, matrix in enumerate(weights): - layer_idx = _SLIDING_WINDOW_PATTERN * i + attention_type_index - base_path = f"layers.{layer_idx}" - - if "altup" in path: - altup_path = f"{base_path}.altup" - - if param == "correct_output_scale": - converted_paths.append(f"{altup_path}.correct_output_scale") - converted_weights.append(matrix) - elif param == "correction_coefs": - converted_paths.append(f"{altup_path}.correction_coefs.weight") - converted_weights.append(matrix.transpose()) - elif param == "prediction_coefs": - converted_paths.append(f"{altup_path}.prediction_coefs.weight") - converted_weights.append( - np.clip( - matrix.reshape(config.altup_num_inputs, config.altup_num_inputs**2).transpose(), - -config.altup_coef_clip, - config.altup_coef_clip, - ) - ) - - if path.endswith("modality_router"): - converted_paths.append(f"{altup_path}.modality_router.weight") - converted_weights.append(matrix.transpose()) - elif path.endswith("router_norm_layer"): - converted_paths.append(f"{altup_path}.router_norm.weight") - converted_weights.append(matrix) - elif path.endswith("attn/attn_vec_einsum"): - converted_paths.append(f"{base_path}.self_attn.o_proj.weight") - converted_weights.append( - matrix.transpose(2, 0, 1).reshape(config.hidden_size, config.num_attention_heads * config.head_dim) - ) - elif path.endswith("attn/kv_einsum"): - converted_paths.extend( - [ - f"{base_path}.self_attn.k_proj.weight", - f"{base_path}.self_attn.v_proj.weight", - ] - ) - k_proj_weights, v_proj_weights = matrix.transpose(0, 2, 1, 3) - kv_proj_shape = (config.hidden_size, config.num_key_value_heads * config.head_dim) - converted_weights.extend( - [ - k_proj_weights.reshape(kv_proj_shape).transpose(), - v_proj_weights.reshape(kv_proj_shape).transpose(), - ] - ) - elif path.endswith("attn/q_einsum"): - converted_paths.append(f"{base_path}.self_attn.q_proj.weight") - converted_weights.append( - matrix.transpose(1, 0, 2) - .reshape(config.hidden_size, config.num_attention_heads * config.head_dim) - .transpose() - ) - elif path.endswith("attn/query_norm"): - converted_paths.append(f"{base_path}.self_attn.q_norm.weight") - converted_weights.append(matrix) - elif path.endswith("attn/key_norm"): - converted_paths.append(f"{base_path}.self_attn.k_norm.weight") - converted_weights.append(matrix) - elif path.endswith("laurel_block/linear_left"): - converted_paths.append(f"{base_path}.laurel.linear_left.weight") - converted_weights.append(matrix.transpose()) - elif path.endswith("laurel_block/linear_right"): - converted_paths.append(f"{base_path}.laurel.linear_right.weight") - converted_weights.append(matrix.transpose()) - elif path.endswith("mlp/gating_einsum"): - converted_paths.extend([f"{base_path}.mlp.gate_proj.weight", f"{base_path}.mlp.up_proj.weight"]) - gate_proj_weight, up_proj_weight = matrix - converted_weights.extend([gate_proj_weight, up_proj_weight]) - elif path.endswith("mlp/linear"): - converted_paths.append(f"{base_path}.mlp.down_proj.weight") - converted_weights.append(matrix.transpose()) - elif path.endswith("per_layer_input_gate"): - converted_paths.append(f"{base_path}.per_layer_input_gate.weight") - converted_weights.append(matrix.transpose()) - elif path.endswith("per_layer_projection"): - converted_paths.append(f"{base_path}.per_layer_projection.weight") - converted_weights.append(matrix.transpose()) - elif path.endswith("post_attention_norm"): - converted_paths.append(f"{base_path}.post_attention_layernorm.weight") - converted_weights.append(matrix) - elif path.endswith("post_ffw_norm"): - converted_paths.append(f"{base_path}.post_feedforward_layernorm.weight") - converted_weights.append(matrix) - elif path.endswith("post_laurel_norm"): - converted_paths.append(f"{base_path}.laurel.post_laurel_norm.weight") - converted_weights.append(matrix) - elif path.endswith("post_per_layer_input_norm"): - converted_paths.append(f"{base_path}.post_per_layer_input_norm.weight") - converted_weights.append(matrix) - elif path.endswith("pre_attention_norm"): - converted_paths.append(f"{base_path}.input_layernorm.weight") - converted_weights.append(matrix) - elif path.endswith("pre_ffw_norm"): - converted_paths.append(f"{base_path}.pre_feedforward_layernorm.weight") - converted_weights.append(matrix) - elif path == _TRANSFORMER_EMBEDDER: - if param == "input_embedding": - converted_paths.append("embed_tokens.weight") - # Gemma 3n model doesn't have soft tokens or "end of" tokens for images and audio in its input and output - # embeddings, so we resize to avoid bugs observed with Mllama - pre_expansion_embeddings = weights - pad_token_slice = slice(config.pad_token_id, config.pad_token_id + 1) - new_embeddings = np.repeat(pre_expansion_embeddings[pad_token_slice], 256, axis=0) - weights = np.vstack([pre_expansion_embeddings, new_embeddings]) - converted_weights.append(weights) - elif param == "per_layer_embeddings": - converted_paths.append("embed_tokens_per_layer.weight") - converted_weights.append( - weights.reshape( - config.vocab_size_per_layer_input, config.num_hidden_layers * config.hidden_size_per_layer_input - ) - ) - elif path.startswith(_TRANSFORMER_EMBEDDER): - # TODO: ryanmullins - support multimodal norms and projections - if path.endswith("per_layer_model_projection"): - converted_paths.append("per_layer_model_projection.weight") - converted_weights.append( - weights.reshape( - config.hidden_size, config.num_hidden_layers * config.hidden_size_per_layer_input - ).transpose() - ) - elif path.endswith("per_layer_projection_norm"): - converted_paths.append("per_layer_projection_norm.weight") - converted_weights.append(weights) - elif path == _TRANSFORMER_FINAL_NORM: - converted_paths = ["norm.weight"] - converted_weights = [weights] - - if (cpl := len(converted_paths)) != (cwl := len(converted_weights)): - raise ValueError( - "The `converted_paths` and `converted_weights` should be the same " - f"length. Got {cpl} and {cwl}, respectively, for {path}." - ) - - return zip(converted_paths, converted_weights) - - -def convert_vision_weights( - config: Gemma3nVisionConfig, - path: str, - param: str, - weights: np.ndarray, -) -> Iterable[tuple[str, np.ndarray]]: - def generate_base_path(path: str, block_type: str) -> tuple[str, tuple[int, int]]: - re_str = rf"{block_type}(\d+)/" - re_pattern = re.compile(re_str) - match = re.search(re_pattern, path).group(1) - idx = abs(int(match)) - 1 - - for block_idx, v in enumerate(_MOBILE_NET_TIMM_SUMMED_BLOCK_SIZES): - if v > idx: - offset = _MOBILE_NET_TIMM_SUMMED_BLOCK_SIZES[block_idx - 1] if block_idx > 0 else 0 - layer_idx = idx - offset - return f"blocks.{block_idx}.{layer_idx}", (block_idx, layer_idx) - - raise ValueError(f"could not extract a base path from {path}") - - if _MOBILE_NET_MSFA in path: - converted_path = "msfa" - - if "ffn/Normalize_0" in path: - converted_path += ".ffn.pw_exp.bn.weight" - converted_weight = weights - elif "ffn/Normalize_1" in path: - converted_path += ".ffn.pw_proj.bn.weight" - converted_weight = weights - elif "ffn/expand" in path: - converted_path += ".ffn.pw_exp.conv.weight" - converted_weight = weights.transpose()[:, :, None, None] - elif "ffn/project" in path: - converted_path += ".ffn.pw_proj.conv.weight" - converted_weight = weights.transpose()[:, :, None, None] - elif "Normalize_0" in path: - converted_path += ".norm.weight" - converted_weight = weights - elif _MOBILE_NET_CONV in path: - if "Conv_0" in path: - converted_path = ("conv_stem.conv.weight", "conv_stem.conv.bias") - converted_weight = weights.transpose(3, 2, 0, 1) - converted_weight = (converted_weight, np.zeros(converted_weight.shape[0])) - elif "Normalize_0" in path: - converted_path = "conv_stem.bn.weight" - converted_weight = weights - elif _MOBILE_NET_FIB in path: - converted_path, _ = generate_base_path(path, _MOBILE_NET_FIB) - if "Normalize_0" in path: - converted_path += ".bn1.weight" - converted_weight = weights - elif "Normalize_1" in path: - converted_path += ".bn2.weight" - converted_weight = weights - elif "expand_conv" in path: - converted_path += ".conv_exp.weight" - converted_weight = weights.transpose(3, 2, 0, 1) - else: - converted_path += ".conv_pwl.weight" - converted_weight = weights.transpose()[:, :, None, None] - elif _MOBILE_NET_MQA in path: - converted_path, _ = generate_base_path(path, _MOBILE_NET_MQA) - - if "LayerScale_0" in path: - converted_path += ".layer_scale.gamma" - converted_weight = weights - elif "Normalize_0" in path: - converted_path += ".norm.weight" - converted_weight = weights - elif "Normalize_1" in path: - converted_path += ".attn.key.norm.weight" - converted_weight = weights - elif "Normalize_2" in path: - converted_path += ".attn.value.norm.weight" - converted_weight = weights - elif "key_dwconv" in path: - converted_path += ".attn.key.down_conv.weight" - converted_weight = weights.transpose(3, 2, 0, 1) - elif "key_proj" in path: - converted_path += ".attn.key.proj.weight" - converted_weight = weights.transpose()[:, :, None, None] - elif "output_proj" in path: - converted_path += ".attn.output.proj.weight" - converted_weight = weights.transpose()[:, :, None, None] - elif "query_proj" in path: - converted_path += ".attn.query.proj.weight" - converted_weight = weights.transpose()[:, :, None, None] - elif "value_dwconv" in path: - converted_path += ".attn.value.down_conv.weight" - converted_weight = weights.transpose(3, 2, 0, 1) - elif "value_proj" in path: - converted_path += ".attn.value.proj.weight" - converted_weight = weights.transpose()[:, :, None, None] - elif _MOBILE_NET_UIB in path: - converted_path, idx_key = generate_base_path(path, _MOBILE_NET_UIB) - - has_dw_start = idx_key in _MOBILE_NET_UIB_HAS_DW_START - has_dw_mid = idx_key in _MOBILE_NET_UIB_HAS_DW_MID - - if "LayerScale_0" in path: - converted_path += ".layer_scale.gamma" - converted_weight = weights - elif "Normalize_0" in path: - converted_path += ".dw_start.bn.weight" if has_dw_start else ".pw_exp.bn.weight" - converted_weight = weights - elif "Normalize_1" in path: - converted_path += ".pw_exp.bn.weight" if has_dw_start else ".pw_proj.bn.weight" - converted_weight = weights - elif "Normalize_2" in path: - converted_path += ".dw_mid.bn.weight" if has_dw_mid else ".pw_proj.bn.weight" - converted_weight = weights - elif "Normalize_3" in path: - converted_path += ".pw_proj.bn.weight" - converted_weight = weights - elif "expand" in path: - converted_path += ".pw_exp.conv.weight" - converted_weight = weights.transpose()[:, :, None, None] - elif "middle_dwconv" in path: - converted_path += ".dw_mid.conv.weight" - converted_weight = weights.transpose(3, 2, 0, 1) - elif "project" in path: - converted_path += ".pw_proj.conv.weight" - converted_weight = weights.transpose()[:, :, None, None] - elif "start_dwconv" in path: - converted_path += ".dw_start.conv.weight" - converted_weight = weights.transpose(3, 2, 0, 1) - - if isinstance(converted_path, (tuple, list)): - return zip(converted_path, converted_weight) - else: - return [(converted_path, converted_weight)] - - -def convert(checkpoint_path: str, config: Gemma3nConfig) -> dict[str, torch.Tensor]: - """Loads Orbax checkpoint from `input_path` and converts it to HF tree.""" - checkpointer = obc.PyTreeCheckpointer() - ckpt = checkpointer.restore(checkpoint_path) - hf_tree: dict[str, torch.Tensor] = {} - - def update_tree(path: str, weights: np.ndarray, target_dtype: torch.dtype) -> None: - hf_tree[path] = torch.from_numpy(weights.astype("float32")).type(target_dtype) - if _VERBOSE.value: - logging.info( - "%s converted shape=%s with dtype=%s", - path, - weights.shape, - target_dtype, - ) - - for (path, param), value in tree.flatten_with_path(ckpt): - if param == "audio_input_embedding_extra": - update_tree("model.embed_audio.embedding.weight", value, config.audio_config.dtype) - elif path.endswith("audio_embedding_norm"): - update_tree("model.embed_audio.hard_embedding_norm.weight", value, config.audio_config.dtype) - elif path.endswith("audio_input_projection"): - update_tree("model.embed_audio.embedding_projection.weight", value.transpose(), config.audio_config.dtype) - elif path.endswith("audio_soft_embedding_norm"): - update_tree("model.embed_audio.soft_embedding_norm.weight", value, config.audio_config.dtype) - elif param == "mm_input_embedding_extra": - update_tree("model.embed_vision.embedding.weight", value, config.vision_config.dtype) - elif path.endswith("mm_hard_embedding_norm"): - update_tree("model.embed_vision.hard_embedding_norm.weight", value, config.vision_config.dtype) - elif path.endswith("mm_input_projection"): - update_tree( - "model.embed_vision.embedding_projection.weight", value.transpose(), config.vision_config.dtype - ) - elif path.endswith("mm_soft_embedding_norm"): - update_tree("model.embed_vision.soft_embedding_norm.weight", value, config.vision_config.dtype) - elif path.startswith(_TRANSFORMER_PARAMETER): - for path, weights in convert_transformer_weights(config.text_config, path, param, value): - update_tree(f"model.language_model.{path}", weights, config.text_config.dtype) - elif _MOBILE_NET_PREFIX in path: - mobilenet_prefix_idx = path.index(_MOBILE_NET_PREFIX) - path = path[mobilenet_prefix_idx:] - for path, weights in convert_vision_weights(config.vision_config, path, param, value): - update_tree(f"model.vision_tower.timm_model.{path}", weights, config.vision_config.dtype) - elif path.startswith(_AUDIO_ENCODER_PARAMETER): - for path, weights in convert_audio_encoder_weights(config.audio_config, path, param, value): - update_tree(f"model.audio_tower.{path}", weights, config.audio_config.dtype) - - hf_tree["lm_head.weight"] = hf_tree["model.language_model.embed_tokens.weight"] - - return hf_tree - - -def main(*args): - del args - - output_path = _OUTPUT_PATH.value - variant = _VARIANT.value - - config = _VARIANTS[variant] - config.audio_config.dtype = getattr(torch, _AUDIO_DTYPE.value) - config.text_config.dtype = getattr(torch, _TRANSFORMER_DTYPE.value) - config.vision_config.dtype = getattr(torch, _VISION_DTYPE.value) - if _INCLUDE_CHAT_TEMPLATE.value: - # Chat template is included for instruction tuned models, which treat - # both "" and "" as generation stoppers. - config.eos_token_id = [1, 106] - - logging.info( - "Converting Gemma 3 (%s) @ %s (language) and %s (vision)", - variant, - _TRANSFORMER_DTYPE.value, - _VISION_DTYPE.value, - ) - state_tree = convert(_CHECKPOINT_PATH.value, config) - logging.info("Converted Gemma 3 (%s) state tree from Orbax to Hugging Face.", variant) - - with accelerate.init_empty_weights(): - model = Gemma3nForConditionalGeneration(config=config) - - model.load_state_dict(state_tree, assign=True, strict=True) - logging.info( - "Loaded Gemma 3 (%s) in Hugging Face Transformers as a %s instance.", - variant, - type(model).__name__, - ) - model.save_pretrained(output_path, state_dict=state_tree, safe_serialization=True) - logging.info( - "Saved Gemma 3 (%s) to SafeTensors in %s using %s", - variant, - output_path, - type(model).__name__, - ) - del model - del state_tree - - chat_template_kwargs = {"chat_template": _CHAT_TEMPLATE} if _INCLUDE_CHAT_TEMPLATE.value else {} - - tokenizer = GemmaTokenizerFast( - _TOKENIZER_PATH.value, - add_bos_token=True, - extra_special_tokens={ - "image_token": "", # Should be ID=262_145 - "boi_token": "", # Should be ID=255_999 - "eoi_token": "", # Should be ID=262_144 - "audio_token": "", # Should be ID=262_273 - "boa_token": "", # Should be ID=256_000 - "eoa_token": "", # Should be ID=262_272 - }, - **chat_template_kwargs, - ) - tokenizer.save_pretrained(output_path) - logging.info("Saved GemmaTokenizer for %s to %s", variant, output_path) - - feature_extractor = Gemma3nAudioFeatureExtractor() - image_processor = SiglipImageProcessorFast( - image_seq_length=256, - image_mean=(0.5,) * 3, - image_std=(0.5,) * 3, - size={"height": 768, "width": 768}, - resample=PILImageResampling.BILINEAR, - do_normalize=False, - ) - processor = Gemma3nProcessor( - feature_extractor=feature_extractor, - image_processor=image_processor, - tokenizer=tokenizer, - **chat_template_kwargs, - ) - processor.save_pretrained(output_path) - - logging.info("Saved Gemma3nProcessor for %s to %s", variant, output_path) - - # NOTE: feature_extractor and image_processor both use the same filename, preprocessor_config.json, when saved to - # disk, but the files are overwritten by processor.save_pretrained(). However, the configs can be unioned, saved, - # and loaded from the same preprocessor_config.json file, so we do that explicitly here. - feature_extractor_config = json.loads(feature_extractor.to_json_string()) - image_processor_config = json.loads(image_processor.to_json_string()) - preprocessor_config = {**feature_extractor_config, **image_processor_config} - with open(os.path.join(output_path, "preprocessor_config.json"), "w", encoding="utf-8") as writer: - writer.write(json.dumps(preprocessor_config, indent=2, sort_keys=True) + "\n") - - logging.info("Saved joint preprocessor_config.json for %s to %s", variant, output_path) - - del feature_extractor, image_processor, processor, tokenizer - - generation_config = GenerationConfig( - pad_token_id=config.text_config.pad_token_id, - bos_token_id=config.text_config.bos_token_id, - eos_token_id=( - [config.text_config.eos_token_id, 106] if _INCLUDE_CHAT_TEMPLATE.value else config.text_config.eos_token_id - ), - cache_implementation="hybrid", - temperature=1.0, - do_sample=True, - top_k=64, - top_p=0.95, - ) - generation_config.save_pretrained(output_path) - - -if __name__ == "__main__": - app.run(main) diff --git a/src/transformers/models/gemma3n/modular_gemma3n.py b/src/transformers/models/gemma3n/modular_gemma3n.py index 48de2bb27f7f..7ea50b7572cf 100644 --- a/src/transformers/models/gemma3n/modular_gemma3n.py +++ b/src/transformers/models/gemma3n/modular_gemma3n.py @@ -304,9 +304,7 @@ def __init__( if activation_sparsity_pattern is None: num_sparse_layers = 10 if num_hidden_layers > 10 else 0 - activation_sparsity_pattern = (0.95,) * num_sparse_layers + (0.0,) * ( - num_hidden_layers - num_sparse_layers - ) + activation_sparsity_pattern = [0.95] * num_sparse_layers + [0.0] * (num_hidden_layers - num_sparse_layers) if (len_asp := len(activation_sparsity_pattern)) != num_hidden_layers: raise ValueError( @@ -2679,7 +2677,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(self, **super_kwargs): "Gemma3nForCausalLM", "Gemma3nForConditionalGeneration", "Gemma3nModel", - "Gemma3nPreTrainedModel", # noqa: F822 + "Gemma3nPreTrainedModel", "Gemma3nTextConfig", "Gemma3nTextModel", "Gemma3nVisionConfig", diff --git a/src/transformers/models/git/convert_git_to_pytorch.py b/src/transformers/models/git/convert_git_to_pytorch.py deleted file mode 100644 index 34dc58299bc7..000000000000 --- a/src/transformers/models/git/convert_git_to_pytorch.py +++ /dev/null @@ -1,448 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert GIT checkpoints from the original repository. - -URL: https://github.com/microsoft/GenerativeImage2Text/tree/main""" - -import argparse -from pathlib import Path - -import av -import numpy as np -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image -from torchvision.transforms import CenterCrop, Compose, Normalize, Resize, ToTensor - -from transformers import ( - AutoTokenizer, - CLIPImageProcessor, - GitConfig, - GitForCausalLM, - GitProcessor, - GitVisionConfig, - VideoMAEImageProcessor, -) -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_git_config(model_name): - if "base" in model_name and "vqa" in model_name: - image_size = 480 - elif "large" in model_name and "vqa" in model_name: - image_size = 420 - else: - image_size = 224 - - vision_config = GitVisionConfig(image_size=image_size) - - if "large" in model_name: - vision_config.patch_size = 14 - vision_config.hidden_size = 1024 - vision_config.intermediate_size = 4096 - vision_config.num_hidden_layers = 24 - vision_config.num_attention_heads = 16 - - is_video = "vatex" in model_name or "msrvtt" in model_name - num_image_with_embedding = 6 if is_video else None - config = GitConfig(vision_config=vision_config.to_dict(), num_image_with_embedding=num_image_with_embedding) - - return config, image_size, is_video - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config, prefix=""): - rename_keys = [] - - # image encoder - # ftm: off - rename_keys.append( - (f"{prefix}image_encoder.class_embedding", "git.image_encoder.vision_model.embeddings.class_embedding") - ) - rename_keys.append( - ( - f"{prefix}image_encoder.positional_embedding", - "git.image_encoder.vision_model.embeddings.position_embedding.weight", - ) - ) - rename_keys.append( - (f"{prefix}image_encoder.conv1.weight", "git.image_encoder.vision_model.embeddings.patch_embedding.weight") - ) - rename_keys.append((f"{prefix}image_encoder.ln_pre.weight", "git.image_encoder.vision_model.pre_layrnorm.weight")) - rename_keys.append((f"{prefix}image_encoder.ln_pre.bias", "git.image_encoder.vision_model.pre_layrnorm.bias")) - rename_keys.append( - (f"{prefix}image_encoder.ln_post.weight", "git.image_encoder.vision_model.post_layernorm.weight") - ) - rename_keys.append((f"{prefix}image_encoder.ln_post.bias", "git.image_encoder.vision_model.post_layernorm.bias")) - # fmt: on - rename_keys.append((f"{prefix}image_encoder.proj", "git.image_encoder.visual_projection.weight")) - - # fmt: off - for i in range(config.vision_config.num_hidden_layers): - # image encoder layers: output projection, 2 feedforward neural networks and 2 layernorms - rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.attn.out_proj.weight", f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.out_proj.weight")) - rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.attn.out_proj.bias", f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.out_proj.bias")) - rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.ln_1.weight", f"git.image_encoder.vision_model.encoder.layers.{i}.layer_norm1.weight")) - rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.ln_1.bias", f"git.image_encoder.vision_model.encoder.layers.{i}.layer_norm1.bias")) - rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.mlp.c_fc.weight", f"git.image_encoder.vision_model.encoder.layers.{i}.mlp.fc1.weight")) - rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.mlp.c_fc.bias", f"git.image_encoder.vision_model.encoder.layers.{i}.mlp.fc1.bias")) - rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.mlp.c_proj.weight", f"git.image_encoder.vision_model.encoder.layers.{i}.mlp.fc2.weight")) - rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.mlp.c_proj.bias", f"git.image_encoder.vision_model.encoder.layers.{i}.mlp.fc2.bias")) - rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.ln_2.weight", f"git.image_encoder.vision_model.encoder.layers.{i}.layer_norm2.weight")) - rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.ln_2.bias", f"git.image_encoder.vision_model.encoder.layers.{i}.layer_norm2.bias")) - # fmt: on - - # text decoder - # fmt: off - rename_keys.append((f"{prefix}textual.embedding.words.weight", "git.embeddings.word_embeddings.weight")) - rename_keys.append((f"{prefix}textual.embedding.positions.weight", "git.embeddings.position_embeddings.weight")) - rename_keys.append((f"{prefix}textual.visual_projection.0.weight", "git.visual_projection.visual_projection.0.weight")) - rename_keys.append((f"{prefix}textual.visual_projection.0.bias", "git.visual_projection.visual_projection.0.bias")) - rename_keys.append((f"{prefix}textual.visual_projection.1.weight", "git.visual_projection.visual_projection.1.weight")) - rename_keys.append((f"{prefix}textual.visual_projection.1.bias", "git.visual_projection.visual_projection.1.bias")) - - rename_keys.append((f"{prefix}textual.embedding.layer_norm.weight", "git.embeddings.LayerNorm.weight")) - rename_keys.append((f"{prefix}textual.embedding.layer_norm.bias", "git.embeddings.LayerNorm.bias")) - rename_keys.append((f"{prefix}textual.output.weight", "output.weight")) - rename_keys.append((f"{prefix}textual.output.bias", "output.bias")) - for i in range(config.num_hidden_layers): - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.self.query.weight", f"git.encoder.layer.{i}.attention.self.query.weight")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.self.query.bias", f"git.encoder.layer.{i}.attention.self.query.bias")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.self.key.weight", f"git.encoder.layer.{i}.attention.self.key.weight")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.self.key.bias", f"git.encoder.layer.{i}.attention.self.key.bias")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.self.value.weight", f"git.encoder.layer.{i}.attention.self.value.weight")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.self.value.bias", f"git.encoder.layer.{i}.attention.self.value.bias")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.output.dense.weight", f"git.encoder.layer.{i}.attention.output.dense.weight")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.output.dense.bias", f"git.encoder.layer.{i}.attention.output.dense.bias")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.output.LayerNorm.weight", f"git.encoder.layer.{i}.attention.output.LayerNorm.weight")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.output.LayerNorm.bias", f"git.encoder.layer.{i}.attention.output.LayerNorm.bias")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.intermediate.dense.weight", f"git.encoder.layer.{i}.intermediate.dense.weight")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.intermediate.dense.bias", f"git.encoder.layer.{i}.intermediate.dense.bias")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.output.dense.weight", f"git.encoder.layer.{i}.output.dense.weight")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.output.dense.bias", f"git.encoder.layer.{i}.output.dense.bias")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.output.LayerNorm.weight", f"git.encoder.layer.{i}.output.LayerNorm.weight")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.output.LayerNorm.bias", f"git.encoder.layer.{i}.output.LayerNorm.bias")) - # fmt: on - - if config.num_image_with_embedding is not None: - rename_keys.append(("img_temperal_embedding.0", "git.img_temperal_embedding.0")) - rename_keys.append(("img_temperal_embedding.1", "git.img_temperal_embedding.1")) - rename_keys.append(("img_temperal_embedding.2", "git.img_temperal_embedding.2")) - rename_keys.append(("img_temperal_embedding.3", "git.img_temperal_embedding.3")) - rename_keys.append(("img_temperal_embedding.4", "git.img_temperal_embedding.4")) - rename_keys.append(("img_temperal_embedding.5", "git.img_temperal_embedding.5")) - - return rename_keys - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val.T if "image_encoder.visual_projection" in new else val - - -# we split up the matrix of each CLIP encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config, prefix=""): - dim = config.vision_config.hidden_size - for i in range(config.vision_config.num_hidden_layers): - # read in weights + bias of input projection layer (in the original implementation, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"{prefix}image_encoder.transformer.resblocks.{i}.attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"{prefix}image_encoder.transformer.resblocks.{i}.attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[ - :dim, : - ] - state_dict[f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:dim] - state_dict[f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[ - dim : dim * 2, : - ] - state_dict[f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[ - dim : dim * 2 - ] - state_dict[f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[ - -dim:, : - ] - state_dict[f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-dim:] - - -# We will verify our results on an image -def prepare_img(model_name): - if "textvqa" in model_name: - filepath = hf_hub_download(repo_id="nielsr/textvqa-sample", filename="bus.png", repo_type="dataset") - image = Image.open(filepath).convert("RGB") - else: - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - image = Image.open(requests.get(url, stream=True).raw) - - return image - - -def prepare_video(): - def read_video_pyav(container, indices): - """ - Decode the video with PyAV decoder. - - Args: - container (`av.container.input.InputContainer`): PyAV container. - indices (`list[int]`): List of frame indices to decode. - - Returns: - result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3). - """ - frames = [] - container.seek(0) - start_index = indices[0] - end_index = indices[-1] - for i, frame in enumerate(container.decode(video=0)): - if i > end_index: - break - if i >= start_index and i in indices: - frames.append(frame) - return np.stack([x.to_ndarray(format="rgb24") for x in frames]) - - def sample_frame_indices(clip_len, frame_sample_rate, seg_len): - """ - Sample a given number of frame indices from the video. - - Args: - clip_len (`int`): Total number of frames to sample. - frame_sample_rate (`int`): Sample every n-th frame. - seg_len (`int`): Maximum allowed index of sample's last frame. - - Returns: - indices (`list[int]`): List of sampled frame indices - """ - converted_len = int(clip_len * frame_sample_rate) - end_idx = np.random.randint(converted_len, seg_len) - start_idx = end_idx - converted_len - indices = np.linspace(start_idx, end_idx, num=clip_len) - indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64) - return indices - - # set seed for reproducibility - np.random.seed(0) - - file_path = hf_hub_download(repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset") - with av.open(file_path) as container: - # sample 6 frames - num_frames = 6 - indices = sample_frame_indices( - clip_len=num_frames, frame_sample_rate=4, seg_len=container.streams.video[0].frames - ) - frames = read_video_pyav(container, indices) - - return frames - - -@torch.no_grad() -def convert_git_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False): - """ - Copy/paste/tweak model's weights to our GIT structure. - """ - - model_name_to_url = { - "git-base": "https://publicgit.blob.core.windows.net/data/output/GIT_BASE/snapshot/model.pt", - "git-base-coco": "https://publicgit.blob.core.windows.net/data/output/GIT_BASE_COCO/snapshot/model.pt", - "git-base-textcaps": "https://publicgit.blob.core.windows.net/data/output/GIT_BASE_TEXTCAPS/snapshot/model.pt", - "git-base-vqav2": "https://publicgit.blob.core.windows.net/data/output/GIT_BASE_VQAv2/snapshot/model.pt", - "git-base-textvqa": "https://publicgit.blob.core.windows.net/data/output/GIT_BASE_TEXTVQA/snapshot/model.pt", # todo - "git-base-vatex": "https://publicgit.blob.core.windows.net/data/output/GIT_BASE_VATEX/snapshot/model.pt", - "git-base-msrvtt-qa": ( - "https://publicgit.blob.core.windows.net/data/output/GIT_BASE_MSRVTT_QA/snapshot/model.pt" - ), - "git-large": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE/snapshot/model.pt", - "git-large-coco": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_COCO/snapshot/model.pt", - "git-large-textcaps": ( - "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_TEXTCAPS/snapshot/model.pt" - ), - "git-large-vqav2": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_VQAv2/snapshot/model.pt", - "git-large-textvqa": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_TEXTVQA/snapshot/model.pt", - "git-large-vatex": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_VATEX/snapshot/model.pt", - "git-large-msrvtt-qa": ( - "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_MSRVTT_QA/snapshot/model.pt" - ), - "git-large-r": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_R/snapshot/model.pt", - "git-large-r-coco": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_R_COCO/snapshot/model.pt", - "git-large-r-textcaps": ( - "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_R_TEXTCAPS/snapshot/model.pt" - ), - } - - model_name_to_path = { - "git-large": "/Users/nielsrogge/Documents/GIT/git_large_model.pt", - "git-large-coco": "/Users/nielsrogge/Documents/GIT/git_large_coco_model.pt", - "git-large-textcaps": "/Users/nielsrogge/Documents/GIT/git_large_textcaps_model.pt", - "git-large-vqav2": "/Users/nielsrogge/Documents/GIT/git_large_vqav2_model.pt", - "git-large-textvqa": "/Users/nielsrogge/Documents/GIT/git_large_textvqa_model.pt", - } - - # define GIT configuration based on model name - config, image_size, is_video = get_git_config(model_name) - if "large" in model_name and not is_video and "large-r" not in model_name: - # large checkpoints take way too long to download - checkpoint_path = model_name_to_path[model_name] - state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)["model"] - else: - checkpoint_url = model_name_to_url[model_name] - state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", file_name=model_name)[ - "model" - ] - # rename keys - prefix = "module." if model_name == "git-base" else "" - rename_keys = create_rename_keys(config, prefix=prefix) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - read_in_q_k_v(state_dict, config, prefix=prefix) - - # load HuggingFace model - model = GitForCausalLM(config) - missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False) - model.eval() - - print("Missing keys:", missing_keys) - print("Unexpected keys:", unexpected_keys) - - assert missing_keys == ["git.embeddings.position_ids", "git.image_encoder.vision_model.embeddings.position_ids"] - assert unexpected_keys == ["git.image_encoder.visual_projection.weight"] - - # verify results - image_processor = ( - VideoMAEImageProcessor( - size={"shortest_edge": image_size}, crop_size={"height": image_size, "width": image_size} - ) - if is_video - else CLIPImageProcessor( - size={"shortest_edge": image_size}, crop_size={"height": image_size, "width": image_size} - ) - ) - tokenizer = AutoTokenizer.from_pretrained( - "google-bert/bert-base-uncased", model_input_names=["input_ids", "attention_mask"] - ) - processor = GitProcessor(tokenizer=tokenizer, image_processor=image_processor) - - if is_video: - video = prepare_video() - pixel_values = processor(images=list(video), return_tensors="pt").pixel_values - else: - image = prepare_img(model_name) - image_transforms = Compose( - [ - Resize(image_size, interpolation=Image.BICUBIC), - CenterCrop(image_size), - ToTensor(), - Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), - ] - ) - original_pixel_values = image_transforms(image).unsqueeze(0) - pixel_values = processor(images=image, return_tensors="pt").pixel_values - - assert torch.allclose(pixel_values, original_pixel_values) - - input_ids = torch.tensor([[101]]) - outputs = model(input_ids, pixel_values=pixel_values) - logits = outputs.logits - print("Logits:", logits[0, -1, :3]) - - if model_name == "git-base": - expected_slice_logits = torch.tensor([-1.2832, -1.2835, -1.2840]) - elif model_name == "git-base-coco": - expected_slice_logits = torch.tensor([-0.9925, -0.9930, -0.9935]) - elif model_name == "git-base-textcaps": - expected_slice_logits = torch.tensor([-1.2980, -1.2983, -1.2985]) - elif model_name == "git-base-vqav2": - expected_slice_logits = torch.tensor([-0.8570, -0.8568, -0.8561]) - elif model_name == "git-base-textvqa": - expected_slice_logits = torch.tensor([-1.4085, -1.4083, -1.4082]) - elif model_name == "git-base-vatex": - expected_slice_logits = torch.tensor([-1.3451, -1.3447, -1.3447]) - elif model_name == "git-base-msrvtt-qa": - expected_slice_logits = torch.tensor([-0.8554, -0.8550, -0.8540]) - elif model_name == "git-large": - expected_slice_logits = torch.tensor([-1.1708, -1.1707, -1.1705]) - elif model_name == "git-large-coco": - expected_slice_logits = torch.tensor([-1.0425, -1.0423, -1.0422]) - elif model_name == "git-large-textcaps": - expected_slice_logits = torch.tensor([-1.2705, -1.2708, -1.2706]) - elif model_name == "git-large-vqav2": - expected_slice_logits = torch.tensor([-0.7042, -0.7043, -0.7043]) - elif model_name == "git-large-textvqa": - expected_slice_logits = torch.tensor([-0.8590, -0.8592, -0.8590]) - elif model_name == "git-large-vatex": - expected_slice_logits = torch.tensor([-1.0113, -1.0114, -1.0113]) - elif model_name == "git-large-msrvtt-qa": - expected_slice_logits = torch.tensor([0.0130, 0.0134, 0.0131]) - elif model_name == "git-large-r": - expected_slice_logits = torch.tensor([-1.1283, -1.1285, -1.1286]) - elif model_name == "git-large-r-coco": - expected_slice_logits = torch.tensor([-0.9641, -0.9641, -0.9641]) - elif model_name == "git-large-r-textcaps": - expected_slice_logits = torch.tensor([-1.1121, -1.1120, -1.1124]) - - assert torch.allclose(logits[0, -1, :3], expected_slice_logits, atol=1e-4) - print("Looks ok!") - - prompt = "" - if "textvqa" in model_name: - prompt = "what does the front of the bus say at the top?" - elif "msrvtt-qa" in model_name: - prompt = "what does the woman eat?" - elif "vqa" in model_name: - prompt = "what are the cats doing?" - input_ids = tokenizer(prompt, add_special_tokens=False).input_ids - input_ids = [processor.tokenizer.cls_token_id] + input_ids - input_ids = torch.tensor(input_ids).unsqueeze(0) - print("Generating caption...") - generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50) - print("Generated caption:", processor.batch_decode(generated_ids, skip_special_tokens=True)) - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model and processor of {model_name} to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - print(f"Pushing model and processor of {model_name} to the hub...") - model.push_to_hub(f"microsoft/{model_name}") - processor.push_to_hub(f"microsoft/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="git-base", - type=str, - help="Name of the model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether to push the model to the hub.", - ) - - args = parser.parse_args() - convert_git_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py index 4122b7a0df79..bc037912c5c5 100644 --- a/src/transformers/models/git/modeling_git.py +++ b/src/transformers/models/git/modeling_git.py @@ -954,7 +954,7 @@ def __init__(self, config): self.visual_projection = GitProjection(config) if config.num_image_with_embedding is not None: - self.img_temperal_embedding = nn.ParameterList( + self.img_temporal_embedding = nn.ParameterList( nn.Parameter(torch.zeros(1, 1, config.vision_config.hidden_size)) for _ in range(config.num_image_with_embedding) ) @@ -1119,7 +1119,7 @@ def forward( visual_features_frame = self.image_encoder( pixel_values[:, frame_idx, :, :], interpolate_pos_encoding=interpolate_pos_encoding ).last_hidden_state - visual_features_frame += self.img_temperal_embedding[frame_idx] + visual_features_frame += self.img_temporal_embedding[frame_idx] visual_features.append(visual_features_frame) # finally, concatenate all features along sequence dimension diff --git a/src/transformers/models/glm/convert_glm_weights_to_hf.py b/src/transformers/models/glm/convert_glm_weights_to_hf.py deleted file mode 100644 index df1fd7537f4c..000000000000 --- a/src/transformers/models/glm/convert_glm_weights_to_hf.py +++ /dev/null @@ -1,195 +0,0 @@ -import argparse -import json -import os -import re - -import torch -from safetensors.torch import load_file -from tokenizers import processors - -from transformers import GlmConfig, GlmForCausalLM, PreTrainedTokenizerFast - - -# fmt: off -# `None` means we drop the key -STATE_DICT_MAPPING = { - # CausalLM keys - r"transformer.output_layer.weight": r"lm_head.weight", - - # Model keys - r"transformer.embedding.word_embeddings.weight": r"model.embed_tokens.weight", - r"transformer.rotary_pos_emb.inv_freq": None, - r"transformer.encoder.final_layernorm.weight": r"model.norm.weight", - - # Layers keys - r"transformer.encoder.layers.(\d+).input_layernorm.weight": r"model.layers.\1.input_layernorm.weight", - r"transformer.encoder.layers.(\d+).post_attention_layernorm.weight": r"model.layers.\1.post_attention_layernorm.weight", - - # Attention keys - r"transformer.encoder.layers.(\d+).self_attention.dense.weight": r"model.layers.\1.self_attn.o_proj.weight", - # qkv_proj will later be split in q|k|v|_proj - r"transformer.encoder.layers.(\d+).self_attention.query_key_value.(weight|bias)": r"model.layers.\1.self_attn.qkv_proj.\2", - - # MLP keys - r"transformer.encoder.layers.(\d+).mlp.dense_h_to_4h.weight": r"model.layers.\1.mlp.gate_up_proj.weight", - r"transformer.encoder.layers.(\d+).mlp.dense_4h_to_h.weight": r"model.layers.\1.mlp.down_proj.weight", -} -# fmt: on - - -def load_weights(input_dir: str): - safetensor_files = [os.path.join(input_dir, x) for x in os.listdir(input_dir) if x.endswith(".safetensors")] - bin_files = [os.path.join(input_dir, x) for x in os.listdir(input_dir) if x.endswith(".bin")] - - all_weights = {} - - if safetensor_files: - safetensor_files = sorted(safetensor_files, key=lambda x: int(x.rsplit("-", 3)[1])) - for file in safetensor_files: - tensors = load_file(file) - all_weights.update(tensors) - return all_weights - - elif bin_files: - bin_files = sorted(bin_files, key=lambda x: int(x.rsplit("-", 3)[1])) - for file in bin_files: - tensors = torch.load(file, map_location="cpu", weights_only=True) - all_weights.update(tensors) - return all_weights - - else: - raise ValueError("No .safetensors or .bin files found in the specified directory.") - - -def map_old_key_to_new(old_key): - for pattern, replacement in STATE_DICT_MAPPING.items(): - if replacement is None: - if re.fullmatch(pattern, old_key): - return None - else: - new_key, n_replace = re.subn(pattern, replacement, old_key) - # Early exit of the loop - if n_replace > 0: - return new_key - - raise ValueError(f"Key: {old_key} could not be mapped (check the mapping).") - - -def convert_state_dict(original_state_dict: dict, config: GlmConfig): - new_dict = {} - - head_dim = config.hidden_size // config.num_attention_heads - query_size = config.num_attention_heads * head_dim - kv_size = config.num_key_value_heads * head_dim - - for old_key, value in original_state_dict.items(): - new_key = map_old_key_to_new(old_key) - if new_key is None: - continue - - if "qkv_proj." in new_key: - q_proj, k_proj, v_proj = ( - value[:query_size, ...], - value[query_size : query_size + kv_size, ...], - value[query_size + kv_size :, ...], - ) - new_dict[new_key.replace("qkv_proj.", "q_proj.")] = q_proj - new_dict[new_key.replace("qkv_proj.", "k_proj.")] = k_proj - new_dict[new_key.replace("qkv_proj.", "v_proj.")] = v_proj - else: - new_dict[new_key] = value - return new_dict - - -def convert_config(original_config: dict): - key_mapping = { - "vocab_size": "padded_vocab_size", - "intermediate_size": "ffn_hidden_size", - "num_hidden_layers": "num_layers", - "max_position_embeddings": "seq_length", - "rms_norm_eps": "layernorm_epsilon", - "head_dim": "kv_channels", - "attention_bias": "add_qkv_bias", - } - similar_keys_to_keep = [ - "num_attention_heads", - "hidden_size", - "attention_dropout", - "use_cache", - "eos_token_id", - "pad_token_id", - "tie_word_embeddings", - ] - new_config_kwargs = {k: original_config[v] for k, v in key_mapping.items()} - new_config_kwargs.update({k: v for k, v in original_config.items() if k in similar_keys_to_keep}) - new_config_kwargs["num_key_value_heads"] = ( - new_config_kwargs["num_attention_heads"] - if not original_config["multi_query_attention"] - else original_config["multi_query_group_num"] - ) - new_config_kwargs["rope_theta"] = 10000.0 * getattr(original_config, "rope_ratio", 1) - - new_config = GlmConfig(**new_config_kwargs) - return new_config - - -def convert_glm_tokenizer(input_dir, use_post_processor=False): - fast_tok = PreTrainedTokenizerFast.from_pretrained(input_dir, model_input_names=["input_ids", "attention_mask"]) - if use_post_processor: - fast_tok._tokenizer.post_processor = processors.Sequence( - [ - processors.ByteLevel(trim_offsets=False), - processors.TemplateProcessing( - single="[gMASK]:0 :0 $A:0", - pair="[gMASK]:0 :0 $A:0 $B:1", - special_tokens=[("[gMASK]", 151331), ("", 151333)], - ), - ], - ) - else: - fast_tok._tokenizer.post_processor = processors.Sequence( - [processors.ByteLevel(trim_offsets=False)], - ) - return fast_tok - - -def convert_glm_model(input_dir, output_dir, use_post_processor=False): - # Load and convert config - with open(os.path.join(input_dir, "config.json")) as f: - original_config = json.load(f) - config = convert_config(original_config) - config.save_pretrained(output_dir) - - # Load and convert weights - original_state_dict = load_weights(input_dir) - new_dict = convert_state_dict(original_state_dict, config) - with torch.device("meta"): - model = GlmForCausalLM(config) - model.load_state_dict(new_dict, strict=True, assign=True) - model.save_pretrained(output_dir) - - # Load and convert tokenizer - tokenizer = convert_glm_tokenizer(input_dir, use_post_processor) - tokenizer.save_pretrained(output_dir) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "input_dir", - type=str, - help="Location of the local folder copied from the Hub.", - ) - parser.add_argument( - "output_dir", - type=str, - help="Location to write HF model and tokenizer", - ) - parser.add_argument( - "--use_post_processor", - action="store_true", - help="Whether to apply post processor with special tokens", - ) - - args = parser.parse_args() - convert_glm_model(args.input_dir, args.output_dir, args.use_post_processor) diff --git a/src/transformers/models/glm4/convert_glm4_weights_to_hf.py b/src/transformers/models/glm4/convert_glm4_weights_to_hf.py deleted file mode 100644 index 01ad00f517ad..000000000000 --- a/src/transformers/models/glm4/convert_glm4_weights_to_hf.py +++ /dev/null @@ -1,199 +0,0 @@ -import argparse -import json -import os -import re - -import torch -from safetensors.torch import load_file -from tokenizers import processors - -from transformers import Glm4Config, Glm4ForCausalLM, PreTrainedTokenizerFast - - -# fmt: off -# `None` means we drop the key -STATE_DICT_MAPPING = { - # CausalLM keys - r"transformer.output_layer.weight": r"lm_head.weight", - - # Model keys - r"transformer.embedding.word_embeddings.weight": r"model.embed_tokens.weight", - r"transformer.rotary_pos_emb.inv_freq": None, - r"transformer.encoder.final_layernorm.weight": r"model.norm.weight", - - # Layers keys - r"transformer.encoder.layers.(\d+).input_layernorm.weight": r"model.layers.\1.input_layernorm.weight", - - # Sandwich keys - r"transformer.encoder.layers.(\d+).post_mlp_layernorm.weight": r"model.layers.\1.post_mlp_layernorm.weight", - r"transformer.encoder.layers.(\d+).post_self_attn_layernorm.weight": r"model.layers.\1.post_self_attn_layernorm.weight", - - r"transformer.encoder.layers.(\d+).post_attention_layernorm.weight": r"model.layers.\1.post_attention_layernorm.weight", - - # Attention keys - r"transformer.encoder.layers.(\d+).self_attention.dense.weight": r"model.layers.\1.self_attn.o_proj.weight", - # qkv_proj will later be split in q|k|v|_proj - r"transformer.encoder.layers.(\d+).self_attention.query_key_value.(weight|bias)": r"model.layers.\1.self_attn.qkv_proj.\2", - - # MLP keys - r"transformer.encoder.layers.(\d+).mlp.dense_h_to_4h.weight": r"model.layers.\1.mlp.gate_up_proj.weight", - r"transformer.encoder.layers.(\d+).mlp.dense_4h_to_h.weight": r"model.layers.\1.mlp.down_proj.weight", -} -# fmt: on - - -def load_weights(input_dir: str): - safetensor_files = [os.path.join(input_dir, x) for x in os.listdir(input_dir) if x.endswith(".safetensors")] - bin_files = [os.path.join(input_dir, x) for x in os.listdir(input_dir) if x.endswith(".bin")] - - all_weights = {} - - if safetensor_files: - safetensor_files = sorted(safetensor_files, key=lambda x: int(x.rsplit("-", 3)[1])) - for file in safetensor_files: - tensors = load_file(file) - all_weights.update(tensors) - return all_weights - - elif bin_files: - bin_files = sorted(bin_files, key=lambda x: int(x.rsplit("-", 3)[1])) - for file in bin_files: - tensors = torch.load(file, map_location="cpu") - all_weights.update(tensors) - return all_weights - - else: - raise ValueError("No .safetensors or .bin files found in the specified directory.") - - -def map_old_key_to_new(old_key): - for pattern, replacement in STATE_DICT_MAPPING.items(): - if replacement is None: - if re.fullmatch(pattern, old_key): - return None - else: - new_key, n_replace = re.subn(pattern, replacement, old_key) - # Early exit of the loop - if n_replace > 0: - return new_key - - raise ValueError(f"Key: {old_key} could not be mapped (check the mapping).") - - -def convert_state_dict(original_state_dict: dict, config: Glm4Config): - new_dict = {} - - head_dim = config.hidden_size // config.num_attention_heads - query_size = config.num_attention_heads * head_dim - kv_size = config.num_key_value_heads * head_dim - - for old_key, value in original_state_dict.items(): - new_key = map_old_key_to_new(old_key) - if new_key is None: - continue - - if "qkv_proj." in new_key: - q_proj, k_proj, v_proj = ( - value[:query_size, ...], - value[query_size : query_size + kv_size, ...], - value[query_size + kv_size :, ...], - ) - new_dict[new_key.replace("qkv_proj.", "q_proj.")] = q_proj - new_dict[new_key.replace("qkv_proj.", "k_proj.")] = k_proj - new_dict[new_key.replace("qkv_proj.", "v_proj.")] = v_proj - else: - new_dict[new_key] = value - return new_dict - - -def convert_config(original_config: dict): - key_mapping = { - "vocab_size": "padded_vocab_size", - "intermediate_size": "ffn_hidden_size", - "num_hidden_layers": "num_layers", - "max_position_embeddings": "seq_length", - "rms_norm_eps": "layernorm_epsilon", - "head_dim": "kv_channels", - "attention_bias": "add_qkv_bias", - } - similar_keys_to_keep = [ - "num_attention_heads", - "hidden_size", - "attention_dropout", - "use_cache", - "eos_token_id", - "pad_token_id", - "tie_word_embeddings", - ] - new_config_kwargs = {k: original_config[v] for k, v in key_mapping.items()} - new_config_kwargs.update({k: v for k, v in original_config.items() if k in similar_keys_to_keep}) - new_config_kwargs["num_key_value_heads"] = ( - new_config_kwargs["num_attention_heads"] - if not original_config["multi_query_attention"] - else original_config["multi_query_group_num"] - ) - new_config_kwargs["rope_theta"] = 10000.0 * getattr(original_config, "rope_ratio", 1) - - new_config = Glm4Config(**new_config_kwargs) - return new_config - - -def convert_glm4_tokenizer(input_dir, use_post_processor=False): - fast_tok = PreTrainedTokenizerFast.from_pretrained(input_dir, model_input_names=["input_ids", "attention_mask"]) - if use_post_processor: - fast_tok._tokenizer.post_processor = processors.Sequence( - [ - processors.ByteLevel(trim_offsets=False), - processors.TemplateProcessing( - single="[gMASK]:0 :0 $A:0", - pair="[gMASK]:0 :0 $A:0 $B:1", - special_tokens=[("[gMASK]", 151331), ("", 151333)], - ), - ], - ) - else: - fast_tok._tokenizer.post_processor = processors.Sequence( - [processors.ByteLevel(trim_offsets=False)], - ) - return fast_tok - - -def convert_glm4_model(input_dir, output_dir, use_post_processor=False): - # Load and convert config - with open(os.path.join(input_dir, "config.json")) as f: - original_config = json.load(f) - config = convert_config(original_config) - config.save_pretrained(output_dir) - - # Load and convert weights - original_state_dict = load_weights(input_dir) - new_dict = convert_state_dict(original_state_dict, config) - with torch.device("meta"): - model = Glm4ForCausalLM(config) - model.load_state_dict(new_dict, strict=True, assign=True) - model.save_pretrained(output_dir) - - # Load and convert tokenizer - tokenizer = convert_glm4_tokenizer(input_dir, use_post_processor) - tokenizer.save_pretrained(output_dir) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "input_dir", - type=str, - help="Location of the local folder copied from the Hub.", - ) - parser.add_argument( - "output_dir", - type=str, - help="Location to write HF model and tokenizer", - ) - parser.add_argument( - "--use_post_processor", - action="store_true", - help="Whether to apply post processor with special tokens", - ) - args = parser.parse_args() - convert_glm4_model(args.input_dir, args.output_dir, args.use_post_processor) diff --git a/src/transformers/models/glm4v/configuration_glm4v.py b/src/transformers/models/glm4v/configuration_glm4v.py index e311cd246c8e..4c417020fa84 100644 --- a/src/transformers/models/glm4v/configuration_glm4v.py +++ b/src/transformers/models/glm4v/configuration_glm4v.py @@ -330,7 +330,6 @@ def __init__( video_end_token_id=151342, **kwargs, ): - super().__init__(**kwargs) if isinstance(vision_config, dict): self.vision_config = self.sub_configs["vision_config"](**vision_config) elif vision_config is None: @@ -339,7 +338,6 @@ def __init__( if isinstance(text_config, dict): self.text_config = self.sub_configs["text_config"](**text_config) elif text_config is None: - # For BC use all kwargs to init `TextConfig` self.text_config = self.sub_configs["text_config"](**kwargs) self.image_token_id = image_token_id @@ -349,5 +347,7 @@ def __init__( self.image_start_token_id = image_start_token_id self.image_end_token_id = image_end_token_id + super().__init__(**kwargs) + __all__ = ["Glm4vConfig", "Glm4vTextConfig"] diff --git a/src/transformers/models/glm4v/convert_glm4v_mgt_weights_to_hf.py b/src/transformers/models/glm4v/convert_glm4v_mgt_weights_to_hf.py deleted file mode 100644 index ec1abec38172..000000000000 --- a/src/transformers/models/glm4v/convert_glm4v_mgt_weights_to_hf.py +++ /dev/null @@ -1,645 +0,0 @@ -# coding=utf-8 -# Copyright 2025 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import json -import os -import pickle -import re -from pathlib import Path -from typing import Callable, Optional - -import torch -from safetensors.torch import save_file - - -# Avoid Using Megatron Lib -class UnpicklerWrapper(pickle.Unpickler): - def find_class(self, mod_name, name): - class DummyClass: - def __init__(self, *args, **kwargs): - pass - - if mod_name.startswith("megatron") or mod_name.startswith("glm") or mod_name.startswith("__main__"): - return DummyClass - return super().find_class(mod_name, name) - - -pickle.Unpickler = UnpicklerWrapper - - -def dict_access_multi(a_dict, keys): - if len(keys) == 0: - return a_dict - return dict_access_multi(a_dict[keys[0]], keys[1:]) - - -def merge_qkv( - sd_list, - original_tp, - num_attention_heads, - multi_query_group_num, - attention_dim, - multi_query_attention, - interleaved_qkv, -): - if not multi_query_attention and interleaved_qkv: - return torch.cat(sd_list, dim=0) - q, k, v = [], [], [] - for sd in sd_list: - if multi_query_attention: - q_, k_, v_ = sd.split( - [ - num_attention_heads * attention_dim // original_tp, - multi_query_group_num * attention_dim // original_tp, - multi_query_group_num * attention_dim // original_tp, - ], - dim=0, - ) - else: - q_, k_, v_ = sd.chunk(dim=0, chunks=3) - q.append(q_.clone()) - k.append(k_.clone()) - v.append(v_.clone()) - q = torch.cat(q, dim=0) - k = torch.cat(k, dim=0) - v = torch.cat(v, dim=0) - if not interleaved_qkv: - rotary_dim = attention_dim // 2 - half_rot = rotary_dim // 2 - perm_rot = torch.empty(rotary_dim, dtype=torch.long) - perm_rot[0::2] = torch.arange(0, half_rot) - perm_rot[1::2] = torch.arange(half_rot, rotary_dim) - if q.dim() == 2: - qh = q.view(num_attention_heads, attention_dim, -1) - kh = k.view(multi_query_group_num, attention_dim, -1) - qh[:, :rotary_dim, :] = qh[:, perm_rot, :] - kh[:, :rotary_dim, :] = kh[:, perm_rot, :] - q = qh.reshape(-1, q.size(-1)) - k = kh.reshape(-1, k.size(-1)) - else: - qh = q.view(num_attention_heads, attention_dim) - kh = k.view(multi_query_group_num, attention_dim) - qh[:, :rotary_dim] = qh[:, perm_rot] - kh[:, :rotary_dim] = kh[:, perm_rot] - q = qh.reshape(-1) - k = kh.reshape(-1) - return q, k, v - - -def merge_glu(sd_list): - return torch.cat( - [sd.chunk(dim=0, chunks=2)[0].clone() for sd in sd_list] - + [sd.chunk(dim=0, chunks=2)[1].clone() for sd in sd_list], - dim=0, - ) - - -def merge_glu_vit(sd_list, original_tp=None): - gate_proj = torch.cat([sd.chunk(dim=0, chunks=2)[0].clone() for sd in sd_list], dim=0) - up_proj = torch.cat([sd.chunk(dim=0, chunks=2)[1].clone() for sd in sd_list], dim=0) - return gate_proj, up_proj - - -def split_glu(sd, cnt, idx): - return torch.cat( - ( - sd.chunk(dim=0, chunks=2)[0].chunk(cnt, dim=0)[idx].clone(), - sd.chunk(dim=0, chunks=2)[1].chunk(cnt, dim=0)[idx].clone(), - ), - dim=0, - ) - - -def merge_qkv_vit(sd_list, original_tp=None): - q, k, v = [], [], [] - for sd in sd_list: - q_, k_, v_ = sd.chunk(dim=0, chunks=3) - q.append(q_.clone().contiguous()) - k.append(k_.clone().contiguous()) - v.append(v_.clone().contiguous()) - q = torch.cat(q, dim=0) - k = torch.cat(k, dim=0) - v = torch.cat(v, dim=0) - combined = torch.cat([q, k, v], dim=0) - return combined - - -def merge_tensors_vit( - tp_sd: list[dict], - keys: list[str], - original_tp: int, - target_tp: int, - slice_dim: Optional[int] = None, - merge_fn: Optional[Callable] = None, -): - cnt = original_tp // target_tp - sd_list = [dict_access_multi(tp_sd[i], keys) for i in range(cnt)] - if slice_dim is not None: - return torch.cat(sd_list, dim=slice_dim) - assert merge_fn is not None - return merge_fn(sd_list, original_tp) - - -def merge_tensors( - tp_sd, - keys, - original_tp, - target_tp, - current_tp, - slice_dim=None, - merge_fn=None, -): - cnt = original_tp // target_tp - offset = cnt * current_tp - sd_list = [dict_access_multi(tp_sd[i + offset], keys) for i in range(cnt)] - if slice_dim is not None: - return torch.cat(sd_list, dim=slice_dim) - assert merge_fn is not None - return merge_fn(sd_list) - - -def save_sharded_model(state_dict, output_path, max_shard_size_gb=5, num_layers=40, vision_num_layers=24): - os.makedirs(output_path, exist_ok=True) - - layered_dict = {} - for layer_idx in range(num_layers): - layer_key = f"layer_{layer_idx}" - layered_dict[layer_key] = {} - - for key, value in state_dict.items(): - if f"model.language_model.layers.{layer_idx}." in key: - layered_dict[layer_key][key] = value - - for layer_idx in range(vision_num_layers): - layer_key = f"visual_layer_{layer_idx}" - layered_dict[layer_key] = {} - - for key, value in state_dict.items(): - if f"model.visual.blocks.{layer_idx}." in key: - layered_dict[layer_key][key] = value - - layered_dict["others"] = {} - for key, value in state_dict.items(): - if not any(f"model.language_model.layers.{i}." in key for i in range(num_layers)) and not any( - f"model.visual.blocks.{i}." in key for i in range(vision_num_layers) - ): - layered_dict["others"][key] = value - - # Determine layer ordering - layer_order = [] - for i in range(40): - layer_order.append(f"layer_{i}") - for i in range(24): - layer_order.append(f"visual_layer_{i}") - layer_order.append("others") - - # Calculate sizes and create shards by layer - param_sizes = {} - shards = [] - current_shard = {} - current_shard_size = 0 - max_shard_size_bytes = max_shard_size_gb * 1024 * 1024 * 1024 - - for layer_key in layer_order: - layer_weights = layered_dict[layer_key] - layer_size = sum(param.numel() * param.element_size() for param in layer_weights.values()) - if current_shard_size + layer_size > max_shard_size_bytes and current_shard: - shards.append(current_shard) - current_shard = {} - current_shard_size = 0 - for param_name, param in layer_weights.items(): - current_shard[param_name] = param - current_shard_size += param.numel() * param.element_size() - param_sizes[param_name] = param.numel() * param.element_size() - if current_shard: - shards.append(current_shard) - index_dict = {"metadata": {"total_size": sum(param_sizes.values())}, "weight_map": {}} - - for i, shard in enumerate(shards): - shard_filename = f"model-{i + 1:05d}-of-{len(shards):05d}.safetensors" - shard_path = os.path.join(output_path, shard_filename) - - for param_name in shard: - index_dict["weight_map"][param_name] = shard_filename - - save_file(shard, shard_path, metadata={"format": "pt"}) - print(f"Saved shard {i + 1}/{len(shards)}: {shard_filename}") - print(f" Shard size: {sum(p.numel() * p.element_size() for p in shard.values()) / (1024**3):.2f} GB") - print(f" Keys in shard: {len(shard)}") - - index_path = os.path.join(output_path, "model.safetensors.index.json") - with open(index_path, "w") as f: - json.dump(index_dict, f, indent=2) - - return len(shards) - - -def merge_tp_weights(model_path, output_path, vllm_config_path=None): - tp_size = 0 - for item in Path(model_path).iterdir(): - if item.is_dir(): - match = re.match(r"mp_rank_(\d{2})", item.name) - if match: - tp = int(match.group(1)) - tp_size = max(tp_size, tp + 1) - - print(f"Detected tensor parallel degree TP={tp_size}") - - if tp_size <= 1: - print("Model is already at TP=1, no need to merge") - return - - print(f"Loading vLLM configuration file: {vllm_config_path}") - with open(vllm_config_path, "r") as f: - model_config = json.load(f) - num_layers = model_config.get("num_layers", 40) - vision_num_layers = model_config.get("vision_config", {}).get("num_hidden_layers", 24) - num_heads = model_config.get("num_attention_heads", 32) - num_kv_heads = model_config.get("num_query_groups", 2) - hidden_size = model_config.get("hidden_size", 4096) - head_dim = model_config.get("attention_dim", hidden_size // num_heads) - - print( - f"Model parameters: num_layers={num_layers}, vision_num_layers={vision_num_layers}, " - f"num_heads={num_heads}, multi_query_group_num={num_kv_heads}, hidden_size={hidden_size}" - ) - - weights = [] - for tp_rank in range(tp_size): - print(f"Loading TP shard {tp_rank}...") - weight_path = Path(model_path) / f"mp_rank_{tp_rank:02d}" / "model_optim_rng.pt" - sd = torch.load(weight_path, map_location="cpu", pickle_module=pickle) - - for k in list(sd.keys()): - if "_extra_state" in k or "dummy_parameter" in k: - sd.pop(k) - - if "model" in sd: - weights.append(sd["model"]) - else: - raise ValueError(f"'model' key not found in {weight_path}") - - if not weights: - raise ValueError("No valid weight files found") - - print("Merging tensor parallel weights...") - original_pp_enabled = os.path.exists(Path(model_path) / "mp_rank_00_000") - original_tp, original_pp = tp_size, 1 - target_tp = 1 - print(f"TP and PP INFO: original_tp: {original_tp}, original_pp:{original_pp}, target_tp: {target_tp}") - mgt_sd = [ - [ - torch.load( - Path(model_path) - / (f"mp_rank_{j:02d}_{i:03d}" if original_pp_enabled else f"mp_rank_{j:02d}") - / "model_optim_rng.pt", - map_location="cpu", - pickle_module=pickle, - ) - for j in range(original_tp) - ] - for i in range(original_pp) - ] - - interleaved_qkv = False - multi_query_attention = True - num_attention_heads = num_heads - multi_query_group_num = num_kv_heads - attention_dim = head_dim - complete_state_dict = {} - keys = ["model"] - rank = 0 - - # LLM - for pp in range(original_pp): - layer_i = 0 - mgt_encoder_tp_0 = dict_access_multi(mgt_sd[pp][rank], keys) - - while f"decoder.layers.{layer_i}.self_attention.linear_qkv.layer_norm_weight" in mgt_encoder_tp_0: - complete_state_dict.update( - { - f"model.language_model.layers.{layer_i}.input_layernorm.weight": mgt_encoder_tp_0[ - f"decoder.layers.{layer_i}.self_attention.linear_qkv.layer_norm_weight" - ], - f"model.language_model.layers.{layer_i}.post_attention_layernorm.weight": mgt_encoder_tp_0[ - f"decoder.layers.{layer_i}.mlp.linear_fc1.layer_norm_weight" - ], - f"model.language_model.layers.{layer_i}.post_self_attn_layernorm.weight": mgt_encoder_tp_0[ - f"decoder.layers.{layer_i}.post_self_attn_layernorm.weight" - ], - f"model.language_model.layers.{layer_i}.post_mlp_layernorm.weight": mgt_encoder_tp_0[ - f"decoder.layers.{layer_i}.post_mlp_layernorm.weight" - ], - } - ) - - q, k, v = merge_tensors( - tp_sd=mgt_sd[pp], - keys=keys + [f"decoder.layers.{layer_i}.self_attention.linear_qkv.weight"], - original_tp=original_tp, - target_tp=target_tp, - current_tp=0, - merge_fn=lambda sd_list: merge_qkv( - sd_list, - original_tp, - num_attention_heads, - multi_query_group_num, - attention_dim, - multi_query_attention, - interleaved_qkv, - ), - ) - - complete_state_dict[f"model.language_model.layers.{layer_i}.self_attn.q_proj.weight"] = q.clone() - complete_state_dict[f"model.language_model.layers.{layer_i}.self_attn.k_proj.weight"] = k.clone() - complete_state_dict[f"model.language_model.layers.{layer_i}.self_attn.v_proj.weight"] = v.clone() - - if f"decoder.layers.{layer_i}.self_attention.linear_qkv.bias" in mgt_encoder_tp_0: - q_bias, k_bias, v_bias = merge_tensors( - tp_sd=mgt_sd[pp], - keys=keys + [f"decoder.layers.{layer_i}.self_attention.linear_qkv.bias"], - original_tp=original_tp, - target_tp=target_tp, - current_tp=0, - merge_fn=lambda sd_list: merge_qkv( - sd_list, - original_tp, - num_attention_heads, - multi_query_group_num, - attention_dim, - multi_query_attention, - interleaved_qkv, - ), - ) - complete_state_dict[f"model.language_model.layers.{layer_i}.self_attn.q_proj.bias"] = q_bias.clone() - complete_state_dict[f"model.language_model.layers.{layer_i}.self_attn.k_proj.bias"] = k_bias.clone() - complete_state_dict[f"model.language_model.layers.{layer_i}.self_attn.v_proj.bias"] = v_bias.clone() - - o_proj = merge_tensors( - tp_sd=mgt_sd[pp], - keys=keys + [f"decoder.layers.{layer_i}.self_attention.linear_proj.weight"], - original_tp=original_tp, - target_tp=target_tp, - current_tp=0, - slice_dim=1, - ) - complete_state_dict[f"model.language_model.layers.{layer_i}.self_attn.o_proj.weight"] = o_proj.clone() - - # MLP - Use gate_up_proj - complete_state_dict[f"model.language_model.layers.{layer_i}.mlp.gate_up_proj.weight"] = merge_tensors( - tp_sd=mgt_sd[pp], - keys=keys + [f"decoder.layers.{layer_i}.mlp.linear_fc1.weight"], - original_tp=original_tp, - target_tp=target_tp, - current_tp=0, - merge_fn=merge_glu, - ).clone() - complete_state_dict[f"model.language_model.layers.{layer_i}.mlp.down_proj.weight"] = merge_tensors( - tp_sd=mgt_sd[pp], - keys=keys + [f"decoder.layers.{layer_i}.mlp.linear_fc2.weight"], - original_tp=original_tp, - target_tp=target_tp, - current_tp=0, - slice_dim=1, - ) - layer_i += 1 - - # Embedded Model, LM Head, and Norm - embed_tokens = merge_tensors( - tp_sd=mgt_sd[0], - keys=["model", "embedding.word_embeddings.weight"], - original_tp=original_tp, - target_tp=target_tp, - current_tp=0, - slice_dim=0, - ) - complete_state_dict["model.language_model.embed_tokens.weight"] = embed_tokens.clone() - lm_head = merge_tensors( - tp_sd=mgt_sd[-1], - keys=["model", "output_layer.weight"], - original_tp=original_tp, - target_tp=target_tp, - current_tp=0, - slice_dim=0, - ) - complete_state_dict["lm_head.weight"] = lm_head.clone() - complete_state_dict["model.language_model.norm.weight"] = mgt_sd[-1][rank]["model"][ - "decoder.final_layernorm.weight" - ].clone() - mgt_encoder_tp_0 = dict_access_multi(mgt_sd[0][0], keys) - - # VLM - for layer_i in range(vision_num_layers): - complete_state_dict[f"model.visual.blocks.{layer_i}.norm1.weight"] = mgt_encoder_tp_0[ - f"vision_model.transformer.layers.{layer_i}.input_layernorm.weight" - ] - complete_state_dict[f"model.visual.blocks.{layer_i}.norm2.weight"] = mgt_encoder_tp_0[ - f"vision_model.transformer.layers.{layer_i}.pre_mlp_layernorm.weight" - ] - - qkv_weight = merge_tensors_vit( - tp_sd=mgt_sd[0], - keys=keys + [f"vision_model.transformer.layers.{layer_i}.self_attention.linear_qkv.weight"], - original_tp=original_tp, - target_tp=target_tp, - merge_fn=merge_qkv_vit, - ) - complete_state_dict[f"model.visual.blocks.{layer_i}.attn.qkv.weight"] = qkv_weight.clone() - - proj_weight = merge_tensors_vit( - tp_sd=mgt_sd[0], - keys=keys + [f"vision_model.transformer.layers.{layer_i}.self_attention.linear_proj.weight"], - original_tp=original_tp, - target_tp=target_tp, - slice_dim=1, - ) - complete_state_dict[f"model.visual.blocks.{layer_i}.attn.proj.weight"] = proj_weight.clone() - - gate_proj_weight, up_proj_weight = merge_tensors_vit( - tp_sd=mgt_sd[0], - keys=keys + [f"vision_model.transformer.layers.{layer_i}.mlp.linear_fc1.weight"], - original_tp=original_tp, - target_tp=target_tp, - merge_fn=lambda sd_list, original_tp: merge_glu_vit(sd_list, original_tp), - ) - complete_state_dict[f"model.visual.blocks.{layer_i}.mlp.gate_proj.weight"] = gate_proj_weight.clone() - complete_state_dict[f"model.visual.blocks.{layer_i}.mlp.up_proj.weight"] = up_proj_weight.clone() - - down_proj_weight = merge_tensors_vit( - tp_sd=mgt_sd[0], - keys=keys + [f"vision_model.transformer.layers.{layer_i}.mlp.linear_fc2.weight"], - original_tp=original_tp, - target_tp=target_tp, - slice_dim=1, - ) - complete_state_dict[f"model.visual.blocks.{layer_i}.mlp.down_proj.weight"] = down_proj_weight.clone() - - complete_state_dict["model.visual.downsample.weight"] = ( - mgt_sd[0][0]["model"]["vision_model.downsample.weight"].clone().contiguous() - ) - complete_state_dict["model.visual.downsample.bias"] = ( - mgt_sd[0][0]["model"]["vision_model.downsample.bias"].clone().contiguous() - ) - - # Merger - gate_proj, up_proj = merge_tensors_vit( - tp_sd=mgt_sd[0], - keys=keys + ["vision_projection.encoder.linear_fc1.weight"], - original_tp=original_tp, - target_tp=target_tp, - merge_fn=merge_glu_vit, - ) - - down_proj = merge_tensors_vit( - tp_sd=mgt_sd[0], - keys=keys + ["vision_projection.encoder.linear_fc2.weight"], - original_tp=original_tp, - target_tp=target_tp, - slice_dim=1, - ) - proj = merge_tensors_vit( - tp_sd=mgt_sd[0], - keys=keys + ["vision_projection.encoder.linear_fc_extra.weight"], - original_tp=original_tp, - target_tp=target_tp, - slice_dim=0, - ) - - complete_state_dict["model.visual.merger.gate_proj.weight"] = gate_proj.clone().contiguous() - complete_state_dict["model.visual.merger.up_proj.weight"] = up_proj.clone().contiguous() - complete_state_dict["model.visual.merger.down_proj.weight"] = down_proj.clone().contiguous() - complete_state_dict["model.visual.merger.proj.weight"] = proj.clone().contiguous() - - complete_state_dict["model.visual.merger.post_projection_norm.weight"] = ( - mgt_sd[0][0]["model"]["vision_projection.encoder.layer_norm.weight"].clone().contiguous() - ) - complete_state_dict["model.visual.merger.post_projection_norm.bias"] = ( - mgt_sd[0][0]["model"]["vision_projection.encoder.layer_norm.bias"].clone().contiguous() - ) - complete_state_dict["model.visual.embeddings.position_embedding.weight"] = ( - mgt_sd[0][0]["model"]["vision_model.position_embeddings.weight"].clone().contiguous() - ) - complete_state_dict["model.visual.patch_embed.proj.weight"] = ( - mgt_sd[0][0]["model"]["vision_model.conv3d.weight"].clone().contiguous() - ) - complete_state_dict["model.visual.patch_embed.proj.bias"] = ( - mgt_sd[0][0]["model"]["vision_model.conv3d.bias"].clone().contiguous() - ) - - # Check for additional vision model norm layers mentioned in the expected output - if "vision_model.post_conv_layernorm.weight" in mgt_encoder_tp_0: - complete_state_dict["model.visual.post_conv_layernorm.weight"] = ( - mgt_sd[0][0]["model"]["vision_model.post_conv_layernorm.weight"].clone().contiguous() - ) - - if "vision_model.post_layernorm.weight" in mgt_encoder_tp_0: - complete_state_dict["model.visual.post_layernorm.weight"] = ( - mgt_sd[0][0]["model"]["vision_model.post_layernorm.weight"].clone().contiguous() - ) - - print(f"Total keys in state dict: {len(complete_state_dict)}") - - for key, value in complete_state_dict.items(): - if isinstance(value, torch.Tensor): - complete_state_dict[key] = value.to(torch.bfloat16) - print("Converted all tensors to bfloat16") - # Save Model weight - save_sharded_model( - complete_state_dict, - output_path=output_path, - max_shard_size_gb=5, - num_layers=num_layers, - vision_num_layers=vision_num_layers, - ) - - hf_config = { - "architectures": ["Glm4vForConditionalGeneration"], - "model_type": "glm4v", - "attention_bias": model_config.get("add_qkv_bias", True), - "attention_dropout": 0.0, - "pad_token_id": model_config.get("pad_token_id", 151329), - "eos_token_id": model_config.get("eos_token_id", [151329, 151336, 151338]), - "image_start_token_id": model_config.get("image_start_token_id", 151339), - "image_end_token_id": model_config.get("image_end_token_id", 151340), - "video_start_token_id": model_config.get("video_start_token_id", 151341), - "video_end_token_id": model_config.get("video_end_token_id", 151342), - "image_token_id": model_config.get("image_token_id", 151343), - "video_token_id": model_config.get("video_token_id", 151344), - "hidden_act": model_config.get("hidden_act", "silu"), - "hidden_size": model_config.get("hidden_size", 4096), - "initializer_range": 0.02, - "intermediate_size": model_config.get("ffn_hidden_size", 13696), - "max_position_embeddings": model_config.get("seq_length", 32768), - "num_attention_heads": model_config.get("num_attention_heads", 32), - "num_hidden_layers": model_config.get("num_layers", 40), - "num_key_value_heads": model_config.get("multi_query_group_num", 2), - "rms_norm_eps": model_config.get("layernorm_epsilon", 1e-05), - "rope_theta": model_config.get("rotary_base", 10000.0), - "tie_word_embeddings": False, - "dtype": model_config.get("dtype", "bfloat16"), - "transformers_version": "4.53.0dev", - "use_cache": model_config.get("use_cache", True), - "vocab_size": model_config.get("vocab_size", 151552), - "partial_rotary_factor": 0.5, - } - - if "vision_config" in model_config: - vision_config = { - "hidden_size": model_config["vision_config"].get("hidden_size", 1536), - "depth": model_config["vision_config"].get("num_layers", 24), - "num_heads": model_config["vision_config"].get("num_attention_heads", 12), - "attention_bias": model_config["vision_config"].get("attention_bias", False), - "intermediate_size": model_config.get("ffn_hidden_size", 13696), - "hidden_act": model_config["vision_config"].get("hidden_act", "silu"), - "hidden_dropout_prob": model_config["vision_config"].get("hidden_dropout_prob", 0.0), - "initializer_range": 0.02, - "image_size": model_config["vision_config"].get("image_size", 336), - "patch_size": model_config["vision_config"].get("patch_size", 14), - "out_hidden_size": model_config.get("hidden_size", 4096), - "rms_norm_eps": model_config["vision_config"].get("layernorm_epsilon", 1e-05), - "spatial_merge_size": model_config["vision_config"].get("downsample_ratio", 2), - "temporal_patch_size": model_config["vision_config"].get("t_patch", 2), - } - hf_config["vision_config"] = vision_config - - if "rope_scaling" in model_config: - hf_config["rope_scaling"] = model_config["rope_scaling"] - - config_path = os.path.join(output_path, "config.json") - with open(config_path, "w") as f: - json.dump(hf_config, f, indent=2) - - print(f"Conversion complete! Model saved to {output_path}") - - -def parse_args(): - parser = argparse.ArgumentParser(description="Convert Megatron model to HuggingFace format") - parser.add_argument( - "--model_path", - type=str, - required=True, - help="Path to Megatron model directory", - ) - parser.add_argument("--output_path", type=str, required=True, help="Output path for HuggingFace model directory") - parser.add_argument( - "--config_path", type=str, help="Path to vLLM configuration file for creating HuggingFace config" - ) - return parser.parse_args() - - -if __name__ == "__main__": - args = parse_args() - merge_tp_weights(args.model_path, args.output_path, args.config_path) diff --git a/src/transformers/models/glm4v/image_processing_glm4v_fast.py b/src/transformers/models/glm4v/image_processing_glm4v_fast.py index fbf4aebaac6a..8cdf31a437ae 100644 --- a/src/transformers/models/glm4v/image_processing_glm4v_fast.py +++ b/src/transformers/models/glm4v/image_processing_glm4v_fast.py @@ -17,6 +17,7 @@ from typing import Optional, Union import torch +from torchvision.transforms.v2 import functional as F from ...image_processing_utils import ( BatchFeature, @@ -38,17 +39,11 @@ from ...utils import ( TensorType, auto_docstring, - is_torchvision_v2_available, logging, ) from .image_processing_glm4v import smart_resize -if is_torchvision_v2_available(): - from torchvision.transforms.v2 import functional as F -else: - from torchvision.transforms import functional as F - logger = logging.get_logger(__name__) diff --git a/src/transformers/models/glm4v/modular_glm4v.py b/src/transformers/models/glm4v/modular_glm4v.py index 7c400edc51c3..3f870db9db05 100644 --- a/src/transformers/models/glm4v/modular_glm4v.py +++ b/src/transformers/models/glm4v/modular_glm4v.py @@ -38,7 +38,6 @@ from ...utils.generic import check_model_inputs from ...video_utils import VideoInput from ..glm4.modeling_glm4 import Glm4MLP, Glm4RMSNorm, eager_attention_forward -from ..qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLConfig from ..qwen2_5_vl.modeling_qwen2_5_vl import ( Qwen2_5_VisionPatchEmbed, Qwen2_5_VisionRotaryEmbedding, @@ -313,7 +312,7 @@ def __init__( super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) -class Glm4vConfig(Qwen2_5_VLConfig): +class Glm4vConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of a [`Glm4vModel`]. It is used to instantiate a GLM-4.1V model according to the specified arguments, defining the model architecture. Instantiating a @@ -355,6 +354,10 @@ class Glm4vConfig(Qwen2_5_VLConfig): >>> configuration = model.config ```""" + model_type = "glm4v" + sub_configs = {"vision_config": Glm4vVisionConfig, "text_config": Glm4vTextConfig} + keys_to_ignore_at_inference = ["past_key_values"] + def __init__( self, text_config=None, @@ -367,12 +370,25 @@ def __init__( video_end_token_id=151342, **kwargs, ): - super().__init__() + if isinstance(vision_config, dict): + self.vision_config = self.sub_configs["vision_config"](**vision_config) + elif vision_config is None: + self.vision_config = self.sub_configs["vision_config"]() + + if isinstance(text_config, dict): + self.text_config = self.sub_configs["text_config"](**text_config) + elif text_config is None: + self.text_config = self.sub_configs["text_config"](**kwargs) + + self.image_token_id = image_token_id + self.video_token_id = video_token_id self.video_start_token_id = video_start_token_id self.video_end_token_id = video_end_token_id self.image_start_token_id = image_start_token_id self.image_end_token_id = image_end_token_id + super().__init__(**kwargs) + # Will be used for both Text and Vision modalities class Glm4vRMSNorm(Glm4RMSNorm): @@ -1625,7 +1641,7 @@ def __call__( num_frames = video_grid_thw[video_index][0] video_structure = "" - metadata = video_metadata[i] + metadata = video_metadata[video_index] if metadata.fps is None: logger.warning_once( "SmolVLM requires frame timestamps to construct prompts, but the `fps` of the input video could not be inferred. " diff --git a/src/transformers/models/glm4v/processing_glm4v.py b/src/transformers/models/glm4v/processing_glm4v.py index 817da3630d52..a8ebb4d41b49 100644 --- a/src/transformers/models/glm4v/processing_glm4v.py +++ b/src/transformers/models/glm4v/processing_glm4v.py @@ -180,7 +180,7 @@ def __call__( num_frames = video_grid_thw[video_index][0] video_structure = "" - metadata = video_metadata[i] + metadata = video_metadata[video_index] if metadata.fps is None: logger.warning_once( "SmolVLM requires frame timestamps to construct prompts, but the `fps` of the input video could not be inferred. " diff --git a/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py b/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py index 52004b560da7..b06642e250bc 100644 --- a/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py @@ -371,7 +371,6 @@ def __init__( if isinstance(text_config, dict): self.text_config = self.sub_configs["text_config"](**text_config) elif text_config is None: - # For BC use all kwargs to init `TextConfig` self.text_config = self.sub_configs["text_config"](**kwargs) self.image_token_id = image_token_id diff --git a/src/transformers/models/glpn/convert_glpn_to_pytorch.py b/src/transformers/models/glpn/convert_glpn_to_pytorch.py deleted file mode 100644 index 51088fb72443..000000000000 --- a/src/transformers/models/glpn/convert_glpn_to_pytorch.py +++ /dev/null @@ -1,218 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert GLPN checkpoints.""" - -import argparse -from collections import OrderedDict -from pathlib import Path - -import requests -import torch -from PIL import Image - -from transformers import GLPNConfig, GLPNForDepthEstimation, GLPNImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def rename_keys(state_dict): - new_state_dict = OrderedDict() - for key, value in state_dict.items(): - if key.startswith("module.encoder"): - key = key.replace("module.encoder", "glpn.encoder") - if key.startswith("module.decoder"): - key = key.replace("module.decoder", "decoder.stages") - if "patch_embed" in key: - # replace for example patch_embed1 by patch_embeddings.0 - idx = key[key.find("patch_embed") + len("patch_embed")] - key = key.replace(f"patch_embed{idx}", f"patch_embeddings.{int(idx) - 1}") - if "norm" in key: - key = key.replace("norm", "layer_norm") - if "glpn.encoder.layer_norm" in key: - # replace for example layer_norm1 by layer_norm.0 - idx = key[key.find("glpn.encoder.layer_norm") + len("glpn.encoder.layer_norm")] - key = key.replace(f"layer_norm{idx}", f"layer_norm.{int(idx) - 1}") - if "layer_norm1" in key: - key = key.replace("layer_norm1", "layer_norm_1") - if "layer_norm2" in key: - key = key.replace("layer_norm2", "layer_norm_2") - if "block" in key: - # replace for example block1 by block.0 - idx = key[key.find("block") + len("block")] - key = key.replace(f"block{idx}", f"block.{int(idx) - 1}") - if "attn.q" in key: - key = key.replace("attn.q", "attention.self.query") - if "attn.proj" in key: - key = key.replace("attn.proj", "attention.output.dense") - if "attn" in key: - key = key.replace("attn", "attention.self") - if "fc1" in key: - key = key.replace("fc1", "dense1") - if "fc2" in key: - key = key.replace("fc2", "dense2") - if "linear_pred" in key: - key = key.replace("linear_pred", "classifier") - if "linear_fuse" in key: - key = key.replace("linear_fuse.conv", "linear_fuse") - key = key.replace("linear_fuse.bn", "batch_norm") - if "linear_c" in key: - # replace for example linear_c4 by linear_c.3 - idx = key[key.find("linear_c") + len("linear_c")] - key = key.replace(f"linear_c{idx}", f"linear_c.{int(idx) - 1}") - if "bot_conv" in key: - key = key.replace("bot_conv", "0.convolution") - if "skip_conv1" in key: - key = key.replace("skip_conv1", "1.convolution") - if "skip_conv2" in key: - key = key.replace("skip_conv2", "2.convolution") - if "fusion1" in key: - key = key.replace("fusion1", "1.fusion") - if "fusion2" in key: - key = key.replace("fusion2", "2.fusion") - if "fusion3" in key: - key = key.replace("fusion3", "3.fusion") - if "fusion" in key and "conv" in key: - key = key.replace("conv", "convolutional_layer") - if key.startswith("module.last_layer_depth"): - key = key.replace("module.last_layer_depth", "head.head") - new_state_dict[key] = value - - return new_state_dict - - -def read_in_k_v(state_dict, config): - # for each of the encoder blocks: - for i in range(config.num_encoder_blocks): - for j in range(config.depths[i]): - # read in weights + bias of keys and values (which is a single matrix in the original implementation) - kv_weight = state_dict.pop(f"glpn.encoder.block.{i}.{j}.attention.self.kv.weight") - kv_bias = state_dict.pop(f"glpn.encoder.block.{i}.{j}.attention.self.kv.bias") - # next, add keys and values (in that order) to the state dict - state_dict[f"glpn.encoder.block.{i}.{j}.attention.self.key.weight"] = kv_weight[ - : config.hidden_sizes[i], : - ] - state_dict[f"glpn.encoder.block.{i}.{j}.attention.self.key.bias"] = kv_bias[: config.hidden_sizes[i]] - state_dict[f"glpn.encoder.block.{i}.{j}.attention.self.value.weight"] = kv_weight[ - config.hidden_sizes[i] :, : - ] - state_dict[f"glpn.encoder.block.{i}.{j}.attention.self.value.bias"] = kv_bias[config.hidden_sizes[i] :] - - -# We will verify our results on a COCO image -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - image = Image.open(requests.get(url, stream=True).raw) - - return image - - -@torch.no_grad() -def convert_glpn_checkpoint(checkpoint_path, pytorch_dump_folder_path, push_to_hub=False, model_name=None): - """ - Copy/paste/tweak model's weights to our GLPN structure. - """ - - # load GLPN configuration (Segformer-B4 size) - config = GLPNConfig(hidden_sizes=[64, 128, 320, 512], decoder_hidden_size=64, depths=[3, 8, 27, 3]) - - # load image processor (only resize + rescale) - image_processor = GLPNImageProcessor() - - # prepare image - image = prepare_img() - pixel_values = image_processor(images=image, return_tensors="pt").pixel_values - - logger.info("Converting model...") - - # load original state dict - state_dict = torch.load(checkpoint_path, map_location=torch.device("cpu"), weights_only=True) - - # rename keys - state_dict = rename_keys(state_dict) - - # key and value matrices need special treatment - read_in_k_v(state_dict, config) - - # create HuggingFace model and load state dict - model = GLPNForDepthEstimation(config) - model.load_state_dict(state_dict) - model.eval() - - # forward pass - outputs = model(pixel_values) - predicted_depth = outputs.predicted_depth - - # verify output - if model_name is not None: - if "nyu" in model_name: - expected_slice = torch.tensor( - [[4.4147, 4.0873, 4.0673], [3.7890, 3.2881, 3.1525], [3.7674, 3.5423, 3.4913]] - ) - elif "kitti" in model_name: - expected_slice = torch.tensor( - [[3.4291, 2.7865, 2.5151], [3.2841, 2.7021, 2.3502], [3.1147, 2.4625, 2.2481]] - ) - else: - raise ValueError(f"Unknown model name: {model_name}") - - expected_shape = torch.Size([1, 480, 640]) - - assert predicted_depth.shape == expected_shape - assert torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-4) - print("Looks ok!") - - # finally, push to hub if required - if push_to_hub: - logger.info("Pushing model and image processor to the hub...") - model.push_to_hub( - repo_path_or_name=Path(pytorch_dump_folder_path, model_name), - organization="nielsr", - commit_message="Add model", - use_temp_dir=True, - ) - image_processor.push_to_hub( - repo_path_or_name=Path(pytorch_dump_folder_path, model_name), - organization="nielsr", - commit_message="Add image processor", - use_temp_dir=True, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument( - "--checkpoint_path", - default=None, - type=str, - help="Path to the original PyTorch checkpoint (.pth file).", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model." - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether to upload the model to the HuggingFace hub." - ) - parser.add_argument( - "--model_name", - default="glpn-kitti", - type=str, - help="Name of the model in case you're pushing to the hub.", - ) - args = parser.parse_args() - convert_glpn_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub, args.model_name) diff --git a/src/transformers/models/got_ocr2/convert_got_ocr2_weights_to_hf.py b/src/transformers/models/got_ocr2/convert_got_ocr2_weights_to_hf.py deleted file mode 100644 index 9cf873a27567..000000000000 --- a/src/transformers/models/got_ocr2/convert_got_ocr2_weights_to_hf.py +++ /dev/null @@ -1,274 +0,0 @@ -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import gc -import glob -import os -from typing import Optional - -import regex as re -import torch -from huggingface_hub import snapshot_download -from safetensors import safe_open - -from transformers import ( - GotOcr2Config, - GotOcr2ForConditionalGeneration, - GotOcr2ImageProcessor, - GotOcr2Processor, - PreTrainedTokenizerFast, - is_vision_available, -) -from transformers.convert_slow_tokenizer import TikTokenConverter -from transformers.tokenization_utils import AddedToken - - -if is_vision_available(): - from transformers.image_utils import load_image - - -# fmt: off -ORIGINAL_TO_CONVERTED_KEY_MAPPING = { - # Vision encoder mapping - r"model.vision_tower_high.pos_embed": r"vision_tower.pos_embed", - r"model.vision_tower_high.patch_embed.proj": r"vision_tower.patch_embed.projection", - r"model.vision_tower_high.blocks.(\d+).norm": r"vision_tower.layers.\1.layer_norm", - r"model.vision_tower_high.blocks.(\d+).attn": r"vision_tower.layers.\1.attn", - r"model.vision_tower_high.blocks.(\d+).mlp": r"vision_tower.layers.\1.mlp", - r"model.vision_tower_high.neck.0": r"vision_tower.neck.conv1", - r"model.vision_tower_high.neck.1": r"vision_tower.neck.layer_norm1", - r"model.vision_tower_high.neck.2": r"vision_tower.neck.conv2", - r"model.vision_tower_high.neck.3": r"vision_tower.neck.layer_norm2", - r"model.vision_tower_high.net_(\d+)": lambda m: f"multi_modal_projector.conv_upsampler{int(m.group(1)) - 1}", - r"model.mm_projector_vary" : r"multi_modal_projector.multimodal_projector", - r"model.": r"language_model.model.", - r"lm_head": r"language_model.lm_head", -} -# fmt: on - -CONTEXT_LENGTH = 8000 - - -def convert_old_keys_to_new_keys(state_dict_keys: Optional[dict] = None): - """ - This function should be applied only once, on the concatenated keys to efficiently rename using - the key mappings. - """ - output_dict = {} - if state_dict_keys is not None: - old_text = "\n".join(state_dict_keys) - new_text = old_text - for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items(): - new_text = re.sub(pattern, replacement, new_text) - output_dict = dict(zip(old_text.split("\n"), new_text.split("\n"))) - return output_dict - - -def load_original_state_dict(model_id): - directory_path = snapshot_download(repo_id=model_id, allow_patterns=["*.safetensors"]) - - original_state_dict = {} - for path in glob.glob(f"{directory_path}/*"): - if path.endswith(".safetensors"): - with safe_open(path, framework="pt", device="cpu") as f: - for key in f.keys(): - original_state_dict[key] = f.get_tensor(key) - - return original_state_dict - - -def get_got_ocr2_config(): - config = GotOcr2Config() - - return config - - -def write_model( - model_path, - input_base_path, - push_to_hub=False, -): - os.makedirs(model_path, exist_ok=True) - - config = get_got_ocr2_config() - config.architectures = ["GotOcr2ForConditionalGeneration"] - config.save_pretrained(model_path) - print("Model config saved successfully...") - - # ------------------------------------------------------------ - # Convert weights - # ------------------------------------------------------------ - - print(f"Fetching all parameters from the checkpoint at {input_base_path}...") - state_dict_old = load_original_state_dict(input_base_path) - print("Converting model...") - all_keys = list(state_dict_old.keys()) - new_keys = convert_old_keys_to_new_keys(all_keys) - state_dict = {} - for key in all_keys: - new_key = new_keys[key] - state_dict[new_key] = state_dict_old[key] - - del state_dict_old - gc.collect() - - print("Loading the checkpoint in a GotOcr2ForConditionalGeneration model.") - model = GotOcr2ForConditionalGeneration(config) - missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False) - model = model.to(torch.bfloat16) - print("model dtype:", model.dtype) - print("Missing keys:", missing_keys) - print("Unexpected keys:", unexpected_keys) - - print("Saving the model.") - model.save_pretrained(model_path) - if push_to_hub: - model.push_to_hub("stepfun-ai/GOT-OCR-2.0-hf", use_temp_dir=True) - del state_dict, model - - # Safety check: reload the converted model - gc.collect() - print("Reloading the model to check if it's saved correctly.") - model = GotOcr2ForConditionalGeneration.from_pretrained(model_path, device_map="auto") - processor = GotOcr2Processor.from_pretrained(model_path) - image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/image_ocr.jpg" - ) - - inputs = processor(image, return_tensors="pt", format=True).to(model.device, dtype=model.dtype) - generate_ids = model.generate(**inputs, do_sample=False, num_beams=1, max_new_tokens=4) - decoded_output = processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True) - expected_output = "\\title{\nR" - print("Decoded output:", decoded_output) - assert decoded_output == expected_output - print("Model reloaded successfully.") - del model - - -class GotOcr2Converter(TikTokenConverter): - def __init__( - self, - vocab_file, - special_tokens: list[str], - pattern: str, - model_max_length: int, - chat_template: Optional[str] = None, - **kwargs, - ): - super().__init__(vocab_file, pattern=pattern) - self.additional_special_tokens = special_tokens - tokenizer = self.converted() - if chat_template is not None: - kwargs["chat_template"] = chat_template - self.tokenizer = PreTrainedTokenizerFast( - tokenizer_object=tokenizer, - model_input_names=["input_ids", "attention_mask"], - model_max_length=model_max_length, - **kwargs, - ) - - -def write_tokenizer(tokenizer_path: str, save_dir: str, push_to_hub: bool = False): - model_max_length = CONTEXT_LENGTH - pattern = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+" # noqa: W605 - # Special tokens - special_tokens = ( - ["<|endoftext|>", "<|im_start|>", "<|im_end|>"] - + [f"<|extra_{i}|>" for i in range(205)] - + [ - "", - "", - "", - "", - "", - "", - "", - "", - "", - ] - ) - - pad_token = "<|endoftext|>" - pad_token = AddedToken(pad_token, lstrip=False, rstrip=False, normalized=False, single_word=False) - - converter = GotOcr2Converter( - vocab_file=tokenizer_path, - pattern=pattern, - special_tokens=special_tokens, - model_max_length=model_max_length, - pad_token=pad_token, - bos_token="<|endoftext|>", - eos_token="<|endoftext|>", - clean_up_tokenization_spaces=True, - ) - tokenizer = converter.tokenizer - tokenizer.save_pretrained(save_dir) - - if push_to_hub: - tokenizer.push_to_hub("stepfun-ai/GOT-OCR-2.0-hf", use_temp_dir=True) - - -def write_image_processor(save_dir: str, push_to_hub: bool = False): - image_processor = GotOcr2ImageProcessor( - do_resize=True, - size={"height": 1024, "width": 1024}, - do_rescale=True, - rescale_factor=1 / 255, - do_normalize=True, - image_mean=[0.48145466, 0.4578275, 0.40821073], - image_std=[0.26862954, 0.26130258, 0.27577711], - ) - - image_processor.save_pretrained(save_dir) - if push_to_hub: - image_processor.push_to_hub("stepfun-ai/GOT-OCR-2.0-hf", use_temp_dir=True) - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--input_dir", - default="stepfun-ai/GOT-OCR2_0", - help="Location of LLaMA weights, which contains tokenizer.model and model folders", - ) - parser.add_argument( - "--output_dir", - default="GotOcr2", - help="Location to write HF model and tokenizer", - ) - - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub." - ) - args = parser.parse_args() - write_tokenizer( - tokenizer_path="qwen.tiktoken", - save_dir=args.output_dir, - push_to_hub=args.push_to_hub, - ) - - write_image_processor( - save_dir=args.output_dir, - push_to_hub=args.push_to_hub, - ) - write_model( - model_path=args.output_dir, - input_base_path=args.input_dir, - push_to_hub=args.push_to_hub, - ) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py b/src/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py index 5277f1c4e13b..a47a1422a5dc 100644 --- a/src/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +++ b/src/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py @@ -17,6 +17,7 @@ from typing import Optional, Union import torch +from torchvision.transforms.v2 import functional as F from ...image_processing_utils import BatchFeature from ...image_processing_utils_fast import ( @@ -30,17 +31,10 @@ from ...utils import ( TensorType, auto_docstring, - is_torchvision_v2_available, ) from .image_processing_got_ocr2 import get_optimal_tiled_canvas -if is_torchvision_v2_available(): - from torchvision.transforms.v2 import functional as F -else: - from torchvision.transforms import functional as F - - class GotOcr2FastImageProcessorKwargs(DefaultFastImageProcessorKwargs): """ crop_to_patches (`bool`, *optional*, defaults to `False`): diff --git a/src/transformers/models/gpt2/convert_gpt2_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/gpt2/convert_gpt2_original_tf_checkpoint_to_pytorch.py deleted file mode 100755 index 33f9dabed07f..000000000000 --- a/src/transformers/models/gpt2/convert_gpt2_original_tf_checkpoint_to_pytorch.py +++ /dev/null @@ -1,68 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert OpenAI GPT checkpoint.""" - -import argparse - -import torch - -from transformers import GPT2Config, GPT2Model, load_tf_weights_in_gpt2 -from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging - - -logging.set_verbosity_info() - - -def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path): - # Construct model - if gpt2_config_file == "": - config = GPT2Config() - else: - config = GPT2Config.from_json_file(gpt2_config_file) - model = GPT2Model(config) - - # Load weights from numpy - load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path) - - # Save pytorch-model - pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME - pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME - print(f"Save PyTorch model to {pytorch_weights_dump_path}") - torch.save(model.state_dict(), pytorch_weights_dump_path) - print(f"Save configuration file to {pytorch_config_dump_path}") - with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: - f.write(config.to_json_string()) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--gpt2_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - parser.add_argument( - "--gpt2_config_file", - default="", - type=str, - help=( - "An optional config json file corresponding to the pre-trained OpenAI model. \n" - "This specifies the model architecture." - ), - ) - args = parser.parse_args() - convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path, args.gpt2_config_file, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/gpt_neo/convert_gpt_neo_mesh_tf_to_pytorch.py b/src/transformers/models/gpt_neo/convert_gpt_neo_mesh_tf_to_pytorch.py deleted file mode 100644 index 3db22857293c..000000000000 --- a/src/transformers/models/gpt_neo/convert_gpt_neo_mesh_tf_to_pytorch.py +++ /dev/null @@ -1,71 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The Eleuther AI and HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert GPT Neo checkpoint.""" - -import argparse -import json - -from transformers import GPTNeoConfig, GPTNeoForCausalLM, load_tf_weights_in_gpt_neo -from transformers.utils import logging - - -logging.set_verbosity_info() - - -def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path): - # Initialise PyTorch model - config_json = json.load(open(config_file, "r")) - config = GPTNeoConfig( - hidden_size=config_json["n_embd"], - num_layers=config_json["n_layer"], - num_heads=config_json["n_head"], - attention_types=config_json["attention_types"], - max_position_embeddings=config_json["n_positions"], - resid_dropout=config_json["res_dropout"], - embed_dropout=config_json["embed_dropout"], - attention_dropout=config_json["attn_dropout"], - ) - print(f"Building PyTorch model from configuration: {config}") - model = GPTNeoForCausalLM(config) - - # Load weights from tf checkpoint - load_tf_weights_in_gpt_neo(model, config, tf_checkpoint_path) - - # Save pytorch-model - print(f"Save PyTorch model to {pytorch_dump_path}") - model.save_pretrained(pytorch_dump_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." - ) - parser.add_argument( - "--config_file", - default=None, - type=str, - required=True, - help=( - "The config json file corresponding to the pre-trained mesh-tf model. \n" - "This specifies the model architecture." - ), - ) - parser.add_argument( - "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - args = parser.parse_args() - convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path) diff --git a/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py index 891f77ece304..584e74a8123e 100644 --- a/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py +++ b/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py @@ -318,7 +318,7 @@ def checku2e(x): candidates.append((self.vocab[wd], wd, e)) if len(candidates) > 0: # the smallest token_id is adopted - _, wd, e = sorted(candidates, key=lambda x: x[0])[0] + _, wd, e = min(candidates, key=lambda x: x[0]) result.append(wd) pos = e else: diff --git a/src/transformers/models/gpt_oss/convert_gpt_oss_weights_to_hf.py b/src/transformers/models/gpt_oss/convert_gpt_oss_weights_to_hf.py deleted file mode 100644 index 736a95247dfb..000000000000 --- a/src/transformers/models/gpt_oss/convert_gpt_oss_weights_to_hf.py +++ /dev/null @@ -1,831 +0,0 @@ -# Copyright 2025 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import gc -import json -import os -from pathlib import Path -from typing import Optional - -import regex as re -import tiktoken -import torch -from safetensors.torch import load_file as safe_load - -from transformers import ( - GenerationConfig, - GptOssConfig, - GptOssForCausalLM, - PreTrainedTokenizerFast, -) -from transformers.convert_slow_tokenizer import TikTokenConverter - - -# fmt: off -# If a weight needs to be split in two or more keys, use `|` to indicate it. ex: -# r"layers.(\d+).attention.wqkv.weight": r"layers.\1.self_attn.q|k|v|_proj.weight" -ORIGINAL_TO_CONVERTED_KEY_MAPPING = { - r"norm.weight": r"norm.weight", - r"\nnorm.scale": r"\nnorm.weight", - r"unembedding.weight": r"lm_head.weight", - r"embedding": r"embed_tokens", - # special key, wqkv needs to be split afterwards - r"block.(\d+).attn.qkv": r"layers.\1.self_attn.qkv_proj", - r"block.(\d+).attn.out": r"layers.\1.self_attn.o_proj", - r"block.(\d+).attn.sinks": r"layers.\1.self_attn.sinks", - r"block.(\d+).attn.norm.scale": r"layers.\1.input_layernorm.weight", - - r"block.(\d+).mlp.mlp1_weight": r"layers.\1.mlp.experts.gate_up_proj", - r"block.(\d+).mlp.mlp1_bias": r"layers.\1.mlp.experts.gate_up_proj_bias", - r"block.(\d+).mlp.mlp2_weight": r"layers.\1.mlp.experts.down_proj", - r"block.(\d+).mlp.mlp2_bias": r"layers.\1.mlp.experts.down_proj_bias", - r"block.(\d+).mlp.norm.scale": r"layers.\1.post_attention_layernorm.weight", - r"block.(\d+).mlp.gate": r"layers.\1.mlp.router", -} -# fmt: on - - -def convert_old_keys_to_new_keys(state_dict_keys: Optional[dict] = None): - """ - This function should be applied only once, on the concatenated keys to efficiently rename using - the key mappings. - """ - output_dict = {} - if state_dict_keys is not None: - old_text = "\n".join(state_dict_keys) - new_text = old_text - for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items(): - if replacement is None: - new_text = re.sub(pattern, "", new_text) # an empty line - continue - new_text = re.sub(pattern, replacement, new_text) - output_dict = dict(zip(old_text.split("\n"), new_text.split("\n"))) - return output_dict - - -FP4_VALUES = [ - +0.0, - +0.5, - +1.0, - +1.5, - +2.0, - +3.0, - +4.0, - +6.0, - -0.0, - -0.5, - -1.0, - -1.5, - -2.0, - -3.0, - -4.0, - -6.0, -] - - -def convert_moe_packed_tensors( - blocks, - scales, - *, - dtype: torch.dtype = torch.bfloat16, - rows_per_chunk: int = 32768 * 1024, -) -> torch.Tensor: - """ - TODO this needs to be documented - """ - import math - - scales = scales.to(torch.int32) - 127 - - assert blocks.shape[:-1] == scales.shape, f"{blocks.shape=} does not match {scales.shape=}" - - lut = torch.tensor(FP4_VALUES, dtype=dtype, device=blocks.device) - - *prefix_shape, G, B = blocks.shape - rows_total = math.prod(prefix_shape) * G - - blocks = blocks.reshape(rows_total, B) - scales = scales.reshape(rows_total, 1) - - out = torch.empty(rows_total, B * 2, dtype=dtype, device=blocks.device) - - for r0 in range(0, rows_total, rows_per_chunk): - r1 = min(r0 + rows_per_chunk, rows_total) - - blk = blocks[r0:r1] - exp = scales[r0:r1] - - # nibble indices -> int64 - idx_lo = (blk & 0x0F).to(torch.long) - idx_hi = (blk >> 4).to(torch.long) - - sub = out[r0:r1] - sub[:, 0::2] = lut[idx_lo] - sub[:, 1::2] = lut[idx_hi] - - torch.ldexp(sub, exp, out=sub) - del idx_lo, idx_hi, blk, exp - - out = out.reshape(*prefix_shape, G, B * 2).view(*prefix_shape, G * B * 2) - out = out.to(torch.float8_e5m2).permute(0, 2, 1).contiguous() - return out - - -def write_model( - model_path, - input_base_path, - safe_serialization=True, - instruct=False, - mxfp4=False, -): - os.makedirs(model_path, exist_ok=True) - eos_token_id = 199999 if not instruct else 200002 - pad_token_id = 199999 - - original_config = json.loads((Path(input_base_path) / "config.json").read_text()) - - num_local_experts = original_config.pop("num_experts") - rope_scaling = { - "beta_fast": float(original_config.pop("rope_ntk_beta")), - "beta_slow": float(original_config.pop("rope_ntk_alpha")), - "factor": float(original_config.pop("rope_scaling_factor")), - "rope_type": "yarn", - "truncate": False, - "original_max_position_embeddings": 4096, - } - - config = GptOssConfig( - num_local_experts=num_local_experts, - rope_scaling=rope_scaling, - eos_token_id=eos_token_id, - pad_token_id=pad_token_id, - **original_config, - ) - - print(f"Fetching all parameters from the checkpoint at {input_base_path}...") - final_ = {} - for file in list(os.listdir(input_base_path)): - if file.endswith(".safetensors"): - final_.update(safe_load(os.path.join(input_base_path, file))) - - print("Converting ..") - all_keys = final_.keys() - new_keys = convert_old_keys_to_new_keys(all_keys) - - state_dict = {} - for key in all_keys: - # Post-process the current_parameter. - new_key = new_keys.get(key, key) - if "lm_head" not in new_key: - new_key = "model." + new_key - print(f"Processing key: {key} -> {new_key}") - if re.search("qkv_proj", new_key): - q_len = config.head_dim * config.num_attention_heads - k_len = config.head_dim * config.num_key_value_heads - q, k, v = ( - final_[key][:q_len, ...], - final_[key][q_len : k_len + q_len, ...], - final_[key][k_len + q_len :, ...], - ) - q_key = re.sub(r"qkv_proj", "q_proj", new_key) - k_key = re.sub(r"qkv_proj", "k_proj", new_key) - v_key = re.sub(r"qkv_proj", "v_proj", new_key) - state_dict[q_key] = q.contiguous().to(torch.bfloat16) - state_dict[k_key] = k.contiguous().to(torch.bfloat16) - state_dict[v_key] = v.contiguous().to(torch.bfloat16) - elif re.search("gate_up_proj|down_proj", new_key) and "bias" not in new_key: - if not mxfp4: - if "scales" in new_key: - continue - elif "blocks" in new_key: - # deal with packed weights - blocks = final_[key] - scales = final_[key.replace("blocks", "scales")] - new_key = new_key.replace(".blocks", "") - unpacked_tensors = convert_moe_packed_tensors(blocks, scales, dtype=torch.bfloat16) - state_dict[new_key] = unpacked_tensors - else: - raise (f"Unidentified {key}, please double check the state dict") - else: - if "scales" in new_key: - new_key = new_key.replace(".scales", "_scales") - state_dict[new_key] = final_[key].contiguous() - elif "blocks" in new_key: - new_key = new_key.replace(".blocks", "_blocks") - state_dict[new_key] = final_[key].contiguous() - else: - raise (f"Unidentified {key}, please double check the state dict") - else: - weight = final_[key] - if not re.search("norm", new_key): - weight = weight.to(torch.bfloat16) # norms are the only ones in float32 - state_dict[new_key] = weight - - del final_ - gc.collect() - - if not mxfp4: - print("Loading the checkpoint in a GptOss model for unpacked format") - with torch.device("meta"): - model = GptOssForCausalLM(config) - model.load_state_dict(state_dict, strict=True, assign=True) - print("Checkpoint loaded successfully.") - del config._name_or_path - - print("Saving the model") - model.save_pretrained(model_path, safe_serialization=safe_serialization) - del state_dict, model - - else: - print("Saving the checkpoint in mxfp4 format") - config.quantization_config = { - "quant_method": "mxfp4", - "modules_to_not_convert": [ - "model.layers.*.self_attn", - "model.layers.*.mlp.router", - "model.embed_tokens", - "lm_head", - ], - } - # required as we don't save the model with save_pretrained - config.architectures = ["GptOssForCausalLM"] - config.save_pretrained(model_path) - save_sharded_model(state_dict, model_path) - del state_dict - - gc.collect() - print("Reloading the model to check if it's saved correctly.") - GptOssForCausalLM.from_pretrained(model_path, dtype=torch.bfloat16, device_map="auto") - print("Model reloaded successfully.") - - # generation config - if instruct: - print("Saving generation config...") - generation_config = GenerationConfig( - bos_token_id=199998, # <|startoftext|> - do_sample=True, - eos_token_id=[200002, 199999], # <|return|>, <|endoftext|> - pad_token_id=199999, # <|endoftext|> - temperature=1.0, - top_p=1.0, - ) - generation_config.save_pretrained(model_path) - - -def save_sharded_model(state_dict, model_path): - from safetensors.torch import save_file - - max_shard_size = 4800000000 # 4.8 GB - os.makedirs(model_path, exist_ok=True) - shard_size_counter = 0 - shard_id = 0 - shard_state_dict = {} - total_sharded_dict = {} - safetensors_index = {} - safetensors_index["metadata"] = {"total_size": 0} - safetensors_index["weight_map"] = {} - for key in state_dict.keys(): - size = state_dict[key].numel() * state_dict[key].element_size() - if shard_size_counter + size > max_shard_size: - total_sharded_dict[shard_id] = shard_state_dict - shard_id += 1 - shard_size_counter = 0 - shard_state_dict = {} - shard_state_dict[key] = state_dict[key] - shard_size_counter += size - safetensors_index["metadata"]["total_size"] += size - safetensors_index["weight_map"][key] = shard_id - total_sharded_dict[shard_id] = shard_state_dict - num_shards = len(total_sharded_dict) - 1 - for shard_id, shard_state_dict in total_sharded_dict.items(): - save_file(shard_state_dict, os.path.join(model_path, f"model-{shard_id:05d}-of-{num_shards:05d}.safetensors")) - create_safetensors_index(safetensors_index, num_shards, model_path) - - -def create_safetensors_index(safetensors_index, num_shards, model_path): - for key in safetensors_index["weight_map"].keys(): - shard_id = safetensors_index["weight_map"][key] - safetensors_index["weight_map"][key] = f"model-{shard_id:05d}-of-{num_shards:05d}.safetensors" - with open(os.path.join(model_path, "model.safetensors.index.json"), "w") as f: - json.dump(safetensors_index, f) - - -# Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode -def bytes_to_unicode(): - """ - Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control - characters the bpe code barfs on. - - The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab - if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for - decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup - tables between utf-8 bytes and unicode strings. - """ - bs = ( - list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) - ) - cs = bs[:] - n = 0 - for b in range(2**8): - if b not in bs: - bs.append(b) - cs.append(2**8 + n) - n += 1 - cs = [chr(n) for n in cs] - return dict(zip(bs, cs)) - - -class GptOssConverter(TikTokenConverter): - def extract_vocab_merges_from_model(self, tiktoken_url: str): - tokenizer = tiktoken.get_encoding(tiktoken_url) - self.pattern = tokenizer._pat_str - bpe_ranks = tokenizer._mergeable_ranks - byte_encoder = bytes_to_unicode() - - def token_bytes_to_string(b): - return "".join([byte_encoder[ord(char)] for char in b.decode("latin-1")]) - - merges = [] - vocab = {} - for token, rank in bpe_ranks.items(): - vocab[token_bytes_to_string(token)] = rank - if len(token) == 1: - continue - local = [] - for index in range(1, len(token)): - piece_l, piece_r = token[:index], token[index:] - if piece_l in bpe_ranks and piece_r in bpe_ranks and (piece_l + piece_r) in bpe_ranks: - local.append((piece_l, piece_r, rank)) - local = sorted(local, key=lambda x: (bpe_ranks[x[0]], bpe_ranks[x[1]]), reverse=False) - merges.extend(local) - merges = sorted(merges, key=lambda val: val[2], reverse=False) - merges = [(token_bytes_to_string(val[0]), token_bytes_to_string(val[1])) for val in merges] - return vocab, merges - - def __init__( - self, - vocab_file, - model_max_length: int, - chat_template: Optional[str] = None, - **kwargs, - ): - super().__init__(vocab_file, pattern=None) - - # TODO 1st download the vocabfile!!! - tokenizer = tiktoken.get_encoding(vocab_file) - self.additional_special_tokens = {} - # Complete list of Harmony special tokens as per o200k_harmony spec - special_tokens_map = { - "<|startoftext|>": 199998, - "<|endoftext|>": 199999, - "<|return|>": 200002, - "<|constrain|>": 200003, - "<|channel|>": 200005, - "<|start|>": 200006, - "<|end|>": 200007, - "<|message|>": 200008, - "<|call|>": 200012, - "<|endofprompt|>": 200018, - } - - # Add the remaining reserved slots while skipping IDs already present above. - used_ids = set(special_tokens_map.values()) - for k in range(199999, 200018): - if k in used_ids: - continue - special_tokens_map.setdefault(f"<|reserved_{k}|>", k) - - # Keep only token strings (sorted by ID) for TikTokenConverter. - self.additional_special_tokens = [tok for tok, _ in sorted(special_tokens_map.items(), key=lambda x: x[1])] - tokenizer = self.converted() - if chat_template is not None: - kwargs["chat_template"] = chat_template - self.tokenizer = PreTrainedTokenizerFast( - tokenizer_object=tokenizer, - bos_token="<|startoftext|>", - eos_token="<|return|>" if chat_template else "<|endoftext|>", - pad_token="<|endoftext|>", - model_input_names=["input_ids", "attention_mask"], - model_max_length=model_max_length, - **kwargs, - ) - - -def write_tokenizer(tokenizer_path: str, save_dir: str, instruct: bool = False): - # Updated Harmony chat template - chat_template = """{#- - In addition to the normal inputs of `messages` and `tools`, this template also accepts the - following kwargs: - - "builtin_tools": A list, can contain "browser" and/or "python". - - "model_identity": A string that optionally describes the model identity. - - "reasoning_effort": A string that describes the reasoning effort, defaults to "medium". - #} - -{#- Tool Definition Rendering ============================================== #} -{%- macro render_typescript_type(param_spec, required_params, is_nullable=false) -%} - {%- if param_spec.type == "array" -%} - {%- if param_spec['items'] -%} - {%- if param_spec['items']['type'] == "string" -%} - {{- "string[]" }} - {%- elif param_spec['items']['type'] == "number" -%} - {{- "number[]" }} - {%- elif param_spec['items']['type'] == "integer" -%} - {{- "number[]" }} - {%- elif param_spec['items']['type'] == "boolean" -%} - {{- "boolean[]" }} - {%- else -%} - {%- set inner_type = render_typescript_type(param_spec['items'], required_params) -%} - {%- if inner_type == "object | object" or inner_type|length > 50 -%} - {{- "any[]" }} - {%- else -%} - {{- inner_type + "[]" }} - {%- endif -%} - {%- endif -%} - {%- if param_spec.nullable -%} - {{- " | null" }} - {%- endif -%} - {%- else -%} - {{- "any[]" }} - {%- if param_spec.nullable -%} - {{- " | null" }} - {%- endif -%} - {%- endif -%} - {%- elif param_spec.type is defined and param_spec.type is iterable and param_spec.type is not string and param_spec.type is not mapping and param_spec.type[0] is defined -%} - {#- Handle array of types like ["object", "object"] from Union[dict, list] #} - {%- if param_spec.type | length > 1 -%} - {{- param_spec.type | join(" | ") }} - {%- else -%} - {{- param_spec.type[0] }} - {%- endif -%} - {%- elif param_spec.oneOf -%} - {#- Handle oneOf schemas - check for complex unions and fallback to any #} - {%- set has_object_variants = false -%} - {%- for variant in param_spec.oneOf -%} - {%- if variant.type == "object" -%} - {%- set has_object_variants = true -%} - {%- endif -%} - {%- endfor -%} - {%- if has_object_variants and param_spec.oneOf|length > 1 -%} - {{- "any" }} - {%- else -%} - {%- for variant in param_spec.oneOf -%} - {{- render_typescript_type(variant, required_params) -}} - {%- if variant.description %} - {{- "// " + variant.description }} - {%- endif -%} - {%- if variant.default is defined %} - {{ "// default: " + variant.default|tojson }} - {%- endif -%} - {%- if not loop.last %} - {{- " | " }} - {% endif -%} - {%- endfor -%} - {%- endif -%} - {%- elif param_spec.type == "string" -%} - {%- if param_spec.enum -%} - {{- '"' + param_spec.enum|join('" | "') + '"' -}} - {%- else -%} - {{- "string" }} - {%- if param_spec.nullable %} - {{- " | null" }} - {%- endif -%} - {%- endif -%} - {%- elif param_spec.type == "number" -%} - {{- "number" }} - {%- elif param_spec.type == "integer" -%} - {{- "number" }} - {%- elif param_spec.type == "boolean" -%} - {{- "boolean" }} - - {%- elif param_spec.type == "object" -%} - {%- if param_spec.properties -%} - {{- "{\n" }} - {%- for prop_name, prop_spec in param_spec.properties.items() -%} - {{- prop_name -}} - {%- if prop_name not in (param_spec.required or []) -%} - {{- "?" }} - {%- endif -%} - {{- ": " }} - {{ render_typescript_type(prop_spec, param_spec.required or []) }} - {%- if not loop.last -%} - {{-", " }} - {%- endif -%} - {%- endfor -%} - {{- "}" }} - {%- else -%} - {{- "object" }} - {%- endif -%} - {%- else -%} - {{- "any" }} - {%- endif -%} -{%- endmacro -%} - -{%- macro render_tool_namespace(namespace_name, tools) -%} - {{- "## " + namespace_name + "\n\n" }} - {{- "namespace " + namespace_name + " {\n\n" }} - {%- for tool in tools %} - {%- set tool = tool.function %} - {{- "// " + tool.description + "\n" }} - {{- "type "+ tool.name + " = " }} - {%- if tool.parameters and tool.parameters.properties %} - {{- "(_: {\n" }} - {%- for param_name, param_spec in tool.parameters.properties.items() %} - {%- if param_spec.description %} - {{- "// " + param_spec.description + "\n" }} - {%- endif %} - {{- param_name }} - {%- if param_name not in (tool.parameters.required or []) -%} - {{- "?" }} - {%- endif -%} - {{- ": " }} - {{- render_typescript_type(param_spec, tool.parameters.required or []) }} - {%- if param_spec.default is defined -%} - {%- if param_spec.enum %} - {{- ", // default: " + param_spec.default }} - {%- elif param_spec.oneOf %} - {{- "// default: " + param_spec.default }} - {%- else %} - {{- ", // default: " + param_spec.default|tojson }} - {%- endif -%} - {%- endif -%} - {%- if not loop.last %} - {{- ",\n" }} - {%- else %} - {{- ",\n" }} - {%- endif -%} - {%- endfor %} - {{- "}) => any;\n\n" }} - {%- else -%} - {{- "() => any;\n\n" }} - {%- endif -%} - {%- endfor %} - {{- "} // namespace " + namespace_name }} -{%- endmacro -%} - -{%- macro render_builtin_tools(browser_tool, python_tool) -%} - {%- if browser_tool %} - {{- "## browser\n\n" }} - {{- "// Tool for browsing.\n" }} - {{- "// The `cursor` appears in brackets before each browsing display: `[{cursor}]`.\n" }} - {{- "// Cite information from the tool using the following format:\n" }} - {{- "// `【{cursor}†L{line_start}(-L{line_end})?】`, for example: `【6†L9-L11】` or `【8†L3】`.\n" }} - {{- "// Do not quote more than 10 words directly from the tool output.\n" }} - {{- "// sources=web (default: web)\n" }} - {{- "namespace browser {\n\n" }} - {{- "// Searches for information related to `query` and displays `topn` results.\n" }} - {{- "type search = (_: {\n" }} - {{- "query: string,\n" }} - {{- "topn?: number, // default: 10\n" }} - {{- "source?: string,\n" }} - {{- "}) => any;\n\n" }} - {{- "// Opens the link `id` from the page indicated by `cursor` starting at line number `loc`, showing `num_lines` lines.\n" }} - {{- "// Valid link ids are displayed with the formatting: `【{id}†.*】`.\n" }} - {{- "// If `cursor` is not provided, the most recent page is implied.\n" }} - {{- "// If `id` is a string, it is treated as a fully qualified URL associated with `source`.\n" }} - {{- "// If `loc` is not provided, the viewport will be positioned at the beginning of the document or centered on the most relevant passage, if available.\n" }} - {{- "// Use this function without `id` to scroll to a new location of an opened page.\n" }} - {{- "type open = (_: {\n" }} - {{- "id?: number | string, // default: -1\n" }} - {{- "cursor?: number, // default: -1\n" }} - {{- "loc?: number, // default: -1\n" }} - {{- "num_lines?: number, // default: -1\n" }} - {{- "view_source?: boolean, // default: false\n" }} - {{- "source?: string,\n" }} - {{- "}) => any;\n\n" }} - {{- "// Finds exact matches of `pattern` in the current page, or the page given by `cursor`.\n" }} - {{- "type find = (_: {\n" }} - {{- "pattern: string,\n" }} - {{- "cursor?: number, // default: -1\n" }} - {{- "}) => any;\n\n" }} - {{- "} // namespace browser\n\n" }} - {%- endif -%} - - {%- if python_tool %} - {{- "## python\n\n" }} - {{- "Use this tool to execute Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files).\n\n" }} - {{- "When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 120.0 seconds. The drive at '/mnt/data' can be used to save and persist user files. Internet access for this session is UNKNOWN. Depends on the cluster.\n\n" }} - {%- endif -%} -{%- endmacro -%} - -{#- System Message Construction ============================================ #} -{%- macro build_system_message() -%} - {%- if model_identity is not defined %} - {%- set model_identity = "You are ChatGPT, a large language model trained by OpenAI." %} - {%- endif %} - {{- model_identity + "\n" }} - {{- "Knowledge cutoff: 2024-06\n" }} - {{- "Current date: " + strftime_now("%Y-%m-%d") + "\n\n" }} - {%- if reasoning_effort is not defined %} - {%- set reasoning_effort = "medium" %} - {%- endif %} - {{- "Reasoning: " + reasoning_effort + "\n\n" }} - {%- if builtin_tools %} - {{- "# Tools\n\n" }} - {%- set available_builtin_tools = namespace(browser=false, python=false) %} - {%- for tool in builtin_tools %} - {%- if tool == "browser" %} - {%- set available_builtin_tools.browser = true %} - {%- elif tool == "python" %} - {%- set available_builtin_tools.python = true %} - {%- endif %} - {%- endfor %} - {{- render_builtin_tools(available_builtin_tools.browser, available_builtin_tools.python) }} - {%- endif -%} - {{- "# Valid channels: analysis, commentary, final. Channel must be included for every message." }} - {%- if tools -%} - {{- "\nCalls to these tools must go to the commentary channel: 'functions'." }} - {%- endif -%} -{%- endmacro -%} - -{#- Main Template Logic ================================================= #} -{#- Set defaults #} - -{#- Render system message #} -{{- "<|start|>system<|message|>" }} -{{- build_system_message() }} -{{- "<|end|>" }} - -{#- Extract developer message #} -{%- if messages[0].role == "developer" or messages[0].role == "system" %} - {%- set developer_message = messages[0].content %} - {%- set loop_messages = messages[1:] %} -{%- else %} - {%- set developer_message = "" %} - {%- set loop_messages = messages %} -{%- endif %} - -{#- Render developer message #} -{%- if developer_message or tools %} - {{- "<|start|>developer<|message|>" }} - {%- if developer_message %} - {{- "# Instructions\n\n" }} - {{- developer_message }} - {%- endif %} - {%- if tools -%} - {{- "\n\n" }} - {{- "# Tools\n\n" }} - {{- render_tool_namespace("functions", tools) }} - {%- endif -%} - {{- "<|end|>" }} -{%- endif %} - -{#- Render messages #} -{%- set last_tool_call = namespace(name=none) %} -{%- for message in loop_messages -%} - {#- At this point only assistant/user/tool messages should remain #} - {%- if message.role == 'assistant' -%} - {#- Checks to ensure the messages are being passed in the format we expect #} - {%- if "content" in message %} - {%- if "<|channel|>analysis<|message|>" in message.content or "<|channel|>final<|message|>" in message.content %} - {{- raise_exception("You have passed a message containing <|channel|> tags in the content field. Instead of doing this, you should pass analysis messages (the string between '<|message|>' and '<|end|>') in the 'thinking' field, and final messages (the string between '<|message|>' and '<|end|>') in the 'content' field.") }} - {%- endif %} - {%- endif %} - {%- if "thinking" in message %} - {%- if "<|channel|>analysis<|message|>" in message.thinking or "<|channel|>final<|message|>" in message.thinking %} - {{- raise_exception("You have passed a message containing <|channel|> tags in the thinking field. Instead of doing this, you should pass analysis messages (the string between '<|message|>' and '<|end|>') in the 'thinking' field, and final messages (the string between '<|message|>' and '<|end|>') in the 'content' field.") }} - {%- endif %} - {%- endif %} - {%- if "tool_calls" in message %} - {#- We need very careful handling here - we want to drop the tool call analysis message if the model #} - {#- has output a later <|final|> message, but otherwise we want to retain it. This is the only case #} - {#- when we render CoT/analysis messages in inference. #} - {%- set future_final_message = namespace(found=false) %} - {%- for future_message in loop_messages[loop.index:] %} - {%- if future_message.role == 'assistant' and "tool_calls" not in future_message %} - {%- set future_final_message.found = true %} - {%- endif %} - {%- endfor %} - {#- We assume max 1 tool call per message, and so we infer the tool call name #} - {#- in "tool" messages from the most recent assistant tool call name #} - {%- set tool_call = message.tool_calls[0] %} - {%- if tool_call.function %} - {%- set tool_call = tool_call.function %} - {%- endif %} - {%- if message.content and message.thinking %} - {{- raise_exception("Cannot pass both content and thinking in an assistant message with tool calls! Put the analysis message in one or the other, but not both.") }} - {%- elif message.content and not future_final_message.found %} - {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.content + "<|end|>" }} - {%- elif message.thinking and not future_final_message.found %} - {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }} - {%- endif %} - {{- "<|start|>assistant to=" }} - {{- "functions." + tool_call.name + "<|channel|>commentary " }} - {{- (tool_call.content_type if tool_call.content_type is defined else "json") + "<|message|>" }} - {{- tool_call.arguments|tojson }} - {{- "<|call|>" }} - {%- set last_tool_call.name = tool_call.name %} - {%- elif loop.last and not add_generation_prompt %} - {#- Only render the CoT if the final turn is an assistant turn and add_generation_prompt is false #} - {#- This is a situation that should only occur in training, never in inference. #} - {%- if "thinking" in message %} - {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }} - {%- endif %} - {#- <|return|> indicates the end of generation, but <|end|> does not #} - {#- <|return|> should never be an input to the model, but we include it as the final token #} - {#- when training, so the model learns to emit it. #} - {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|return|>" }} - {%- else %} - {#- CoT is dropped during all previous turns, so we never render it for inference #} - {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|end|>" }} - {%- set last_tool_call.name = none %} - {%- endif %} - {%- elif message.role == 'tool' -%} - {%- if last_tool_call.name is none %} - {{- raise_exception("Message has tool role, but there was no previous assistant message with a tool call!") }} - {%- endif %} - {{- "<|start|>functions." + last_tool_call.name }} - {{- " to=assistant<|channel|>commentary<|message|>" + message.content|tojson + "<|end|>" }} - {%- elif message.role == 'user' -%} - {{- "<|start|>user<|message|>" + message.content + "<|end|>" }} - {%- endif -%} -{%- endfor -%} - -{#- Generation prompt #} -{%- if add_generation_prompt -%} -<|start|>assistant -{%- endif -%}""" - - converter = GptOssConverter( - vocab_file=tokenizer_path, - model_max_length=None, - chat_template=chat_template if instruct else None, - ) - tokenizer = converter.tokenizer - tokenizer.save_pretrained(save_dir) - - if instruct: - print("Saving chat template...") - chat_template_path = os.path.join(save_dir, "chat_template.json") - with open(chat_template_path, "w") as f: - json.dump({"chat_template": chat_template}, f, indent=2) - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--input_dir", - default="/fsx/mohamed/oai-hf/tests/120b", - help="Location of LLaMA weights, which contains tokenizer.model and model folders", - ) - parser.add_argument( - "--output_dir", - default="/fsx/mohamed/oai-hf/tests/120b_converted_packed", - help="Location to write HF model and tokenizer", - ) - parser.add_argument( - "--safe_serialization", default=True, type=bool, help="Whether or not to save using `safetensors`." - ) - parser.add_argument( - "--special_tokens", - default=None, - type=list[str], - help="The list of special tokens that should be added to the ", - ) - - parser.add_argument( - "--instruct", - action="store_true", - help="Whether the model is an instruct model", - ) - - # Only specify this if you want to use the model with mxfp4 quantization - # It means the model will be unpacked, and quantized using mxfp4 during inference if all the triton requirements are satisfied (triton >= 3.4.0) - # Else we have a fallback to the full precision model (bfloat16) - # If not specified, the model will be unpacked during conversion, and will be in fp8/bfloat16 during inference - # Note: mxfp4 should bring an important speedup in inference time with blackwell gpus - parser.add_argument( - "--mxfp4", - action="store_true", - help="Whether to use the original model with mxfp4 quantization or default to the full precision model.", - ) - - args = parser.parse_args() - write_model( - model_path=args.output_dir, - input_base_path=args.input_dir, - safe_serialization=args.safe_serialization, - instruct=args.instruct, - mxfp4=args.mxfp4, - ) - - write_tokenizer( - tokenizer_path="o200k_base", - save_dir=args.output_dir, - instruct=args.instruct, - ) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/gpt_sw3/convert_megatron_to_pytorch.py b/src/transformers/models/gpt_sw3/convert_megatron_to_pytorch.py deleted file mode 100644 index 27ec2f20d89f..000000000000 --- a/src/transformers/models/gpt_sw3/convert_megatron_to_pytorch.py +++ /dev/null @@ -1,197 +0,0 @@ -# Copyright 2022 The HuggingFace Inc. team and the AI-Sweden team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert GPT-SW3 megatron checkpoints to pytorch""" - -import argparse -import os -from os.path import isfile - -import torch - -from transformers import GPT2Config - - -def recursive_print(name, val, spaces=0): - # Format the message. - if name is None: - msg = None - else: - fmt = "." * max(0, spaces - 2) + "# {:" + str(50 - spaces) + "s}" - msg = fmt.format(name) - - # Print and recurse (if needed). - if isinstance(val, dict): - if msg is not None: - print(msg) - for k in val: - recursive_print(k, val[k], spaces + 2) - elif isinstance(val, torch.Tensor): - print(msg, ":", val.size()) - else: - print(msg, ":", val) - - -def fix_query_key_value_ordering(param, num_splits, num_heads, hidden_size): - # Permutes layout of param tensor to [num_splits * num_heads * hidden_size, :] - # for compatibility with later versions of NVIDIA Megatron-LM. - # The inverse operation is performed inside Megatron-LM to read checkpoints: - # https://github.com/NVIDIA/Megatron-LM/blob/v2.4/megatron/checkpointing.py#L209 - # If param is the weight tensor of the self-attention block, the returned tensor - # will have to be transposed one more time to be read by HuggingFace GPT2. - input_shape = param.size() - # other versions store [num_heads * num_splits * hidden_size, :] - saved_shape = (num_heads, num_splits, hidden_size) + input_shape[1:] - param = param.view(*saved_shape) - param = param.transpose(0, 1).contiguous() - param = param.view(*input_shape) - return param - - -def convert_megatron_checkpoint(sd_megatron, config): - """ - Converts a Megatron checkpoint to a HuggingFace GPT-SW3 checkpoint. - """ - n_positions = config.n_positions - layers = config.n_layer - vocab_size = config.vocab_size - heads = config.n_head - hidden_size_per_head = config.n_embd // config.n_head - - word_embeddings = sd_megatron["model.language_model.embedding.word_embeddings.weight"][:vocab_size, :] - sd_hf = { - "transformer.wte.weight": word_embeddings, - "transformer.wpe.weight": sd_megatron["model.language_model.embedding.position_embeddings.weight"], - "transformer.ln_f.weight": sd_megatron["model.language_model.encoder.final_layernorm.weight"], - "transformer.ln_f.bias": sd_megatron["model.language_model.encoder.final_layernorm.bias"], - } - - pf = "model.language_model.encoder.layers." - for i in range(layers): - causal_mask = torch.tril(torch.ones((n_positions, n_positions), dtype=torch.bool)) - causal_mask = causal_mask.view(1, 1, n_positions, n_positions) - sd_hf[f"transformer.h.{i}.attn.bias"] = causal_mask - sd_hf[f"transformer.h.{i}.attn.masked_bias"] = torch.tensor(-1e4, dtype=torch.bfloat16) - - sd_hf[f"transformer.h.{i}.ln_1.weight"] = sd_megatron[f"{pf}{i}.input_layernorm.weight"] - sd_hf[f"transformer.h.{i}.ln_1.bias"] = sd_megatron[f"{pf}{i}.input_layernorm.bias"] - - val1 = sd_megatron[f"{pf}{i}.self_attention.query_key_value.weight"] - val1 = fix_query_key_value_ordering(val1, 3, heads, hidden_size_per_head) - sd_hf[f"transformer.h.{i}.attn.c_attn.weight"] = val1.transpose(0, 1).contiguous() - - val2 = sd_megatron[f"{pf}{i}.self_attention.query_key_value.bias"] - val2 = fix_query_key_value_ordering(val2, 3, heads, hidden_size_per_head) - sd_hf[f"transformer.h.{i}.attn.c_attn.bias"] = val2 - - sd_hf[f"transformer.h.{i}.attn.c_proj.weight"] = sd_megatron[f"{pf}{i}.self_attention.dense.weight"].transpose( - 0, 1 - ) - sd_hf[f"transformer.h.{i}.attn.c_proj.bias"] = sd_megatron[f"{pf}{i}.self_attention.dense.bias"] - sd_hf[f"transformer.h.{i}.ln_2.weight"] = sd_megatron[f"{pf}{i}.post_attention_layernorm.weight"] - sd_hf[f"transformer.h.{i}.ln_2.bias"] = sd_megatron[f"{pf}{i}.post_attention_layernorm.bias"] - sd_hf[f"transformer.h.{i}.mlp.c_fc.weight"] = sd_megatron[f"{pf}{i}.mlp.dense_h_to_4h.weight"].transpose(0, 1) - sd_hf[f"transformer.h.{i}.mlp.c_fc.bias"] = sd_megatron[f"{pf}{i}.mlp.dense_h_to_4h.bias"] - sd_hf[f"transformer.h.{i}.mlp.c_proj.weight"] = sd_megatron[f"{pf}{i}.mlp.dense_4h_to_h.weight"].transpose( - 0, 1 - ) - sd_hf[f"transformer.h.{i}.mlp.c_proj.bias"] = sd_megatron[f"{pf}{i}.mlp.dense_4h_to_h.bias"] - - # For LM head, transformers' wants the matrix to weight embeddings. - sd_hf["lm_head.weight"] = word_embeddings - - return sd_hf - - -def copy_config(config_hf, config_megatron): - """Copy the config from Megatron to hf.""" - config_hf.vocab_size = 64000 - config_hf.n_positions = config_megatron["encoder_seq_length"] - config_hf.n_embd = config_megatron["hidden_size"] - config_hf.n_layer = config_megatron["num_layers"] - config_hf.n_head = config_megatron["num_attention_heads"] - config_hf.n_inner = config_megatron["ffn_hidden_size"] - config_hf.activation_function = "gelu" - config_hf.resid_pdrop = 0.1 - config_hf.embd_pdrop = 0.1 - config_hf.attn_pdrop = 0.1 - config_hf.layer_norm_epsilon = config_megatron["layernorm_epsilon"] # 1e-5 - config_hf.initializer_range = config_megatron["init_method_std"] # 0.02 - config_hf.apply_query_key_layer_scaling = config_megatron["apply_query_key_layer_scaling"] # True - config_hf.normalize_attention_scores = True - config_hf.use_cache = True - - # This identifies the 6.7B (7B) model which uses a different tokenizer - if config_megatron["hidden_size"] == 4096: - config_hf.bos_token_id = 1 # <|endoftext|> - config_hf.eos_token_id = 1 # <|endoftext|> - config_hf.pad_token_id = 0 # - else: - config_hf.bos_token_id = 2 # - config_hf.eos_token_id = 3 # <|endoftext|> - config_hf.pad_token_id = 0 # - - return config_hf - - -def main(args): - print(args) - - checkpoint_path = args.checkpoint_path - save_path = args.save_path - if isfile(checkpoint_path): - raise FileNotFoundError(f"ERROR! could not find file {checkpoint_path}") - - # Load the model. - checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=True) - - # Load the config. - config_megatron = checkpoint["hyper_parameters"]["cfg"] - config_hf = GPT2Config() - config_hf = copy_config(config_hf=config_hf, config_megatron=config_megatron) - config_hf.architectures = ["GPT2LMHeadModel"] - - sd_megatron = checkpoint["state_dict"] - - # Convert. - print("Converting") - sd_hf = convert_megatron_checkpoint(sd_megatron, config_hf) - - # Print the structure of converted state dict. - if args.print_checkpoint_structure: - recursive_print(None, sd_hf) - - config_hf.tokenizer_class = "GPTSw3Tokenizer" - - # Store the config to file. - print("Saving config") - config_hf.save_pretrained(save_path) - - # Store the state_dict to file. - output_checkpoint_file = os.path.join(save_path, "pytorch_model.bin") - print(f'Saving checkpoint to "{output_checkpoint_file}"') - torch.save(sd_hf, output_checkpoint_file) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--checkpoint_path", - type=str, - required=True, - help="e.g. megatron_gpt--val_loss=2.42-step=38000-consumed_samples=54720000", - ) - parser.add_argument("--save_path", type=str, required=True, help="e.g. /home/user/gpt-sw3/hf") - parser.add_argument("--print-checkpoint-structure", action="store_true") - _args = parser.parse_args() - main(_args) diff --git a/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py b/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py index 7f9883779c43..8f6059720b04 100644 --- a/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +++ b/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py @@ -458,7 +458,7 @@ def __init__(self, config: GraniteMoeHybridConfig, layer_idx: int): if not is_fast_path_available: logger.warning_once( - "The fast path is not available because on of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)`" + "The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)`" " is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and" " https://github.com/Dao-AILab/causal-conv1d" ) diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py deleted file mode 100644 index b7358e2a015f..000000000000 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ /dev/null @@ -1,491 +0,0 @@ -# coding=utf-8 -# Copyright 2024 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Grounding DINO checkpoints from the original repository. - -URL: https://github.com/IDEA-Research/GroundingDINO""" - -import argparse - -import requests -import torch -from PIL import Image -from torchvision import transforms as T - -from transformers import ( - AutoTokenizer, - GroundingDinoConfig, - GroundingDinoForObjectDetection, - GroundingDinoImageProcessor, - GroundingDinoProcessor, - SwinConfig, -) - - -IMAGENET_MEAN = [0.485, 0.456, 0.406] -IMAGENET_STD = [0.229, 0.224, 0.225] - - -def get_grounding_dino_config(model_name): - if "tiny" in model_name: - window_size = 7 - embed_dim = 96 - depths = (2, 2, 6, 2) - num_heads = (3, 6, 12, 24) - image_size = 224 - elif "base" in model_name: - window_size = 12 - embed_dim = 128 - depths = (2, 2, 18, 2) - num_heads = (4, 8, 16, 32) - image_size = 384 - else: - raise ValueError("Model not supported, only supports base and large variants") - - backbone_config = SwinConfig( - window_size=window_size, - image_size=image_size, - embed_dim=embed_dim, - depths=depths, - num_heads=num_heads, - out_indices=[2, 3, 4], - ) - - config = GroundingDinoConfig(backbone_config=backbone_config) - - return config - - -def create_rename_keys(state_dict, config): - rename_keys = [] - # fmt: off - ########################################## VISION BACKBONE - START - # patch embedding layer - rename_keys.append(("backbone.0.patch_embed.proj.weight", - "model.backbone.conv_encoder.model.embeddings.patch_embeddings.projection.weight")) - rename_keys.append(("backbone.0.patch_embed.proj.bias", - "model.backbone.conv_encoder.model.embeddings.patch_embeddings.projection.bias")) - rename_keys.append(("backbone.0.patch_embed.norm.weight", - "model.backbone.conv_encoder.model.embeddings.norm.weight")) - rename_keys.append(("backbone.0.patch_embed.norm.bias", - "model.backbone.conv_encoder.model.embeddings.norm.bias")) - - for layer, depth in enumerate(config.backbone_config.depths): - for block in range(depth): - # layernorms - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm1.weight", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.weight")) - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm1.bias", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.bias")) - - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.weight", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_after.weight")) - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.bias", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_after.bias")) - # attention - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_bias_table", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.relative_position_bias_table")) - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.weight", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.weight")) - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.bias", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.bias")) - # intermediate - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.weight", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.weight")) - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.bias", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.bias")) - - # output - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.weight", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.output.dense.weight")) - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.bias", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.output.dense.bias")) - - # downsample - if layer!=len(config.backbone_config.depths)-1: - rename_keys.append((f"backbone.0.layers.{layer}.downsample.reduction.weight", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.reduction.weight")) - rename_keys.append((f"backbone.0.layers.{layer}.downsample.norm.weight", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.norm.weight")) - rename_keys.append((f"backbone.0.layers.{layer}.downsample.norm.bias", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.norm.bias")) - - for out_indice in config.backbone_config.out_indices: - # Grounding DINO implementation of out_indices isn't aligned with transformers - rename_keys.append((f"backbone.0.norm{out_indice-1}.weight", - f"model.backbone.conv_encoder.model.hidden_states_norms.stage{out_indice}.weight")) - rename_keys.append((f"backbone.0.norm{out_indice-1}.bias", - f"model.backbone.conv_encoder.model.hidden_states_norms.stage{out_indice}.bias")) - - ########################################## VISION BACKBONE - END - - ########################################## ENCODER - START - deformable_key_mappings = { - 'self_attn.sampling_offsets.weight': 'deformable_layer.self_attn.sampling_offsets.weight', - 'self_attn.sampling_offsets.bias': 'deformable_layer.self_attn.sampling_offsets.bias', - 'self_attn.attention_weights.weight': 'deformable_layer.self_attn.attention_weights.weight', - 'self_attn.attention_weights.bias': 'deformable_layer.self_attn.attention_weights.bias', - 'self_attn.value_proj.weight': 'deformable_layer.self_attn.value_proj.weight', - 'self_attn.value_proj.bias': 'deformable_layer.self_attn.value_proj.bias', - 'self_attn.output_proj.weight': 'deformable_layer.self_attn.output_proj.weight', - 'self_attn.output_proj.bias': 'deformable_layer.self_attn.output_proj.bias', - 'norm1.weight': 'deformable_layer.self_attn_layer_norm.weight', - 'norm1.bias': 'deformable_layer.self_attn_layer_norm.bias', - 'linear1.weight': 'deformable_layer.fc1.weight', - 'linear1.bias': 'deformable_layer.fc1.bias', - 'linear2.weight': 'deformable_layer.fc2.weight', - 'linear2.bias': 'deformable_layer.fc2.bias', - 'norm2.weight': 'deformable_layer.final_layer_norm.weight', - 'norm2.bias': 'deformable_layer.final_layer_norm.bias', - } - text_enhancer_key_mappings = { - 'self_attn.in_proj_weight': 'text_enhancer_layer.self_attn.in_proj_weight', - 'self_attn.in_proj_bias': 'text_enhancer_layer.self_attn.in_proj_bias', - 'self_attn.out_proj.weight': 'text_enhancer_layer.self_attn.out_proj.weight', - 'self_attn.out_proj.bias': 'text_enhancer_layer.self_attn.out_proj.bias', - 'linear1.weight': 'text_enhancer_layer.fc1.weight', - 'linear1.bias': 'text_enhancer_layer.fc1.bias', - 'linear2.weight': 'text_enhancer_layer.fc2.weight', - 'linear2.bias': 'text_enhancer_layer.fc2.bias', - 'norm1.weight': 'text_enhancer_layer.layer_norm_before.weight', - 'norm1.bias': 'text_enhancer_layer.layer_norm_before.bias', - 'norm2.weight': 'text_enhancer_layer.layer_norm_after.weight', - 'norm2.bias': 'text_enhancer_layer.layer_norm_after.bias', - } - fusion_key_mappings = { - 'gamma_v': 'fusion_layer.vision_param', - 'gamma_l': 'fusion_layer.text_param', - 'layer_norm_v.weight': 'fusion_layer.layer_norm_vision.weight', - 'layer_norm_v.bias': 'fusion_layer.layer_norm_vision.bias', - 'layer_norm_l.weight': 'fusion_layer.layer_norm_text.weight', - 'layer_norm_l.bias': 'fusion_layer.layer_norm_text.bias', - 'attn.v_proj.weight': 'fusion_layer.attn.vision_proj.weight', - 'attn.v_proj.bias': 'fusion_layer.attn.vision_proj.bias', - 'attn.l_proj.weight': 'fusion_layer.attn.text_proj.weight', - 'attn.l_proj.bias': 'fusion_layer.attn.text_proj.bias', - 'attn.values_v_proj.weight': 'fusion_layer.attn.values_vision_proj.weight', - 'attn.values_v_proj.bias': 'fusion_layer.attn.values_vision_proj.bias', - 'attn.values_l_proj.weight': 'fusion_layer.attn.values_text_proj.weight', - 'attn.values_l_proj.bias': 'fusion_layer.attn.values_text_proj.bias', - 'attn.out_v_proj.weight': 'fusion_layer.attn.out_vision_proj.weight', - 'attn.out_v_proj.bias': 'fusion_layer.attn.out_vision_proj.bias', - 'attn.out_l_proj.weight': 'fusion_layer.attn.out_text_proj.weight', - 'attn.out_l_proj.bias': 'fusion_layer.attn.out_text_proj.bias', - } - for layer in range(config.encoder_layers): - # deformable - for src, dest in deformable_key_mappings.items(): - rename_keys.append((f"transformer.encoder.layers.{layer}.{src}", - f"model.encoder.layers.{layer}.{dest}")) - # text enhance - for src, dest in text_enhancer_key_mappings.items(): - rename_keys.append((f"transformer.encoder.text_layers.{layer}.{src}", - f"model.encoder.layers.{layer}.{dest}")) - # fusion layers - for src, dest in fusion_key_mappings.items(): - rename_keys.append((f"transformer.encoder.fusion_layers.{layer}.{src}", - f"model.encoder.layers.{layer}.{dest}")) - ########################################## ENCODER - END - - ########################################## DECODER - START - key_mappings_decoder = { - 'cross_attn.sampling_offsets.weight': 'encoder_attn.sampling_offsets.weight', - 'cross_attn.sampling_offsets.bias': 'encoder_attn.sampling_offsets.bias', - 'cross_attn.attention_weights.weight': 'encoder_attn.attention_weights.weight', - 'cross_attn.attention_weights.bias': 'encoder_attn.attention_weights.bias', - 'cross_attn.value_proj.weight': 'encoder_attn.value_proj.weight', - 'cross_attn.value_proj.bias': 'encoder_attn.value_proj.bias', - 'cross_attn.output_proj.weight': 'encoder_attn.output_proj.weight', - 'cross_attn.output_proj.bias': 'encoder_attn.output_proj.bias', - 'norm1.weight': 'encoder_attn_layer_norm.weight', - 'norm1.bias': 'encoder_attn_layer_norm.bias', - 'ca_text.in_proj_weight': 'encoder_attn_text.in_proj_weight', - 'ca_text.in_proj_bias': 'encoder_attn_text.in_proj_bias', - 'ca_text.out_proj.weight': 'encoder_attn_text.out_proj.weight', - 'ca_text.out_proj.bias': 'encoder_attn_text.out_proj.bias', - 'catext_norm.weight': 'encoder_attn_text_layer_norm.weight', - 'catext_norm.bias': 'encoder_attn_text_layer_norm.bias', - 'self_attn.in_proj_weight': 'self_attn.in_proj_weight', - 'self_attn.in_proj_bias': 'self_attn.in_proj_bias', - 'self_attn.out_proj.weight': 'self_attn.out_proj.weight', - 'self_attn.out_proj.bias': 'self_attn.out_proj.bias', - 'norm2.weight': 'self_attn_layer_norm.weight', - 'norm2.bias': 'self_attn_layer_norm.bias', - 'linear1.weight': 'fc1.weight', - 'linear1.bias': 'fc1.bias', - 'linear2.weight': 'fc2.weight', - 'linear2.bias': 'fc2.bias', - 'norm3.weight': 'final_layer_norm.weight', - 'norm3.bias': 'final_layer_norm.bias', - } - for layer_num in range(config.decoder_layers): - source_prefix_decoder = f'transformer.decoder.layers.{layer_num}.' - target_prefix_decoder = f'model.decoder.layers.{layer_num}.' - - for source_name, target_name in key_mappings_decoder.items(): - rename_keys.append((source_prefix_decoder + source_name, - target_prefix_decoder + target_name)) - ########################################## DECODER - END - - ########################################## Additional - START - for layer_name in state_dict: - #### TEXT BACKBONE - if "bert" in layer_name: - rename_keys.append((layer_name, layer_name.replace("bert", "model.text_backbone"))) - #### INPUT PROJ - PROJECT OUTPUT FEATURES FROM VISION BACKBONE - if "input_proj" in layer_name: - rename_keys.append((layer_name, layer_name.replace("input_proj", "model.input_proj_vision"))) - #### INPUT PROJ - PROJECT OUTPUT FEATURES FROM TEXT BACKBONE - if "feat_map" in layer_name: - rename_keys.append((layer_name, layer_name.replace("feat_map", "model.text_projection"))) - #### DECODER REFERENCE POINT HEAD - if "transformer.decoder.ref_point_head" in layer_name: - rename_keys.append((layer_name, layer_name.replace("transformer.decoder.ref_point_head", - "model.decoder.reference_points_head"))) - #### DECODER BBOX EMBED - if "transformer.decoder.bbox_embed" in layer_name: - rename_keys.append((layer_name, layer_name.replace("transformer.decoder.bbox_embed", - "model.decoder.bbox_embed"))) - if "transformer.enc_output" in layer_name: - rename_keys.append((layer_name, layer_name.replace("transformer", "model"))) - - if "transformer.enc_out_bbox_embed" in layer_name: - rename_keys.append((layer_name, layer_name.replace("transformer.enc_out_bbox_embed", - "model.encoder_output_bbox_embed"))) - - rename_keys.append(("transformer.level_embed", "model.level_embed")) - rename_keys.append(("transformer.decoder.norm.weight", "model.decoder.layer_norm.weight")) - rename_keys.append(("transformer.decoder.norm.bias", "model.decoder.layer_norm.bias")) - rename_keys.append(("transformer.tgt_embed.weight", "model.query_position_embeddings.weight")) - ########################################## Additional - END - - # fmt: on - return rename_keys - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v_encoder(state_dict, config): - ########################################## VISION BACKBONE - START - embed_dim = config.backbone_config.embed_dim - for layer, depth in enumerate(config.backbone_config.depths): - hidden_size = embed_dim * 2**layer - for block in range(depth): - # read in weights + bias of input projection layer (in timm, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"backbone.0.layers.{layer}.blocks.{block}.attn.qkv.weight") - in_proj_bias = state_dict.pop(f"backbone.0.layers.{layer}.blocks.{block}.attn.qkv.bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[ - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.weight" - ] = in_proj_weight[:hidden_size, :] - state_dict[ - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.bias" - ] = in_proj_bias[:hidden_size] - - state_dict[ - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.key.weight" - ] = in_proj_weight[hidden_size : hidden_size * 2, :] - state_dict[ - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.key.bias" - ] = in_proj_bias[hidden_size : hidden_size * 2] - - state_dict[ - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.value.weight" - ] = in_proj_weight[-hidden_size:, :] - state_dict[ - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.value.bias" - ] = in_proj_bias[-hidden_size:] - ########################################## VISION BACKBONE - END - - -def read_in_q_k_v_text_enhancer(state_dict, config): - hidden_size = config.hidden_size - for idx in range(config.encoder_layers): - # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.query.weight"] = in_proj_weight[ - :hidden_size, : - ] - state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.query.bias"] = in_proj_bias[:hidden_size] - - state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.key.weight"] = in_proj_weight[ - hidden_size : hidden_size * 2, : - ] - state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.key.bias"] = in_proj_bias[ - hidden_size : hidden_size * 2 - ] - - state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.value.weight"] = in_proj_weight[ - -hidden_size:, : - ] - state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.value.bias"] = in_proj_bias[ - -hidden_size: - ] - - -def read_in_q_k_v_decoder(state_dict, config): - hidden_size = config.hidden_size - for idx in range(config.decoder_layers): - # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"model.decoder.layers.{idx}.self_attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"model.decoder.layers.{idx}.self_attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"model.decoder.layers.{idx}.self_attn.query.weight"] = in_proj_weight[:hidden_size, :] - state_dict[f"model.decoder.layers.{idx}.self_attn.query.bias"] = in_proj_bias[:hidden_size] - - state_dict[f"model.decoder.layers.{idx}.self_attn.key.weight"] = in_proj_weight[ - hidden_size : hidden_size * 2, : - ] - state_dict[f"model.decoder.layers.{idx}.self_attn.key.bias"] = in_proj_bias[hidden_size : hidden_size * 2] - - state_dict[f"model.decoder.layers.{idx}.self_attn.value.weight"] = in_proj_weight[-hidden_size:, :] - state_dict[f"model.decoder.layers.{idx}.self_attn.value.bias"] = in_proj_bias[-hidden_size:] - - # read in weights + bias of cross-attention - in_proj_weight = state_dict.pop(f"model.decoder.layers.{idx}.encoder_attn_text.in_proj_weight") - in_proj_bias = state_dict.pop(f"model.decoder.layers.{idx}.encoder_attn_text.in_proj_bias") - - # next, add query, keys and values (in that order) to the state dict - state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.query.weight"] = in_proj_weight[:hidden_size, :] - state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.query.bias"] = in_proj_bias[:hidden_size] - - state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.key.weight"] = in_proj_weight[ - hidden_size : hidden_size * 2, : - ] - state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.key.bias"] = in_proj_bias[ - hidden_size : hidden_size * 2 - ] - - state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.value.weight"] = in_proj_weight[-hidden_size:, :] - state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.value.bias"] = in_proj_bias[-hidden_size:] - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - image = Image.open(requests.get(url, stream=True).raw).convert("RGB") - return image - - -def preprocess_caption(caption: str) -> str: - result = caption.lower().strip() - if result.endswith("."): - return result - return result + "." - - -@torch.no_grad() -def convert_grounding_dino_checkpoint(args): - model_name = args.model_name - pytorch_dump_folder_path = args.pytorch_dump_folder_path - push_to_hub = args.push_to_hub - verify_logits = args.verify_logits - - checkpoint_mapping = { - "grounding-dino-tiny": "https://huggingface.co/ShilongLiu/GroundingDino/resolve/main/groundingdino_swint_ogc.pth", - "grounding-dino-base": "https://huggingface.co/ShilongLiu/GroundingDino/resolve/main/groundingdino_swinb_cogcoor.pth", - } - # Define default GroundingDino configuration - config = get_grounding_dino_config(model_name) - - # Load original checkpoint - checkpoint_url = checkpoint_mapping[model_name] - original_state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["model"] - original_state_dict = {k.replace("module.", ""): v for k, v in original_state_dict.items()} - - for name, param in original_state_dict.items(): - print(name, param.shape) - - # Rename keys - new_state_dict = original_state_dict.copy() - rename_keys = create_rename_keys(original_state_dict, config) - - for src, dest in rename_keys: - rename_key(new_state_dict, src, dest) - read_in_q_k_v_encoder(new_state_dict, config) - read_in_q_k_v_text_enhancer(new_state_dict, config) - read_in_q_k_v_decoder(new_state_dict, config) - - # Load HF model - model = GroundingDinoForObjectDetection(config) - model.eval() - missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False) - print("Missing keys:", missing_keys) - print("Unexpected keys:", unexpected_keys) - - # Load and process test image - image = prepare_img() - transforms = T.Compose([T.Resize(size=800, max_size=1333), T.ToTensor(), T.Normalize(IMAGENET_MEAN, IMAGENET_STD)]) - original_pixel_values = transforms(image).unsqueeze(0) - - image_processor = GroundingDinoImageProcessor() - tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") - processor = GroundingDinoProcessor(image_processor=image_processor, tokenizer=tokenizer) - - text = "a cat" - inputs = processor(images=image, text=preprocess_caption(text), return_tensors="pt") - - assert torch.allclose(original_pixel_values, inputs.pixel_values, atol=1e-4) - - if verify_logits: - # Running forward - with torch.no_grad(): - outputs = model(**inputs) - - print(outputs.logits[0, :3, :3]) - - expected_slice = torch.tensor( - [[-4.8913, -0.1900, -0.2161], [-4.9653, -0.3719, -0.3950], [-5.9599, -3.3765, -3.3104]] - ) - - assert torch.allclose(outputs.logits[0, :3, :3], expected_slice, atol=1e-4) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - model.push_to_hub(f"EduardoPacheco/{model_name}") - processor.push_to_hub(f"EduardoPacheco/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="grounding-dino-tiny", - type=str, - choices=["grounding-dino-tiny", "grounding-dino-base"], - help="Name of the GroundingDino model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub." - ) - parser.add_argument( - "--verify_logits", action="store_false", help="Whether or not to verify logits after conversion." - ) - - args = parser.parse_args() - convert_grounding_dino_checkpoint(args) diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino_fast.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino_fast.py index 66528519eef8..744cb5f92923 100644 --- a/src/transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino_fast.py @@ -9,6 +9,7 @@ import torch from torchvision.io import read_image +from torchvision.transforms.v2 import functional as F from ...image_processing_utils import BatchFeature, get_size_dict from ...image_processing_utils_fast import ( @@ -32,7 +33,7 @@ validate_annotations, ) from ...processing_utils import Unpack -from ...utils import TensorType, auto_docstring, is_torchvision_v2_available, logging +from ...utils import TensorType, auto_docstring, logging from ...utils.import_utils import requires from .image_processing_grounding_dino import get_size_with_aspect_ratio @@ -41,12 +42,6 @@ from .modeling_grounding_dino import GroundingDinoObjectDetectionOutput -if is_torchvision_v2_available(): - from torchvision.transforms.v2 import functional as F -else: - from torchvision.transforms import functional as F - - logger = logging.get_logger(__name__) @@ -459,13 +454,7 @@ def resize_annotation( resample (`InterpolationMode`, defaults to `F.InterpolationMode.NEAREST_EXACT`): The resampling filter to use when resizing the masks. """ - interpolation = ( - interpolation - if interpolation is not None - else F.InterpolationMode.NEAREST_EXACT - if is_torchvision_v2_available() - else F.InterpolationMode.NEAREST - ) + interpolation = interpolation if interpolation is not None else F.InterpolationMode.NEAREST_EXACT ratio_height, ratio_width = [target / orig for target, orig in zip(target_size, orig_size)] new_annotation = {} diff --git a/src/transformers/models/groupvit/configuration_groupvit.py b/src/transformers/models/groupvit/configuration_groupvit.py index d17288ede723..662447e7e984 100644 --- a/src/transformers/models/groupvit/configuration_groupvit.py +++ b/src/transformers/models/groupvit/configuration_groupvit.py @@ -289,7 +289,7 @@ def __init__( # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different. for key, value in _text_config_dict.items(): - if key in text_config and value != text_config[key] and key not in ["transformers_version"]: + if key in text_config and value != text_config[key] and key != "transformers_version": # If specified in `text_config_dict` if key in text_config_dict: message = ( @@ -321,7 +321,7 @@ def __init__( # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different. for key, value in _vision_config_dict.items(): - if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]: + if key in vision_config and value != vision_config[key] and key != "transformers_version": # If specified in `vision_config_dict` if key in vision_config_dict: message = ( diff --git a/src/transformers/models/groupvit/convert_groupvit_nvlab_to_hf.py b/src/transformers/models/groupvit/convert_groupvit_nvlab_to_hf.py deleted file mode 100644 index ac6844bd34c6..000000000000 --- a/src/transformers/models/groupvit/convert_groupvit_nvlab_to_hf.py +++ /dev/null @@ -1,217 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Convert GroupViT checkpoints from the original repository. - -URL: https://github.com/NVlabs/GroupViT -""" - -import argparse - -import requests -import torch -from PIL import Image - -from transformers import CLIPProcessor, GroupViTConfig, GroupViTModel - - -def rename_key(name): - # vision encoder - if "img_encoder.pos_embed" in name: - name = name.replace("img_encoder.pos_embed", "vision_model.embeddings.position_embeddings") - if "img_encoder.patch_embed.proj" in name: - name = name.replace("img_encoder.patch_embed.proj", "vision_model.embeddings.patch_embeddings.projection") - if "img_encoder.patch_embed.norm" in name: - name = name.replace("img_encoder.patch_embed.norm", "vision_model.embeddings.layernorm") - if "img_encoder.layers" in name: - name = name.replace("img_encoder.layers", "vision_model.encoder.stages") - if "blocks" in name and "res" not in name: - name = name.replace("blocks", "layers") - if "attn" in name and "pre_assign" not in name: - name = name.replace("attn", "self_attn") - if "proj" in name and "self_attn" in name and "text" not in name: - name = name.replace("proj", "out_proj") - if "pre_assign_attn.attn.proj" in name: - name = name.replace("pre_assign_attn.attn.proj", "pre_assign_attn.attn.out_proj") - if "norm1" in name: - name = name.replace("norm1", "layer_norm1") - if "norm2" in name and "pre_assign" not in name: - name = name.replace("norm2", "layer_norm2") - if "img_encoder.norm" in name: - name = name.replace("img_encoder.norm", "vision_model.layernorm") - # text encoder - if "text_encoder.token_embedding" in name: - name = name.replace("text_encoder.token_embedding", "text_model.embeddings.token_embedding") - if "text_encoder.positional_embedding" in name: - name = name.replace("text_encoder.positional_embedding", "text_model.embeddings.position_embedding.weight") - if "text_encoder.transformer.resblocks." in name: - name = name.replace("text_encoder.transformer.resblocks.", "text_model.encoder.layers.") - if "ln_1" in name: - name = name.replace("ln_1", "layer_norm1") - if "ln_2" in name: - name = name.replace("ln_2", "layer_norm2") - if "c_fc" in name: - name = name.replace("c_fc", "fc1") - if "c_proj" in name: - name = name.replace("c_proj", "fc2") - if "text_encoder" in name: - name = name.replace("text_encoder", "text_model") - if "ln_final" in name: - name = name.replace("ln_final", "final_layer_norm") - # projection layers - if "img_projector.linear_hidden." in name: - name = name.replace("img_projector.linear_hidden.", "visual_projection.") - if "img_projector.linear_out." in name: - name = name.replace("img_projector.linear_out.", "visual_projection.3.") - if "text_projector.linear_hidden" in name: - name = name.replace("text_projector.linear_hidden", "text_projection") - if "text_projector.linear_out" in name: - name = name.replace("text_projector.linear_out", "text_projection.3") - - return name - - -def convert_state_dict(orig_state_dict, config): - for key in orig_state_dict.copy(): - val = orig_state_dict.pop(key) - - if "qkv" in key: - # weights and biases of the key, value and query projections of vision encoder's attention layers require special treatment: - # we need to split them up into separate matrices/vectors - key_split = key.split(".") - stage_num, layer_num = int(key_split[2]), int(key_split[4]) - dim = config.vision_config.hidden_size - if "weight" in key: - orig_state_dict[ - f"vision_model.encoder.stages.{stage_num}.layers.{layer_num}.self_attn.q_proj.weight" - ] = val[:dim, :] - orig_state_dict[ - f"vision_model.encoder.stages.{stage_num}.layers.{layer_num}.self_attn.k_proj.weight" - ] = val[dim : dim * 2, :] - orig_state_dict[ - f"vision_model.encoder.stages.{stage_num}.layers.{layer_num}.self_attn.v_proj.weight" - ] = val[-dim:, :] - else: - orig_state_dict[ - f"vision_model.encoder.stages.{stage_num}.layers.{layer_num}.self_attn.q_proj.bias" - ] = val[:dim] - orig_state_dict[ - f"vision_model.encoder.stages.{stage_num}.layers.{layer_num}.self_attn.k_proj.bias" - ] = val[dim : dim * 2] - orig_state_dict[ - f"vision_model.encoder.stages.{stage_num}.layers.{layer_num}.self_attn.v_proj.bias" - ] = val[-dim:] - elif "in_proj" in key: - # weights and biases of the key, value and query projections of text encoder's attention layers require special treatment: - # we need to split them up into separate matrices/vectors - key_split = key.split(".") - layer_num = int(key_split[3]) - dim = config.text_config.hidden_size - if "weight" in key: - orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.q_proj.weight"] = val[:dim, :] - orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.k_proj.weight"] = val[ - dim : dim * 2, : - ] - orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.v_proj.weight"] = val[-dim:, :] - else: - orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.q_proj.bias"] = val[:dim] - orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.k_proj.bias"] = val[dim : dim * 2] - orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:] - else: - new_name = rename_key(key) - # squeeze if necessary - if ( - "text_projection.0" in new_name - or "text_projection.3" in new_name - or "visual_projection.0" in new_name - or "visual_projection.3" in new_name - ): - orig_state_dict[new_name] = val.squeeze_() - else: - orig_state_dict[new_name] = val - - return orig_state_dict - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -@torch.no_grad() -def convert_groupvit_checkpoint( - checkpoint_path, pytorch_dump_folder_path, model_name="groupvit-gcc-yfcc", push_to_hub=False -): - """ - Copy/paste/tweak model's weights to the Transformers design. - """ - config = GroupViTConfig() - model = GroupViTModel(config).eval() - - state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)["model"] - new_state_dict = convert_state_dict(state_dict, config) - missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False) - assert missing_keys == ["text_model.embeddings.position_ids"] - assert (unexpected_keys == ["multi_label_logit_scale"]) or (len(unexpected_keys) == 0) - - # verify result - processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") - image = prepare_img() - inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="pt") - - with torch.no_grad(): - outputs = model(**inputs) - - if model_name == "groupvit-gcc-yfcc": - expected_logits = torch.tensor([[13.3523, 6.3629]]) - elif model_name == "groupvit-gcc-redcaps": - expected_logits = torch.tensor([[16.1873, 8.6230]]) - else: - raise ValueError(f"Model name {model_name} not supported.") - assert torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3) - - processor.save_pretrained(pytorch_dump_folder_path) - model.save_pretrained(pytorch_dump_folder_path) - print("Successfully saved processor and model to", pytorch_dump_folder_path) - - if push_to_hub: - print("Pushing to the hub...") - processor.push_to_hub(model_name, organization="nielsr") - model.push_to_hub(model_name, organization="nielsr") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to dump the processor and PyTorch model." - ) - parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to GroupViT checkpoint") - parser.add_argument( - "--model_name", - default="groupvit-gccy-fcc", - type=str, - help="Name of the model. Expecting either 'groupvit-gcc-yfcc' or 'groupvit-gcc-redcaps'", - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether or not to push the converted model and processor to the 🤗 hub using the provided `model_name`.", - ) - args = parser.parse_args() - - convert_groupvit_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.model_name, args.push_to_hub) diff --git a/src/transformers/models/groupvit/modeling_groupvit.py b/src/transformers/models/groupvit/modeling_groupvit.py index 775ebd286f0a..3335df375da9 100644 --- a/src/transformers/models/groupvit/modeling_groupvit.py +++ b/src/transformers/models/groupvit/modeling_groupvit.py @@ -74,7 +74,7 @@ def gumbel_softmax(logits: torch.Tensor, tau: float = 1, hard: bool = False, dim y_hard = torch.zeros_like(logits, memory_format=torch.legacy_contiguous_format).scatter_(dim, index, 1.0) ret = y_hard - y_soft.detach() + y_soft else: - # Reparametrization trick. + # Reparameterization trick. ret = y_soft return ret @@ -662,7 +662,7 @@ def forward( attn_weights = nn.functional.softmax(attn_weights, dim=-1) if output_attentions: - # this operation is a bit akward, but it's required to + # this operation is a bit awkward, but it's required to # make sure that attn_weights keeps its gradient. # In order to do so, attn_weights have to reshaped # twice and have to be reused in the following diff --git a/src/transformers/models/hiera/convert_hiera_to_hf.py b/src/transformers/models/hiera/convert_hiera_to_hf.py deleted file mode 100644 index fb23803c65f5..000000000000 --- a/src/transformers/models/hiera/convert_hiera_to_hf.py +++ /dev/null @@ -1,368 +0,0 @@ -# coding=utf-8 -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Hiera checkpoints from the original repository. - -URL: https://github.com/facebookresearch/hiera -""" - -import argparse -import json -import math - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image -from torchvision import transforms - -from transformers import BitImageProcessor, HieraConfig, HieraForImageClassification, HieraForPreTraining, HieraModel -from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config: HieraConfig, base_model: bool, mae_model: bool): - rename_keys = [] - # fmt: off - num_stages = len(config.depths) - # embedding dimensions for input and stages - dims = [config.embed_dim] + [int(config.embed_dim * config.embed_dim_multiplier**i) for i in range(num_stages)] - - global_layer_idx = 0 - for stage_idx in range(num_stages): - dim_in = dims[stage_idx] - dim_out = dims[stage_idx + 1] - for layer_idx in range(config.depths[stage_idx]): - rename_keys.append((f"blocks.{global_layer_idx}.norm1.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.layernorm_before.weight")) - rename_keys.append((f"blocks.{global_layer_idx}.norm1.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.layernorm_before.bias")) - rename_keys.append((f"blocks.{global_layer_idx}.attn.qkv.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.attn.qkv.weight")) - rename_keys.append((f"blocks.{global_layer_idx}.attn.qkv.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.attn.qkv.bias")) - rename_keys.append((f"blocks.{global_layer_idx}.attn.proj.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.attn.proj.weight")) - rename_keys.append((f"blocks.{global_layer_idx}.attn.proj.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.attn.proj.bias")) - rename_keys.append((f"blocks.{global_layer_idx}.norm2.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.layernorm_after.weight")) - rename_keys.append((f"blocks.{global_layer_idx}.norm2.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.layernorm_after.bias")) - rename_keys.append((f"blocks.{global_layer_idx}.mlp.fc1.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.mlp.fc1.weight")) - rename_keys.append((f"blocks.{global_layer_idx}.mlp.fc1.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.mlp.fc1.bias")) - rename_keys.append((f"blocks.{global_layer_idx}.mlp.fc2.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.mlp.fc2.weight")) - rename_keys.append((f"blocks.{global_layer_idx}.mlp.fc2.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.mlp.fc2.bias")) - - # projection layer only for the first layer of each stage boundary (except the first stage) - if dim_out != dim_in and layer_idx == 0: - rename_keys.append((f"blocks.{global_layer_idx}.proj.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.proj.weight")) - rename_keys.append((f"blocks.{global_layer_idx}.proj.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.proj.bias")) - - global_layer_idx += 1 - - # projection layer + position embeddings - rename_keys.extend( - [ - ("patch_embed.proj.weight", "hiera.embeddings.patch_embeddings.projection.weight"), - ("patch_embed.proj.bias", "hiera.embeddings.patch_embeddings.projection.bias") - ] - ) - - rename_keys.append(("pos_embed", "hiera.embeddings.position_embeddings")) - - if base_model: - # layernorm + pooler - rename_keys.extend([("norm.weight", "pooler.layernorm.weight"), ("norm.bias", "pooler.layernorm.bias")]) - # if just the base model, we should remove "hiera" from all keys that start with "hiera" - rename_keys = [(pair[0], pair[1][6:]) if pair[1].startswith("hiera") else pair for pair in rename_keys] - elif mae_model: - rename_keys.extend( - [ - ("encoder_norm.weight", "encoder_norm.weight"), - ("encoder_norm.bias", "encoder_norm.bias"), - ("mask_token", "decoder.mask_token"), - ("decoder_pos_embed", "decoder.decoder_position_embeddings"), - ("decoder_norm.weight", "decoder.decoder_norm.weight"), - ("decoder_norm.bias", "decoder.decoder_norm.bias"), - ("decoder_pred.weight", "decoder.decoder_pred.weight"), - ("decoder_pred.bias", "decoder.decoder_pred.bias"), - ("decoder_embed.weight", "decoder.decoder_embeddings.weight"), - ("decoder_embed.bias", "decoder.decoder_embeddings.bias") - ] - ) - for i in range(config.decoder_depth): - rename_keys.extend( - [ - (f"decoder_blocks.{i}.norm1.weight", f"decoder.decoder_block.layers.{i}.layernorm_before.weight"), - (f"decoder_blocks.{i}.norm1.bias", f"decoder.decoder_block.layers.{i}.layernorm_before.bias"), - (f"decoder_blocks.{i}.attn.qkv.weight", f"decoder.decoder_block.layers.{i}.attn.qkv.weight"), - (f"decoder_blocks.{i}.attn.qkv.bias", f"decoder.decoder_block.layers.{i}.attn.qkv.bias"), - (f"decoder_blocks.{i}.attn.proj.weight", f"decoder.decoder_block.layers.{i}.attn.proj.weight"), - (f"decoder_blocks.{i}.attn.proj.bias", f"decoder.decoder_block.layers.{i}.attn.proj.bias"), - (f"decoder_blocks.{i}.norm2.weight", f"decoder.decoder_block.layers.{i}.layernorm_after.weight"), - (f"decoder_blocks.{i}.norm2.bias", f"decoder.decoder_block.layers.{i}.layernorm_after.bias"), - (f"decoder_blocks.{i}.mlp.fc1.weight", f"decoder.decoder_block.layers.{i}.mlp.fc1.weight"), - (f"decoder_blocks.{i}.mlp.fc1.bias", f"decoder.decoder_block.layers.{i}.mlp.fc1.bias"), - (f"decoder_blocks.{i}.mlp.fc2.weight", f"decoder.decoder_block.layers.{i}.mlp.fc2.weight"), - (f"decoder_blocks.{i}.mlp.fc2.bias", f"decoder.decoder_block.layers.{i}.mlp.fc2.bias"), - ] - ) - for i in range(config.num_query_pool): - rename_keys.extend( - [ - (f"multi_scale_fusion_heads.{i}.weight", f"multiscale_fusion.multi_scale_fusion_heads.{i}.weight"), - (f"multi_scale_fusion_heads.{i}.bias", f"multiscale_fusion.multi_scale_fusion_heads.{i}.bias") - ] - ) - else: - # layernorm + classification head - rename_keys.extend( - [ - ("norm.weight", "hiera.pooler.layernorm.weight"), - ("norm.bias", "hiera.pooler.layernorm.bias"), - ("head.projection.weight", "classifier.weight"), - ("head.projection.bias", "classifier.bias"), - ] - ) - # fmt: on - return rename_keys - - -def remove_classification_head_(state_dict): - ignore_keys = ["head.projection.weight", "head.projection.bias"] - for k in ignore_keys: - state_dict.pop(k, None) - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -def get_labels_for_classifier(model_name: str) -> tuple[dict[int, str], dict[str, int], int]: - repo_id = "huggingface/label-files" - - filename = "imagenet-1k-id2label.json" - - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - label2id = {v: k for k, v in id2label.items()} - num_labels = len(id2label) - - return id2label, label2id, num_labels - - -def get_hiera_config(model_name: str, base_model: bool, mae_model: bool) -> HieraConfig: - if model_name == "hiera-tiny-224": - config = HieraConfig(depths=[1, 2, 7, 2]) - elif model_name == "hiera-small-224": - config = HieraConfig(depths=[1, 2, 11, 2]) - elif model_name == "hiera-base-224": - config = HieraConfig() - elif model_name == "hiera-base-plus-224": - config = HieraConfig(embed_dim=112, num_heads=[2, 4, 8, 16]) - elif model_name == "hiera-large-224": - config = HieraConfig(embed_dim=144, num_heads=[2, 4, 8, 16], depths=[2, 6, 36, 4]) - elif model_name == "hiera-huge-224": - config = HieraConfig(embed_dim=256, num_heads=[4, 8, 16, 32], depths=[2, 6, 36, 4]) - else: - raise ValueError(f"Unrecognized model name: {model_name}") - - if base_model: - pass - elif mae_model: - config.num_query_pool = 2 - config.decoder_hidden_size = 512 - config.decoder_depth = 8 - config.decoder_num_heads = 16 - # Table 3b from Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles - config.mask_ratio = 0.6 - else: - id2label, label2id, num_labels = get_labels_for_classifier(model_name) - config.id2label = id2label - config.label2id = label2id - config.num_labels = num_labels - - return config - - -@torch.no_grad() -def convert_hiera_checkpoint(args): - model_name = args.model_name - base_model = args.base_model - pytorch_dump_folder_path = args.pytorch_dump_folder_path - push_to_hub = args.push_to_hub - mae_model = args.mae_model - - config = get_hiera_config(model_name, base_model, mae_model) - - # Load original hiera model - original_model_name = model_name.replace("-", "_") - original_model_name = f"mae_{original_model_name}" if mae_model else original_model_name - - original_checkpoint_name = "mae_in1k_ft_in1k" if not (base_model or mae_model) else "mae_in1k" - - original_model = torch.hub.load( - "facebookresearch/hiera", - model=original_model_name, - pretrained=True, - checkpoint=original_checkpoint_name, - ) - - original_model.eval() - original_state_dict = original_model.state_dict() - # Don't need to remove head for MAE because original implementation doesn't have it on MAE - if base_model: - remove_classification_head_(original_state_dict) - - # # Rename keys - new_state_dict = original_state_dict.copy() - rename_keys = create_rename_keys(config, base_model, mae_model) - - for src, dest in rename_keys: - rename_key(new_state_dict, src, dest) - - # Load HF hiera model - if base_model: - model = HieraModel(config) - elif mae_model: - model = HieraForPreTraining(config) - else: - model = HieraForImageClassification(config) - - model.eval() - - missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False) - print("Missing keys:", missing_keys) - print("Unexpected keys:", unexpected_keys) - - input_image = prepare_img() - - original_image_preprocessor = transforms.Compose( - [ - transforms.Resize(int((256 / 224) * 224), interpolation=transforms.functional.InterpolationMode.BICUBIC), - transforms.CenterCrop(224), - transforms.ToTensor(), - transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD), - ] - ) - - image_processor = BitImageProcessor( - image_mean=IMAGENET_DEFAULT_MEAN, image_std=IMAGENET_DEFAULT_STD, size={"shortest_edge": 256} - ) - inputs = image_processor(images=input_image, return_tensors="pt") - - expected_pixel_values = original_image_preprocessor(input_image).unsqueeze(0) - - input_image = prepare_img() - - inputs = image_processor(images=input_image, return_tensors="pt") - expected_pixel_values = original_image_preprocessor(input_image).unsqueeze(0) - assert torch.allclose(inputs.pixel_values, expected_pixel_values, atol=1e-4) - print("Pixel values look good!") - print(f"{inputs.pixel_values[0, :3, :3, :3]=}") - - # If is MAE we pass a noise to generate a random mask - mask_spatial_shape = [ - i // s // ms for i, s, ms in zip(config.image_size, config.patch_stride, config.masked_unit_size) - ] - num_windows = math.prod(mask_spatial_shape) - torch.manual_seed(2) - noise = torch.rand(1, num_windows) - outputs = model(**inputs) if not mae_model else model(noise=noise, **inputs) - # original implementation returns logits.softmax(dim=-1) - - if base_model: - expected_prob, expected_intermediates = original_model(expected_pixel_values, return_intermediates=True) - expected_last_hidden = expected_intermediates[-1] - batch_size, _, _, hidden_dim = expected_last_hidden.shape - expected_last_hidden = expected_last_hidden.reshape(batch_size, -1, hidden_dim) - assert torch.allclose(outputs.last_hidden_state, expected_last_hidden, atol=1e-3) - print("Base Model looks good as hidden states match original implementation!") - print(f"{outputs.last_hidden_state[0, :3, :3]=}") - elif mae_model: - # get mask from noise to be able to compare outputs - mask, _ = model.hiera.embeddings.patch_embeddings.random_masking(expected_pixel_values, noise) - expected_loss, _, _, _ = original_model(expected_pixel_values, mask=mask.bool()) - assert torch.allclose(outputs.loss, expected_loss, atol=1e-3) - print("MAE Model looks good as loss matches original implementation!") - else: - expected_prob = original_model(expected_pixel_values) - assert torch.allclose(outputs.logits.softmax(dim=-1), expected_prob, atol=1e-3) - print("Classifier looks good as probs match original implementation") - print(f"{outputs.logits[:, :5]=}") - - if pytorch_dump_folder_path is not None: - print(f"Saving model and processor for {model_name} to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - image_processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - hub_name = model_name - if base_model: - hub_name = model_name - elif mae_model: - hub_name = f"{model_name}-mae" - else: - hub_name = f"{model_name}-in1k" - repo_id = f"EduardoPacheco/{hub_name}" - print(f"Pushing model and processor for {model_name} to hub at {repo_id}") - model.push_to_hub(repo_id) - image_processor.push_to_hub(repo_id) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model-name", - default="hiera-tiny-224", - type=str, - choices=[ - "hiera-tiny-224", - "hiera-small-224", - "hiera-base-224", - "hiera-base-plus-224", - "hiera-large-224", - "hiera-huge-224", - ], - help="Name of the Hiera model you'd like to convert.", - ) - parser.add_argument( - "--pytorch-dump-folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - parser.add_argument( - "--verify-logits", - action="store_true", - help="Whether or not to verify the logits against the original implementation.", - ) - parser.add_argument( - "--push-to-hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub." - ) - parser.add_argument( - "--base-model", - action="store_true", - help="Whether to only convert the base model (no projection head weights).", - ) - parser.add_argument( - "--mae-model", action="store_true", help="Whether to convert to MAE checkpoint to HieraForPreTraining." - ) - - args = parser.parse_args() - convert_hiera_checkpoint(args) diff --git a/src/transformers/models/hubert/convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py b/src/transformers/models/hubert/convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py deleted file mode 100644 index f5914f35c546..000000000000 --- a/src/transformers/models/hubert/convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py +++ /dev/null @@ -1,222 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Hubert checkpoint.""" - -import argparse - -import torch -from s3prl.hub import distilhubert - -from transformers import HubertConfig, HubertModel, Wav2Vec2FeatureExtractor, logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -MAPPING = { - "post_extract_proj": "feature_projection.projection", - "encoder.pos_conv.0": "encoder.pos_conv_embed.conv", - "self_attn.k_proj": "encoder.layers.*.attention.k_proj", - "self_attn.v_proj": "encoder.layers.*.attention.v_proj", - "self_attn.q_proj": "encoder.layers.*.attention.q_proj", - "self_attn.out_proj": "encoder.layers.*.attention.out_proj", - "self_attn_layer_norm": "encoder.layers.*.layer_norm", - "fc1": "encoder.layers.*.feed_forward.intermediate_dense", - "fc2": "encoder.layers.*.feed_forward.output_dense", - "final_layer_norm": "encoder.layers.*.final_layer_norm", - "encoder.layer_norm": "encoder.layer_norm", - "mask_emb": "masked_spec_embed", -} - - -def set_recursively(hf_pointer, key, value, full_name, weight_type): - for attribute in key.split("."): - hf_pointer = getattr(hf_pointer, attribute) - - if weight_type is not None: - hf_shape = getattr(hf_pointer, weight_type).shape - else: - hf_shape = hf_pointer.shape - - assert hf_shape == value.shape, ( - f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be" - f" {value.shape} for {full_name}" - ) - - if weight_type == "weight": - hf_pointer.weight.data = value - elif weight_type == "weight_g": - hf_pointer.weight_g.data = value - elif weight_type == "weight_v": - hf_pointer.weight_v.data = value - elif weight_type == "bias": - hf_pointer.bias.data = value - else: - hf_pointer.data = value - - logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.") - - -def recursively_load_weights(fairseq_model, hf_model): - unused_weights = [] - fairseq_dict = fairseq_model.state_dict() - - feature_extractor = hf_model.feature_extractor - - for name, value in fairseq_dict.items(): - is_used = False - if "conv_layers" in name: - load_conv_layer( - name, - value, - feature_extractor, - unused_weights, - hf_model.config.feat_extract_norm == "group", - ) - is_used = True - else: - for key, mapped_key in MAPPING.items(): - mapped_key = mapped_key - - if key in name: - is_used = True - if "*" in mapped_key: - layer_index = name.split(key)[0].split(".")[-2] - mapped_key = mapped_key.replace("*", layer_index) - if "weight_g" in name: - weight_type = "weight_g" - elif "weight_v" in name: - weight_type = "weight_v" - elif "weight" in name: - weight_type = "weight" - elif "bias" in name: - weight_type = "bias" - else: - weight_type = None - set_recursively(hf_model, mapped_key, value, name, weight_type) - continue - if not is_used: - unused_weights.append(name) - - logger.warning(f"Unused weights: {unused_weights}") - - -def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm): - name = full_name.split("conv_layers.")[-1] - items = name.split(".") - layer_id = int(items[0]) - type_id = int(items[1]) - - if type_id == 0: - if "bias" in name: - assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, ( - f"{full_name} has size {value.shape}, but" - f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found." - ) - feature_extractor.conv_layers[layer_id].conv.bias.data = value - logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.") - elif "weight" in name: - assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, ( - f"{full_name} has size {value.shape}, but" - f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found." - ) - feature_extractor.conv_layers[layer_id].conv.weight.data = value - logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.") - elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm): - if "bias" in name: - assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, ( - f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was" - " found." - ) - feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value - logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.") - elif "weight" in name: - assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, ( - f"{full_name} has size {value.shape}, but" - f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found." - ) - feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value - logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.") - else: - unused_weights.append(full_name) - - -def convert_config(model): - config = HubertConfig() - fs_config = model.config - - config.activation_dropout = fs_config.activation_dropout - config.apply_spec_augment = False - config.attention_dropout = fs_config.attention_dropout - config.conv_bias = False - conv_layers = eval(fs_config.extractor_conv_feature_layers) - config.conv_dim = [x[0] for x in conv_layers] - config.conv_kernel = [x[1] for x in conv_layers] - config.conv_stride = [x[2] for x in conv_layers] - config.feat_extract_activation = "gelu" - config.feat_extract_norm = "layer" if fs_config.extractor_mode == "layer_norm" else "group" - config.feat_proj_layer_norm = False - config.feat_proj_dropout = 0.0 - config.final_dropout = 0.0 - config.hidden_act = fs_config.activation_fn - config.hidden_dropout = fs_config.dropout - config.hidden_size = fs_config.encoder_embed_dim - config.initializer_range = 0.02 - config.intermediate_size = fs_config.encoder_ffn_embed_dim - config.layer_norm_eps = 1e-5 - config.layerdrop = 0.0 - config.num_attention_heads = fs_config.encoder_attention_heads - config.num_conv_pos_embedding_groups = fs_config.conv_pos_groups - config.num_conv_pos_embeddings = fs_config.conv_pos - config.num_feat_extract_layers = len(conv_layers) - config.num_hidden_layers = fs_config.encoder_layers - - return config - - -@torch.no_grad() -def convert_hubert_checkpoint(pytorch_dump_folder_path, config_path=None): - """ - Copy/paste/tweak model's weights to transformers design. - """ - model = distilhubert().model.model - - if config_path is not None: - config = HubertConfig.from_pretrained(config_path) - else: - config = convert_config(model) - model = model.eval() - - feature_extractor = Wav2Vec2FeatureExtractor( - feature_size=1, - sampling_rate=16000, - padding_value=0, - do_normalize=False, - return_attention_mask=False, - ) - hf_model = HubertModel(config) - - recursively_load_weights(model, hf_model) - - feature_extractor.save_pretrained(pytorch_dump_folder_path) - hf_model.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert") - args = parser.parse_args() - convert_hubert_checkpoint(args.pytorch_dump_folder_path, args.config_path) diff --git a/src/transformers/models/hubert/convert_hubert_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/hubert/convert_hubert_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index a0e0b5cd566b..000000000000 --- a/src/transformers/models/hubert/convert_hubert_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,261 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Hubert checkpoint.""" - -import argparse -import json -import os - -import fairseq -import torch -from fairseq.data import Dictionary - -from transformers import ( - HubertConfig, - HubertForCTC, - HubertModel, - Wav2Vec2CTCTokenizer, - Wav2Vec2FeatureExtractor, - Wav2Vec2Processor, - logging, -) - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -MAPPING = { - "post_extract_proj": "feature_projection.projection", - "encoder.pos_conv.0": "encoder.pos_conv_embed.batch_norm", - "encoder.pos_conv.1": "encoder.pos_conv_embed.conv", - "self_attn.k_proj": "encoder.layers.*.attention.k_proj", - "self_attn.v_proj": "encoder.layers.*.attention.v_proj", - "self_attn.q_proj": "encoder.layers.*.attention.q_proj", - "self_attn.out_proj": "encoder.layers.*.attention.out_proj", - "self_attn_layer_norm": "encoder.layers.*.layer_norm", - "fc1": "encoder.layers.*.feed_forward.intermediate_dense", - "fc2": "encoder.layers.*.feed_forward.output_dense", - "final_layer_norm": "encoder.layers.*.final_layer_norm", - "encoder.layer_norm": "encoder.layer_norm", - "w2v_model.layer_norm": "feature_projection.layer_norm", - "w2v_encoder.proj": "lm_head", - "mask_emb": "masked_spec_embed", -} - - -def set_recursively(hf_pointer, key, value, full_name, weight_type): - for attribute in key.split("."): - hf_pointer = getattr(hf_pointer, attribute) - - if weight_type is not None: - hf_shape = getattr(hf_pointer, weight_type).shape - else: - hf_shape = hf_pointer.shape - - assert hf_shape == value.shape, ( - f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be" - f" {value.shape} for {full_name}" - ) - - if weight_type == "weight": - hf_pointer.weight.data = value - elif weight_type == "weight_g": - hf_pointer.weight_g.data = value - elif weight_type == "weight_v": - hf_pointer.weight_v.data = value - elif weight_type == "bias": - hf_pointer.bias.data = value - elif weight_type == "running_mean": - hf_pointer.running_mean.data = value - elif weight_type == "running_var": - hf_pointer.running_var.data = value - elif weight_type == "num_batches_tracked": - hf_pointer.num_batches_tracked.data = value - else: - hf_pointer.data = value - - logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.") - - -def recursively_load_weights(fairseq_model, hf_model, is_finetuned): - unused_weights = [] - fairseq_dict = fairseq_model.state_dict() - - feature_extractor = hf_model.hubert.feature_extractor if is_finetuned else hf_model.feature_extractor - - for name, value in fairseq_dict.items(): - is_used = False - if "conv_layers" in name: - load_conv_layer( - name, - value, - feature_extractor, - unused_weights, - hf_model.config.feat_extract_norm == "group", - ) - is_used = True - else: - for key, mapped_key in MAPPING.items(): - mapped_key = "hubert." + mapped_key if (is_finetuned and mapped_key != "lm_head") else mapped_key - - if key in name or (key.split("w2v_model.")[-1] == name.split(".")[0] and not is_finetuned): - is_used = True - if "*" in mapped_key: - layer_index = name.split(key)[0].split(".")[-2] - mapped_key = mapped_key.replace("*", layer_index) - if "weight_g" in name: - weight_type = "weight_g" - elif "weight_v" in name: - weight_type = "weight_v" - elif "weight" in name: - weight_type = "weight" - elif "bias" in name: - weight_type = "bias" - elif "running_mean" in name: - weight_type = "running_mean" - elif "running_var" in name: - weight_type = "running_var" - elif "num_batches_tracked" in name: - weight_type = "num_batches_tracked" - else: - weight_type = None - set_recursively(hf_model, mapped_key, value, name, weight_type) - continue - if not is_used: - unused_weights.append(name) - - logger.warning(f"Unused weights: {unused_weights}") - - -def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm): - name = full_name.split("conv_layers.")[-1] - items = name.split(".") - layer_id = int(items[0]) - type_id = int(items[1]) - - if type_id == 0: - if "bias" in name: - assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, ( - f"{full_name} has size {value.shape}, but" - f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found." - ) - feature_extractor.conv_layers[layer_id].conv.bias.data = value - logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.") - elif "weight" in name: - assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, ( - f"{full_name} has size {value.shape}, but" - f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found." - ) - feature_extractor.conv_layers[layer_id].conv.weight.data = value - logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.") - elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm): - if "bias" in name: - assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, ( - f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was" - " found." - ) - feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value - logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.") - elif "weight" in name: - assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, ( - f"{full_name} has size {value.shape}, but" - f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found." - ) - feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value - logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.") - else: - unused_weights.append(full_name) - - -@torch.no_grad() -def convert_hubert_checkpoint( - checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True -): - """ - Copy/paste/tweak model's weights to transformers design. - """ - if config_path is not None: - config = HubertConfig.from_pretrained(config_path) - else: - config = HubertConfig() - - if is_finetuned: - if dict_path: - target_dict = Dictionary.load(dict_path) - - # important change bos & pad token id since CTC symbol is and - # not as in fairseq - config.bos_token_id = target_dict.pad_index - config.pad_token_id = target_dict.bos_index - config.eos_token_id = target_dict.eos_index - config.vocab_size = len(target_dict.symbols) - vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json") - if not os.path.isdir(pytorch_dump_folder_path): - logger.error(f"--pytorch_dump_folder_path ({pytorch_dump_folder_path}) should be a directory") - return - os.makedirs(pytorch_dump_folder_path, exist_ok=True) - with open(vocab_path, "w", encoding="utf-8") as vocab_handle: - json.dump(target_dict.indices, vocab_handle) - tokenizer = Wav2Vec2CTCTokenizer( - vocab_path, - unk_token=target_dict.unk_word, - pad_token=target_dict.pad_word, - bos_token=target_dict.bos_word, - eos_token=target_dict.eos_word, - word_delimiter_token="|", - do_lower_case=False, - ) - return_attention_mask = config.feat_extract_norm == "layer" - feature_extractor = Wav2Vec2FeatureExtractor( - feature_size=1, - sampling_rate=16000, - padding_value=0, - do_normalize=True, - return_attention_mask=return_attention_mask, - ) - processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) - processor.save_pretrained(pytorch_dump_folder_path) - - hf_wav2vec = HubertForCTC(config) - else: - hf_wav2vec = HubertModel(config) - - if is_finetuned: - model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( - [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])} - ) - else: - model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([checkpoint_path]) - - model = model[0].eval() - - recursively_load_weights(model, hf_wav2vec, is_finetuned) - - hf_wav2vec.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint") - parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model") - parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert") - parser.add_argument( - "--not_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not" - ) - args = parser.parse_args() - convert_hubert_checkpoint( - args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, not args.not_finetuned - ) diff --git a/src/transformers/models/hubert/convert_hubert_original_s3prl_checkpoint_to_pytorch.py b/src/transformers/models/hubert/convert_hubert_original_s3prl_checkpoint_to_pytorch.py deleted file mode 100644 index c66c41ce36b5..000000000000 --- a/src/transformers/models/hubert/convert_hubert_original_s3prl_checkpoint_to_pytorch.py +++ /dev/null @@ -1,68 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Hubert checkpoint.""" - -import argparse - -import torch - -from transformers import HubertConfig, HubertForSequenceClassification, Wav2Vec2FeatureExtractor, logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -SUPPORTED_MODELS = ["UtteranceLevel"] - - -@torch.no_grad() -def convert_s3prl_checkpoint(base_model_name, config_path, checkpoint_path, model_dump_path): - """ - Copy/paste/tweak model's weights to transformers design. - """ - checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=True) - if checkpoint["Config"]["downstream_expert"]["modelrc"]["select"] not in SUPPORTED_MODELS: - raise NotImplementedError(f"The supported s3prl models are {SUPPORTED_MODELS}") - - downstream_dict = checkpoint["Downstream"] - - hf_congfig = HubertConfig.from_pretrained(config_path) - hf_model = HubertForSequenceClassification.from_pretrained(base_model_name, config=hf_congfig) - hf_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( - base_model_name, return_attention_mask=True, do_normalize=False - ) - - if hf_congfig.use_weighted_layer_sum: - hf_model.layer_weights.data = checkpoint["Featurizer"]["weights"] - - hf_model.projector.weight.data = downstream_dict["projector.weight"] - hf_model.projector.bias.data = downstream_dict["projector.bias"] - hf_model.classifier.weight.data = downstream_dict["model.post_net.linear.weight"] - hf_model.classifier.bias.data = downstream_dict["model.post_net.linear.bias"] - - hf_feature_extractor.save_pretrained(model_dump_path) - hf_model.save_pretrained(model_dump_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--base_model_name", default=None, type=str, help="Name of the huggingface pretrained base model." - ) - parser.add_argument("--config_path", default=None, type=str, help="Path to the huggingface classifier config.") - parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to the s3prl checkpoint.") - parser.add_argument("--model_dump_path", default=None, type=str, help="Path to the final converted model.") - args = parser.parse_args() - convert_s3prl_checkpoint(args.base_model_name, args.config_path, args.checkpoint_path, args.model_dump_path) diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index f2fb135a4f4e..d2d5db61f739 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -485,7 +485,7 @@ def __init__( num_heads: int, dropout: float = 0.0, is_cross_attention: bool = False, - config: PretrainedConfig = None, + config: Optional[PretrainedConfig] = None, qk_layer_norms: bool = False, layer_idx: Optional[int] = None, ): @@ -997,7 +997,7 @@ def forward( elif position_ids is None: position_ids = cache_position.unsqueeze(0) - if sum([x is None for x in [pixel_values, image_encoder_embeddings, perceiver_embeddings]]) != 2: + if sum(x is None for x in [pixel_values, image_encoder_embeddings, perceiver_embeddings]) != 2: raise ValueError( "Exactly 1 of pixel_values, image_encoder_embeddings or perceiver_embeddings has to be not-None." ) diff --git a/src/transformers/models/idefics2/convert_idefics2_weights_to_hf.py b/src/transformers/models/idefics2/convert_idefics2_weights_to_hf.py deleted file mode 100644 index ea44ee11e58c..000000000000 --- a/src/transformers/models/idefics2/convert_idefics2_weights_to_hf.py +++ /dev/null @@ -1,185 +0,0 @@ -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import copy - -import torch -from accelerate import init_empty_weights - -from transformers import ( - AutoConfig, - AutoModelForCausalLM, - AutoTokenizer, - Idefics2Config, - Idefics2ForConditionalGeneration, - Idefics2ImageProcessor, - Idefics2Processor, - MistralConfig, -) - - -EPILOG_TXT = """Example: - python transformers/src/transformers/models/idefics2/convert_idefics2_weights_to_hf.py --original_model_id HuggingFaceM4/idefics2-8b --output_hub_path org/idefics2 -""" - - -KEYS_TO_MODIFY_MAPPING = { - "lm_head.weight": "lm_head.linear.weight", - "model.layers": "model.text_model.layers", - "model.norm": "model.text_model.norm", - "model.perceiver_resampler": "model.connector.perceiver_resampler", - "model.modality_projection": "model.connector.modality_projection", -} - - -WEIGHTS_TO_MERGE_MAPPING = ( - # (weights to merge in merging order), (new weight name) - ( - ("model.embed_tokens.weight", "model.embed_tokens.additional_embedding.weight"), - "model.text_model.embed_tokens.weight", - ), - (("lm_head.linear.weight", "additional_fc.weight"), "lm_head.weight"), -) - - -def convert_state_dict_to_hf(state_dict): - new_state_dict = {} - for key, value in state_dict.items(): - if key.endswith(".inv_freq"): - continue - for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items(): - if key_to_modify in key: - key = key.replace(key_to_modify, new_key) - - new_state_dict[key] = value - return new_state_dict - - -def merge_weights(state_dict): - new_state_dict = copy.deepcopy(state_dict) - - # Merge the weights - for weights_to_merge, new_weight_name in WEIGHTS_TO_MERGE_MAPPING: - for weight in weights_to_merge: - assert weight in state_dict, f"Weight {weight} is missing in the state dict" - if new_weight_name not in new_state_dict: - new_state_dict[new_weight_name] = [state_dict[weight]] - else: - new_state_dict[new_weight_name].append(state_dict[weight]) - new_state_dict[new_weight_name] = torch.cat(new_state_dict[new_weight_name], dim=0) - - # Remove the weights that were merged - for weights_to_merge, new_weight_name in WEIGHTS_TO_MERGE_MAPPING: - for weight in weights_to_merge: - if weight in new_state_dict and weight != new_weight_name: - new_state_dict.pop(weight) - - return new_state_dict - - -def get_config(checkpoint): - if checkpoint == "HuggingFaceM4/idefics2": - # We load the config then recreate to use the text_config - config = AutoConfig.from_pretrained(checkpoint) - text_config = MistralConfig( - vocab_size=config.vocab_size + config.additional_vocab_size, - hidden_size=config.hidden_size, - intermediate_size=config.intermediate_size, - num_hidden_layers=config.num_hidden_layers, - num_attention_heads=config.num_attention_heads, - num_key_value_heads=config.num_key_value_heads, - hidden_act=config.hidden_act, - max_position_embeddings=config.max_position_embeddings, - initializer_range=config.initializer_range, - rms_norm_eps=config.rms_norm_eps, - tie_word_embeddings=config.tie_word_embeddings, - rope_theta=config.rope_theta, - sliding_window=config.sliding_window, - attention_dropout=config.attention_dropout, - pad_token_id=config.pad_token_id, - bos_token_id=config.bos_token_id, - eos_token_id=config.eos_token_id, - ) - perceiver_config = config.perceiver_config.to_dict() - config = Idefics2Config( - text_config=text_config.to_dict(), - vision_config=config.vision_config, - perceiver_config=perceiver_config, - use_cache=config.use_cache, - image_token_id=config.image_token_id, - tie_word_embeddings=config.tie_word_embeddings, - ) - return config - - return AutoConfig.from_pretrained(checkpoint) - - -def convert_idefics2_hub_to_hf(original_model_id, output_hub_path, push_to_hub): - # The original model maps to AutoModelForCausalLM, converted we map to Idefics2ForConditionalGeneration - original_model = AutoModelForCausalLM.from_pretrained(original_model_id, trust_remote_code=True) - # The original model doesn't use the idefics2 processing objects - image_seq_len = original_model.config.perceiver_config.resampler_n_latents - image_processor = Idefics2ImageProcessor() - tokenizer = AutoTokenizer.from_pretrained(original_model_id) - processor = Idefics2Processor( - image_processor=image_processor, - tokenizer=tokenizer, - image_seq_len=image_seq_len, - ) - state_dict = original_model.state_dict() - state_dict = convert_state_dict_to_hf(state_dict) - - # Merge weights - state_dict = merge_weights(state_dict) - - config = get_config(original_model_id) - - with init_empty_weights(): - model = Idefics2ForConditionalGeneration(config) - - model.load_state_dict(state_dict, strict=True, assign=True) - - model.save_pretrained(output_hub_path) - processor.save_pretrained(output_hub_path) - - if push_to_hub: - model.push_to_hub(output_hub_path, private=True) - processor.push_to_hub(output_hub_path, private=True) - - -def main(): - parser = argparse.ArgumentParser( - epilog=EPILOG_TXT, - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - parser.add_argument( - "--original_model_id", - help="Hub location of the text model", - ) - parser.add_argument( - "--output_hub_path", - help="Location on the hub of the converted model", - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="If set, the model will be pushed to the hub after conversion.", - ) - args = parser.parse_args() - convert_idefics2_hub_to_hf(args.original_model_id, args.output_hub_path, args.push_to_hub) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/idefics3/convert_idefics3_weights_to_hf.py b/src/transformers/models/idefics3/convert_idefics3_weights_to_hf.py deleted file mode 100644 index 591a7dbd757a..000000000000 --- a/src/transformers/models/idefics3/convert_idefics3_weights_to_hf.py +++ /dev/null @@ -1,214 +0,0 @@ -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import json - -import torch -from accelerate import init_empty_weights -from huggingface_hub import hf_hub_download - -from transformers import ( - AutoModelForCausalLM, - AutoTokenizer, - Idefics3Config, - Idefics3ForConditionalGeneration, - Idefics3ImageProcessor, - Idefics3Processor, - LlamaConfig, -) - - -EPILOG_TXT = """Example: - python transformers/src/transformers/models/idefics3/convert_idefics3_weights_to_hf.py --original_model_id HuggingFaceM4/Idefics3-8B-Llama3 --output_hub_path org/idefics3 -""" - - -KEYS_TO_MODIFY_MAPPING = { - "lm_head.weight": "lm_head.linear.weight", - "model.layers": "model.text_model.layers", - "model.norm": "model.text_model.norm", - "model.modality_projection": "model.connector.modality_projection", -} - - -WEIGHTS_TO_MERGE_MAPPING = ( - # (weights to merge in merging order), (new weight name) - ( - ("model.embed_tokens.weight", "model.embed_tokens.additional_embedding.weight"), - "model.text_model.embed_tokens.weight", - ), - (("lm_head.linear.weight", "additional_fc.weight"), "lm_head.weight"), -) - -WEIGHTS_TO_DROP = ( - # The original model had a vision head, but this is never used - "model.vision_model.head", -) - - -def convert_state_dict_to_hf(state_dict): - new_state_dict = {} - old_state_dict_keys = set(state_dict.keys()) - - # Flattened list of weights to merge. We keep these in the original state dict to merge them later - original_weights_to_merge = [w for weights in WEIGHTS_TO_MERGE_MAPPING for w in weights[0]] - - # for key, value in state_dict.items(): - for old_key in old_state_dict_keys: - if old_key.endswith(".inv_freq") or any(w in old_key for w in WEIGHTS_TO_DROP): - state_dict.pop(old_key) - continue - - key = old_key - for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items(): - if key_to_modify in key: - key = key.replace(key_to_modify, new_key) - - weight = state_dict.pop(old_key) - if key in original_weights_to_merge: - new_state_dict[key] = weight - # Bit of a hack - we need to keep the original weights to merge them later - state_dict[key] = weight - else: - new_state_dict[key] = weight - - return new_state_dict - - -def merge_weights(state_dict, new_state_dict): - old_weight_names = set(state_dict.keys()) - - # Merge the weights - for weights_to_merge, new_weight_name in WEIGHTS_TO_MERGE_MAPPING: - for weight_to_merge in weights_to_merge: - print(weight_to_merge) - assert weight_to_merge in state_dict, f"Weight {weight_to_merge} is missing in the state dict" - - weight = state_dict.pop(weight_to_merge) - if new_weight_name not in new_state_dict: - new_state_dict[new_weight_name] = [weight] - else: - new_state_dict[new_weight_name].append(weight) - - old_weight_names.remove(weight_to_merge) - - new_state_dict[new_weight_name] = torch.cat(new_state_dict[new_weight_name], dim=0) - - # Remove the weights that were merged - for weights_to_merge, new_weight_name in WEIGHTS_TO_MERGE_MAPPING: - for weight in weights_to_merge: - if weight in new_state_dict and weight != new_weight_name: - new_state_dict.pop(weight) - - return new_state_dict - - -def get_config(checkpoint): - # We load the config then recreate to use the text_config - - # download the config file - filepath = hf_hub_download(repo_id=checkpoint, filename="config.json") - with open(filepath, "r") as f: - config_json = json.load(f) - - # Setup the vision config - vision_config = config_json.pop("vision_config") - vision_config.pop("vision_model_name", None) - if "embed_dim" in vision_config: - vision_config["hidden_size"] = vision_config.pop("embed_dim") - - config_json["vocab_size"] = config_json.pop("vocab_size") + config_json.pop("additional_vocab_size") - - image_token_id = config_json.pop("image_token_id", config_json["vocab_size"] - 2) - use_cache = config_json.pop("use_cache", True) - tie_word_embeddings = config_json.pop("tie_word_embeddings", True) - scale_factor = config_json.pop("scale_factor", 2) - vocab_size = config_json.pop("vocab_size", 100000) - - # Remove "freeze" params from the config - config_json = {k: v for k, v in config_json.items() if not k.startswith("freeze_")} - text_config = LlamaConfig(**config_json) - - config = Idefics3Config( - text_config=text_config, - vision_config=vision_config, - use_cache=use_cache, - image_token_id=image_token_id, - tie_word_embeddings=tie_word_embeddings, - scale_factor=scale_factor, - vocab_size=vocab_size, - ) - return config - - -def convert_idefics3_hub_to_hf(original_model_id, output_hub_path, push_to_hub): - # The original model maps to AutoModelForCausalLM, converted we map to Idefics3ForConditionalGeneration - original_model = AutoModelForCausalLM.from_pretrained( - original_model_id, trust_remote_code=True, dtype=torch.bfloat16 - ) - # The original model doesn't use the Idefics3 processing objects - image_processor = Idefics3ImageProcessor() - tokenizer = AutoTokenizer.from_pretrained(original_model_id) - processor = Idefics3Processor( - image_processor=image_processor, - tokenizer=tokenizer, - ) - state_dict = original_model.state_dict() - new_state_dict = convert_state_dict_to_hf(state_dict) - - # Merge weights - new_state_dict = merge_weights(state_dict, new_state_dict) - del state_dict - - config = get_config(original_model_id) - print(config) - - with init_empty_weights(): - model = Idefics3ForConditionalGeneration(config) - - model.load_state_dict(new_state_dict, strict=True, assign=True) - - model.save_pretrained(output_hub_path) - processor.save_pretrained(output_hub_path) - - if push_to_hub: - model.push_to_hub(output_hub_path, private=True) - processor.push_to_hub(output_hub_path, private=True) - - -def main(): - parser = argparse.ArgumentParser( - epilog=EPILOG_TXT, - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - parser.add_argument( - "--original_model_id", - help="Hub location of the text model", - ) - parser.add_argument( - "--output_hub_path", - help="Location on the hub of the converted model", - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="If set, the model will be pushed to the hub after conversion.", - ) - args = parser.parse_args() - convert_idefics3_hub_to_hf(args.original_model_id, args.output_hub_path, args.push_to_hub) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py index ab9eaac8e8b2..00ee8df6d414 100644 --- a/src/transformers/models/idefics3/processing_idefics3.py +++ b/src/transformers/models/idefics3/processing_idefics3.py @@ -108,9 +108,6 @@ class Idefics3ProcessorKwargs(ProcessingKwargs, total=False): } -Idefics3ProcessorKwargs.__annotations__["images_kwargs"] = Idefics3ImagesKwargs # python 3.8 compatibility - - class Idefics3Processor(ProcessorMixin): r""" Constructs a Idefics3 processor which wraps a LLama tokenizer and Idefics3 image processor into a single processor. diff --git a/src/transformers/models/ijepa/convert_ijepa_to_hf.py b/src/transformers/models/ijepa/convert_ijepa_to_hf.py deleted file mode 100644 index 25d97df6ce8f..000000000000 --- a/src/transformers/models/ijepa/convert_ijepa_to_hf.py +++ /dev/null @@ -1,268 +0,0 @@ -# coding=utf-8 -# Copyright 2024 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert IJEPA checkpoints from the original repository. - -URL: https://github.com/facebookresearch/ijepa -""" - -import argparse -import gc -import re -from pathlib import Path -from typing import Optional - -import requests -import torch -from PIL import Image - -from transformers import ( - IJepaConfig, - IJepaModel, - ViTImageProcessor, -) -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -# fmt: off -ORIGINAL_TO_CONVERTED_KEY_MAPPING = { - # Projection layer + position embeddings - r"pos_embed": r"embeddings.position_embeddings", - r"patch_embed.proj.weight": r"embeddings.patch_embeddings.projection.weight", - r"patch_embed.proj.bias": r"embeddings.patch_embeddings.projection.bias", - - # Encoder layers: Layernorms, Attention, Feedforward layers - r"blocks.(\d+).norm1.weight": r"encoder.layer.\1.layernorm_before.weight", - r"blocks.(\d+).norm1.bias": r"encoder.layer.\1.layernorm_before.bias", - r"blocks.(\d+).attn.proj.weight": r"encoder.layer.\1.attention.output.dense.weight", - r"blocks.(\d+).attn.proj.bias": r"encoder.layer.\1.attention.output.dense.bias", - r"blocks.(\d+).norm2.weight": r"encoder.layer.\1.layernorm_after.weight", - r"blocks.(\d+).norm2.bias": r"encoder.layer.\1.layernorm_after.bias", - r"blocks.(\d+).mlp.fc1.weight": r"encoder.layer.\1.intermediate.dense.weight", - r"blocks.(\d+).mlp.fc1.bias": r"encoder.layer.\1.intermediate.dense.bias", - r"blocks.(\d+).mlp.fc2.weight": r"encoder.layer.\1.output.dense.weight", - r"blocks.(\d+).mlp.fc2.bias": r"encoder.layer.\1.output.dense.bias", - - # Layernorm + pooler - r"norm.weight": r"layernorm.weight", - r"norm.bias": r"layernorm.bias", -} -# fmt: on - - -def convert_old_keys_to_new_keys(state_dict_keys: Optional[dict] = None): - """ - Converts old keys to new keys using the mapping and dynamically removes the 'ijepa.' prefix if necessary. - - Args: - state_dict_keys (dict): The keys from the state_dict to convert. - - Returns: - dict: A mapping from old keys to new keys. - """ - output_dict = {} - if state_dict_keys is not None: - old_text = "\n".join(state_dict_keys) - new_text = old_text - - # Apply regex-based mapping - for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items(): - if replacement is None: - new_text = re.sub(pattern, "", new_text) # Skip the key - continue - new_text = re.sub(pattern, replacement, new_text) - - output_dict = dict(zip(old_text.split("\n"), new_text.split("\n"))) - - return output_dict - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config): - for i in range(config.num_hidden_layers): - # read in weights + bias of input projection layer (in timm, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight") - in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :] - state_dict[f"encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size] - state_dict[f"encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ - config.hidden_size : config.hidden_size * 2, : - ] - state_dict[f"encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[ - config.hidden_size : config.hidden_size * 2 - ] - state_dict[f"encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-config.hidden_size :, :] - state_dict[f"encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :] - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -def get_ijepa_config(model_name): - patch_size = int(model_name.split("_")[1][4:]) - config = IJepaConfig(patch_size=patch_size) - if "vith" in model_name: - config.hidden_size = 1280 - config.num_hidden_layers = 32 - config.num_attention_heads = 16 - config.layer_norm_eps = 1e-6 - config.mlp_ratio = 4 - config.intermediate_size = 5120 - if model_name == "ijepa_vith16_1k": - config.image_size = 448 - elif "vitg" in model_name: - config.hidden_size = 1408 - config.num_hidden_layers = 40 - config.num_attention_heads = 16 - config.layer_norm_eps = 1e-6 - config.mlp_ratio = 48 / 11 - config.intermediate_size = 6144 - else: - raise ValueError("Model not supported, only supports huge and giant models.") - return config - - -@torch.no_grad() -def write_model(model_name, output_dir, safe_serialization, push_to_hub, verify_logits): - """ - Copy/paste/tweak model's weights to our IJEPA structure. - """ - - # define default IJEPA configuration - config = get_ijepa_config(model_name) - - checkpoint_mapping = { - "ijepa_vith14_1k": "https://dl.fbaipublicfiles.com/ijepa/IN1K-vit.h.14-300e.pth.tar", - "ijepa_vith14_22k": "https://dl.fbaipublicfiles.com/ijepa/IN22K-vit.h.14-900e.pth.tar", - "ijepa_vith16_1k": "https://dl.fbaipublicfiles.com/ijepa/IN1K-vit.h.16-448px-300e.pth.tar", - "ijepa_vitg16_22k": "https://dl.fbaipublicfiles.com/ijepa/IN22K-vit.g.16-600e.pth.tar", - } - - # Load original checkpoint - checkpoint_url = checkpoint_mapping[model_name] - original_state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["encoder"] - original_state_dict = {k.replace("module.", ""): v for k, v in original_state_dict.items()} - - # Rename keys - state_dict = original_state_dict.copy() - new_keys = convert_old_keys_to_new_keys(state_dict.keys()) - for old_key, new_key in new_keys.items(): - rename_key(state_dict, old_key, new_key) - read_in_q_k_v(state_dict, config) - - # load HuggingFace model - model = IJepaModel(config, add_pooling_layer=False).eval() - model.load_state_dict(state_dict) - size = {"height": config.image_size, "width": config.image_size} - image_processor = ViTImageProcessor(size=size) - - if verify_logits: - # Check outputs on an image, prepared by ViTImageProcessor - encoding = image_processor(images=prepare_img(), return_tensors="pt") - pixel_values = encoding["pixel_values"] - with torch.no_grad(): - outputs = model(pixel_values) - - expected_slices = { - "ijepa_vith14_1k": torch.Tensor( - [[-0.0621, -0.0054, -2.7513], [-0.1952, 0.0909, -3.9536], [0.0942, -0.0331, -1.2833]] - ), - "ijepa_vith14_22k": torch.Tensor( - [[0.0358, -0.0045, -0.2154], [0.0418, -0.0246, 0.0108], [0.2529, -0.0345, -0.0246]] - ), - "ijepa_vith16_1k": torch.Tensor( - [[0.5145, -0.1259, 0.0615], [0.1132, 0.0028, -0.0496], [1.1586, -0.0056, -0.0387]] - ), - "ijepa_vitg16_22k": torch.Tensor( - [[0.0512, -0.0510, -0.0649], [0.1972, 0.0380, -0.0790], [0.1667, -0.0834, -0.1240]] - ), - } - - assert torch.allclose( - expected_slices[model_name], - outputs.last_hidden_state[0, :3, :3], - atol=1e-4, - ) - - if output_dir: - Path(output_dir).mkdir(exist_ok=True) - print(f"Saving model {model_name} to {output_dir}") - image_processor.save_pretrained(output_dir, safe_serialization=safe_serialization) - model.save_pretrained(output_dir, safe_serialization=safe_serialization) - - if push_to_hub: - image_processor.push_to_hub(repo_id=f"jmtzt/{model_name}", safe_serialization=safe_serialization) - model.push_to_hub(repo_id=f"jmtzt/{model_name}", safe_serialization=safe_serialization) - - if output_dir: - del model, state_dict - gc.collect() - print("Reloading the model to check if it's saved correctly.") - IJepaModel.from_pretrained(output_dir, device_map="auto") - print("Model reloaded successfully.") - - -def main(): - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="ijepa_vith14_1k", - type=str, - choices=[ - "ijepa_vith14_1k", - "ijepa_vith14_22k", - "ijepa_vith16_1k", - "ijepa_vitg16_22k", - ], - help="Name of the model you'd like to convert.", - ) - parser.add_argument( - "--output_dir", - default=None, - type=str, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument( - "--safe_serialization", default=True, type=bool, help="Whether or not to save using `safetensors`." - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether or not to push the model to the 🤗 Hub.", - ) - parser.add_argument( - "--verify_logits", action="store_false", help="Whether or not to verify logits after conversion." - ) - - parser.set_defaults() - args = parser.parse_args() - write_model(args.model_name, args.output_dir, args.safe_serialization, args.push_to_hub, args.verify_logits) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/imagegpt/convert_imagegpt_original_tf2_to_pytorch.py b/src/transformers/models/imagegpt/convert_imagegpt_original_tf2_to_pytorch.py deleted file mode 100644 index 182d66b9af28..000000000000 --- a/src/transformers/models/imagegpt/convert_imagegpt_original_tf2_to_pytorch.py +++ /dev/null @@ -1,71 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert OpenAI Image GPT checkpoints.""" - -import argparse - -import torch - -from transformers import ImageGPTConfig, ImageGPTForCausalLM, load_tf_weights_in_imagegpt -from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging - - -logging.set_verbosity_info() - - -def convert_imagegpt_checkpoint_to_pytorch(imagegpt_checkpoint_path, model_size, pytorch_dump_folder_path): - # Construct configuration depending on size - MODELS = {"small": (512, 8, 24), "medium": (1024, 8, 36), "large": (1536, 16, 48)} - n_embd, n_head, n_layer = MODELS[model_size] # set model hyperparameters - config = ImageGPTConfig(n_embd=n_embd, n_layer=n_layer, n_head=n_head) - model = ImageGPTForCausalLM(config) - - # Load weights from numpy - load_tf_weights_in_imagegpt(model, config, imagegpt_checkpoint_path) - - # Save pytorch-model - pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME - pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME - print(f"Save PyTorch model to {pytorch_weights_dump_path}") - torch.save(model.state_dict(), pytorch_weights_dump_path) - print(f"Save configuration file to {pytorch_config_dump_path}") - with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: - f.write(config.to_json_string()) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--imagegpt_checkpoint_path", - default=None, - type=str, - required=True, - help="Path to the TensorFlow checkpoint path.", - ) - parser.add_argument( - "--model_size", - default=None, - type=str, - required=True, - help="Size of the model (can be either 'small', 'medium' or 'large').", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - args = parser.parse_args() - convert_imagegpt_checkpoint_to_pytorch( - args.imagegpt_checkpoint_path, args.model_size, args.pytorch_dump_folder_path - ) diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt.py b/src/transformers/models/imagegpt/image_processing_imagegpt.py index 9168ecaceff2..aa2114509f70 100644 --- a/src/transformers/models/imagegpt/image_processing_imagegpt.py +++ b/src/transformers/models/imagegpt/image_processing_imagegpt.py @@ -247,7 +247,7 @@ def preprocess( ) # Here, normalize() is using a constant factor to divide pixel values. - # hence, the method does not need iamge_mean and image_std. + # hence, the method does not need image_mean and image_std. validate_preprocess_arguments( do_resize=do_resize, size=size, diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py index ddfee7c757fe..7a6bcc53ae1a 100644 --- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py +++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py @@ -18,6 +18,7 @@ import numpy as np import torch +from torchvision.transforms.v2 import functional as F from ...image_processing_utils import BatchFeature from ...image_processing_utils_fast import ( @@ -30,16 +31,9 @@ from ...utils import ( TensorType, auto_docstring, - is_torchvision_v2_available, ) -if is_torchvision_v2_available(): - from torchvision.transforms.v2 import functional as F -else: - from torchvision.transforms import functional as F - - def squared_euclidean_distance_torch(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor: """ Compute squared Euclidean distances between all pixels and clusters. diff --git a/src/transformers/models/instructblip/convert_instructblip_original_to_pytorch.py b/src/transformers/models/instructblip/convert_instructblip_original_to_pytorch.py deleted file mode 100644 index f8b9c86cfddc..000000000000 --- a/src/transformers/models/instructblip/convert_instructblip_original_to_pytorch.py +++ /dev/null @@ -1,303 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Convert InstructBLIP checkpoints from the original repository. - -URL: https://github.com/salesforce/LAVIS/tree/main/projects/instructblip -""" - -import argparse - -import requests -import torch - -# pip3 install salesforce-lavis -# I'm actually installing a slightly modified version: pip3 install git+https://github.com/nielsrogge/LAVIS.git@fix_lavis_float32 (there's also the fix_lavis branch) -# also note: to convert Vicuna checkpoints, we had to include /home/niels/python_projects/checkpoints/FastChat/vicuna-7b in lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml -# same for Vicuna-13b -from lavis.models import load_model_and_preprocess -from PIL import Image - -from transformers import ( - AutoTokenizer, - BlipImageProcessor, - InstructBlipConfig, - InstructBlipForConditionalGeneration, - InstructBlipProcessor, - InstructBlipQFormerConfig, - InstructBlipVisionConfig, - LlamaConfig, - LlamaTokenizerFast, - T5Config, - T5TokenizerFast, -) -from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD - - -def load_demo_image(): - url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg" - image = Image.open(requests.get(url, stream=True).raw).convert("RGB") - - return image - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config): - rename_keys = [] - # fmt: off - - # vision encoder - rename_keys.append(("visual_encoder.cls_token", "vision_model.embeddings.class_embedding")) - rename_keys.append(("visual_encoder.pos_embed", "vision_model.embeddings.position_embedding")) - rename_keys.append(("visual_encoder.patch_embed.proj.weight", "vision_model.embeddings.patch_embedding.weight")) - rename_keys.append(("visual_encoder.patch_embed.proj.bias", "vision_model.embeddings.patch_embedding.bias")) - rename_keys.append(("ln_vision.weight", "vision_model.post_layernorm.weight")) - rename_keys.append(("ln_vision.bias", "vision_model.post_layernorm.bias")) - - for i in range(config.vision_config.num_hidden_layers): - rename_keys.append((f"visual_encoder.blocks.{i}.norm1.weight", f"vision_model.encoder.layers.{i}.layer_norm1.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.norm1.bias", f"vision_model.encoder.layers.{i}.layer_norm1.bias")) - rename_keys.append((f"visual_encoder.blocks.{i}.norm2.weight", f"vision_model.encoder.layers.{i}.layer_norm2.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.norm2.bias", f"vision_model.encoder.layers.{i}.layer_norm2.bias")) - rename_keys.append((f"visual_encoder.blocks.{i}.attn.qkv.weight", f"vision_model.encoder.layers.{i}.self_attn.qkv.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.weight", f"vision_model.encoder.layers.{i}.self_attn.projection.weight",)) - rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.bias", f"vision_model.encoder.layers.{i}.self_attn.projection.bias")) - rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.weight", f"vision_model.encoder.layers.{i}.mlp.fc1.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.bias", f"vision_model.encoder.layers.{i}.mlp.fc1.bias")) - rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.weight", f"vision_model.encoder.layers.{i}.mlp.fc2.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.bias", f"vision_model.encoder.layers.{i}.mlp.fc2.bias")) - - # QFormer - rename_keys.append(("Qformer.bert.embeddings.LayerNorm.weight", "qformer.embeddings.layernorm.weight")) - rename_keys.append(("Qformer.bert.embeddings.LayerNorm.bias", "qformer.embeddings.layernorm.bias")) - - # fmt: on - return rename_keys - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -def read_in_q_v_bias(state_dict, config): - for i in range(config.vision_config.num_hidden_layers): - # read in original q and v biases - q_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.q_bias") - v_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.v_bias") - - # next, set bias in the state dict - qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias)) - state_dict[f"vision_model.encoder.layers.{i}.self_attn.qkv.bias"] = qkv_bias - - -def get_blip2_config(model_name): - image_size = 364 if "coco" in model_name else 224 - vision_config = InstructBlipVisionConfig(image_size=image_size).to_dict() - - # make sure the models have proper bos_token_id and eos_token_id set (important for generation) - # seems like flan-T5 models don't have bos_token_id properly set? - if "t5-xl" in model_name: - text_config = T5Config.from_pretrained("google/flan-t5-xl", dense_act_fn="gelu", bos_token_id=1).to_dict() - elif "t5-xxl" in model_name: - text_config = T5Config.from_pretrained("google/flan-t5-xxl", dense_act_fn="gelu", bos_token_id=1).to_dict() - elif "vicuna-7b" in model_name: - text_config = LlamaConfig.from_pretrained("decapoda-research/llama-7b-hf", vocab_size=32001).to_dict() - elif "vicuna-13b" in model_name: - text_config = LlamaConfig.from_pretrained("decapoda-research/llama-13b-hf", vocab_size=32001).to_dict() - else: - raise ValueError("Model name not supported") - - # the authors add one special "[DEC]" token to the vocab of Q-Former, hence vocab size = 30522 + 1 - qformer_config = InstructBlipQFormerConfig(vocab_size=30523).to_dict() - config = InstructBlipConfig(vision_config=vision_config, text_config=text_config, qformer_config=qformer_config) - - return config, image_size - - -@torch.no_grad() -def convert_blip2_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False): - """ - Copy/paste/tweak model's weights to Transformers design. - """ - qformer_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased", truncation_side="left") - qformer_tokenizer.add_special_tokens({"bos_token": "[DEC]"}) - - if "t5" in model_name: - tokenizer = T5TokenizerFast.from_pretrained("google/flan-t5-xl", truncation_side="left") - elif "vicuna" in model_name: - # the following was used in the original implementation: - # tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", use_fast=False, truncation_side="left") - # tokenizer.add_special_tokens({"pad_token": "[PAD]"}) - # tokenizer.add_special_tokens({"bos_token": ""}) - # tokenizer.add_special_tokens({"eos_token": ""}) - # tokenizer.add_special_tokens({"unk_token": ""}) - tokenizer = LlamaTokenizerFast.from_pretrained( - "huggyllama/llama-7b", truncation_side="left", bos_token="", unk_token="" - ) - tokenizer.add_special_tokens({"pad_token": "[PAD]"}) - - config, image_size = get_blip2_config(model_name) - hf_model = InstructBlipForConditionalGeneration(config).eval() - - model_name_to_original = { - "instructblip-vicuna-7b": ("blip2_vicuna_instruct", "vicuna7b"), - "instructblip-vicuna-13b": ("blip2_vicuna_instruct", "vicuna13b"), - "instructblip-flan-t5-xl": ("blip2_t5_instruct", "flant5xl"), - "instructblip-flan-t5-xxl": ("blip2_t5_instruct", "flant5xxl"), - } - - name, type = model_name_to_original[model_name] - - # load original model - print("Loading original model...") - hf_model_device = "cuda:1" if torch.cuda.is_available() else "cpu" - lavis_device = "cuda:2" if torch.cuda.is_available() else "cpu" - original_model, vis_processors, _ = load_model_and_preprocess( - name=name, model_type=type, is_eval=True, device=lavis_device - ) - original_model.eval() - print("Done!") - - # update state dict keys - state_dict = original_model.state_dict() - rename_keys = create_rename_keys(config) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - - # some keys can be renamed efficiently - for key, val in state_dict.copy().items(): - val = state_dict.pop(key) - if key.startswith("Qformer.bert"): - key = key.replace("Qformer.bert", "qformer") - if "attention.self" in key: - key = key.replace("self", "attention") - if "llm_proj" in key: - key = key.replace("llm_proj", "language_projection") - if "t5_proj" in key: - key = key.replace("t5_proj", "language_projection") - if key.startswith("llm_model"): - key = key.replace("llm_model", "language_model") - if key.startswith("t5"): - key = key.replace("t5", "language") - state_dict[key] = val - - # read in qv biases - read_in_q_v_bias(state_dict, config) - - # note: weights get loaded in torch.float32 by default - hf_model.load_state_dict(state_dict, strict=True) - - image = load_demo_image() - prompt = "What is unusual about this image?" - - # create processor - image_processor = BlipImageProcessor( - size={"height": image_size, "width": image_size}, image_mean=OPENAI_CLIP_MEAN, image_std=OPENAI_CLIP_STD - ) - processor = InstructBlipProcessor( - image_processor=image_processor, - tokenizer=tokenizer, - qformer_tokenizer=qformer_tokenizer, - ) - inputs = processor(images=image, text=prompt, return_tensors="pt").to(hf_model_device) - - # make sure processor creates exact same pixel values - original_pixel_values = vis_processors["eval"](image).unsqueeze(0).to(lavis_device) - pixel_values = inputs.pixel_values - assert torch.allclose(original_pixel_values.to(pixel_values.device), pixel_values) - - original_model.to(lavis_device) - hf_model.to(hf_model_device) - with torch.no_grad(): - if "vicuna" in model_name: - original_logits = original_model({"image": original_pixel_values, "text_input": [prompt]}).logits - logits = hf_model(**inputs).logits - else: - original_logits = original_model( - {"image": original_pixel_values, "text_input": [prompt], "text_output": ["\n"]} - ).logits - label_input_ids = tokenizer("\n", return_tensors="pt").input_ids.to(hf_model_device) - labels = label_input_ids.masked_fill(label_input_ids == tokenizer.pad_token_id, -100) - logits = hf_model(**inputs, labels=labels).logits - - print("First values of original logits:", original_logits[0, :3, :3]) - print("First values of HF logits:", logits[0, :3, :3]) - - # assert values - assert original_logits.shape == logits.shape - atol = 1e-4 if "vicuna" in model_name else 1e-5 - assert torch.allclose(original_logits.to(logits.device), logits, atol=atol) - print("Looks ok!") - - print("Generating with original model...") - original_outputs = original_model.generate({"image": original_pixel_values, "prompt": prompt}, num_beams=5) - - # important: we need to cast the weights of the HF model to the appropriate type - print("Generating with HF model...") - outputs = hf_model.generate( - **inputs, - do_sample=False, - num_beams=5, - max_length=256, - min_length=1, - top_p=0.9, - repetition_penalty=1.5, - length_penalty=1.0, - temperature=1, - ) - if "vicuna" in model_name: - # convert output id 0 to 2 (eos_token_id) - # TODO add this in the generate method? - outputs[outputs == 0] = 2 - print("Original generation:", original_outputs) - output_text = processor.batch_decode(outputs, skip_special_tokens=True) - output_text = [text.strip() for text in output_text] - print("HF generation:", output_text) - - if pytorch_dump_folder_path is not None: - processor.save_pretrained(pytorch_dump_folder_path) - hf_model.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - processor.push_to_hub(f"Salesforce/{model_name}") - hf_model.push_to_hub(f"Salesforce/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - choices = [ - "instructblip-vicuna-7b", - "instructblip-vicuna-13b", - "instructblip-flan-t5-xl", - "instructblip-flan-t5-xxl", - ] - parser.add_argument( - "--model_name", - default="instructblip-flan-t5-xl", - choices=choices, - type=str, - help="Path to hf config.json of model to convert", - ) - parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether to push the model and processor to the hub after converting", - ) - - args = parser.parse_args() - - convert_blip2_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/instructblipvideo/convert_instructblipvideo_original_to_pytorch.py b/src/transformers/models/instructblipvideo/convert_instructblipvideo_original_to_pytorch.py deleted file mode 100644 index 9b3d508db6ff..000000000000 --- a/src/transformers/models/instructblipvideo/convert_instructblipvideo_original_to_pytorch.py +++ /dev/null @@ -1,305 +0,0 @@ -# coding=utf-8 -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Convert InstructBlipVideo checkpoints from the original repository. - -URL: https://github.com/salesforce/LAVIS/tree/main/projects/instructblipvideo -""" - -import argparse - -import requests -import torch - -# pip3 install salesforce-lavis -# I'm actually installing a slightly modified version: pip3 install git+https://github.com/nielsrogge/LAVIS.git@fix_lavis_float32 (there's also the fix_lavis branch) -# also note: to convert Vicuna checkpoints, we had to include /home/niels/python_projects/checkpoints/FastChat/vicuna-7b in lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml -# same for Vicuna-13b -from lavis.models import load_model_and_preprocess -from PIL import Image - -from transformers import ( - AutoTokenizer, - BlipImageProcessor, - InstructBlipProcessor, - InstructBlipVideoConfig, - InstructBlipVideoForConditionalGeneration, - InstructBlipVideoQFormerConfig, - InstructBlipVideoVisionConfig, - LlamaConfig, - LlamaTokenizerFast, - T5Config, - T5TokenizerFast, -) -from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD - - -def load_demo_image(): - url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg" - image = Image.open(requests.get(url, stream=True).raw).convert("RGB") - - return image - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config): - rename_keys = [] - # fmt: off - - # vision encoder - rename_keys.append(("visual_encoder.cls_token", "vision_model.embeddings.class_embedding")) - rename_keys.append(("visual_encoder.pos_embed", "vision_model.embeddings.position_embedding")) - rename_keys.append(("visual_encoder.patch_embed.proj.weight", "vision_model.embeddings.patch_embedding.weight")) - rename_keys.append(("visual_encoder.patch_embed.proj.bias", "vision_model.embeddings.patch_embedding.bias")) - rename_keys.append(("ln_vision.weight", "vision_model.post_layernorm.weight")) - rename_keys.append(("ln_vision.bias", "vision_model.post_layernorm.bias")) - - for i in range(config.vision_config.num_hidden_layers): - rename_keys.append((f"visual_encoder.blocks.{i}.norm1.weight", f"vision_model.encoder.layers.{i}.layer_norm1.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.norm1.bias", f"vision_model.encoder.layers.{i}.layer_norm1.bias")) - rename_keys.append((f"visual_encoder.blocks.{i}.norm2.weight", f"vision_model.encoder.layers.{i}.layer_norm2.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.norm2.bias", f"vision_model.encoder.layers.{i}.layer_norm2.bias")) - rename_keys.append((f"visual_encoder.blocks.{i}.attn.qkv.weight", f"vision_model.encoder.layers.{i}.self_attn.qkv.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.weight", f"vision_model.encoder.layers.{i}.self_attn.projection.weight",)) - rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.bias", f"vision_model.encoder.layers.{i}.self_attn.projection.bias")) - rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.weight", f"vision_model.encoder.layers.{i}.mlp.fc1.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.bias", f"vision_model.encoder.layers.{i}.mlp.fc1.bias")) - rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.weight", f"vision_model.encoder.layers.{i}.mlp.fc2.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.bias", f"vision_model.encoder.layers.{i}.mlp.fc2.bias")) - - # QFormer - rename_keys.append(("Qformer.bert.embeddings.LayerNorm.weight", "qformer.embeddings.layernorm.weight")) - rename_keys.append(("Qformer.bert.embeddings.LayerNorm.bias", "qformer.embeddings.layernorm.bias")) - - # fmt: on - return rename_keys - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -def read_in_q_v_bias(state_dict, config): - for i in range(config.vision_config.num_hidden_layers): - # read in original q and v biases - q_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.q_bias") - v_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.v_bias") - - # next, set bias in the state dict - qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias)) - state_dict[f"vision_model.encoder.layers.{i}.self_attn.qkv.bias"] = qkv_bias - - -def get_blip2_config(model_name): - image_size = 364 if "coco" in model_name else 224 - vision_config = InstructBlipVideoVisionConfig(image_size=image_size).to_dict() - - # make sure the models have proper bos_token_id and eos_token_id set (important for generation) - # seems like flan-T5 models don't have bos_token_id properly set? - if "t5-xl" in model_name: - text_config = T5Config.from_pretrained("google/flan-t5-xl", dense_act_fn="gelu", bos_token_id=1).to_dict() - elif "t5-xxl" in model_name: - text_config = T5Config.from_pretrained("google/flan-t5-xxl", dense_act_fn="gelu", bos_token_id=1).to_dict() - elif "vicuna-7b" in model_name: - text_config = LlamaConfig.from_pretrained("decapoda-research/llama-7b-hf", vocab_size=32001).to_dict() - elif "vicuna-13b" in model_name: - text_config = LlamaConfig.from_pretrained("decapoda-research/llama-13b-hf", vocab_size=32001).to_dict() - else: - raise ValueError("Model name not supported") - - # the authors add one special "[DEC]" token to the vocab of Q-Former, hence vocab size = 30522 + 1 - qformer_config = InstructBlipVideoQFormerConfig(vocab_size=30523).to_dict() - config = InstructBlipVideoConfig( - vision_config=vision_config, text_config=text_config, qformer_config=qformer_config - ) - - return config, image_size - - -@torch.no_grad() -def convert_blip2_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False): - """ - Copy/paste/tweak model's weights to Transformers design. - """ - qformer_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased", truncation_side="left") - qformer_tokenizer.add_special_tokens({"bos_token": "[DEC]"}) - - if "t5" in model_name: - tokenizer = T5TokenizerFast.from_pretrained("google/flan-t5-xl", truncation_side="left") - elif "vicuna" in model_name: - # the following was used in the original implementation: - # tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", use_fast=False, truncation_side="left") - # tokenizer.add_special_tokens({"pad_token": "[PAD]"}) - # tokenizer.add_special_tokens({"bos_token": ""}) - # tokenizer.add_special_tokens({"eos_token": ""}) - # tokenizer.add_special_tokens({"unk_token": ""}) - tokenizer = LlamaTokenizerFast.from_pretrained( - "huggyllama/llama-7b", truncation_side="left", bos_token="", unk_token="" - ) - tokenizer.add_special_tokens({"pad_token": "[PAD]"}) - - config, image_size = get_blip2_config(model_name) - hf_model = InstructBlipVideoForConditionalGeneration(config).eval() - - model_name_to_original = { - "instructblipvideo-vicuna-7b": ("blip2_vicuna_instruct", "vicuna7b"), - "instructblipvideo-vicuna-13b": ("blip2_vicuna_instruct", "vicuna13b"), - "instructblipvideo-flan-t5-xl": ("blip2_t5_instruct", "flant5xl"), - "instructblipvideo-flan-t5-xxl": ("blip2_t5_instruct", "flant5xxl"), - } - - name, type = model_name_to_original[model_name] - - # load original model - print("Loading original model...") - hf_model_device = "cuda:1" if torch.cuda.is_available() else "cpu" - lavis_device = "cuda:2" if torch.cuda.is_available() else "cpu" - original_model, vis_processors, _ = load_model_and_preprocess( - name=name, model_type=type, is_eval=True, device=lavis_device - ) - original_model.eval() - print("Done!") - - # update state dict keys - state_dict = original_model.state_dict() - rename_keys = create_rename_keys(config) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - - # some keys can be renamed efficiently - for key, val in state_dict.copy().items(): - val = state_dict.pop(key) - if key.startswith("Qformer.bert"): - key = key.replace("Qformer.bert", "qformer") - if "attention.self" in key: - key = key.replace("self", "attention") - if "llm_proj" in key: - key = key.replace("llm_proj", "language_projection") - if "t5_proj" in key: - key = key.replace("t5_proj", "language_projection") - if key.startswith("llm_model"): - key = key.replace("llm_model", "language_model") - if key.startswith("t5"): - key = key.replace("t5", "language") - state_dict[key] = val - - # read in qv biases - read_in_q_v_bias(state_dict, config) - - # note: weights get loaded in torch.float32 by default - hf_model.load_state_dict(state_dict, strict=True) - - image = load_demo_image() - prompt = "What is unusual about this image?" - - # create processor - image_processor = BlipImageProcessor( - size={"height": image_size, "width": image_size}, image_mean=OPENAI_CLIP_MEAN, image_std=OPENAI_CLIP_STD - ) - processor = InstructBlipProcessor( - image_processor=image_processor, - tokenizer=tokenizer, - qformer_tokenizer=qformer_tokenizer, - ) - inputs = processor(images=image, text=prompt, return_tensors="pt").to(hf_model_device) - - # make sure processor creates exact same pixel values - original_pixel_values = vis_processors["eval"](image).unsqueeze(0).to(lavis_device) - pixel_values = inputs.pixel_values - assert torch.allclose(original_pixel_values.to(pixel_values.device), pixel_values) - - original_model.to(lavis_device) - hf_model.to(hf_model_device) - with torch.no_grad(): - if "vicuna" in model_name: - original_logits = original_model({"image": original_pixel_values, "text_input": [prompt]}).logits - logits = hf_model(**inputs).logits - else: - original_logits = original_model( - {"image": original_pixel_values, "text_input": [prompt], "text_output": ["\n"]} - ).logits - label_input_ids = tokenizer("\n", return_tensors="pt").input_ids.to(hf_model_device) - labels = label_input_ids.masked_fill(label_input_ids == tokenizer.pad_token_id, -100) - logits = hf_model(**inputs, labels=labels).logits - - print("First values of original logits:", original_logits[0, :3, :3]) - print("First values of HF logits:", logits[0, :3, :3]) - - # assert values - assert original_logits.shape == logits.shape - atol = 1e-4 if "vicuna" in model_name else 1e-5 - assert torch.allclose(original_logits.to(logits.device), logits, atol=atol) - print("Looks ok!") - - print("Generating with original model...") - original_outputs = original_model.generate({"image": original_pixel_values, "prompt": prompt}, num_beams=5) - - # important: we need to cast the weights of the HF model to the appropriate type - print("Generating with HF model...") - outputs = hf_model.generate( - **inputs, - do_sample=False, - num_beams=5, - max_length=256, - min_length=1, - top_p=0.9, - repetition_penalty=1.5, - length_penalty=1.0, - temperature=1, - ) - if "vicuna" in model_name: - # convert output id 0 to 2 (eos_token_id) - # TODO add this in the generate method? - outputs[outputs == 0] = 2 - print("Original generation:", original_outputs) - output_text = processor.batch_decode(outputs, skip_special_tokens=True) - output_text = [text.strip() for text in output_text] - print("HF generation:", output_text) - - if pytorch_dump_folder_path is not None: - processor.save_pretrained(pytorch_dump_folder_path) - hf_model.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - processor.push_to_hub(f"Salesforce/{model_name}") - hf_model.push_to_hub(f"Salesforce/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - choices = [ - "instructblipvideo-vicuna-7b", - "instructblipvideo-vicuna-13b", - "instructblipvideo-flan-t5-xl", - "instructblipvideo-flan-t5-xxl", - ] - parser.add_argument( - "--model_name", - default="instructblipvideo-flan-t5-xl", - choices=choices, - type=str, - help="Path to hf config.json of model to convert", - ) - parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether to push the model and processor to the hub after converting", - ) - - args = parser.parse_args() - - convert_blip2_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/instructblipvideo/video_processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/video_processing_instructblipvideo.py index a2cd3cf351d2..d2fe3cc7f343 100644 --- a/src/transformers/models/instructblipvideo/video_processing_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/video_processing_instructblipvideo.py @@ -20,21 +20,16 @@ from typing import Optional, Union import torch +from torchvision.transforms.v2 import functional as F from ...image_processing_utils import BatchFeature from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling, SizeDict from ...processing_utils import Unpack, VideosKwargs -from ...utils import TensorType, is_torchvision_v2_available +from ...utils import TensorType from ...video_processing_utils import BaseVideoProcessor from ...video_utils import group_videos_by_shape, reorder_videos -if is_torchvision_v2_available(): - from torchvision.transforms.v2 import functional as F -else: - from torchvision.transforms import functional as F - - class InstructBlipVideoVideoProcessorInitKwargs(VideosKwargs): ... diff --git a/src/transformers/models/internvl/convert_internvl_weights_to_hf.py b/src/transformers/models/internvl/convert_internvl_weights_to_hf.py deleted file mode 100644 index 35318c8a5f77..000000000000 --- a/src/transformers/models/internvl/convert_internvl_weights_to_hf.py +++ /dev/null @@ -1,460 +0,0 @@ -# coding=utf-8 -# Copyright 2025 HuggingFace Inc. team. All rights reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import gc -import os -import re -from typing import Literal, Optional - -import torch -from einops import rearrange - -from transformers import ( - AutoModel, - AutoTokenizer, - GenerationConfig, - GotOcr2ImageProcessorFast, - InternVLConfig, - InternVLForConditionalGeneration, - InternVLProcessor, - InternVLVideoProcessor, - InternVLVisionConfig, - LlamaConfig, - Qwen2Config, -) - - -LM_TYPE_CORRESPONDENCE = { - "OpenGVLab/InternVL2_5-1B-MPO": "qwen2", - "OpenGVLab/InternVL2_5-2B-MPO": "llama", - "OpenGVLab/InternVL2_5-4B-MPO": "qwen2", - "OpenGVLab/InternVL2_5-8B-MPO": "llama", - "OpenGVLab/InternVL2_5-26B-MPO": "llama", - "OpenGVLab/InternVL2_5-38B-MPO": "qwen2", - "OpenGVLab/InternVL2_5-78B-MPO": "qwen2", - "OpenGVLab/InternVL3-1B": "qwen2", - "OpenGVLab/InternVL3-2B": "qwen2", - "OpenGVLab/InternVL3-8B": "qwen2", - "OpenGVLab/InternVL3-9B": "llama", - "OpenGVLab/InternVL3-14B": "qwen2", - "OpenGVLab/InternVL3-38B": "qwen2", - "OpenGVLab/InternVL3-78B": "qwen2", -} - -UNNECESSARY_CONFIG_KEYS = [ "_name_or_path", "_attn_implementation_autoset", "auto_map", "use_bfloat16", "use_flash_attn", "bias", "laux_allreduce", "moe_coeff_ratio", "moe_intermediate_size", "moe_output_scale", "noisy_gate_policy", "shared_expert_intermediate_size", "use_residual", "use_moe", "use_rts", "use_weighted_residual", "moe_config", "num_experts", "num_routed_experts", "num_shared_experts", "capacity_factor", "eval_capacity_factor", "drop_path_rate"] # fmt: skip - -# fmt: off -ORIGINAL_TO_CONVERTED_KEY_MAPPING_VISION = { - # Vision encoder mapping - r"vision_model": r"model.vision_tower", - r"layers": r"layer", - r"class_embedding": r"cls_token", - r"position_embedding": r"position_embeddings", - r"patch_embedding": r"patch_embeddings.projection", - r"ls(\d+)": r"lambda_\1", - r"attn.proj": r"attention.projection_layer", - r"attn.dropout": r"attention.projection_dropout", - r"attn": r"attention", - r"norm1": r"layernorm_before", - r"norm2": r"layernorm_after", - -} - -ORIGINAL_TO_CONVERTED_KEY_MAPPING_TEXT_LLAMA = { - r"language_model.model.": r"model.language_model.", - r"tok_embeddings": r"embed_tokens", - r"attention.wo": r"self_attn.o_proj", - r"feed_forward.w1": r"mlp.gate_proj", - r"feed_forward.w2": r"mlp.down_proj", - r"feed_forward.w3": r"mlp.up_proj", - r"attention_norm": r"input_layernorm", - r"ffn_norm": r"post_attention_layernorm", - r"language_model.output": r"lm_head", -} - -ORIGINAL_TO_CONVERTED_KEY_MAPPING_TEXT_QWEN2 = { - # Vision encoder mapping - r"language_model.model.": r"model.language_model.", - r"language_model.lm_head": r"lm_head", -} - -ORIGINAL_TO_CONVERTED_KEY_MAPPING_MULTI = { - # Vision encoder mapping - r"mlp1.0": r"model.multi_modal_projector.layer_norm", - r"mlp1.1": r"model.multi_modal_projector.linear_1", - r"mlp1.3": r"model.multi_modal_projector.linear_2", -} - - -chat_template = ( - "{% for message in messages %}" - "{{'<|im_start|>' + message['role'] + '\n'}}" - "{% if message['content'] is string %}" - "{{ message['content'] }}" - "{% else %}" - "{% for content in message['content'] %}" - "{% if content['type'] == 'image' %}" - "{{ '\n' }}" - "{% elif content['type'] == 'video' %}" - "{{ '