mindspore-lab · vigo999 · Oct 18, 2025 · Aug 15, 2025 · Aug 15, 2025 · Aug 18, 2025
@@ -285,6 +285,8 @@
         title: PixArtTransformer2DModel
       - local: api/models/prior_transformer
         title: PriorTransformer
+      - local: api/models/qwenimage_transformer2d
+        title: QwenImageTransformer2DModel
       - local: api/models/sana_transformer2d
         title: SanaTransformer2DModel
       - local: api/models/sd3_transformer2d
@@ -337,6 +339,8 @@
         title: AutoencoderKLMagvit
       - local: api/models/autoencoderkl_mochi
         title: AutoencoderKLMochi
+      - local: api/models/autoencoderkl_qwenimage
+        title: AutoencoderKLQwenImage
       - local: api/models/autoencoder_kl_wan
         title: AutoencoderKLWan
       - local: api/models/consistency_decoder_vae
@@ -471,6 +475,8 @@
       title: PixArt-α
     - local: api/pipelines/pixart_sigma
       title: PixArt-Σ
+    - local: api/pipelines/qwenimage
+      title: QwenImage
     - local: api/pipelines/sana
       title: Sana
     - local: api/pipelines/sana_sprint

@@ -27,6 +27,7 @@ LoRA is a fast and lightweight training method that inserts and trains a signifi
 - `WanLoraLoaderMixin` provides similar functions for [Wan](../../api/pipelines/wan.md).
 - `SkyReelsV2LoraLoaderMixin` provides similar functions for [SkyReels-V2](../../api/pipelines/skyreels_v2.md).
 - `AmusedLoraLoaderMixin` is for the [AmusedPipeline](../../api/pipelines/amused.md).
+- `QwenImageLoraLoaderMixin` provides similar functions for [QwenImage](../../api/pipelines/qwenimage.md)
 - `LoraBaseMixin` provides a base class with several utility methods to fuse, unfuse, unload, LoRAs and more.
 
 !!! tip
@@ -60,4 +61,6 @@ LoRA is a fast and lightweight training method that inserts and trains a signifi
 
 ::: mindone.diffusers.loaders.lora_pipeline.AmusedLoraLoaderMixin
 
+::: mindone.diffusers.loaders.lora_pipeline.QwenImageLoraLoaderMixin
+
 ::: mindone.diffusers.loaders.lora_base.LoraBaseMixin
@@ -0,0 +1,26 @@
+<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. -->
+
+# AutoencoderKLQwenImage
+
+The model can be loaded with the following code snippet.
+
+```python
+from mindone.diffusers import AutoencoderKLQwenImage
+
+vae = AutoencoderKLQwenImage.from_pretrained("Qwen/QwenImage", subfolder="vae")
+```
+
+::: mindone.diffusers.AutoencoderKLQwenImage
+
+::: mindone.diffusers.models.autoencoders.autoencoder_kl.AutoencoderKLOutput
+
+::: mindone.diffusers.models.autoencoders.vae.DecoderOutput
@@ -0,0 +1,24 @@
+<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. -->
+
+# QwenImageTransformer2DModel
+
+The model can be loaded with the following code snippet.
+
+```python
+from mindone.diffusers import QwenImageTransformer2DModel
+
+transformer = QwenImageTransformer2DModel.from_pretrained("Qwen/QwenImage", subfolder="transformer", mindspore_dtype=mindspore.bfloat16)
+```
+
+::: mindone.diffusers.QwenImageTransformer2DModel
+
+::: mindone.diffusers.models.modeling_outputs.Transformer2DModelOutput
@@ -0,0 +1,42 @@
+<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License. -->
+
+# QwenImage
+
+<div class="flex flex-wrap space-x-1">
+  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
+</div>
+
+Qwen-Image from the Qwen team is an image generation foundation model in the Qwen series that achieves significant advances in complex text rendering and precise image editing. Experiments show strong general capabilities in both image generation and editing, with exceptional performance in text rendering, especially for Chinese.
+
+Qwen-Image comes in the following variants:
+
+| model type | model id |
+|:----------:|:--------:|
+| Qwen-Image | [`Qwen/Qwen-Image`](https://huggingface.co/Qwen/Qwen-Image) |
+| Qwen-Image-Edit | [`Qwen/Qwen-Image-Edit`](https://huggingface.co/Qwen/Qwen-Image-Edit) |
+
+!!! tip
+
+    Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
+    In addition, the default version of installed `transformers` in `mindone` is `4.50.0`, but `transformers==4.52.1` is required for Qwen-Image. Please using `pip install transformers==4.52.1` to upgrade, if you want to try related Qwen-Image pipelines.
+
+
+::: mindone.diffusers.QwenImagePipeline
+
+::: mindone.diffusers.pipelines.qwenimage.pipeline_output.QwenImagePipelineOutput
+
+::: mindone.diffusers.QwenImageImg2ImgPipeline
+
+::: mindone.diffusers.QwenImageInpaintPipeline
@@ -65,6 +65,7 @@
         "AutoencoderKLLTXVideo",
         "AutoencoderKLMagvit",
         "AutoencoderKLMochi",
+        "AutoencoderKLQwenImage",
         "AutoencoderKLTemporalDecoder",
         "AutoencoderKLWan",
         "AutoencoderOobleck",
@@ -104,6 +105,7 @@
         "OmniGenTransformer2DModel",
         "PixArtTransformer2DModel",
         "PriorTransformer",
+        "QwenImageTransformer2DModel",
         "SanaControlNetModel",
         "SanaTransformer2DModel",
         "SD3ControlNetModel",
@@ -259,6 +261,11 @@
         "PixArtAlphaPipeline",
         "PixArtSigmaPAGPipeline",
         "PixArtSigmaPipeline",
+        "QwenImageImg2ImgPipeline",
+        "QwenImageInpaintPipeline",
+        "QwenImagePipeline",
+        "QwenImageEditPipeline",
+        "QwenImageEditInpaintPipeline",
         "ReduxImageEncoder",
         "SanaControlNetPipeline",
         "SanaPAGPipeline",
@@ -437,6 +444,7 @@
         AutoencoderKLLTXVideo,
         AutoencoderKLMagvit,
         AutoencoderKLMochi,
+        AutoencoderKLQwenImage,
         AutoencoderKLTemporalDecoder,
         AutoencoderKLWan,
         AutoencoderOobleck,
@@ -476,6 +484,7 @@
         OmniGenTransformer2DModel,
         PixArtTransformer2DModel,
         PriorTransformer,
+        QwenImageTransformer2DModel,
         SanaControlNetModel,
         SanaTransformer2DModel,
         SD3ControlNetModel,
@@ -642,6 +651,11 @@
         PixArtAlphaPipeline,
         PixArtSigmaPAGPipeline,
         PixArtSigmaPipeline,
+        QwenImageEditInpaintPipeline,
+        QwenImageEditPipeline,
+        QwenImageImg2ImgPipeline,
+        QwenImageInpaintPipeline,
+        QwenImagePipeline,
         ReduxImageEncoder,
         SanaControlNetPipeline,
         SanaPAGPipeline,

@@ -73,6 +73,7 @@ def text_encoder_attn_modules(text_encoder):
         "CogView4LoraLoaderMixin",
         "Mochi1LoraLoaderMixin",
         "HunyuanVideoLoraLoaderMixin",
+        "QwenImageLoraLoaderMixin",
         "SanaLoraLoaderMixin",
         "Lumina2LoraLoaderMixin",
         "WanLoraLoaderMixin",
@@ -100,6 +101,7 @@ def text_encoder_attn_modules(text_encoder):
         LTXVideoLoraLoaderMixin,
         Lumina2LoraLoaderMixin,
         Mochi1LoraLoaderMixin,
+        QwenImageLoraLoaderMixin,
         SanaLoraLoaderMixin,
         SD3LoraLoaderMixin,
         SkyReelsV2LoraLoaderMixin,

@@ -2351,8 +2351,8 @@ def get_alpha_scales(down_weight, alpha_key):
             down_weight = state_dict.pop(k)
             up_weight = state_dict.pop(k.replace(down_key, up_key))
             scale_down, scale_up = get_alpha_scales(down_weight, alpha_key)
-            converted_state_dict[diffusers_down_key] = down_weight * scale_down
-            converted_state_dict[diffusers_up_key] = up_weight * scale_up
+            converted_state_dict[diffusers_down_key] = Parameter(down_weight * scale_down)
+            converted_state_dict[diffusers_up_key] = Parameter(up_weight * scale_up)
 
     if len(state_dict) > 0:
         raise ValueError(f"`state_dict` should be empty at this point but has {state_dict.keys()=}")