Skip to content

Commit 1876abf

Browse files
committed
Merge branch 'main' of https://github.com/huggingface/diffusers into main
2 parents 617841b + 026507c commit 1876abf

File tree

52 files changed

+3916
-1033
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

52 files changed

+3916
-1033
lines changed

docs/source/en/_toctree.yml

Lines changed: 30 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -290,12 +290,12 @@
290290
title: AuraFlowTransformer2DModel
291291
- local: api/models/cogvideox_transformer3d
292292
title: CogVideoXTransformer3DModel
293-
- local: api/models/consisid_transformer3d
294-
title: ConsisIDTransformer3DModel
295293
- local: api/models/cogview3plus_transformer2d
296294
title: CogView3PlusTransformer2DModel
297295
- local: api/models/cogview4_transformer2d
298296
title: CogView4Transformer2DModel
297+
- local: api/models/consisid_transformer3d
298+
title: ConsisIDTransformer3DModel
299299
- local: api/models/dit_transformer2d
300300
title: DiTTransformer2DModel
301301
- local: api/models/easyanimate_transformer3d
@@ -310,12 +310,12 @@
310310
title: HunyuanVideoTransformer3DModel
311311
- local: api/models/latte_transformer3d
312312
title: LatteTransformer3DModel
313-
- local: api/models/lumina_nextdit2d
314-
title: LuminaNextDiT2DModel
315-
- local: api/models/lumina2_transformer2d
316-
title: Lumina2Transformer2DModel
317313
- local: api/models/ltx_video_transformer3d
318314
title: LTXVideoTransformer3DModel
315+
- local: api/models/lumina2_transformer2d
316+
title: Lumina2Transformer2DModel
317+
- local: api/models/lumina_nextdit2d
318+
title: LuminaNextDiT2DModel
319319
- local: api/models/mochi_transformer3d
320320
title: MochiTransformer3DModel
321321
- local: api/models/omnigen_transformer
@@ -324,10 +324,10 @@
324324
title: PixArtTransformer2DModel
325325
- local: api/models/prior_transformer
326326
title: PriorTransformer
327-
- local: api/models/sd3_transformer2d
328-
title: SD3Transformer2DModel
329327
- local: api/models/sana_transformer2d
330328
title: SanaTransformer2DModel
329+
- local: api/models/sd3_transformer2d
330+
title: SD3Transformer2DModel
331331
- local: api/models/stable_audio_transformer
332332
title: StableAudioDiTModel
333333
- local: api/models/transformer2d
@@ -342,10 +342,10 @@
342342
title: StableCascadeUNet
343343
- local: api/models/unet
344344
title: UNet1DModel
345-
- local: api/models/unet2d
346-
title: UNet2DModel
347345
- local: api/models/unet2d-cond
348346
title: UNet2DConditionModel
347+
- local: api/models/unet2d
348+
title: UNet2DModel
349349
- local: api/models/unet3d-cond
350350
title: UNet3DConditionModel
351351
- local: api/models/unet-motion
@@ -354,6 +354,10 @@
354354
title: UViT2DModel
355355
title: UNets
356356
- sections:
357+
- local: api/models/asymmetricautoencoderkl
358+
title: AsymmetricAutoencoderKL
359+
- local: api/models/autoencoder_dc
360+
title: AutoencoderDC
357361
- local: api/models/autoencoderkl
358362
title: AutoencoderKL
359363
- local: api/models/autoencoderkl_allegro
@@ -370,10 +374,6 @@
370374
title: AutoencoderKLMochi
371375
- local: api/models/autoencoder_kl_wan
372376
title: AutoencoderKLWan
373-
- local: api/models/asymmetricautoencoderkl
374-
title: AsymmetricAutoencoderKL
375-
- local: api/models/autoencoder_dc
376-
title: AutoencoderDC
377377
- local: api/models/consistency_decoder_vae
378378
title: ConsistencyDecoderVAE
379379
- local: api/models/autoencoder_oobleck
@@ -521,40 +521,40 @@
521521
- sections:
522522
- local: api/pipelines/stable_diffusion/overview
523523
title: Overview
524-
- local: api/pipelines/stable_diffusion/text2img
525-
title: Text-to-image
524+
- local: api/pipelines/stable_diffusion/depth2img
525+
title: Depth-to-image
526+
- local: api/pipelines/stable_diffusion/gligen
527+
title: GLIGEN (Grounded Language-to-Image Generation)
528+
- local: api/pipelines/stable_diffusion/image_variation
529+
title: Image variation
526530
- local: api/pipelines/stable_diffusion/img2img
527531
title: Image-to-image
528532
- local: api/pipelines/stable_diffusion/svd
529533
title: Image-to-video
530534
- local: api/pipelines/stable_diffusion/inpaint
531535
title: Inpainting
532-
- local: api/pipelines/stable_diffusion/depth2img
533-
title: Depth-to-image
534-
- local: api/pipelines/stable_diffusion/image_variation
535-
title: Image variation
536+
- local: api/pipelines/stable_diffusion/k_diffusion
537+
title: K-Diffusion
538+
- local: api/pipelines/stable_diffusion/latent_upscale
539+
title: Latent upscaler
540+
- local: api/pipelines/stable_diffusion/ldm3d_diffusion
541+
title: LDM3D Text-to-(RGB, Depth), Text-to-(RGB-pano, Depth-pano), LDM3D Upscaler
536542
- local: api/pipelines/stable_diffusion/stable_diffusion_safe
537543
title: Safe Stable Diffusion
544+
- local: api/pipelines/stable_diffusion/sdxl_turbo
545+
title: SDXL Turbo
538546
- local: api/pipelines/stable_diffusion/stable_diffusion_2
539547
title: Stable Diffusion 2
540548
- local: api/pipelines/stable_diffusion/stable_diffusion_3
541549
title: Stable Diffusion 3
542550
- local: api/pipelines/stable_diffusion/stable_diffusion_xl
543551
title: Stable Diffusion XL
544-
- local: api/pipelines/stable_diffusion/sdxl_turbo
545-
title: SDXL Turbo
546-
- local: api/pipelines/stable_diffusion/latent_upscale
547-
title: Latent upscaler
548552
- local: api/pipelines/stable_diffusion/upscale
549553
title: Super-resolution
550-
- local: api/pipelines/stable_diffusion/k_diffusion
551-
title: K-Diffusion
552-
- local: api/pipelines/stable_diffusion/ldm3d_diffusion
553-
title: LDM3D Text-to-(RGB, Depth), Text-to-(RGB-pano, Depth-pano), LDM3D Upscaler
554554
- local: api/pipelines/stable_diffusion/adapter
555555
title: T2I-Adapter
556-
- local: api/pipelines/stable_diffusion/gligen
557-
title: GLIGEN (Grounded Language-to-Image Generation)
556+
- local: api/pipelines/stable_diffusion/text2img
557+
title: Text-to-image
558558
title: Stable Diffusion
559559
- local: api/pipelines/stable_unclip
560560
title: Stable unCLIP

docs/source/en/api/loaders/lora.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,10 @@ LoRA is a fast and lightweight training method that inserts and trains a signifi
2525
- [`SanaLoraLoaderMixin`] provides similar functions for [Sana](https://huggingface.co/docs/diffusers/main/en/api/pipelines/sana).
2626
- [`HunyuanVideoLoraLoaderMixin`] provides similar functions for [HunyuanVideo](https://huggingface.co/docs/diffusers/main/en/api/pipelines/hunyuan_video).
2727
- [`Lumina2LoraLoaderMixin`] provides similar functions for [Lumina2](https://huggingface.co/docs/diffusers/main/en/api/pipelines/lumina2).
28+
- [`WanLoraLoaderMixin`] provides similar functions for [Wan](https://huggingface.co/docs/diffusers/main/en/api/pipelines/wan).
29+
- [`CogView4LoraLoaderMixin`] provides similar functions for [CogView4](https://huggingface.co/docs/diffusers/main/en/api/pipelines/cogview4).
2830
- [`AmusedLoraLoaderMixin`] is for the [`AmusedPipeline`].
31+
- [`HiDreamImageLoraLoaderMixin`] provides similar functions for [HiDream Image](https://huggingface.co/docs/diffusers/main/en/api/pipelines/hidream)
2932
- [`LoraBaseMixin`] provides a base class with several utility methods to fuse, unfuse, unload, LoRAs and more.
3033

3134
<Tip>
@@ -77,10 +80,22 @@ To learn more about how to load LoRA weights, see the [LoRA](../../using-diffuse
7780

7881
[[autodoc]] loaders.lora_pipeline.Lumina2LoraLoaderMixin
7982

83+
## CogView4LoraLoaderMixin
84+
85+
[[autodoc]] loaders.lora_pipeline.CogView4LoraLoaderMixin
86+
87+
## WanLoraLoaderMixin
88+
89+
[[autodoc]] loaders.lora_pipeline.WanLoraLoaderMixin
90+
8091
## AmusedLoraLoaderMixin
8192

8293
[[autodoc]] loaders.lora_pipeline.AmusedLoraLoaderMixin
8394

95+
## HiDreamImageLoraLoaderMixin
96+
97+
[[autodoc]] loaders.lora_pipeline.HiDreamImageLoraLoaderMixin
98+
8499
## LoraBaseMixin
85100

86101
[[autodoc]] loaders.lora_base.LoraBaseMixin

docs/source/en/api/pipelines/aura_flow.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,8 @@ AuraFlow can be compiled with `torch.compile()` to speed up inference latency ev
100100
)
101101
```
102102

103+
Specifying `use_duck_shape` to be `False` instructs the compiler if it should use the same symbolic variable to represent input sizes that are the same. For more details, check out [this comment](https://github.com/huggingface/diffusers/pull/11327#discussion_r2047659790).
104+
103105
This enables from 100% (on low resolutions) to a 30% (on 1536x1536 resolution) speed improvements.
104106

105107
Thanks to [AstraliteHeart](https://github.com/huggingface/diffusers/pull/11297/) who helped us rewrite the [`AuraFlowTransformer2DModel`] class so that the above works for different resolutions ([PR](https://github.com/huggingface/diffusers/pull/11297/)).

docs/source/en/api/pipelines/wan.md

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,60 @@ output = pipe(
133133
export_to_video(output, "wan-i2v.mp4", fps=16)
134134
```
135135

136+
### First and Last Frame Interpolation
137+
138+
```python
139+
import numpy as np
140+
import torch
141+
import torchvision.transforms.functional as TF
142+
from diffusers import AutoencoderKLWan, WanImageToVideoPipeline
143+
from diffusers.utils import export_to_video, load_image
144+
from transformers import CLIPVisionModel
145+
146+
147+
model_id = "Wan-AI/Wan2.1-FLF2V-14B-720P-diffusers"
148+
image_encoder = CLIPVisionModel.from_pretrained(model_id, subfolder="image_encoder", torch_dtype=torch.float32)
149+
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
150+
pipe = WanImageToVideoPipeline.from_pretrained(
151+
model_id, vae=vae, image_encoder=image_encoder, torch_dtype=torch.bfloat16
152+
)
153+
pipe.to("cuda")
154+
155+
first_frame = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_first_frame.png")
156+
last_frame = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_last_frame.png")
157+
158+
def aspect_ratio_resize(image, pipe, max_area=720 * 1280):
159+
aspect_ratio = image.height / image.width
160+
mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
161+
height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
162+
width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
163+
image = image.resize((width, height))
164+
return image, height, width
165+
166+
def center_crop_resize(image, height, width):
167+
# Calculate resize ratio to match first frame dimensions
168+
resize_ratio = max(width / image.width, height / image.height)
169+
170+
# Resize the image
171+
width = round(image.width * resize_ratio)
172+
height = round(image.height * resize_ratio)
173+
size = [width, height]
174+
image = TF.center_crop(image, size)
175+
176+
return image, height, width
177+
178+
first_frame, height, width = aspect_ratio_resize(first_frame, pipe)
179+
if last_frame.size != first_frame.size:
180+
last_frame, _, _ = center_crop_resize(last_frame, height, width)
181+
182+
prompt = "CG animation style, a small blue bird takes off from the ground, flapping its wings. The bird's feathers are delicate, with a unique pattern on its chest. The background shows a blue sky with white clouds under bright sunshine. The camera follows the bird upward, capturing its flight and the vastness of the sky from a close-up, low-angle perspective."
183+
184+
output = pipe(
185+
image=first_frame, last_image=last_frame, prompt=prompt, height=height, width=width, guidance_scale=5.5
186+
).frames[0]
187+
export_to_video(output, "output.mp4", fps=16)
188+
```
189+
136190
### Video to Video Generation
137191

138192
```python

examples/advanced_diffusion_training/train_dreambooth_lora_flux_advanced.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1915,17 +1915,22 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers):
19151915
free_memory()
19161916

19171917
# Scheduler and math around the number of training steps.
1918-
overrode_max_train_steps = False
1919-
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
1918+
# Check the PR https://github.com/huggingface/diffusers/pull/8312 for detailed explanation.
1919+
num_warmup_steps_for_scheduler = args.lr_warmup_steps * accelerator.num_processes
19201920
if args.max_train_steps is None:
1921-
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
1922-
overrode_max_train_steps = True
1921+
len_train_dataloader_after_sharding = math.ceil(len(train_dataloader) / accelerator.num_processes)
1922+
num_update_steps_per_epoch = math.ceil(len_train_dataloader_after_sharding / args.gradient_accumulation_steps)
1923+
num_training_steps_for_scheduler = (
1924+
args.num_train_epochs * accelerator.num_processes * num_update_steps_per_epoch
1925+
)
1926+
else:
1927+
num_training_steps_for_scheduler = args.max_train_steps * accelerator.num_processes
19231928

19241929
lr_scheduler = get_scheduler(
19251930
args.lr_scheduler,
19261931
optimizer=optimizer,
1927-
num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
1928-
num_training_steps=args.max_train_steps * accelerator.num_processes,
1932+
num_warmup_steps=num_warmup_steps_for_scheduler,
1933+
num_training_steps=num_training_steps_for_scheduler,
19291934
num_cycles=args.lr_num_cycles,
19301935
power=args.lr_power,
19311936
)
@@ -1949,7 +1954,6 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers):
19491954
lr_scheduler,
19501955
)
19511956
else:
1952-
print("I SHOULD BE HERE")
19531957
transformer, text_encoder_one, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
19541958
transformer, text_encoder_one, optimizer, train_dataloader, lr_scheduler
19551959
)
@@ -1961,8 +1965,14 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers):
19611965

19621966
# We need to recalculate our total training steps as the size of the training dataloader may have changed.
19631967
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
1964-
if overrode_max_train_steps:
1968+
if args.max_train_steps is None:
19651969
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
1970+
if num_training_steps_for_scheduler != args.max_train_steps:
1971+
logger.warning(
1972+
f"The length of the 'train_dataloader' after 'accelerator.prepare' ({len(train_dataloader)}) does not match "
1973+
f"the expected length ({len_train_dataloader_after_sharding}) when the learning rate scheduler was created. "
1974+
f"This inconsistency may result in the learning rate scheduler not functioning properly."
1975+
)
19661976
# Afterwards we recalculate our number of training epochs
19671977
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
19681978

examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@
3333
from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
3434
from diffusers.loaders import (
3535
FromSingleFileMixin,
36-
StableDiffusionLoraLoaderMixin,
3736
StableDiffusionXLLoraLoaderMixin,
3837
TextualInversionLoaderMixin,
3938
)
@@ -300,7 +299,7 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
300299

301300

302301
class StableDiffusionXLControlNetAdapterInpaintPipeline(
303-
DiffusionPipeline, StableDiffusionMixin, FromSingleFileMixin, StableDiffusionLoraLoaderMixin
302+
DiffusionPipeline, StableDiffusionMixin, FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin
304303
):
305304
r"""
306305
Pipeline for text-to-image generation using Stable Diffusion augmented with T2I-Adapter

examples/controlnet/README_flux.md

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,19 @@ Training script provided by LibAI, which is an institution dedicated to the prog
66
> [!NOTE]
77
> **Memory consumption**
88
>
9-
> Flux can be quite expensive to run on consumer hardware devices and as a result, ControlNet training of it comes with higher memory requirements than usual.
9+
> Flux can be quite expensive to run on consumer hardware devices and as a result, ControlNet training of it comes with higher memory requirements than usual.
10+
11+
Here is a gpu memory consumption for reference, tested on a single A100 with 80G.
12+
13+
| period | GPU |
14+
| - | - |
15+
| load as float32 | ~70G |
16+
| mv transformer and vae to bf16 | ~48G |
17+
| pre compute txt embeddings | ~62G |
18+
| **offload te to cpu** | ~30G |
19+
| training | ~58G |
20+
| validation | ~71G |
21+
1022

1123
> **Gated access**
1224
>
@@ -98,8 +110,9 @@ accelerate launch train_controlnet_flux.py \
98110
--validation_image "./conditioning_image_1.png" "./conditioning_image_2.png" \
99111
--validation_prompt "red circle with blue background" "cyan circle with brown floral background" \
100112
--train_batch_size=1 \
101-
--gradient_accumulation_steps=4 \
113+
--gradient_accumulation_steps=16 \
102114
--report_to="wandb" \
115+
--lr_scheduler="cosine" \
103116
--num_double_layers=4 \
104117
--num_single_layers=0 \
105118
--seed=42 \

examples/controlnet/train_controlnet_flux.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ def log_validation(
148148
pooled_prompt_embeds=pooled_prompt_embeds,
149149
control_image=validation_image,
150150
num_inference_steps=28,
151-
controlnet_conditioning_scale=0.7,
151+
controlnet_conditioning_scale=1,
152152
guidance_scale=3.5,
153153
generator=generator,
154154
).images[0]
@@ -1085,8 +1085,6 @@ def compute_embeddings(batch, proportion_empty_prompts, flux_controlnet_pipeline
10851085
return {"prompt_embeds": prompt_embeds, "pooled_prompt_embeds": pooled_prompt_embeds, "text_ids": text_ids}
10861086

10871087
train_dataset = get_train_dataset(args, accelerator)
1088-
text_encoders = [text_encoder_one, text_encoder_two]
1089-
tokenizers = [tokenizer_one, tokenizer_two]
10901088
compute_embeddings_fn = functools.partial(
10911089
compute_embeddings,
10921090
flux_controlnet_pipeline=flux_controlnet_pipeline,
@@ -1103,7 +1101,8 @@ def compute_embeddings(batch, proportion_empty_prompts, flux_controlnet_pipeline
11031101
compute_embeddings_fn, batched=True, new_fingerprint=new_fingerprint, batch_size=50
11041102
)
11051103

1106-
del text_encoders, tokenizers, text_encoder_one, text_encoder_two, tokenizer_one, tokenizer_two
1104+
text_encoder_one.to("cpu")
1105+
text_encoder_two.to("cpu")
11071106
free_memory()
11081107

11091108
# Then get the training dataset ready to be passed to the dataloader.

0 commit comments

Comments
 (0)