fix template

samlhuillier · samlhuillier · commit 1e17b7bc7a67 · 2023-03-30T15:17:01.000Z
diff --git a/launch.sh b/launch.sh
@@ -5,7 +5,7 @@ conda activate diffusers
 python heictojpg.py "./data/dog"
 
 accelerate launch train_dreambooth.py \
-  --pretrained_model_name_or_path="CompVis/stable-diffusion-v1-4"  \
+  --pretrained_model_name_or_path="runwayml/stable-diffusion-v1-5"  \
   --instance_data_dir="./data/dog" \
   --output_dir="fine-tuned-model-output" \
   --instance_prompt="adamsmith" \
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,8 @@
-accelerate==0.12.0
+accelerate
 torchvision
-transformers>=4.21.0
+transformers>=4.25.1
 ftfy
 tensorboard
-modelcards
+Jinja2
+safetensors
+xformers
diff --git a/train_dreambooth.py b/train_dreambooth.py
@@ -3,6 +3,7 @@
 import itertools
 import random
 import json
+import logging
 import math
 import os
 from contextlib import nullcontext
@@ -19,6 +20,7 @@
 from accelerate.utils import set_seed
 from diffusers import AutoencoderKL, DDIMScheduler, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel
 from diffusers.optimization import get_scheduler
+from diffusers.utils.import_utils import is_xformers_available
 from huggingface_hub import HfFolder, Repository, whoami
 from PIL import Image
 from torchvision import transforms
@@ -111,7 +113,7 @@ def parse_args(input_args=None):
     parser.add_argument(
         "--save_infer_steps",
         type=int,
-        default=50,
+        default=20,
         help="The number of inference steps for save sample.",
     )
     parser.add_argument(
@@ -252,6 +254,11 @@ def parse_args(input_args=None):
         default=None,
         help="Path to json containing multiple concepts, will overwrite parameters like instance_prompt, class_prompt, etc.",
     )
+    parser.add_argument(
+        "--read_prompts_from_txts",
+        action="store_true",
+        help="Use prompt per image. Put prompts in the same directory as images, e.g. for image.png create image.png.txt.",
+    )
 
     if input_args is not None:
         args = parser.parse_args(input_args)
@@ -280,19 +287,25 @@ def __init__(
         center_crop=False,
         num_class_images=None,
         pad_tokens=False,
-        hflip=False
+        hflip=False,
+        read_prompts_from_txts=False,
     ):
         self.size = size
         self.center_crop = center_crop
         self.tokenizer = tokenizer
         self.with_prior_preservation = with_prior_preservation
         self.pad_tokens = pad_tokens
+        self.read_prompts_from_txts = read_prompts_from_txts
 
         self.instance_images_path = []
         self.class_images_path = []
 
         for concept in concepts_list:
-            inst_img_path = [(x, concept["instance_prompt"]) for x in Path(concept["instance_data_dir"]).iterdir() if x.is_file()]
+            inst_img_path = [
+                (x, concept["instance_prompt"])
+                for x in Path(concept["instance_data_dir"]).iterdir()
+                if x.is_file() and not str(x).endswith(".txt")
+            ]
             self.instance_images_path.extend(inst_img_path)
 
             if with_prior_preservation:
@@ -320,9 +333,15 @@ def __len__(self):
     def __getitem__(self, index):
         example = {}
         instance_path, instance_prompt = self.instance_images_path[index % self.num_instance_images]
+
+        if self.read_prompts_from_txts:
+            with open(str(instance_path) + ".txt") as f:
+                instance_prompt = f.read().strip()
+
         instance_image = Image.open(instance_path)
         if not instance_image.mode == "RGB":
             instance_image = instance_image.convert("RGB")
+
         example["instance_images"] = self.image_transforms(instance_image)
         example["instance_prompt_ids"] = self.tokenizer(
             instance_prompt,
@@ -410,6 +429,12 @@ def main(args):
         logging_dir=logging_dir,
     )
 
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+
     # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate
     # This will be enabled soon in accelerate. For now, we don't allow gradient accumulation when training two models.
     # TODO (patil-suraj): Remove this check when gradient accumulation with two models is enabled in accelerate.
@@ -457,6 +482,9 @@ def main(args):
                         safety_checker=None,
                         revision=args.revision
                     )
+                    pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
+                    if is_xformers_available():
+                        pipeline.enable_xformers_memory_efficient_attention()
                     pipeline.set_progress_bar_config(disable=True)
                     pipeline.to(accelerator.device)
 
@@ -472,7 +500,10 @@ def main(args):
                     for example in tqdm(
                         sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
                     ):
-                        images = pipeline(example["prompt"]).images
+                        images = pipeline(
+                            example["prompt"],
+                            num_inference_steps=args.save_infer_steps
+                            ).images
 
                         for i, image in enumerate(images):
                             hash_image = hashlib.sha1(image.tobytes()).hexdigest()
@@ -518,6 +549,12 @@ def main(args):
     if not args.train_text_encoder:
         text_encoder.requires_grad_(False)
 
+    if is_xformers_available():
+        vae.enable_xformers_memory_efficient_attention()
+        unet.enable_xformers_memory_efficient_attention()
+    else:
+        logger.warning("xformers is not available. Make sure it is installed correctly")
+
     if args.gradient_checkpointing:
         unet.enable_gradient_checkpointing()
         if args.train_text_encoder:
@@ -562,7 +599,8 @@ def main(args):
         center_crop=args.center_crop,
         num_class_images=args.num_class_images,
         pad_tokens=args.pad_tokens,
-        hflip=args.hflip
+        hflip=args.hflip,
+        read_prompts_from_txts=args.read_prompts_from_txts,
     )
 
     def collate_fn(examples):
@@ -679,24 +717,25 @@ def save_weights(step):
         # Create the pipeline using using the trained modules and save it.
         if accelerator.is_main_process:
             if args.train_text_encoder:
-                text_enc_model = accelerator.unwrap_model(text_encoder)
+                text_enc_model = accelerator.unwrap_model(text_encoder, keep_fp32_wrapper=True)
             else:
                 text_enc_model = CLIPTextModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision)
-            scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False)
             pipeline = StableDiffusionPipeline.from_pretrained(
                 args.pretrained_model_name_or_path,
-                unet=accelerator.unwrap_model(unet),
+                unet=accelerator.unwrap_model(unet, keep_fp32_wrapper=True),
                 text_encoder=text_enc_model,
                 vae=AutoencoderKL.from_pretrained(
                     args.pretrained_vae_name_or_path or args.pretrained_model_name_or_path,
                     subfolder=None if args.pretrained_vae_name_or_path else "vae",
                     revision=None if args.pretrained_vae_name_or_path else args.revision,
                 ),
                 safety_checker=None,
-                scheduler=scheduler,
                 torch_dtype=torch.float16,
                 revision=args.revision,
             )
+            pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
+            if is_xformers_available():
+                pipeline.enable_xformers_memory_efficient_attention()
             save_dir = os.path.join(args.output_dir, f"{step}")
             pipeline.save_pretrained(save_dir)
             with open(os.path.join(save_dir, "args.json"), "w") as f:
@@ -765,23 +804,31 @@ def save_weights(step):
                         encoder_hidden_states = text_encoder(batch["input_ids"])[0]
 
                 # Predict the noise residual
-                noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+                model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
 
                 if args.with_prior_preservation:
-                    # Chunk the noise and noise_pred into two parts and compute the loss on each part separately.
-                    noise_pred, noise_pred_prior = torch.chunk(noise_pred, 2, dim=0)
-                    noise, noise_prior = torch.chunk(noise, 2, dim=0)
+                    # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
+                    model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
+                    target, target_prior = torch.chunk(target, 2, dim=0)
 
                     # Compute instance loss
-                    loss = F.mse_loss(noise_pred.float(), noise.float(), reduction="none").mean([1, 2, 3]).mean()
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
 
                     # Compute prior loss
-                    prior_loss = F.mse_loss(noise_pred_prior.float(), noise_prior.float(), reduction="mean")
+                    prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean")
 
                     # Add the prior loss to the instance loss.
                     loss = loss + args.prior_loss_weight * prior_loss
                 else:
-                    loss = F.mse_loss(noise_pred.float(), noise.float(), reduction="mean")
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
 
                 accelerator.backward(loss)
                 # if accelerator.sync_gradients: