Spaces:

fffiloni
/

LatentSync

Running on L4

App Files Files Community

fffiloni commited on 6 days ago

Commit

3650c12

verified ·

1 Parent(s): 9b15738

Migrated from GitHub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +3 -0
LICENSE +201 -0
ORIGINAL_README.md +158 -0
assets/demo1_audio.wav +0 -0
assets/demo1_video.mp4 +3 -0
assets/demo2_audio.wav +0 -0
assets/demo2_video.mp4 +3 -0
assets/demo3_audio.wav +0 -0
assets/demo3_video.mp4 +3 -0
assets/framework.png +0 -0
configs/audio.yaml +23 -0
configs/scheduler_config.json +13 -0
configs/syncnet/syncnet_16_latent.yaml +46 -0
configs/syncnet/syncnet_16_pixel.yaml +45 -0
configs/syncnet/syncnet_25_pixel.yaml +45 -0
configs/unet/first_stage.yaml +103 -0
configs/unet/second_stage.yaml +103 -0
data_processing_pipeline.sh +9 -0
eval/detectors/README.md +3 -0
eval/detectors/__init__.py +1 -0
eval/detectors/s3fd/__init__.py +61 -0
eval/detectors/s3fd/box_utils.py +221 -0
eval/detectors/s3fd/nets.py +174 -0
eval/draw_syncnet_lines.py +70 -0
eval/eval_fvd.py +96 -0
eval/eval_sync_conf.py +77 -0
eval/eval_sync_conf.sh +2 -0
eval/eval_syncnet_acc.py +118 -0
eval/eval_syncnet_acc.sh +3 -0
eval/fvd.py +56 -0
eval/hyper_iqa.py +343 -0
eval/inference_videos.py +37 -0
eval/syncnet/__init__.py +1 -0
eval/syncnet/syncnet.py +113 -0
eval/syncnet/syncnet_eval.py +220 -0
eval/syncnet_detect.py +251 -0
inference.sh +9 -0
latentsync/data/syncnet_dataset.py +153 -0
latentsync/data/unet_dataset.py +164 -0
latentsync/models/attention.py +492 -0
latentsync/models/motion_module.py +332 -0
latentsync/models/resnet.py +234 -0
latentsync/models/syncnet.py +233 -0
latentsync/models/syncnet_wav2lip.py +90 -0
latentsync/models/unet.py +528 -0
latentsync/models/unet_blocks.py +903 -0
latentsync/models/utils.py +19 -0
latentsync/pipelines/lipsync_pipeline.py +470 -0
latentsync/trepa/__init__.py +64 -0
latentsync/trepa/third_party/VideoMAEv2/__init__.py +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/demo1_video.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/demo2_video.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/demo3_video.mp4 filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

ORIGINAL_README.md ADDED Viewed

	@@ -0,0 +1,158 @@

+# LatentSync: Audio Conditioned Latent Diffusion Models for Lip Sync
+<div align="center">
+[![arXiv](https://img.shields.io/badge/arXiv_paper-2412.09262-b31b1b)](https://arxiv.org/abs/2412.09262)
+</div>
+## 📖 Abstract
+We present *LatentSync*, an end-to-end lip sync framework based on audio conditioned latent diffusion models without any intermediate motion representation, diverging from previous diffusion-based lip sync methods based on pixel space diffusion or two-stage generation. Our framework can leverage the powerful capabilities of Stable Diffusion to directly model complex audio-visual correlations. Additionally, we found that the diffusion-based lip sync methods exhibit inferior temporal consistency due to the inconsistency in the diffusion process across different frames. We propose *Temporal REPresentation Alignment (TREPA)* to enhance temporal consistency while preserving lip-sync accuracy. TREPA uses temporal representations extracted by large-scale self-supervised video models to align the generated frames with the ground truth frames.
+## 🏗️ Framework
+<p align="center">
+<img src="assets/framework.png" width=100%>
+<p>
+LatentSync uses the Whisper to convert melspectrogram into audio embeddings, which are then integrated into the U-Net via cross-attention layers. The reference and masked frames are channel-wise concatenated with noised latents as the input of U-Net. In the training process, we use one-step method to get estimated clean latents from predicted noises, which are then decoded to obtain the estimated clean frames. The TREPA, LPIPS and SyncNet loss are added in the pixel space.
+## 🎬 Demo
+<table class="center">
+  <tr style="font-weight: bolder;text-align:center;">
+        <td width="50%"><b>Original video</b></td>
+        <td width="50%"><b>Lip-synced video</b></td>
+  </tr>
+  <tr>
+    <td>
+      <video src=https://github.com/user-attachments/assets/ff3a84da-dc9b-498a-950f-5c54f58dd5c5 controls preload></video>
+    </td>
+    <td>
+      <video src=https://github.com/user-attachments/assets/150e00fd-381e-4421-a478-a9ea3d1212a8 controls preload></video>
+    </td>
+  </tr>
+  <tr>
+    <td>
+      <video src=https://github.com/user-attachments/assets/32c830a9-4d7d-4044-9b33-b184d8e11010 controls preload></video>
+    </td>
+    <td>
+      <video src=https://github.com/user-attachments/assets/84e4fe9d-b108-44a4-8712-13a012348145 controls preload></video>
+    </td>
+  </tr>
+  <tr>
+    <td>
+      <video src=https://github.com/user-attachments/assets/7510a448-255a-44ee-b093-a1b98bd3961d controls preload></video>
+    </td>
+    <td>
+      <video src=https://github.com/user-attachments/assets/6150c453-c559-4ae0-bb00-c565f135ff41 controls preload></video>
+    </td>
+  </tr>
+  <tr>
+    <td width=300px>
+      <video src=https://github.com/user-attachments/assets/0f7f9845-68b2-4165-bd08-c7bbe01a0e52 controls preload></video>
+    </td>
+    <td width=300px>
+      <video src=https://github.com/user-attachments/assets/c34fe89d-0c09-4de3-8601-3d01229a69e3 controls preload></video>
+    </td>
+  </tr>
+  <tr>
+    <td>
+      <video src=https://github.com/user-attachments/assets/7ce04d50-d39f-4154-932a-ec3a590a8f64 controls preload></video>
+    </td>
+    <td>
+      <video src=https://github.com/user-attachments/assets/70bde520-42fa-4a0e-b66c-d3040ae5e065 controls preload></video>
+    </td>
+  </tr>
+</table>
+(Photorealistic videos are filmed by contracted models, and anime videos are from [VASA-1](https://www.microsoft.com/en-us/research/project/vasa-1/) and [EMO](https://humanaigc.github.io/emote-portrait-alive/))
+## 📑 Open-source Plan
+- [x] Inference code and checkpoints
+- [x] Data processing pipeline
+- [x] Training code
+## 🔧 Setting up the Environment
+Install the required packages and download the checkpoints via:
+```bash
+source setup_env.sh
+```
+If the download is successful, the checkpoints should appear as follows:
+```
+./checkpoints/
+|-- latentsync_unet.pt
+|-- latentsync_syncnet.pt
+|-- whisper
+|   `-- tiny.pt
+|-- auxiliary
+|   |-- 2DFAN4-cd938726ad.zip
+|   |-- i3d_torchscript.pt
+|   |-- koniq_pretrained.pkl
+|   |-- s3fd-619a316812.pth
+|   |-- sfd_face.pth
+|   |-- syncnet_v2.model
+|   |-- vgg16-397923af.pth
+|   `-- vit_g_hybrid_pt_1200e_ssv2_ft.pth
+```
+These already include all the checkpoints required for latentsync training and inference. If you just want to try inference, you only need to download `latentsync_unet.pt` and `tiny.pt` from our [HuggingFace repo](https://huggingface.co/chunyu-li/LatentSync)
+## 🚀 Inference
+Run the script for inference, which requires about 6.5 GB GPU memory.
+```bash
+./inference.sh
+```
+You can change the parameter `guidance_scale` to 1.5 to improve the lip-sync accuracy.
+## 🔄 Data Processing Pipeline
+The complete data processing pipeline includes the following steps:
+1. Remove the broken video files.
+2. Resample the video FPS to 25, and resample the audio to 16000 Hz.
+3. Scene detect via [PySceneDetect](https://github.com/Breakthrough/PySceneDetect).
+4. Split each video into 5-10 second segments.
+5. Remove videos where the face is smaller than 256 $\times$ 256, as well as videos with more than one face.
+6. Affine transform the faces according to the landmarks detected by [face-alignment](https://github.com/1adrianb/face-alignment), then resize to 256 $\times$ 256.
+7. Remove videos with [sync confidence score](https://www.robots.ox.ac.uk/~vgg/publications/2016/Chung16a/chung16a.pdf) lower than 3, and adjust the audio-visual offset to 0.
+8. Calculate [hyperIQA](https://openaccess.thecvf.com/content_CVPR_2020/papers/Su_Blindly_Assess_Image_Quality_in_the_Wild_Guided_by_a_CVPR_2020_paper.pdf) score, and remove videos with scores lower than 40.
+Run the script to execute the data processing pipeline:
+```bash
+./data_processing_pipeline.sh
+```
+You can change the parameter `input_dir` in the script to specify the data directory to be processed. The processed data will be saved in the same directory. Each step will generate a new directory to prevent the need to redo the entire pipeline in case the process is interrupted by an unexpected error.
+## 🏋️‍♂️ Training U-Net
+Before training, you must process the data as described above and download all the checkpoints. We released a pretrained SyncNet with 94% accuracy on the VoxCeleb2 dataset for the supervision of U-Net training. Note that this SyncNet is trained on affine transformed videos, so when using or evaluating this SyncNet, you need to perform affine transformation on the video first (the code of affine transformation is included in the data processing pipeline).
+If all the preparations are complete, you can train the U-Net with the following script:
+```bash
+./train_unet.sh
+```
+You should change the parameters in U-Net config file to specify the data directory, checkpoint save path, and other training hyperparameters.
+## 🏋️‍♂️ Training SyncNet
+In case you want to train SyncNet on your own datasets, you can run the following script. The data processing pipeline for SyncNet is the same as U-Net.
+```bash
+./train_syncnet.sh
+```
+After `validations_steps` training, the loss charts will be saved in `train_output_dir`. They contain both the training and validation loss.

assets/demo1_audio.wav ADDED Viewed

Binary file (307 kB). View file

assets/demo1_video.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ed2dd1e2001aa605c3f2d77672a8af4ed55e427a85c55d408adfc3d5076bc872
+size 1240008

assets/demo2_audio.wav ADDED Viewed

Binary file (635 kB). View file

assets/demo2_video.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c3f10288e0642e587a95c0040e6966f8f6b7e003c3a17b572f72472b896d8ff
+size 1772492

assets/demo3_audio.wav ADDED Viewed

Binary file (594 kB). View file

assets/demo3_video.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cfa177b2a44f7809f606285c120e270d526caa50d708ec95e0f614d220970e0f
+size 2112370

assets/framework.png ADDED Viewed

configs/audio.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+audio:
+  num_mels: 80 # Number of mel-spectrogram channels and local conditioning dimensionality
+  rescale: true # Whether to rescale audio prior to preprocessing
+  rescaling_max: 0.9 # Rescaling value
+  use_lws:
+    false # Use LWS (https://github.com/Jonathan-LeRoux/lws) for STFT and phase reconstruction
+    # It"s preferred to set True to use with https://github.com/r9y9/wavenet_vocoder
+    # Does not work if n_ffit is not multiple of hop_size!!
+  n_fft: 800 # Extra window size is filled with 0 paddings to match this parameter
+  hop_size: 200 # For 16000Hz, 200 = 12.5 ms (0.0125 * sample_rate)
+  win_size: 800 # For 16000Hz, 800 = 50 ms (If None, win_size = n_fft) (0.05 * sample_rate)
+  sample_rate: 16000 # 16000Hz (corresponding to librispeech) (sox --i <filename>)
+  frame_shift_ms: null
+  signal_normalization: true
+  allow_clipping_in_normalization: true
+  symmetric_mels: true
+  max_abs_value: 4.0
+  preemphasize: true # whether to apply filter
+  preemphasis: 0.97 # filter coefficient.
+  min_level_db: -100
+  ref_level_db: 20
+  fmin: 55
+  fmax: 7600

configs/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_class_name": "DDIMScheduler",
+  "_diffusers_version": "0.6.0.dev0",
+  "beta_end": 0.012,
+  "beta_schedule": "scaled_linear",
+  "beta_start": 0.00085,
+  "clip_sample": false,
+  "num_train_timesteps": 1000,
+  "set_alpha_to_one": false,
+  "steps_offset": 1,
+  "trained_betas": null,
+  "skip_prk_steps": true
+}

configs/syncnet/syncnet_16_latent.yaml ADDED Viewed

	@@ -0,0 +1,46 @@

+model:
+  audio_encoder: # input (1, 80, 52)
+    in_channels: 1
+    block_out_channels: [32, 64, 128, 256, 512, 1024]
+    downsample_factors: [[2, 1], 2, 2, 2, 2, [2, 3]]
+    attn_blocks: [0, 0, 0, 0, 0, 0]
+    dropout: 0.0
+  visual_encoder: # input (64, 32, 32)
+    in_channels: 64
+    block_out_channels: [64, 128, 256, 256, 512, 1024]
+    downsample_factors: [2, 2, 2, 1, 2, 2]
+    attn_blocks: [0, 0, 0, 0, 0, 0]
+    dropout: 0.0
+ckpt:
+  resume_ckpt_path: ""
+  inference_ckpt_path: ""
+  save_ckpt_steps: 2500
+data:
+  train_output_dir: output/syncnet
+  num_val_samples: 1200
+  batch_size: 120 # 40
+  num_workers: 11 # 11
+  latent_space: true
+  num_frames: 16
+  resolution: 256
+  train_fileslist: ""
+  train_data_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/VoxCeleb2/high_visual_quality/train
+  val_fileslist: ""
+  val_data_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/VoxCeleb2/high_visual_quality/val
+  audio_cache_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/audio_cache/mel_new
+  lower_half: false
+  pretrained_audio_model_path: facebook/wav2vec2-large-xlsr-53
+  audio_sample_rate: 16000
+  video_fps: 25
+optimizer:
+  lr: 1e-5
+  max_grad_norm: 1.0
+run:
+  max_train_steps: 10000000
+  validation_steps: 2500
+  mixed_precision_training: true
+  seed: 42

configs/syncnet/syncnet_16_pixel.yaml ADDED Viewed

	@@ -0,0 +1,45 @@

+model:
+  audio_encoder: # input (1, 80, 52)
+    in_channels: 1
+    block_out_channels: [32, 64, 128, 256, 512, 1024, 2048]
+    downsample_factors: [[2, 1], 2, 2, 1, 2, 2, [2, 3]]
+    attn_blocks: [0, 0, 0, 0, 0, 0, 0]
+    dropout: 0.0
+  visual_encoder: # input (48, 128, 256)
+    in_channels: 48
+    block_out_channels: [64, 128, 256, 256, 512, 1024, 2048, 2048]
+    downsample_factors: [[1, 2], 2, 2, 2, 2, 2, 2, 2]
+    attn_blocks: [0, 0, 0, 0, 0, 0, 0, 0]
+    dropout: 0.0
+ckpt:
+  resume_ckpt_path: ""
+  inference_ckpt_path: checkpoints/latentsync_syncnet.pt
+  save_ckpt_steps: 2500
+data:
+  train_output_dir: debug/syncnet
+  num_val_samples: 2048
+  batch_size: 128 # 128
+  num_workers: 11 # 11
+  latent_space: false
+  num_frames: 16
+  resolution: 256
+  train_fileslist: /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/all_data_v6.txt
+  train_data_dir: ""
+  val_fileslist: ""
+  val_data_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/VoxCeleb2/high_visual_quality/val
+  audio_mel_cache_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/audio_cache/mel_new
+  lower_half: true
+  audio_sample_rate: 16000
+  video_fps: 25
+optimizer:
+  lr: 1e-5
+  max_grad_norm: 1.0
+run:
+  max_train_steps: 10000000
+  validation_steps: 2500
+  mixed_precision_training: true
+  seed: 42

configs/syncnet/syncnet_25_pixel.yaml ADDED Viewed

	@@ -0,0 +1,45 @@

+model:
+  audio_encoder: # input (1, 80, 80)
+    in_channels: 1
+    block_out_channels: [64, 128, 256, 256, 512, 1024]
+    downsample_factors: [2, 2, 2, 2, 2, 2]
+    dropout: 0.0
+  visual_encoder: # input (75, 128, 256)
+    in_channels: 75
+    block_out_channels: [128, 128, 256, 256, 512, 512, 1024, 1024]
+    downsample_factors: [[1, 2], 2, 2, 2, 2, 2, 2, 2]
+    dropout: 0.0
+ckpt:
+  resume_ckpt_path: ""
+  inference_ckpt_path: ""
+  save_ckpt_steps: 2500
+data:
+  train_output_dir: debug/syncnet
+  num_val_samples: 2048
+  batch_size: 64 # 64
+  num_workers: 11 # 11
+  latent_space: false
+  num_frames: 25
+  resolution: 256
+  train_fileslist: /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/hdtf_vox_avatars_ads_affine.txt
+  # /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/hdtf_voxceleb_avatars_affine.txt
+  train_data_dir: ""
+  val_fileslist: /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/vox_affine_val.txt
+  # /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/voxceleb_val.txt
+  val_data_dir: ""
+  audio_cache_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/audio_cache/mel
+  lower_half: true
+  pretrained_audio_model_path: facebook/wav2vec2-large-xlsr-53
+  audio_sample_rate: 16000
+  video_fps: 25
+optimizer:
+  lr: 1e-5
+  max_grad_norm: 1.0
+run:
+  max_train_steps: 10000000
+  mixed_precision_training: true
+  seed: 42

configs/unet/first_stage.yaml ADDED Viewed

	@@ -0,0 +1,103 @@

+data:
+  syncnet_config_path: configs/syncnet/syncnet_16_pixel.yaml
+  train_output_dir: debug/unet
+  train_fileslist: /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/all_data_v6.txt
+  train_data_dir: ""
+  audio_embeds_cache_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/audio_cache/whisper_new
+  audio_mel_cache_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/audio_cache/mel_new
+  val_video_path: assets/demo1_video.mp4
+  val_audio_path: assets/demo1_audio.wav
+  batch_size: 8 # 8
+  num_workers: 11 # 11
+  num_frames: 16
+  resolution: 256
+  mask: fix_mask
+  audio_sample_rate: 16000
+  video_fps: 25
+ckpt:
+  resume_ckpt_path: checkpoints/latentsync_unet.pt
+  save_ckpt_steps: 5000
+run:
+  pixel_space_supervise: false
+  use_syncnet: false
+  sync_loss_weight: 0.05 # 1/283
+  perceptual_loss_weight: 0.1 # 0.1
+  recon_loss_weight: 1 # 1
+  guidance_scale: 1.0 # 1.5 or 1.0
+  trepa_loss_weight: 10
+  inference_steps: 20
+  seed: 1247
+  use_mixed_noise: true
+  mixed_noise_alpha: 1 # 1
+  mixed_precision_training: true
+  enable_gradient_checkpointing: false
+  enable_xformers_memory_efficient_attention: true
+  max_train_steps: 10000000
+  max_train_epochs: -1
+optimizer:
+  lr: 1e-5
+  scale_lr: false
+  max_grad_norm: 1.0
+  lr_scheduler: constant
+  lr_warmup_steps: 0
+model:
+  act_fn: silu
+  add_audio_layer: true
+  custom_audio_layer: false
+  audio_condition_method: cross_attn # Choose between [cross_attn, group_norm]
+  attention_head_dim: 8
+  block_out_channels: [320, 640, 1280, 1280]
+  center_input_sample: false
+  cross_attention_dim: 384
+  down_block_types:
+    [
+      "CrossAttnDownBlock3D",
+      "CrossAttnDownBlock3D",
+      "CrossAttnDownBlock3D",
+      "DownBlock3D",
+    ]
+  mid_block_type: UNetMidBlock3DCrossAttn
+  up_block_types:
+    [
+      "UpBlock3D",
+      "CrossAttnUpBlock3D",
+      "CrossAttnUpBlock3D",
+      "CrossAttnUpBlock3D",
+    ]
+  downsample_padding: 1
+  flip_sin_to_cos: true
+  freq_shift: 0
+  in_channels: 13 # 49
+  layers_per_block: 2
+  mid_block_scale_factor: 1
+  norm_eps: 1e-5
+  norm_num_groups: 32
+  out_channels: 4 # 16
+  sample_size: 64
+  resnet_time_scale_shift: default # Choose between [default, scale_shift]
+  unet_use_cross_frame_attention: false
+  unet_use_temporal_attention: false
+  # Actually we don't use the motion module in the final version of LatentSync
+  # When we started the project, we used the codebase of AnimateDiff and tried motion module, the results are poor
+  # We decied to leave the code here for possible future usage
+  use_motion_module: false
+  motion_module_resolutions: [1, 2, 4, 8]
+  motion_module_mid_block: false
+  motion_module_decoder_only: false
+  motion_module_type: Vanilla
+  motion_module_kwargs:
+    num_attention_heads: 8
+    num_transformer_block: 1
+    attention_block_types:
+      - Temporal_Self
+      - Temporal_Self
+    temporal_position_encoding: true
+    temporal_position_encoding_max_len: 16
+    temporal_attention_dim_div: 1
+    zero_initialize: true

configs/unet/second_stage.yaml ADDED Viewed

	@@ -0,0 +1,103 @@

+data:
+  syncnet_config_path: configs/syncnet/syncnet_16_pixel.yaml
+  train_output_dir: debug/unet
+  train_fileslist: /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/all_data_v6.txt
+  train_data_dir: ""
+  audio_embeds_cache_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/audio_cache/whisper_new
+  audio_mel_cache_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/audio_cache/mel_new
+  val_video_path: assets/demo1_video.mp4
+  val_audio_path: assets/demo1_audio.wav
+  batch_size: 2 # 8
+  num_workers: 11 # 11
+  num_frames: 16
+  resolution: 256
+  mask: fix_mask
+  audio_sample_rate: 16000
+  video_fps: 25
+ckpt:
+  resume_ckpt_path: checkpoints/latentsync_unet.pt
+  save_ckpt_steps: 5000
+run:
+  pixel_space_supervise: true
+  use_syncnet: true
+  sync_loss_weight: 0.05 # 1/283
+  perceptual_loss_weight: 0.1 # 0.1
+  recon_loss_weight: 1 # 1
+  guidance_scale: 1.0 # 1.5 or 1.0
+  trepa_loss_weight: 10
+  inference_steps: 20
+  seed: 1247
+  use_mixed_noise: true
+  mixed_noise_alpha: 1 # 1
+  mixed_precision_training: true
+  enable_gradient_checkpointing: false
+  enable_xformers_memory_efficient_attention: true
+  max_train_steps: 10000000
+  max_train_epochs: -1
+optimizer:
+  lr: 1e-5
+  scale_lr: false
+  max_grad_norm: 1.0
+  lr_scheduler: constant
+  lr_warmup_steps: 0
+model:
+  act_fn: silu
+  add_audio_layer: true
+  custom_audio_layer: false
+  audio_condition_method: cross_attn # Choose between [cross_attn, group_norm]
+  attention_head_dim: 8
+  block_out_channels: [320, 640, 1280, 1280]
+  center_input_sample: false
+  cross_attention_dim: 384
+  down_block_types:
+    [
+      "CrossAttnDownBlock3D",
+      "CrossAttnDownBlock3D",
+      "CrossAttnDownBlock3D",
+      "DownBlock3D",
+    ]
+  mid_block_type: UNetMidBlock3DCrossAttn
+  up_block_types:
+    [
+      "UpBlock3D",
+      "CrossAttnUpBlock3D",
+      "CrossAttnUpBlock3D",
+      "CrossAttnUpBlock3D",
+    ]
+  downsample_padding: 1
+  flip_sin_to_cos: true
+  freq_shift: 0
+  in_channels: 13 # 49
+  layers_per_block: 2
+  mid_block_scale_factor: 1
+  norm_eps: 1e-5
+  norm_num_groups: 32
+  out_channels: 4 # 16
+  sample_size: 64
+  resnet_time_scale_shift: default # Choose between [default, scale_shift]
+  unet_use_cross_frame_attention: false
+  unet_use_temporal_attention: false
+  # Actually we don't use the motion module in the final version of LatentSync
+  # When we started the project, we used the codebase of AnimateDiff and tried motion module, the results are poor
+  # We decied to leave the code here for possible future usage
+  use_motion_module: false
+  motion_module_resolutions: [1, 2, 4, 8]
+  motion_module_mid_block: false
+  motion_module_decoder_only: false
+  motion_module_type: Vanilla
+  motion_module_kwargs:
+    num_attention_heads: 8
+    num_transformer_block: 1
+    attention_block_types:
+      - Temporal_Self
+      - Temporal_Self
+    temporal_position_encoding: true
+    temporal_position_encoding_max_len: 16
+    temporal_attention_dim_div: 1
+    zero_initialize: true

data_processing_pipeline.sh ADDED Viewed

	@@ -0,0 +1,9 @@

+#!/bin/bash
+python -m preprocess.data_processing_pipeline \
+    --total_num_workers 20 \
+    --per_gpu_num_workers 20 \
+    --resolution 256 \
+    --sync_conf_threshold 3 \
+    --temp_dir temp \
+    --input_dir /mnt/bn/maliva-gen-ai-v2/chunyu.li/VoxCeleb2/raw

eval/detectors/README.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # Face detector
2	+
3	+ This face detector is adapted from `https://github.com/cs-giung/face-detection-pytorch`.

eval/detectors/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .s3fd import S3FD

eval/detectors/s3fd/__init__.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import time
+import numpy as np
+import cv2
+import torch
+from torchvision import transforms
+from .nets import S3FDNet
+from .box_utils import nms_
+PATH_WEIGHT = 'checkpoints/auxiliary/sfd_face.pth'
+img_mean = np.array([104., 117., 123.])[:, np.newaxis, np.newaxis].astype('float32')
+class S3FD():
+    def __init__(self, device='cuda'):
+        tstamp = time.time()
+        self.device = device
+        print('[S3FD] loading with', self.device)
+        self.net = S3FDNet(device=self.device).to(self.device)
+        state_dict = torch.load(PATH_WEIGHT, map_location=self.device)
+        self.net.load_state_dict(state_dict)
+        self.net.eval()
+        print('[S3FD] finished loading (%.4f sec)' % (time.time() - tstamp))
+    def detect_faces(self, image, conf_th=0.8, scales=[1]):
+        w, h = image.shape[1], image.shape[0]
+        bboxes = np.empty(shape=(0, 5))
+        with torch.no_grad():
+            for s in scales:
+                scaled_img = cv2.resize(image, dsize=(0, 0), fx=s, fy=s, interpolation=cv2.INTER_LINEAR)
+                scaled_img = np.swapaxes(scaled_img, 1, 2)
+                scaled_img = np.swapaxes(scaled_img, 1, 0)
+                scaled_img = scaled_img[[2, 1, 0], :, :]
+                scaled_img = scaled_img.astype('float32')
+                scaled_img -= img_mean
+                scaled_img = scaled_img[[2, 1, 0], :, :]
+                x = torch.from_numpy(scaled_img).unsqueeze(0).to(self.device)
+                y = self.net(x)
+                detections = y.data
+                scale = torch.Tensor([w, h, w, h])
+                for i in range(detections.size(1)):
+                    j = 0
+                    while detections[0, i, j, 0] > conf_th:
+                        score = detections[0, i, j, 0]
+                        pt = (detections[0, i, j, 1:] * scale).cpu().numpy()
+                        bbox = (pt[0], pt[1], pt[2], pt[3], score)
+                        bboxes = np.vstack((bboxes, bbox))
+                        j += 1
+            keep = nms_(bboxes, 0.1)
+            bboxes = bboxes[keep]
+        return bboxes

eval/detectors/s3fd/box_utils.py ADDED Viewed

	@@ -0,0 +1,221 @@

+import numpy as np
+from itertools import product as product
+import torch
+from torch.autograd import Function
+import warnings
+def nms_(dets, thresh):
+    """
+    Courtesy of Ross Girshick
+    [https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/nms/py_cpu_nms.py]
+    """
+    x1 = dets[:, 0]
+    y1 = dets[:, 1]
+    x2 = dets[:, 2]
+    y2 = dets[:, 3]
+    scores = dets[:, 4]
+    areas = (x2 - x1) * (y2 - y1)
+    order = scores.argsort()[::-1]
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(int(i))
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+        w = np.maximum(0.0, xx2 - xx1)
+        h = np.maximum(0.0, yy2 - yy1)
+        inter = w * h
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+        inds = np.where(ovr <= thresh)[0]
+        order = order[inds + 1]
+    return np.array(keep).astype(np.int32)
+def decode(loc, priors, variances):
+    """Decode locations from predictions using priors to undo
+    the encoding we did for offset regression at train time.
+    Args:
+        loc (tensor): location predictions for loc layers,
+            Shape: [num_priors,4]
+        priors (tensor): Prior boxes in center-offset form.
+            Shape: [num_priors,4].
+        variances: (list[float]) Variances of priorboxes
+    Return:
+        decoded bounding box predictions
+    """
+    boxes = torch.cat((
+        priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
+        priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
+    boxes[:, :2] -= boxes[:, 2:] / 2
+    boxes[:, 2:] += boxes[:, :2]
+    return boxes
+def nms(boxes, scores, overlap=0.5, top_k=200):
+    """Apply non-maximum suppression at test time to avoid detecting too many
+    overlapping bounding boxes for a given object.
+    Args:
+        boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
+        scores: (tensor) The class predscores for the img, Shape:[num_priors].
+        overlap: (float) The overlap thresh for suppressing unnecessary boxes.
+        top_k: (int) The Maximum number of box preds to consider.
+    Return:
+        The indices of the kept boxes with respect to num_priors.
+    """
+    keep = scores.new(scores.size(0)).zero_().long()
+    if boxes.numel() == 0:
+        return keep, 0
+    x1 = boxes[:, 0]
+    y1 = boxes[:, 1]
+    x2 = boxes[:, 2]
+    y2 = boxes[:, 3]
+    area = torch.mul(x2 - x1, y2 - y1)
+    v, idx = scores.sort(0)  # sort in ascending order
+    # I = I[v >= 0.01]
+    idx = idx[-top_k:]  # indices of the top-k largest vals
+    xx1 = boxes.new()
+    yy1 = boxes.new()
+    xx2 = boxes.new()
+    yy2 = boxes.new()
+    w = boxes.new()
+    h = boxes.new()
+    # keep = torch.Tensor()
+    count = 0
+    while idx.numel() > 0:
+        i = idx[-1]  # index of current largest val
+        # keep.append(i)
+        keep[count] = i
+        count += 1
+        if idx.size(0) == 1:
+            break
+        idx = idx[:-1]  # remove kept element from view
+        # load bboxes of next highest vals
+        with warnings.catch_warnings():
+            # Ignore UserWarning within this block
+            warnings.simplefilter("ignore", category=UserWarning)
+            torch.index_select(x1, 0, idx, out=xx1)
+            torch.index_select(y1, 0, idx, out=yy1)
+            torch.index_select(x2, 0, idx, out=xx2)
+            torch.index_select(y2, 0, idx, out=yy2)
+        # store element-wise max with next highest score
+        xx1 = torch.clamp(xx1, min=x1[i])
+        yy1 = torch.clamp(yy1, min=y1[i])
+        xx2 = torch.clamp(xx2, max=x2[i])
+        yy2 = torch.clamp(yy2, max=y2[i])
+        w.resize_as_(xx2)
+        h.resize_as_(yy2)
+        w = xx2 - xx1
+        h = yy2 - yy1
+        # check sizes of xx1 and xx2.. after each iteration
+        w = torch.clamp(w, min=0.0)
+        h = torch.clamp(h, min=0.0)
+        inter = w * h
+        # IoU = i / (area(a) + area(b) - i)
+        rem_areas = torch.index_select(area, 0, idx)  # load remaining areas)
+        union = (rem_areas - inter) + area[i]
+        IoU = inter / union  # store result in iou
+        # keep only elements with an IoU <= overlap
+        idx = idx[IoU.le(overlap)]
+    return keep, count
+class Detect(object):
+    def __init__(self, num_classes=2,
+                    top_k=750, nms_thresh=0.3, conf_thresh=0.05,
+                    variance=[0.1, 0.2], nms_top_k=5000):
+        self.num_classes = num_classes
+        self.top_k = top_k
+        self.nms_thresh = nms_thresh
+        self.conf_thresh = conf_thresh
+        self.variance = variance
+        self.nms_top_k = nms_top_k
+    def forward(self, loc_data, conf_data, prior_data):
+        num = loc_data.size(0)
+        num_priors = prior_data.size(0)
+        conf_preds = conf_data.view(num, num_priors, self.num_classes).transpose(2, 1)
+        batch_priors = prior_data.view(-1, num_priors, 4).expand(num, num_priors, 4)
+        batch_priors = batch_priors.contiguous().view(-1, 4)
+        decoded_boxes = decode(loc_data.view(-1, 4), batch_priors, self.variance)
+        decoded_boxes = decoded_boxes.view(num, num_priors, 4)
+        output = torch.zeros(num, self.num_classes, self.top_k, 5)
+        for i in range(num):
+            boxes = decoded_boxes[i].clone()
+            conf_scores = conf_preds[i].clone()
+            for cl in range(1, self.num_classes):
+                c_mask = conf_scores[cl].gt(self.conf_thresh)
+                scores = conf_scores[cl][c_mask]
+                if scores.dim() == 0:
+                    continue
+                l_mask = c_mask.unsqueeze(1).expand_as(boxes)
+                boxes_ = boxes[l_mask].view(-1, 4)
+                ids, count = nms(boxes_, scores, self.nms_thresh, self.nms_top_k)
+                count = count if count < self.top_k else self.top_k
+                output[i, cl, :count] = torch.cat((scores[ids[:count]].unsqueeze(1), boxes_[ids[:count]]), 1)
+        return output
+class PriorBox(object):
+    def __init__(self, input_size, feature_maps,
+                    variance=[0.1, 0.2],
+                    min_sizes=[16, 32, 64, 128, 256, 512],
+                    steps=[4, 8, 16, 32, 64, 128],
+                    clip=False):
+        super(PriorBox, self).__init__()
+        self.imh = input_size[0]
+        self.imw = input_size[1]
+        self.feature_maps = feature_maps
+        self.variance = variance
+        self.min_sizes = min_sizes
+        self.steps = steps
+        self.clip = clip
+    def forward(self):
+        mean = []
+        for k, fmap in enumerate(self.feature_maps):
+            feath = fmap[0]
+            featw = fmap[1]
+            for i, j in product(range(feath), range(featw)):
+                f_kw = self.imw / self.steps[k]
+                f_kh = self.imh / self.steps[k]
+                cx = (j + 0.5) / f_kw
+                cy = (i + 0.5) / f_kh
+                s_kw = self.min_sizes[k] / self.imw
+                s_kh = self.min_sizes[k] / self.imh
+                mean += [cx, cy, s_kw, s_kh]
+        output = torch.FloatTensor(mean).view(-1, 4)
+        if self.clip:
+            output.clamp_(max=1, min=0)
+        return output

eval/detectors/s3fd/nets.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.nn.init as init
+from .box_utils import Detect, PriorBox
+class L2Norm(nn.Module):
+    def __init__(self, n_channels, scale):
+        super(L2Norm, self).__init__()
+        self.n_channels = n_channels
+        self.gamma = scale or None
+        self.eps = 1e-10
+        self.weight = nn.Parameter(torch.Tensor(self.n_channels))
+        self.reset_parameters()
+    def reset_parameters(self):
+        init.constant_(self.weight, self.gamma)
+    def forward(self, x):
+        norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps
+        x = torch.div(x, norm)
+        out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x
+        return out
+class S3FDNet(nn.Module):
+    def __init__(self, device='cuda'):
+        super(S3FDNet, self).__init__()
+        self.device = device
+        self.vgg = nn.ModuleList([
+            nn.Conv2d(3, 64, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(64, 64, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(2, 2),
+            nn.Conv2d(64, 128, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(128, 128, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(2, 2),
+            nn.Conv2d(128, 256, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256, 256, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256, 256, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(2, 2, ceil_mode=True),
+            nn.Conv2d(256, 512, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(512, 512, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(512, 512, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(2, 2),
+            nn.Conv2d(512, 512, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(512, 512, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(512, 512, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(2, 2),
+            nn.Conv2d(512, 1024, 3, 1, padding=6, dilation=6),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(1024, 1024, 1, 1),
+            nn.ReLU(inplace=True),
+        ])
+        self.L2Norm3_3 = L2Norm(256, 10)
+        self.L2Norm4_3 = L2Norm(512, 8)
+        self.L2Norm5_3 = L2Norm(512, 5)
+        self.extras = nn.ModuleList([
+            nn.Conv2d(1024, 256, 1, 1),
+            nn.Conv2d(256, 512, 3, 2, padding=1),
+            nn.Conv2d(512, 128, 1, 1),
+            nn.Conv2d(128, 256, 3, 2, padding=1),
+        ])
+        self.loc = nn.ModuleList([
+            nn.Conv2d(256, 4, 3, 1, padding=1),
+            nn.Conv2d(512, 4, 3, 1, padding=1),
+            nn.Conv2d(512, 4, 3, 1, padding=1),
+            nn.Conv2d(1024, 4, 3, 1, padding=1),
+            nn.Conv2d(512, 4, 3, 1, padding=1),
+            nn.Conv2d(256, 4, 3, 1, padding=1),
+        ])
+        self.conf = nn.ModuleList([
+            nn.Conv2d(256, 4, 3, 1, padding=1),
+            nn.Conv2d(512, 2, 3, 1, padding=1),
+            nn.Conv2d(512, 2, 3, 1, padding=1),
+            nn.Conv2d(1024, 2, 3, 1, padding=1),
+            nn.Conv2d(512, 2, 3, 1, padding=1),
+            nn.Conv2d(256, 2, 3, 1, padding=1),
+        ])
+        self.softmax = nn.Softmax(dim=-1)
+        self.detect = Detect()
+    def forward(self, x):
+        size = x.size()[2:]
+        sources = list()
+        loc = list()
+        conf = list()
+        for k in range(16):
+            x = self.vgg[k](x)
+        s = self.L2Norm3_3(x)
+        sources.append(s)
+        for k in range(16, 23):
+            x = self.vgg[k](x)
+        s = self.L2Norm4_3(x)
+        sources.append(s)
+        for k in range(23, 30):
+            x = self.vgg[k](x)
+        s = self.L2Norm5_3(x)
+        sources.append(s)
+        for k in range(30, len(self.vgg)):
+            x = self.vgg[k](x)
+        sources.append(x)
+        # apply extra layers and cache source layer outputs
+        for k, v in enumerate(self.extras):
+            x = F.relu(v(x), inplace=True)
+            if k % 2 == 1:
+                sources.append(x)
+        # apply multibox head to source layers
+        loc_x = self.loc[0](sources[0])
+        conf_x = self.conf[0](sources[0])
+        max_conf, _ = torch.max(conf_x[:, 0:3, :, :], dim=1, keepdim=True)
+        conf_x = torch.cat((max_conf, conf_x[:, 3:, :, :]), dim=1)
+        loc.append(loc_x.permute(0, 2, 3, 1).contiguous())
+        conf.append(conf_x.permute(0, 2, 3, 1).contiguous())
+        for i in range(1, len(sources)):
+            x = sources[i]
+            conf.append(self.conf[i](x).permute(0, 2, 3, 1).contiguous())
+            loc.append(self.loc[i](x).permute(0, 2, 3, 1).contiguous())
+        features_maps = []
+        for i in range(len(loc)):
+            feat = []
+            feat += [loc[i].size(1), loc[i].size(2)]
+            features_maps += [feat]
+        loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
+        conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)
+        with torch.no_grad():
+            self.priorbox = PriorBox(size, features_maps)
+            self.priors = self.priorbox.forward()
+        output = self.detect.forward(
+            loc.view(loc.size(0), -1, 4),
+            self.softmax(conf.view(conf.size(0), -1, 2)),
+            self.priors.type(type(x.data)).to(self.device)
+        )
+        return output

eval/draw_syncnet_lines.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import matplotlib.pyplot as plt
+class Chart:
+    def __init__(self):
+        self.loss_list = []
+    def add_ckpt(self, ckpt_path, line_name):
+        ckpt = torch.load(ckpt_path, map_location="cpu")
+        train_step_list = ckpt["train_step_list"]
+        train_loss_list = ckpt["train_loss_list"]
+        val_step_list = ckpt["val_step_list"]
+        val_loss_list = ckpt["val_loss_list"]
+        val_step_list = [val_step_list[0]] + val_step_list[4::5]
+        val_loss_list = [val_loss_list[0]] + val_loss_list[4::5]
+        self.loss_list.append((line_name, train_step_list, train_loss_list, val_step_list, val_loss_list))
+    def draw(self, save_path, plot_val=True):
+        # Global settings
+        plt.rcParams["font.size"] = 14
+        plt.rcParams["font.family"] = "serif"
+        plt.rcParams["font.sans-serif"] = ["Arial", "DejaVu Sans", "Lucida Grande"]
+        plt.rcParams["font.serif"] = ["Times New Roman", "DejaVu Serif"]
+        # Creating the plot
+        plt.figure(figsize=(7.766, 4.8)) # Golden ratio
+        for loss in self.loss_list:
+            if plot_val:
+                (line,) = plt.plot(loss[1], loss[2], label=loss[0], linewidth=0.5, alpha=0.5)
+                line_color = line.get_color()
+                plt.plot(loss[3], loss[4], linewidth=1.5, color=line_color)
+            else:
+                plt.plot(loss[1], loss[2], label=loss[0], linewidth=1)
+        plt.xlabel("Step")
+        plt.ylabel("Loss")
+        legend = plt.legend()
+        # legend = plt.legend(loc='upper right', bbox_to_anchor=(1, 0.82))
+        # Adjust the linewidth of legend
+        for line in legend.get_lines():
+            line.set_linewidth(2)
+        plt.savefig(save_path, transparent=True)
+        plt.close()
+if __name__ == "__main__":
+    chart = Chart()
+    # chart.add_ckpt("output/syncnet/train-2024_10_25-18:14:43/checkpoints/checkpoint-10000.pt", "w/ self-attn")
+    # chart.add_ckpt("output/syncnet/train-2024_10_25-18:21:59/checkpoints/checkpoint-10000.pt", "w/o self-attn")
+    chart.add_ckpt("output/syncnet/train-2024_10_24-21:03:11/checkpoints/checkpoint-10000.pt", "Dim 512")
+    chart.add_ckpt("output/syncnet/train-2024_10_25-18:21:59/checkpoints/checkpoint-10000.pt", "Dim 2048")
+    chart.add_ckpt("output/syncnet/train-2024_10_24-22:37:04/checkpoints/checkpoint-10000.pt", "Dim 4096")
+    chart.add_ckpt("output/syncnet/train-2024_10_25-02:30:17/checkpoints/checkpoint-10000.pt", "Dim 6144")
+    chart.draw("ablation.pdf", plot_val=True)

eval/eval_fvd.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import mediapipe as mp
+import cv2
+from decord import VideoReader
+from einops import rearrange
+import os
+import numpy as np
+import torch
+import tqdm
+from eval.fvd import compute_our_fvd
+class FVD:
+    def __init__(self, resolution=(224, 224)):
+        self.face_detector = mp.solutions.face_detection.FaceDetection(model_selection=0, min_detection_confidence=0.5)
+        self.resolution = resolution
+    def detect_face(self, image):
+        height, width = image.shape[:2]
+        # Process the image and detect faces.
+        results = self.face_detector.process(image)
+        if not results.detections:  # Face not detected
+            raise Exception("Face not detected")
+        detection = results.detections[0]  # Only use the first face in the image
+        bounding_box = detection.location_data.relative_bounding_box
+        xmin = int(bounding_box.xmin * width)
+        ymin = int(bounding_box.ymin * height)
+        face_width = int(bounding_box.width * width)
+        face_height = int(bounding_box.height * height)
+        # Crop the image to the bounding box.
+        xmin = max(0, xmin)
+        ymin = max(0, ymin)
+        xmax = min(width, xmin + face_width)
+        ymax = min(height, ymin + face_height)
+        image = image[ymin:ymax, xmin:xmax]
+        return image
+    def detect_video(self, video_path, real: bool = True):
+        vr = VideoReader(video_path)
+        video_frames = vr[20:36].asnumpy()  # Use one frame per second
+        vr.seek(0)  # avoid memory leak
+        faces = []
+        for frame in video_frames:
+            face = self.detect_face(frame)
+            face = cv2.resize(face, (self.resolution[1], self.resolution[0]), interpolation=cv2.INTER_AREA)
+            faces.append(face)
+        if len(faces) != 16:
+            return None
+        faces = np.stack(faces, axis=0)  # (f, h, w, c)
+        faces = torch.from_numpy(faces)
+        return faces
+def eval_fvd(real_videos_dir, fake_videos_dir):
+    fvd = FVD()
+    real_features_list = []
+    fake_features_list = []
+    for file in tqdm.tqdm(os.listdir(fake_videos_dir)):
+        if file.endswith(".mp4"):
+            real_video_path = os.path.join(real_videos_dir, file.replace("_out.mp4", ".mp4"))
+            fake_video_path = os.path.join(fake_videos_dir, file)
+            real_features = fvd.detect_video(real_video_path, real=True)
+            fake_features = fvd.detect_video(fake_video_path, real=False)
+            if real_features is None or fake_features is None:
+                continue
+            real_features_list.append(real_features)
+            fake_features_list.append(fake_features)
+    real_features = torch.stack(real_features_list) / 255.0
+    fake_features = torch.stack(fake_features_list) / 255.0
+    print(compute_our_fvd(real_features, fake_features, device="cpu"))
+if __name__ == "__main__":
+    real_videos_dir = "/mnt/bn/maliva-gen-ai-v2/chunyu.li/VoxCeleb2/segmented/cross"
+    fake_videos_dir = "/mnt/bn/maliva-gen-ai-v2/chunyu.li/VoxCeleb2/segmented/latentsync_cross"
+    eval_fvd(real_videos_dir, fake_videos_dir)

eval/eval_sync_conf.py ADDED Viewed

	@@ -0,0 +1,77 @@

+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import tqdm
+from statistics import fmean
+from eval.syncnet import SyncNetEval
+from eval.syncnet_detect import SyncNetDetector
+from latentsync.utils.util import red_text
+import torch
+def syncnet_eval(syncnet, syncnet_detector, video_path, temp_dir, detect_results_dir="detect_results"):
+    syncnet_detector(video_path=video_path, min_track=50)
+    crop_videos = os.listdir(os.path.join(detect_results_dir, "crop"))
+    if crop_videos == []:
+        raise Exception(red_text(f"Face not detected in {video_path}"))
+    av_offset_list = []
+    conf_list = []
+    for video in crop_videos:
+        av_offset, _, conf = syncnet.evaluate(
+            video_path=os.path.join(detect_results_dir, "crop", video), temp_dir=temp_dir
+        )
+        av_offset_list.append(av_offset)
+        conf_list.append(conf)
+    av_offset = int(fmean(av_offset_list))
+    conf = fmean(conf_list)
+    print(f"Input video: {video_path}\nSyncNet confidence: {conf:.2f}\nAV offset: {av_offset}")
+    return av_offset, conf
+def main():
+    parser = argparse.ArgumentParser(description="SyncNet")
+    parser.add_argument("--initial_model", type=str, default="checkpoints/auxiliary/syncnet_v2.model", help="")
+    parser.add_argument("--video_path", type=str, default=None, help="")
+    parser.add_argument("--videos_dir", type=str, default="/root/processed")
+    parser.add_argument("--temp_dir", type=str, default="temp", help="")
+    args = parser.parse_args()
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    syncnet = SyncNetEval(device=device)
+    syncnet.loadParameters(args.initial_model)
+    syncnet_detector = SyncNetDetector(device=device, detect_results_dir="detect_results")
+    if args.video_path is not None:
+        syncnet_eval(syncnet, syncnet_detector, args.video_path, args.temp_dir)
+    else:
+        sync_conf_list = []
+        video_names = sorted([f for f in os.listdir(args.videos_dir) if f.endswith(".mp4")])
+        for video_name in tqdm.tqdm(video_names):
+            try:
+                _, conf = syncnet_eval(
+                    syncnet, syncnet_detector, os.path.join(args.videos_dir, video_name), args.temp_dir
+                )
+                sync_conf_list.append(conf)
+            except Exception as e:
+                print(e)
+        print(f"The average sync confidence is {fmean(sync_conf_list):.02f}")
+if __name__ == "__main__":
+    main()

eval/eval_sync_conf.sh ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ #!/bin/bash
2	+ python -m eval.eval_sync_conf --video_path "RD_Radio1_000_006_out.mp4"

eval/eval_syncnet_acc.py ADDED Viewed

	@@ -0,0 +1,118 @@

+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from tqdm.auto import tqdm
+import torch
+import torch.nn as nn
+from einops import rearrange
+from latentsync.models.syncnet import SyncNet
+from latentsync.data.syncnet_dataset import SyncNetDataset
+from diffusers import AutoencoderKL
+from omegaconf import OmegaConf
+from accelerate.utils import set_seed
+def main(config):
+    set_seed(config.run.seed)
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    if config.data.latent_space:
+        vae = AutoencoderKL.from_pretrained(
+            "runwayml/stable-diffusion-inpainting", subfolder="vae", revision="fp16", torch_dtype=torch.float16
+        )
+        vae.requires_grad_(False)
+        vae.to(device)
+    # Dataset and Dataloader setup
+    dataset = SyncNetDataset(config.data.val_data_dir, config.data.val_fileslist, config)
+    test_dataloader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=config.data.batch_size,
+        shuffle=False,
+        num_workers=config.data.num_workers,
+        drop_last=False,
+        worker_init_fn=dataset.worker_init_fn,
+    )
+    # Model
+    syncnet = SyncNet(OmegaConf.to_container(config.model)).to(device)
+    print(f"Load checkpoint from: {config.ckpt.inference_ckpt_path}")
+    checkpoint = torch.load(config.ckpt.inference_ckpt_path, map_location=device)
+    syncnet.load_state_dict(checkpoint["state_dict"])
+    syncnet.to(dtype=torch.float16)
+    syncnet.requires_grad_(False)
+    syncnet.eval()
+    global_step = 0
+    num_val_batches = config.data.num_val_samples // config.data.batch_size
+    progress_bar = tqdm(range(0, num_val_batches), initial=0, desc="Testing accuracy")
+    num_correct_preds = 0
+    num_total_preds = 0
+    while True:
+        for step, batch in enumerate(test_dataloader):
+            ### >>>> Test >>>> ###
+            frames = batch["frames"].to(device, dtype=torch.float16)
+            audio_samples = batch["audio_samples"].to(device, dtype=torch.float16)
+            y = batch["y"].to(device, dtype=torch.float16).squeeze(1)
+            if config.data.latent_space:
+                frames = rearrange(frames, "b f c h w -> (b f) c h w")
+                with torch.no_grad():
+                    frames = vae.encode(frames).latent_dist.sample() * 0.18215
+                frames = rearrange(frames, "(b f) c h w -> b (f c) h w", f=config.data.num_frames)
+            else:
+                frames = rearrange(frames, "b f c h w -> b (f c) h w")
+            if config.data.lower_half:
+                height = frames.shape[2]
+                frames = frames[:, :, height // 2 :, :]
+            with torch.no_grad():
+                vision_embeds, audio_embeds = syncnet(frames, audio_samples)
+            sims = nn.functional.cosine_similarity(vision_embeds, audio_embeds)
+            preds = (sims > 0.5).to(dtype=torch.float16)
+            num_correct_preds += (preds == y).sum().item()
+            num_total_preds += len(sims)
+            progress_bar.update(1)
+            global_step += 1
+            if global_step >= num_val_batches:
+                progress_bar.close()
+                print(f"Accuracy score: {num_correct_preds / num_total_preds*100:.2f}%")
+                return
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Code to test the accuracy of expert lip-sync discriminator")
+    parser.add_argument("--config_path", type=str, default="configs/syncnet/syncnet_16_latent.yaml")
+    args = parser.parse_args()
+    # Load a configuration file
+    config = OmegaConf.load(args.config_path)
+    main(config)

eval/eval_syncnet_acc.sh ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ #!/bin/bash
2	+
3	+ python -m eval.eval_syncnet_acc --config_path "configs/syncnet/syncnet_16_pixel.yaml"

eval/fvd.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# Adapted from https://github.com/universome/fvd-comparison/blob/master/our_fvd.py
+from typing import Tuple
+import scipy
+import numpy as np
+import torch
+def compute_fvd(feats_fake: np.ndarray, feats_real: np.ndarray) -> float:
+    mu_gen, sigma_gen = compute_stats(feats_fake)
+    mu_real, sigma_real = compute_stats(feats_real)
+    m = np.square(mu_gen - mu_real).sum()
+    s, _ = scipy.linalg.sqrtm(np.dot(sigma_gen, sigma_real), disp=False)  # pylint: disable=no-member
+    fid = np.real(m + np.trace(sigma_gen + sigma_real - s * 2))
+    return float(fid)
+def compute_stats(feats: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+    mu = feats.mean(axis=0)  # [d]
+    sigma = np.cov(feats, rowvar=False)  # [d, d]
+    return mu, sigma
+@torch.no_grad()
+def compute_our_fvd(videos_fake: np.ndarray, videos_real: np.ndarray, device: str = "cuda") -> float:
+    i3d_path = "checkpoints/auxiliary/i3d_torchscript.pt"
+    i3d_kwargs = dict(
+        rescale=False, resize=False, return_features=True
+    )  # Return raw features before the softmax layer.
+    with open(i3d_path, "rb") as f:
+        i3d_model = torch.jit.load(f).eval().to(device)
+    videos_fake = videos_fake.permute(0, 4, 1, 2, 3).to(device)
+    videos_real = videos_real.permute(0, 4, 1, 2, 3).to(device)
+    feats_fake = i3d_model(videos_fake, **i3d_kwargs).cpu().numpy()
+    feats_real = i3d_model(videos_real, **i3d_kwargs).cpu().numpy()
+    return compute_fvd(feats_fake, feats_real)
+def main():
+    # input shape: (b, f, h, w, c)
+    videos_fake = torch.rand(10, 16, 224, 224, 3)
+    videos_real = torch.rand(10, 16, 224, 224, 3)
+    our_fvd_result = compute_our_fvd(videos_fake, videos_real)
+    print(f"[FVD scores] Ours: {our_fvd_result}")
+if __name__ == "__main__":
+    main()

eval/hyper_iqa.py ADDED Viewed

	@@ -0,0 +1,343 @@

+# Adapted from https://github.com/SSL92/hyperIQA/blob/master/models.py
+import torch as torch
+import torch.nn as nn
+from torch.nn import functional as F
+from torch.nn import init
+import math
+import torch.utils.model_zoo as model_zoo
+model_urls = {
+    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
+    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
+    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
+    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
+    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
+}
+class HyperNet(nn.Module):
+    """
+    Hyper network for learning perceptual rules.
+    Args:
+        lda_out_channels: local distortion aware module output size.
+        hyper_in_channels: input feature channels for hyper network.
+        target_in_size: input vector size for target network.
+        target_fc(i)_size: fully connection layer size of target network.
+        feature_size: input feature map width/height for hyper network.
+    Note:
+        For size match, input args must satisfy: 'target_fc(i)_size * target_fc(i+1)_size' is divisible by 'feature_size ^ 2'.
+    """
+    def __init__(self, lda_out_channels, hyper_in_channels, target_in_size, target_fc1_size, target_fc2_size, target_fc3_size, target_fc4_size, feature_size):
+        super(HyperNet, self).__init__()
+        self.hyperInChn = hyper_in_channels
+        self.target_in_size = target_in_size
+        self.f1 = target_fc1_size
+        self.f2 = target_fc2_size
+        self.f3 = target_fc3_size
+        self.f4 = target_fc4_size
+        self.feature_size = feature_size
+        self.res = resnet50_backbone(lda_out_channels, target_in_size, pretrained=True)
+        self.pool = nn.AdaptiveAvgPool2d((1, 1))
+        # Conv layers for resnet output features
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(2048, 1024, 1, padding=(0, 0)),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(1024, 512, 1, padding=(0, 0)),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(512, self.hyperInChn, 1, padding=(0, 0)),
+            nn.ReLU(inplace=True)
+        )
+        # Hyper network part, conv for generating target fc weights, fc for generating target fc biases
+        self.fc1w_conv = nn.Conv2d(self.hyperInChn, int(self.target_in_size * self.f1 / feature_size ** 2), 3,  padding=(1, 1))
+        self.fc1b_fc = nn.Linear(self.hyperInChn, self.f1)
+        self.fc2w_conv = nn.Conv2d(self.hyperInChn, int(self.f1 * self.f2 / feature_size ** 2), 3, padding=(1, 1))
+        self.fc2b_fc = nn.Linear(self.hyperInChn, self.f2)
+        self.fc3w_conv = nn.Conv2d(self.hyperInChn, int(self.f2 * self.f3 / feature_size ** 2), 3, padding=(1, 1))
+        self.fc3b_fc = nn.Linear(self.hyperInChn, self.f3)
+        self.fc4w_conv = nn.Conv2d(self.hyperInChn, int(self.f3 * self.f4 / feature_size ** 2), 3, padding=(1, 1))
+        self.fc4b_fc = nn.Linear(self.hyperInChn, self.f4)
+        self.fc5w_fc = nn.Linear(self.hyperInChn, self.f4)
+        self.fc5b_fc = nn.Linear(self.hyperInChn, 1)
+        # initialize
+        for i, m_name in enumerate(self._modules):
+            if i > 2:
+                nn.init.kaiming_normal_(self._modules[m_name].weight.data)
+    def forward(self, img):
+        feature_size = self.feature_size
+        res_out = self.res(img)
+        # input vector for target net
+        target_in_vec = res_out['target_in_vec'].reshape(-1, self.target_in_size, 1, 1)
+        # input features for hyper net
+        hyper_in_feat = self.conv1(res_out['hyper_in_feat']).reshape(-1, self.hyperInChn, feature_size, feature_size)
+        # generating target net weights & biases
+        target_fc1w = self.fc1w_conv(hyper_in_feat).reshape(-1, self.f1, self.target_in_size, 1, 1)
+        target_fc1b = self.fc1b_fc(self.pool(hyper_in_feat).squeeze()).reshape(-1, self.f1)
+        target_fc2w = self.fc2w_conv(hyper_in_feat).reshape(-1, self.f2, self.f1, 1, 1)
+        target_fc2b = self.fc2b_fc(self.pool(hyper_in_feat).squeeze()).reshape(-1, self.f2)
+        target_fc3w = self.fc3w_conv(hyper_in_feat).reshape(-1, self.f3, self.f2, 1, 1)
+        target_fc3b = self.fc3b_fc(self.pool(hyper_in_feat).squeeze()).reshape(-1, self.f3)
+        target_fc4w = self.fc4w_conv(hyper_in_feat).reshape(-1, self.f4, self.f3, 1, 1)
+        target_fc4b = self.fc4b_fc(self.pool(hyper_in_feat).squeeze()).reshape(-1, self.f4)
+        target_fc5w = self.fc5w_fc(self.pool(hyper_in_feat).squeeze()).reshape(-1, 1, self.f4, 1, 1)
+        target_fc5b = self.fc5b_fc(self.pool(hyper_in_feat).squeeze()).reshape(-1, 1)
+        out = {}
+        out['target_in_vec'] = target_in_vec
+        out['target_fc1w'] = target_fc1w
+        out['target_fc1b'] = target_fc1b
+        out['target_fc2w'] = target_fc2w
+        out['target_fc2b'] = target_fc2b
+        out['target_fc3w'] = target_fc3w
+        out['target_fc3b'] = target_fc3b
+        out['target_fc4w'] = target_fc4w
+        out['target_fc4b'] = target_fc4b
+        out['target_fc5w'] = target_fc5w
+        out['target_fc5b'] = target_fc5b
+        return out
+class TargetNet(nn.Module):
+    """
+    Target network for quality prediction.
+    """
+    def __init__(self, paras):
+        super(TargetNet, self).__init__()
+        self.l1 = nn.Sequential(
+            TargetFC(paras['target_fc1w'], paras['target_fc1b']),
+            nn.Sigmoid(),
+        )
+        self.l2 = nn.Sequential(
+            TargetFC(paras['target_fc2w'], paras['target_fc2b']),
+            nn.Sigmoid(),
+        )
+        self.l3 = nn.Sequential(
+            TargetFC(paras['target_fc3w'], paras['target_fc3b']),
+            nn.Sigmoid(),
+        )
+        self.l4 = nn.Sequential(
+            TargetFC(paras['target_fc4w'], paras['target_fc4b']),
+            nn.Sigmoid(),
+            TargetFC(paras['target_fc5w'], paras['target_fc5b']),
+        )
+    def forward(self, x):
+        q = self.l1(x)
+        # q = F.dropout(q)
+        q = self.l2(q)
+        q = self.l3(q)
+        q = self.l4(q).squeeze()
+        return q
+class TargetFC(nn.Module):
+    """
+    Fully connection operations for target net
+    Note:
+        Weights & biases are different for different images in a batch,
+        thus here we use group convolution for calculating images in a batch with individual weights & biases.
+    """
+    def __init__(self, weight, bias):
+        super(TargetFC, self).__init__()
+        self.weight = weight
+        self.bias = bias
+    def forward(self, input_):
+        input_re = input_.reshape(-1, input_.shape[0] * input_.shape[1], input_.shape[2], input_.shape[3])
+        weight_re = self.weight.reshape(self.weight.shape[0] * self.weight.shape[1], self.weight.shape[2], self.weight.shape[3], self.weight.shape[4])
+        bias_re = self.bias.reshape(self.bias.shape[0] * self.bias.shape[1])
+        out = F.conv2d(input=input_re, weight=weight_re, bias=bias_re, groups=self.weight.shape[0])
+        return out.reshape(input_.shape[0], self.weight.shape[1], input_.shape[2], input_.shape[3])
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+class ResNetBackbone(nn.Module):
+    def __init__(self, lda_out_channels, in_chn, block, layers, num_classes=1000):
+        super(ResNetBackbone, self).__init__()
+        self.inplanes = 64
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+        # local distortion aware module
+        self.lda1_pool = nn.Sequential(
+            nn.Conv2d(256, 16, kernel_size=1, stride=1, padding=0, bias=False),
+            nn.AvgPool2d(7, stride=7),
+        )
+        self.lda1_fc = nn.Linear(16 * 64, lda_out_channels)
+        self.lda2_pool = nn.Sequential(
+            nn.Conv2d(512, 32, kernel_size=1, stride=1, padding=0, bias=False),
+            nn.AvgPool2d(7, stride=7),
+        )
+        self.lda2_fc = nn.Linear(32 * 16, lda_out_channels)
+        self.lda3_pool = nn.Sequential(
+            nn.Conv2d(1024, 64, kernel_size=1, stride=1, padding=0, bias=False),
+            nn.AvgPool2d(7, stride=7),
+        )
+        self.lda3_fc = nn.Linear(64 * 4, lda_out_channels)
+        self.lda4_pool = nn.AvgPool2d(7, stride=7)
+        self.lda4_fc = nn.Linear(2048, in_chn - lda_out_channels * 3)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+        # initialize
+        nn.init.kaiming_normal_(self.lda1_pool._modules['0'].weight.data)
+        nn.init.kaiming_normal_(self.lda2_pool._modules['0'].weight.data)
+        nn.init.kaiming_normal_(self.lda3_pool._modules['0'].weight.data)
+        nn.init.kaiming_normal_(self.lda1_fc.weight.data)
+        nn.init.kaiming_normal_(self.lda2_fc.weight.data)
+        nn.init.kaiming_normal_(self.lda3_fc.weight.data)
+        nn.init.kaiming_normal_(self.lda4_fc.weight.data)
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion),
+            )
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        # the same effect as lda operation in the paper, but save much more memory
+        lda_1 = self.lda1_fc(self.lda1_pool(x).reshape(x.size(0), -1))
+        x = self.layer2(x)
+        lda_2 = self.lda2_fc(self.lda2_pool(x).reshape(x.size(0), -1))
+        x = self.layer3(x)
+        lda_3 = self.lda3_fc(self.lda3_pool(x).reshape(x.size(0), -1))
+        x = self.layer4(x)
+        lda_4 = self.lda4_fc(self.lda4_pool(x).reshape(x.size(0), -1))
+        vec = torch.cat((lda_1, lda_2, lda_3, lda_4), 1)
+        out = {}
+        out['hyper_in_feat'] = x
+        out['target_in_vec'] = vec
+        return out
+def resnet50_backbone(lda_out_channels, in_chn, pretrained=False, **kwargs):
+    """Constructs a ResNet-50 model_hyper.
+    Args:
+        pretrained (bool): If True, returns a model_hyper pre-trained on ImageNet
+    """
+    model = ResNetBackbone(lda_out_channels, in_chn, Bottleneck, [3, 4, 6, 3], **kwargs)
+    if pretrained:
+        save_model = model_zoo.load_url(model_urls['resnet50'])
+        model_dict = model.state_dict()
+        state_dict = {k: v for k, v in save_model.items() if k in model_dict.keys()}
+        model_dict.update(state_dict)
+        model.load_state_dict(model_dict)
+    else:
+        model.apply(weights_init_xavier)
+    return model
+def weights_init_xavier(m):
+    classname = m.__class__.__name__
+    # print(classname)
+    # if isinstance(m, nn.Conv2d):
+    if classname.find('Conv') != -1:
+        init.kaiming_normal_(m.weight.data)
+    elif classname.find('Linear') != -1:
+        init.kaiming_normal_(m.weight.data)
+    elif classname.find('BatchNorm2d') != -1:
+        init.uniform_(m.weight.data, 1.0, 0.02)
+        init.constant_(m.bias.data, 0.0)

eval/inference_videos.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import subprocess
+from tqdm import tqdm
+def inference_video_from_dir(input_dir, output_dir, unet_config_path, ckpt_path):
+    os.makedirs(output_dir, exist_ok=True)
+    video_names = sorted([f for f in os.listdir(input_dir) if f.endswith(".mp4")])
+    for video_name in tqdm(video_names):
+        video_path = os.path.join(input_dir, video_name)
+        audio_path = os.path.join(input_dir, video_name.replace(".mp4", "_audio.wav"))
+        video_out_path = os.path.join(output_dir, video_name.replace(".mp4", "_out.mp4"))
+        inference_command = f"python inference.py --unet_config_path {unet_config_path} --video_path {video_path} --audio_path {audio_path} --video_out_path {video_out_path} --inference_ckpt_path {ckpt_path} --seed 1247"
+        subprocess.run(inference_command, shell=True)
+if __name__ == "__main__":
+    input_dir = "/mnt/bn/maliva-gen-ai-v2/chunyu.li/HDTF/segmented/cross"
+    output_dir = "/mnt/bn/maliva-gen-ai-v2/chunyu.li/HDTF/segmented/latentsync_cross"
+    unet_config_path = "configs/unet/unet_latent_16_diffusion.yaml"
+    ckpt_path = "output/unet/train-2024_10_08-16:23:43/checkpoints/checkpoint-1920000.pt"
+    inference_video_from_dir(input_dir, output_dir, unet_config_path, ckpt_path)

eval/syncnet/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .syncnet_eval import SyncNetEval

eval/syncnet/syncnet.py ADDED Viewed

	@@ -0,0 +1,113 @@

+# https://github.com/joonson/syncnet_python/blob/master/SyncNetModel.py
+import torch
+import torch.nn as nn
+def save(model, filename):
+    with open(filename, "wb") as f:
+        torch.save(model, f)
+        print("%s saved." % filename)
+def load(filename):
+    net = torch.load(filename)
+    return net
+class S(nn.Module):
+    def __init__(self, num_layers_in_fc_layers=1024):
+        super(S, self).__init__()
+        self.__nFeatures__ = 24
+        self.__nChs__ = 32
+        self.__midChs__ = 32
+        self.netcnnaud = nn.Sequential(
+            nn.Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
+            nn.BatchNorm2d(64),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=(1, 1), stride=(1, 1)),
+            nn.Conv2d(64, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
+            nn.BatchNorm2d(192),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=(3, 3), stride=(1, 2)),
+            nn.Conv2d(192, 384, kernel_size=(3, 3), padding=(1, 1)),
+            nn.BatchNorm2d(384),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(384, 256, kernel_size=(3, 3), padding=(1, 1)),
+            nn.BatchNorm2d(256),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256, 256, kernel_size=(3, 3), padding=(1, 1)),
+            nn.BatchNorm2d(256),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=(3, 3), stride=(2, 2)),
+            nn.Conv2d(256, 512, kernel_size=(5, 4), padding=(0, 0)),
+            nn.BatchNorm2d(512),
+            nn.ReLU(),
+        )
+        self.netfcaud = nn.Sequential(
+            nn.Linear(512, 512),
+            nn.BatchNorm1d(512),
+            nn.ReLU(),
+            nn.Linear(512, num_layers_in_fc_layers),
+        )
+        self.netfclip = nn.Sequential(
+            nn.Linear(512, 512),
+            nn.BatchNorm1d(512),
+            nn.ReLU(),
+            nn.Linear(512, num_layers_in_fc_layers),
+        )
+        self.netcnnlip = nn.Sequential(
+            nn.Conv3d(3, 96, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=0),
+            nn.BatchNorm3d(96),
+            nn.ReLU(inplace=True),
+            nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2)),
+            nn.Conv3d(96, 256, kernel_size=(1, 5, 5), stride=(1, 2, 2), padding=(0, 1, 1)),
+            nn.BatchNorm3d(256),
+            nn.ReLU(inplace=True),
+            nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1)),
+            nn.Conv3d(256, 256, kernel_size=(1, 3, 3), padding=(0, 1, 1)),
+            nn.BatchNorm3d(256),
+            nn.ReLU(inplace=True),
+            nn.Conv3d(256, 256, kernel_size=(1, 3, 3), padding=(0, 1, 1)),
+            nn.BatchNorm3d(256),
+            nn.ReLU(inplace=True),
+            nn.Conv3d(256, 256, kernel_size=(1, 3, 3), padding=(0, 1, 1)),
+            nn.BatchNorm3d(256),
+            nn.ReLU(inplace=True),
+            nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2)),
+            nn.Conv3d(256, 512, kernel_size=(1, 6, 6), padding=0),
+            nn.BatchNorm3d(512),
+            nn.ReLU(inplace=True),
+        )
+    def forward_aud(self, x):
+        mid = self.netcnnaud(x)
+        # N x ch x 24 x M
+        mid = mid.view((mid.size()[0], -1))
+        # N x (ch x 24)
+        out = self.netfcaud(mid)
+        return out
+    def forward_lip(self, x):
+        mid = self.netcnnlip(x)
+        mid = mid.view((mid.size()[0], -1))
+        # N x (ch x 24)
+        out = self.netfclip(mid)
+        return out
+    def forward_lipfeat(self, x):
+        mid = self.netcnnlip(x)
+        out = mid.view((mid.size()[0], -1))
+        # N x (ch x 24)
+        return out

eval/syncnet/syncnet_eval.py ADDED Viewed

	@@ -0,0 +1,220 @@

+# Adapted from https://github.com/joonson/syncnet_python/blob/master/SyncNetInstance.py
+import torch
+import numpy
+import time, pdb, argparse, subprocess, os, math, glob
+import cv2
+import python_speech_features
+from scipy import signal
+from scipy.io import wavfile
+from .syncnet import S
+from shutil import rmtree
+# ==================== Get OFFSET ====================
+# Video 25 FPS, Audio 16000HZ
+def calc_pdist(feat1, feat2, vshift=10):
+    win_size = vshift * 2 + 1
+    feat2p = torch.nn.functional.pad(feat2, (0, 0, vshift, vshift))
+    dists = []
+    for i in range(0, len(feat1)):
+        dists.append(
+            torch.nn.functional.pairwise_distance(feat1[[i], :].repeat(win_size, 1), feat2p[i : i + win_size, :])
+        )
+    return dists
+# ==================== MAIN DEF ====================
+class SyncNetEval(torch.nn.Module):
+    def __init__(self, dropout=0, num_layers_in_fc_layers=1024, device="cpu"):
+        super().__init__()
+        self.__S__ = S(num_layers_in_fc_layers=num_layers_in_fc_layers).to(device)
+        self.device = device
+    def evaluate(self, video_path, temp_dir="temp", batch_size=20, vshift=15):
+        self.__S__.eval()
+        # ========== ==========
+        # Convert files
+        # ========== ==========
+        if os.path.exists(temp_dir):
+            rmtree(temp_dir)
+        os.makedirs(temp_dir)
+        # temp_video_path = os.path.join(temp_dir, "temp.mp4")
+        # command = f"ffmpeg -loglevel error -nostdin -y -i {video_path} -vf scale='224:224' {temp_video_path}"
+        # subprocess.call(command, shell=True)
+        command = (
+            f"ffmpeg -loglevel error -nostdin -y -i {video_path} -f image2 {os.path.join(temp_dir, '%06d.jpg')}"
+        )
+        subprocess.call(command, shell=True, stdout=None)
+        command = f"ffmpeg -loglevel error -nostdin -y -i {video_path} -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 {os.path.join(temp_dir, 'audio.wav')}"
+        subprocess.call(command, shell=True, stdout=None)
+        # ========== ==========
+        # Load video
+        # ========== ==========
+        images = []
+        flist = glob.glob(os.path.join(temp_dir, "*.jpg"))
+        flist.sort()
+        for fname in flist:
+            img_input = cv2.imread(fname)
+            img_input = cv2.resize(img_input, (224, 224))  # HARD CODED, CHANGE BEFORE RELEASE
+            images.append(img_input)
+        im = numpy.stack(images, axis=3)
+        im = numpy.expand_dims(im, axis=0)
+        im = numpy.transpose(im, (0, 3, 4, 1, 2))
+        imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float())
+        # ========== ==========
+        # Load audio
+        # ========== ==========
+        sample_rate, audio = wavfile.read(os.path.join(temp_dir, "audio.wav"))
+        mfcc = zip(*python_speech_features.mfcc(audio, sample_rate))
+        mfcc = numpy.stack([numpy.array(i) for i in mfcc])
+        cc = numpy.expand_dims(numpy.expand_dims(mfcc, axis=0), axis=0)
+        cct = torch.autograd.Variable(torch.from_numpy(cc.astype(float)).float())
+        # ========== ==========
+        # Check audio and video input length
+        # ========== ==========
+        # if (float(len(audio)) / 16000) != (float(len(images)) / 25):
+        #     print(
+        #         "WARNING: Audio (%.4fs) and video (%.4fs) lengths are different."
+        #         % (float(len(audio)) / 16000, float(len(images)) / 25)
+        #     )
+        min_length = min(len(images), math.floor(len(audio) / 640))
+        # ========== ==========
+        # Generate video and audio feats
+        # ========== ==========
+        lastframe = min_length - 5
+        im_feat = []
+        cc_feat = []
+        tS = time.time()
+        for i in range(0, lastframe, batch_size):
+            im_batch = [imtv[:, :, vframe : vframe + 5, :, :] for vframe in range(i, min(lastframe, i + batch_size))]
+            im_in = torch.cat(im_batch, 0)
+            im_out = self.__S__.forward_lip(im_in.to(self.device))
+            im_feat.append(im_out.data.cpu())
+            cc_batch = [
+                cct[:, :, :, vframe * 4 : vframe * 4 + 20] for vframe in range(i, min(lastframe, i + batch_size))
+            ]
+            cc_in = torch.cat(cc_batch, 0)
+            cc_out = self.__S__.forward_aud(cc_in.to(self.device))
+            cc_feat.append(cc_out.data.cpu())
+        im_feat = torch.cat(im_feat, 0)
+        cc_feat = torch.cat(cc_feat, 0)
+        # ========== ==========
+        # Compute offset
+        # ========== ==========
+        dists = calc_pdist(im_feat, cc_feat, vshift=vshift)
+        mean_dists = torch.mean(torch.stack(dists, 1), 1)
+        min_dist, minidx = torch.min(mean_dists, 0)
+        av_offset = vshift - minidx
+        conf = torch.median(mean_dists) - min_dist
+        fdist = numpy.stack([dist[minidx].numpy() for dist in dists])
+        # fdist   = numpy.pad(fdist, (3,3), 'constant', constant_values=15)
+        fconf = torch.median(mean_dists).numpy() - fdist
+        framewise_conf = signal.medfilt(fconf, kernel_size=9)
+        # numpy.set_printoptions(formatter={"float": "{: 0.3f}".format})
+        rmtree(temp_dir)
+        return av_offset.item(), min_dist.item(), conf.item()
+    def extract_feature(self, opt, videofile):
+        self.__S__.eval()
+        # ========== ==========
+        # Load video
+        # ========== ==========
+        cap = cv2.VideoCapture(videofile)
+        frame_num = 1
+        images = []
+        while frame_num:
+            frame_num += 1
+            ret, image = cap.read()
+            if ret == 0:
+                break
+            images.append(image)
+        im = numpy.stack(images, axis=3)
+        im = numpy.expand_dims(im, axis=0)
+        im = numpy.transpose(im, (0, 3, 4, 1, 2))
+        imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float())
+        # ========== ==========
+        # Generate video feats
+        # ========== ==========
+        lastframe = len(images) - 4
+        im_feat = []
+        tS = time.time()
+        for i in range(0, lastframe, opt.batch_size):
+            im_batch = [
+                imtv[:, :, vframe : vframe + 5, :, :] for vframe in range(i, min(lastframe, i + opt.batch_size))
+            ]
+            im_in = torch.cat(im_batch, 0)
+            im_out = self.__S__.forward_lipfeat(im_in.to(self.device))
+            im_feat.append(im_out.data.cpu())
+        im_feat = torch.cat(im_feat, 0)
+        # ========== ==========
+        # Compute offset
+        # ========== ==========
+        print("Compute time %.3f sec." % (time.time() - tS))
+        return im_feat
+    def loadParameters(self, path):
+        loaded_state = torch.load(path, map_location=lambda storage, loc: storage)
+        self_state = self.__S__.state_dict()
+        for name, param in loaded_state.items():
+            self_state[name].copy_(param)

eval/syncnet_detect.py ADDED Viewed

	@@ -0,0 +1,251 @@

+# Adapted from https://github.com/joonson/syncnet_python/blob/master/run_pipeline.py
+import os, pdb, subprocess, glob, cv2
+import numpy as np
+from shutil import rmtree
+import torch
+from scenedetect.video_manager import VideoManager
+from scenedetect.scene_manager import SceneManager
+from scenedetect.stats_manager import StatsManager
+from scenedetect.detectors import ContentDetector
+from scipy.interpolate import interp1d
+from scipy.io import wavfile
+from scipy import signal
+from eval.detectors import S3FD
+class SyncNetDetector:
+    def __init__(self, device, detect_results_dir="detect_results"):
+        self.s3f_detector = S3FD(device=device)
+        self.detect_results_dir = detect_results_dir
+    def __call__(self, video_path: str, min_track=50, scale=False):
+        crop_dir = os.path.join(self.detect_results_dir, "crop")
+        video_dir = os.path.join(self.detect_results_dir, "video")
+        frames_dir = os.path.join(self.detect_results_dir, "frames")
+        temp_dir = os.path.join(self.detect_results_dir, "temp")
+        # ========== DELETE EXISTING DIRECTORIES ==========
+        if os.path.exists(crop_dir):
+            rmtree(crop_dir)
+        if os.path.exists(video_dir):
+            rmtree(video_dir)
+        if os.path.exists(frames_dir):
+            rmtree(frames_dir)
+        if os.path.exists(temp_dir):
+            rmtree(temp_dir)
+        # ========== MAKE NEW DIRECTORIES ==========
+        os.makedirs(crop_dir)
+        os.makedirs(video_dir)
+        os.makedirs(frames_dir)
+        os.makedirs(temp_dir)
+        # ========== CONVERT VIDEO AND EXTRACT FRAMES ==========
+        if scale:
+            scaled_video_path = os.path.join(video_dir, "scaled.mp4")
+            command = f"ffmpeg -loglevel error -y -nostdin -i {video_path} -vf scale='224:224' {scaled_video_path}"
+            subprocess.run(command, shell=True)
+            video_path = scaled_video_path
+        command = f"ffmpeg -y -nostdin -loglevel error -i {video_path} -qscale:v 2 -async 1 -r 25 {os.path.join(video_dir, 'video.mp4')}"
+        subprocess.run(command, shell=True, stdout=None)
+        command = f"ffmpeg -y -nostdin -loglevel error -i {os.path.join(video_dir, 'video.mp4')} -qscale:v 2 -f image2 {os.path.join(frames_dir, '%06d.jpg')}"
+        subprocess.run(command, shell=True, stdout=None)
+        command = f"ffmpeg -y -nostdin -loglevel error -i {os.path.join(video_dir, 'video.mp4')} -ac 1 -vn -acodec pcm_s16le -ar 16000 {os.path.join(video_dir, 'audio.wav')}"
+        subprocess.run(command, shell=True, stdout=None)
+        faces = self.detect_face(frames_dir)
+        scene = self.scene_detect(video_dir)
+        # Face tracking
+        alltracks = []
+        for shot in scene:
+            if shot[1].frame_num - shot[0].frame_num >= min_track:
+                alltracks.extend(self.track_face(faces[shot[0].frame_num : shot[1].frame_num], min_track=min_track))
+        # Face crop
+        for ii, track in enumerate(alltracks):
+            self.crop_video(track, os.path.join(crop_dir, "%05d" % ii), frames_dir, 25, temp_dir, video_dir)
+        rmtree(temp_dir)
+    def scene_detect(self, video_dir):
+        video_manager = VideoManager([os.path.join(video_dir, "video.mp4")])
+        stats_manager = StatsManager()
+        scene_manager = SceneManager(stats_manager)
+        # Add ContentDetector algorithm (constructor takes detector options like threshold).
+        scene_manager.add_detector(ContentDetector())
+        base_timecode = video_manager.get_base_timecode()
+        video_manager.set_downscale_factor()
+        video_manager.start()
+        scene_manager.detect_scenes(frame_source=video_manager)
+        scene_list = scene_manager.get_scene_list(base_timecode)
+        if scene_list == []:
+            scene_list = [(video_manager.get_base_timecode(), video_manager.get_current_timecode())]
+        return scene_list
+    def track_face(self, scenefaces, num_failed_det=25, min_track=50, min_face_size=100):
+        iouThres = 0.5  # Minimum IOU between consecutive face detections
+        tracks = []
+        while True:
+            track = []
+            for framefaces in scenefaces:
+                for face in framefaces:
+                    if track == []:
+                        track.append(face)
+                        framefaces.remove(face)
+                    elif face["frame"] - track[-1]["frame"] <= num_failed_det:
+                        iou = bounding_box_iou(face["bbox"], track[-1]["bbox"])
+                        if iou > iouThres:
+                            track.append(face)
+                            framefaces.remove(face)
+                            continue
+                    else:
+                        break
+            if track == []:
+                break
+            elif len(track) > min_track:
+                framenum = np.array([f["frame"] for f in track])
+                bboxes = np.array([np.array(f["bbox"]) for f in track])
+                frame_i = np.arange(framenum[0], framenum[-1] + 1)
+                bboxes_i = []
+                for ij in range(0, 4):
+                    interpfn = interp1d(framenum, bboxes[:, ij])
+                    bboxes_i.append(interpfn(frame_i))
+                bboxes_i = np.stack(bboxes_i, axis=1)
+                if (
+                    max(np.mean(bboxes_i[:, 2] - bboxes_i[:, 0]), np.mean(bboxes_i[:, 3] - bboxes_i[:, 1]))
+                    > min_face_size
+                ):
+                    tracks.append({"frame": frame_i, "bbox": bboxes_i})
+        return tracks
+    def detect_face(self, frames_dir, facedet_scale=0.25):
+        flist = glob.glob(os.path.join(frames_dir, "*.jpg"))
+        flist.sort()
+        dets = []
+        for fidx, fname in enumerate(flist):
+            image = cv2.imread(fname)
+            image_np = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+            bboxes = self.s3f_detector.detect_faces(image_np, conf_th=0.9, scales=[facedet_scale])
+            dets.append([])
+            for bbox in bboxes:
+                dets[-1].append({"frame": fidx, "bbox": (bbox[:-1]).tolist(), "conf": bbox[-1]})
+        return dets
+    def crop_video(self, track, cropfile, frames_dir, frame_rate, temp_dir, video_dir, crop_scale=0.4):
+        flist = glob.glob(os.path.join(frames_dir, "*.jpg"))
+        flist.sort()
+        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+        vOut = cv2.VideoWriter(cropfile + "t.mp4", fourcc, frame_rate, (224, 224))
+        dets = {"x": [], "y": [], "s": []}
+        for det in track["bbox"]:
+            dets["s"].append(max((det[3] - det[1]), (det[2] - det[0])) / 2)
+            dets["y"].append((det[1] + det[3]) / 2)  # crop center x
+            dets["x"].append((det[0] + det[2]) / 2)  # crop center y
+        # Smooth detections
+        dets["s"] = signal.medfilt(dets["s"], kernel_size=13)
+        dets["x"] = signal.medfilt(dets["x"], kernel_size=13)
+        dets["y"] = signal.medfilt(dets["y"], kernel_size=13)
+        for fidx, frame in enumerate(track["frame"]):
+            cs = crop_scale
+            bs = dets["s"][fidx]  # Detection box size
+            bsi = int(bs * (1 + 2 * cs))  # Pad videos by this amount
+            image = cv2.imread(flist[frame])
+            frame = np.pad(image, ((bsi, bsi), (bsi, bsi), (0, 0)), "constant", constant_values=(110, 110))
+            my = dets["y"][fidx] + bsi  # BBox center Y
+            mx = dets["x"][fidx] + bsi  # BBox center X
+            face = frame[int(my - bs) : int(my + bs * (1 + 2 * cs)), int(mx - bs * (1 + cs)) : int(mx + bs * (1 + cs))]
+            vOut.write(cv2.resize(face, (224, 224)))
+        audiotmp = os.path.join(temp_dir, "audio.wav")
+        audiostart = (track["frame"][0]) / frame_rate
+        audioend = (track["frame"][-1] + 1) / frame_rate
+        vOut.release()
+        # ========== CROP AUDIO FILE ==========
+        command = "ffmpeg -y -nostdin -loglevel error -i %s -ss %.3f -to %.3f %s" % (
+            os.path.join(video_dir, "audio.wav"),
+            audiostart,
+            audioend,
+            audiotmp,
+        )
+        output = subprocess.run(command, shell=True, stdout=None)
+        sample_rate, audio = wavfile.read(audiotmp)
+        # ========== COMBINE AUDIO AND VIDEO FILES ==========
+        command = "ffmpeg -y -nostdin -loglevel error -i %st.mp4 -i %s -c:v copy -c:a aac %s.mp4" % (
+            cropfile,
+            audiotmp,
+            cropfile,
+        )
+        output = subprocess.run(command, shell=True, stdout=None)
+        os.remove(cropfile + "t.mp4")
+        return {"track": track, "proc_track": dets}
+def bounding_box_iou(boxA, boxB):
+    xA = max(boxA[0], boxB[0])
+    yA = max(boxA[1], boxB[1])
+    xB = min(boxA[2], boxB[2])
+    yB = min(boxA[3], boxB[3])
+    interArea = max(0, xB - xA) * max(0, yB - yA)
+    boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
+    boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
+    iou = interArea / float(boxAArea + boxBArea - interArea)
+    return iou

inference.sh ADDED Viewed

	@@ -0,0 +1,9 @@

+#!/bin/bash
+python -m scripts.inference \
+    --unet_config_path "configs/unet/second_stage.yaml" \
+    --inference_ckpt_path "checkpoints/latentsync_unet.pt" \
+    --guidance_scale 1.0 \
+    --video_path "assets/demo1_video.mp4" \
+    --audio_path "assets/demo1_audio.wav" \
+    --video_out_path "video_out.mp4"

latentsync/data/syncnet_dataset.py ADDED Viewed

	@@ -0,0 +1,153 @@

+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import numpy as np
+from torch.utils.data import Dataset
+import torch
+import random
+from ..utils.util import gather_video_paths_recursively
+from ..utils.image_processor import ImageProcessor
+from ..utils.audio import melspectrogram
+import math
+from decord import AudioReader, VideoReader, cpu
+class SyncNetDataset(Dataset):
+    def __init__(self, data_dir: str, fileslist: str, config):
+        if fileslist != "":
+            with open(fileslist) as file:
+                self.video_paths = [line.rstrip() for line in file]
+        elif data_dir != "":
+            self.video_paths = gather_video_paths_recursively(data_dir)
+        else:
+            raise ValueError("data_dir and fileslist cannot be both empty")
+        self.resolution = config.data.resolution
+        self.num_frames = config.data.num_frames
+        self.mel_window_length = math.ceil(self.num_frames / 5 * 16)
+        self.audio_sample_rate = config.data.audio_sample_rate
+        self.video_fps = config.data.video_fps
+        self.audio_samples_length = int(
+            config.data.audio_sample_rate // config.data.video_fps * config.data.num_frames
+        )
+        self.image_processor = ImageProcessor(resolution=config.data.resolution, mask="half")
+        self.audio_mel_cache_dir = config.data.audio_mel_cache_dir
+        os.makedirs(self.audio_mel_cache_dir, exist_ok=True)
+    def __len__(self):
+        return len(self.video_paths)
+    def read_audio(self, video_path: str):
+        ar = AudioReader(video_path, ctx=cpu(self.worker_id), sample_rate=self.audio_sample_rate)
+        original_mel = melspectrogram(ar[:].asnumpy().squeeze(0))
+        return torch.from_numpy(original_mel)
+    def crop_audio_window(self, original_mel, start_index):
+        start_idx = int(80.0 * (start_index / float(self.video_fps)))
+        end_idx = start_idx + self.mel_window_length
+        return original_mel[:, start_idx:end_idx].unsqueeze(0)
+    def get_frames(self, video_reader: VideoReader):
+        total_num_frames = len(video_reader)
+        start_idx = random.randint(0, total_num_frames - self.num_frames)
+        frames_index = np.arange(start_idx, start_idx + self.num_frames, dtype=int)
+        while True:
+            wrong_start_idx = random.randint(0, total_num_frames - self.num_frames)
+            # wrong_start_idx = random.randint(
+            #     max(0, start_idx - 25), min(total_num_frames - self.num_frames, start_idx + 25)
+            # )
+            if wrong_start_idx == start_idx:
+                continue
+            # if wrong_start_idx >= start_idx - self.num_frames and wrong_start_idx <= start_idx + self.num_frames:
+            #     continue
+            wrong_frames_index = np.arange(wrong_start_idx, wrong_start_idx + self.num_frames, dtype=int)
+            break
+        frames = video_reader.get_batch(frames_index).asnumpy()
+        wrong_frames = video_reader.get_batch(wrong_frames_index).asnumpy()
+        return frames, wrong_frames, start_idx
+    def worker_init_fn(self, worker_id):
+        # Initialize the face mesh object in each worker process,
+        # because the face mesh object cannot be called in subprocesses
+        self.worker_id = worker_id
+        # setattr(self, f"image_processor_{worker_id}", ImageProcessor(self.resolution, self.mask))
+    def __getitem__(self, idx):
+        # image_processor = getattr(self, f"image_processor_{self.worker_id}")
+        while True:
+            try:
+                idx = random.randint(0, len(self) - 1)
+                # Get video file path
+                video_path = self.video_paths[idx]
+                vr = VideoReader(video_path, ctx=cpu(self.worker_id))
+                if len(vr) < 2 * self.num_frames:
+                    continue
+                frames, wrong_frames, start_idx = self.get_frames(vr)
+                mel_cache_path = os.path.join(
+                    self.audio_mel_cache_dir, os.path.basename(video_path).replace(".mp4", "_mel.pt")
+                )
+                if os.path.isfile(mel_cache_path):
+                    try:
+                        original_mel = torch.load(mel_cache_path)
+                    except Exception as e:
+                        print(f"{type(e).__name__} - {e} - {mel_cache_path}")
+                        os.remove(mel_cache_path)
+                        original_mel = self.read_audio(video_path)
+                        torch.save(original_mel, mel_cache_path)
+                else:
+                    original_mel = self.read_audio(video_path)
+                    torch.save(original_mel, mel_cache_path)
+                mel = self.crop_audio_window(original_mel, start_idx)
+                if mel.shape[-1] != self.mel_window_length:
+                    continue
+                if random.choice([True, False]):
+                    y = torch.ones(1).float()
+                    chosen_frames = frames
+                else:
+                    y = torch.zeros(1).float()
+                    chosen_frames = wrong_frames
+                chosen_frames = self.image_processor.process_images(chosen_frames)
+                # chosen_frames, _, _ = image_processor.prepare_masks_and_masked_images(
+                #     chosen_frames, affine_transform=True
+                # )
+                vr.seek(0)  # avoid memory leak
+                break
+            except Exception as e:  # Handle the exception of face not detcted
+                print(f"{type(e).__name__} - {e} - {video_path}")
+                if "vr" in locals():
+                    vr.seek(0)  # avoid memory leak
+        sample = dict(frames=chosen_frames, audio_samples=mel, y=y)
+        return sample

latentsync/data/unet_dataset.py ADDED Viewed

	@@ -0,0 +1,164 @@

+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import numpy as np
+from torch.utils.data import Dataset
+import torch
+import random
+import cv2
+from ..utils.image_processor import ImageProcessor, load_fixed_mask
+from ..utils.audio import melspectrogram
+from decord import AudioReader, VideoReader, cpu
+class UNetDataset(Dataset):
+    def __init__(self, train_data_dir: str, config):
+        if config.data.train_fileslist != "":
+            with open(config.data.train_fileslist) as file:
+                self.video_paths = [line.rstrip() for line in file]
+        elif train_data_dir != "":
+            self.video_paths = []
+            for file in os.listdir(train_data_dir):
+                if file.endswith(".mp4"):
+                    self.video_paths.append(os.path.join(train_data_dir, file))
+        else:
+            raise ValueError("data_dir and fileslist cannot be both empty")
+        self.resolution = config.data.resolution
+        self.num_frames = config.data.num_frames
+        if self.num_frames == 16:
+            self.mel_window_length = 52
+        elif self.num_frames == 5:
+            self.mel_window_length = 16
+        else:
+            raise NotImplementedError("Only support 16 and 5 frames now")
+        self.audio_sample_rate = config.data.audio_sample_rate
+        self.video_fps = config.data.video_fps
+        self.mask = config.data.mask
+        self.mask_image = load_fixed_mask(self.resolution)
+        self.load_audio_data = config.model.add_audio_layer and config.run.use_syncnet
+        self.audio_mel_cache_dir = config.data.audio_mel_cache_dir
+        os.makedirs(self.audio_mel_cache_dir, exist_ok=True)
+    def __len__(self):
+        return len(self.video_paths)
+    def read_audio(self, video_path: str):
+        ar = AudioReader(video_path, ctx=cpu(self.worker_id), sample_rate=self.audio_sample_rate)
+        original_mel = melspectrogram(ar[:].asnumpy().squeeze(0))
+        return torch.from_numpy(original_mel)
+    def crop_audio_window(self, original_mel, start_index):
+        start_idx = int(80.0 * (start_index / float(self.video_fps)))
+        end_idx = start_idx + self.mel_window_length
+        return original_mel[:, start_idx:end_idx].unsqueeze(0)
+    def get_frames(self, video_reader: VideoReader):
+        total_num_frames = len(video_reader)
+        start_idx = random.randint(self.num_frames // 2, total_num_frames - self.num_frames - self.num_frames // 2)
+        frames_index = np.arange(start_idx, start_idx + self.num_frames, dtype=int)
+        while True:
+            wrong_start_idx = random.randint(0, total_num_frames - self.num_frames)
+            if wrong_start_idx > start_idx - self.num_frames and wrong_start_idx < start_idx + self.num_frames:
+                continue
+            wrong_frames_index = np.arange(wrong_start_idx, wrong_start_idx + self.num_frames, dtype=int)
+            break
+        frames = video_reader.get_batch(frames_index).asnumpy()
+        wrong_frames = video_reader.get_batch(wrong_frames_index).asnumpy()
+        return frames, wrong_frames, start_idx
+    def worker_init_fn(self, worker_id):
+        # Initialize the face mesh object in each worker process,
+        # because the face mesh object cannot be called in subprocesses
+        self.worker_id = worker_id
+        setattr(
+            self,
+            f"image_processor_{worker_id}",
+            ImageProcessor(self.resolution, self.mask, mask_image=self.mask_image),
+        )
+    def __getitem__(self, idx):
+        image_processor = getattr(self, f"image_processor_{self.worker_id}")
+        while True:
+            try:
+                idx = random.randint(0, len(self) - 1)
+                # Get video file path
+                video_path = self.video_paths[idx]
+                vr = VideoReader(video_path, ctx=cpu(self.worker_id))
+                if len(vr) < 3 * self.num_frames:
+                    continue
+                continuous_frames, ref_frames, start_idx = self.get_frames(vr)
+                if self.load_audio_data:
+                    mel_cache_path = os.path.join(
+                        self.audio_mel_cache_dir, os.path.basename(video_path).replace(".mp4", "_mel.pt")
+                    )
+                    if os.path.isfile(mel_cache_path):
+                        try:
+                            original_mel = torch.load(mel_cache_path)
+                        except Exception as e:
+                            print(f"{type(e).__name__} - {e} - {mel_cache_path}")
+                            os.remove(mel_cache_path)
+                            original_mel = self.read_audio(video_path)
+                            torch.save(original_mel, mel_cache_path)
+                    else:
+                        original_mel = self.read_audio(video_path)
+                        torch.save(original_mel, mel_cache_path)
+                    mel = self.crop_audio_window(original_mel, start_idx)
+                    if mel.shape[-1] != self.mel_window_length:
+                        continue
+                else:
+                    mel = []
+                gt, masked_gt, mask = image_processor.prepare_masks_and_masked_images(
+                    continuous_frames, affine_transform=False
+                )
+                if self.mask == "fix_mask":
+                    ref, _, _ = image_processor.prepare_masks_and_masked_images(ref_frames, affine_transform=False)
+                else:
+                    ref = image_processor.process_images(ref_frames)
+                vr.seek(0)  # avoid memory leak
+                break
+            except Exception as e:  # Handle the exception of face not detcted
+                print(f"{type(e).__name__} - {e} - {video_path}")
+                if "vr" in locals():
+                    vr.seek(0)  # avoid memory leak
+        sample = dict(
+            gt=gt,
+            masked_gt=masked_gt,
+            ref=ref,
+            mel=mel,
+            mask=mask,
+            video_path=video_path,
+            start_idx=start_idx,
+        )
+        return sample

latentsync/models/attention.py ADDED Viewed

	@@ -0,0 +1,492 @@

+# Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention.py
+from dataclasses import dataclass
+from turtle import forward
+from typing import Optional
+import torch
+import torch.nn.functional as F
+from torch import nn
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.modeling_utils import ModelMixin
+from diffusers.utils import BaseOutput
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.models.attention import CrossAttention, FeedForward, AdaLayerNorm
+from einops import rearrange, repeat
+from .utils import zero_module
+@dataclass
+class Transformer3DModelOutput(BaseOutput):
+    sample: torch.FloatTensor
+if is_xformers_available():
+    import xformers
+    import xformers.ops
+else:
+    xformers = None
+class Transformer3DModel(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        use_motion_module: bool = False,
+        unet_use_cross_frame_attention=None,
+        unet_use_temporal_attention=None,
+        add_audio_layer=False,
+        audio_condition_method="cross_attn",
+        custom_audio_layer: bool = False,
+    ):
+        super().__init__()
+        self.use_linear_projection = use_linear_projection
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+        # Define input layers
+        self.in_channels = in_channels
+        self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+        if use_linear_projection:
+            self.proj_in = nn.Linear(in_channels, inner_dim)
+        else:
+            self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+        if not custom_audio_layer:
+            # Define transformers blocks
+            self.transformer_blocks = nn.ModuleList(
+                [
+                    BasicTransformerBlock(
+                        inner_dim,
+                        num_attention_heads,
+                        attention_head_dim,
+                        dropout=dropout,
+                        cross_attention_dim=cross_attention_dim,
+                        activation_fn=activation_fn,
+                        num_embeds_ada_norm=num_embeds_ada_norm,
+                        attention_bias=attention_bias,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                        use_motion_module=use_motion_module,
+                        unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+                        unet_use_temporal_attention=unet_use_temporal_attention,
+                        add_audio_layer=add_audio_layer,
+                        custom_audio_layer=custom_audio_layer,
+                        audio_condition_method=audio_condition_method,
+                    )
+                    for d in range(num_layers)
+                ]
+            )
+        else:
+            self.transformer_blocks = nn.ModuleList(
+                [
+                    AudioTransformerBlock(
+                        inner_dim,
+                        num_attention_heads,
+                        attention_head_dim,
+                        dropout=dropout,
+                        cross_attention_dim=cross_attention_dim,
+                        activation_fn=activation_fn,
+                        num_embeds_ada_norm=num_embeds_ada_norm,
+                        attention_bias=attention_bias,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                        use_motion_module=use_motion_module,
+                        unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+                        unet_use_temporal_attention=unet_use_temporal_attention,
+                        add_audio_layer=add_audio_layer,
+                    )
+                    for d in range(num_layers)
+                ]
+            )
+        # 4. Define output layers
+        if use_linear_projection:
+            self.proj_out = nn.Linear(in_channels, inner_dim)
+        else:
+            self.proj_out = nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
+        if custom_audio_layer:
+            self.proj_out = zero_module(self.proj_out)
+    def forward(self, hidden_states, encoder_hidden_states=None, timestep=None, return_dict: bool = True):
+        # Input
+        assert hidden_states.dim() == 5, f"Expected hidden_states to have ndim=5, but got ndim={hidden_states.dim()}."
+        video_length = hidden_states.shape[2]
+        hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
+        # No need to do this for audio input, because different audio samples are independent
+        # encoder_hidden_states = repeat(encoder_hidden_states, 'b n c -> (b f) n c', f=video_length)
+        batch, channel, height, weight = hidden_states.shape
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states)
+        if not self.use_linear_projection:
+            hidden_states = self.proj_in(hidden_states)
+            inner_dim = hidden_states.shape[1]
+            hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
+        else:
+            inner_dim = hidden_states.shape[1]
+            hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
+            hidden_states = self.proj_in(hidden_states)
+        # Blocks
+        for block in self.transformer_blocks:
+            hidden_states = block(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                timestep=timestep,
+                video_length=video_length,
+            )
+        # Output
+        if not self.use_linear_projection:
+            hidden_states = hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2).contiguous()
+            hidden_states = self.proj_out(hidden_states)
+        else:
+            hidden_states = self.proj_out(hidden_states)
+            hidden_states = hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2).contiguous()
+        output = hidden_states + residual
+        output = rearrange(output, "(b f) c h w -> b c f h w", f=video_length)
+        if not return_dict:
+            return (output,)
+        return Transformer3DModelOutput(sample=output)
+class BasicTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        attention_bias: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        use_motion_module: bool = False,
+        unet_use_cross_frame_attention=None,
+        unet_use_temporal_attention=None,
+        add_audio_layer=False,
+        custom_audio_layer=False,
+        audio_condition_method="cross_attn",
+    ):
+        super().__init__()
+        self.only_cross_attention = only_cross_attention
+        self.use_ada_layer_norm = num_embeds_ada_norm is not None
+        self.unet_use_cross_frame_attention = unet_use_cross_frame_attention
+        self.unet_use_temporal_attention = unet_use_temporal_attention
+        self.use_motion_module = use_motion_module
+        self.add_audio_layer = add_audio_layer
+        # SC-Attn
+        assert unet_use_cross_frame_attention is not None
+        if unet_use_cross_frame_attention:
+            raise NotImplementedError("SparseCausalAttention2D not implemented yet.")
+        else:
+            self.attn1 = CrossAttention(
+                query_dim=dim,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+            )
+        self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(dim)
+        # Cross-Attn
+        if add_audio_layer and audio_condition_method == "cross_attn" and not custom_audio_layer:
+            self.audio_cross_attn = AudioCrossAttn(
+                dim=dim,
+                cross_attention_dim=cross_attention_dim,
+                num_attention_heads=num_attention_heads,
+                attention_head_dim=attention_head_dim,
+                dropout=dropout,
+                attention_bias=attention_bias,
+                upcast_attention=upcast_attention,
+                num_embeds_ada_norm=num_embeds_ada_norm,
+                use_ada_layer_norm=self.use_ada_layer_norm,
+                zero_proj_out=False,
+            )
+        else:
+            self.audio_cross_attn = None
+        # Feed-forward
+        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn)
+        self.norm3 = nn.LayerNorm(dim)
+        # Temp-Attn
+        assert unet_use_temporal_attention is not None
+        if unet_use_temporal_attention:
+            self.attn_temp = CrossAttention(
+                query_dim=dim,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+            )
+            nn.init.zeros_(self.attn_temp.to_out[0].weight.data)
+            self.norm_temp = AdaLayerNorm(dim, num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(dim)
+    def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+        if not is_xformers_available():
+            print("Here is how to install it")
+            raise ModuleNotFoundError(
+                "Refer to https://github.com/facebookresearch/xformers for more information on how to install"
+                " xformers",
+                name="xformers",
+            )
+        elif not torch.cuda.is_available():
+            raise ValueError(
+                "torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is only"
+                " available for GPU "
+            )
+        else:
+            try:
+                # Make sure we can run the memory efficient attention
+                _ = xformers.ops.memory_efficient_attention(
+                    torch.randn((1, 2, 40), device="cuda"),
+                    torch.randn((1, 2, 40), device="cuda"),
+                    torch.randn((1, 2, 40), device="cuda"),
+                )
+            except Exception as e:
+                raise e
+            self.attn1._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
+            if self.audio_cross_attn is not None:
+                self.audio_cross_attn.attn._use_memory_efficient_attention_xformers = (
+                    use_memory_efficient_attention_xformers
+                )
+            # self.attn_temp._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
+    def forward(
+        self, hidden_states, encoder_hidden_states=None, timestep=None, attention_mask=None, video_length=None
+    ):
+        # SparseCausal-Attention
+        norm_hidden_states = (
+            self.norm1(hidden_states, timestep) if self.use_ada_layer_norm else self.norm1(hidden_states)
+        )
+        # if self.only_cross_attention:
+        #     hidden_states = (
+        #         self.attn1(norm_hidden_states, encoder_hidden_states, attention_mask=attention_mask) + hidden_states
+        #     )
+        # else:
+        #     hidden_states = self.attn1(norm_hidden_states, attention_mask=attention_mask, video_length=video_length) + hidden_states
+        # pdb.set_trace()
+        if self.unet_use_cross_frame_attention:
+            hidden_states = (
+                self.attn1(norm_hidden_states, attention_mask=attention_mask, video_length=video_length)
+                + hidden_states
+            )
+        else:
+            hidden_states = self.attn1(norm_hidden_states, attention_mask=attention_mask) + hidden_states
+        if self.audio_cross_attn is not None and encoder_hidden_states is not None:
+            hidden_states = self.audio_cross_attn(
+                hidden_states, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask
+            )
+        # Feed-forward
+        hidden_states = self.ff(self.norm3(hidden_states)) + hidden_states
+        # Temporal-Attention
+        if self.unet_use_temporal_attention:
+            d = hidden_states.shape[1]
+            hidden_states = rearrange(hidden_states, "(b f) d c -> (b d) f c", f=video_length)
+            norm_hidden_states = (
+                self.norm_temp(hidden_states, timestep) if self.use_ada_layer_norm else self.norm_temp(hidden_states)
+            )
+            hidden_states = self.attn_temp(norm_hidden_states) + hidden_states
+            hidden_states = rearrange(hidden_states, "(b d) f c -> (b f) d c", d=d)
+        return hidden_states
+class AudioTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        attention_bias: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        use_motion_module: bool = False,
+        unet_use_cross_frame_attention=None,
+        unet_use_temporal_attention=None,
+        add_audio_layer=False,
+    ):
+        super().__init__()
+        self.only_cross_attention = only_cross_attention
+        self.use_ada_layer_norm = num_embeds_ada_norm is not None
+        self.unet_use_cross_frame_attention = unet_use_cross_frame_attention
+        self.unet_use_temporal_attention = unet_use_temporal_attention
+        self.use_motion_module = use_motion_module
+        self.add_audio_layer = add_audio_layer
+        # SC-Attn
+        assert unet_use_cross_frame_attention is not None
+        if unet_use_cross_frame_attention:
+            raise NotImplementedError("SparseCausalAttention2D not implemented yet.")
+        else:
+            self.attn1 = CrossAttention(
+                query_dim=dim,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+            )
+        self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(dim)
+        self.audio_cross_attn = AudioCrossAttn(
+            dim=dim,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            attention_head_dim=attention_head_dim,
+            dropout=dropout,
+            attention_bias=attention_bias,
+            upcast_attention=upcast_attention,
+            num_embeds_ada_norm=num_embeds_ada_norm,
+            use_ada_layer_norm=self.use_ada_layer_norm,
+            zero_proj_out=False,
+        )
+        # Feed-forward
+        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn)
+        self.norm3 = nn.LayerNorm(dim)
+    def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+        if not is_xformers_available():
+            print("Here is how to install it")
+            raise ModuleNotFoundError(
+                "Refer to https://github.com/facebookresearch/xformers for more information on how to install"
+                " xformers",
+                name="xformers",
+            )
+        elif not torch.cuda.is_available():
+            raise ValueError(
+                "torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is only"
+                " available for GPU "
+            )
+        else:
+            try:
+                # Make sure we can run the memory efficient attention
+                _ = xformers.ops.memory_efficient_attention(
+                    torch.randn((1, 2, 40), device="cuda"),
+                    torch.randn((1, 2, 40), device="cuda"),
+                    torch.randn((1, 2, 40), device="cuda"),
+                )
+            except Exception as e:
+                raise e
+            self.attn1._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
+            if self.audio_cross_attn is not None:
+                self.audio_cross_attn.attn._use_memory_efficient_attention_xformers = (
+                    use_memory_efficient_attention_xformers
+                )
+            # self.attn_temp._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
+    def forward(
+        self, hidden_states, encoder_hidden_states=None, timestep=None, attention_mask=None, video_length=None
+    ):
+        # SparseCausal-Attention
+        norm_hidden_states = (
+            self.norm1(hidden_states, timestep) if self.use_ada_layer_norm else self.norm1(hidden_states)
+        )
+        # pdb.set_trace()
+        if self.unet_use_cross_frame_attention:
+            hidden_states = (
+                self.attn1(norm_hidden_states, attention_mask=attention_mask, video_length=video_length)
+                + hidden_states
+            )
+        else:
+            hidden_states = self.attn1(norm_hidden_states, attention_mask=attention_mask) + hidden_states
+        if self.audio_cross_attn is not None and encoder_hidden_states is not None:
+            hidden_states = self.audio_cross_attn(
+                hidden_states, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask
+            )
+        # Feed-forward
+        hidden_states = self.ff(self.norm3(hidden_states)) + hidden_states
+        return hidden_states
+class AudioCrossAttn(nn.Module):
+    def __init__(
+        self,
+        dim,
+        cross_attention_dim,
+        num_attention_heads,
+        attention_head_dim,
+        dropout,
+        attention_bias,
+        upcast_attention,
+        num_embeds_ada_norm,
+        use_ada_layer_norm,
+        zero_proj_out=False,
+    ):
+        super().__init__()
+        self.norm = AdaLayerNorm(dim, num_embeds_ada_norm) if use_ada_layer_norm else nn.LayerNorm(dim)
+        self.attn = CrossAttention(
+            query_dim=dim,
+            cross_attention_dim=cross_attention_dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            upcast_attention=upcast_attention,
+        )
+        if zero_proj_out:
+            self.proj_out = zero_module(nn.Linear(dim, dim))
+        self.zero_proj_out = zero_proj_out
+        self.use_ada_layer_norm = use_ada_layer_norm
+    def forward(self, hidden_states, encoder_hidden_states=None, timestep=None, attention_mask=None):
+        previous_hidden_states = hidden_states
+        hidden_states = self.norm(hidden_states, timestep) if self.use_ada_layer_norm else self.norm(hidden_states)
+        if encoder_hidden_states.dim() == 4:
+            encoder_hidden_states = rearrange(encoder_hidden_states, "b f n d -> (b f) n d")
+        hidden_states = self.attn(
+            hidden_states, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask
+        )
+        if self.zero_proj_out:
+            hidden_states = self.proj_out(hidden_states)
+        return hidden_states + previous_hidden_states

latentsync/models/motion_module.py ADDED Viewed

	@@ -0,0 +1,332 @@

+# Adapted from https://github.com/guoyww/AnimateDiff/blob/main/animatediff/models/motion_module.py
+# Actually we don't use the motion module in the final version of LatentSync
+# When we started the project, we used the codebase of AnimateDiff and tried motion module
+# But the results are poor, and we decied to leave the code here for possible future usage
+from dataclasses import dataclass
+import torch
+import torch.nn.functional as F
+from torch import nn
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.modeling_utils import ModelMixin
+from diffusers.utils import BaseOutput
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.models.attention import CrossAttention, FeedForward
+from einops import rearrange, repeat
+import math
+from .utils import zero_module
+@dataclass
+class TemporalTransformer3DModelOutput(BaseOutput):
+    sample: torch.FloatTensor
+if is_xformers_available():
+    import xformers
+    import xformers.ops
+else:
+    xformers = None
+def get_motion_module(in_channels, motion_module_type: str, motion_module_kwargs: dict):
+    if motion_module_type == "Vanilla":
+        return VanillaTemporalModule(
+            in_channels=in_channels,
+            **motion_module_kwargs,
+        )
+    else:
+        raise ValueError
+class VanillaTemporalModule(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        num_attention_heads=8,
+        num_transformer_block=2,
+        attention_block_types=("Temporal_Self", "Temporal_Self"),
+        cross_frame_attention_mode=None,
+        temporal_position_encoding=False,
+        temporal_position_encoding_max_len=24,
+        temporal_attention_dim_div=1,
+        zero_initialize=True,
+    ):
+        super().__init__()
+        self.temporal_transformer = TemporalTransformer3DModel(
+            in_channels=in_channels,
+            num_attention_heads=num_attention_heads,
+            attention_head_dim=in_channels // num_attention_heads // temporal_attention_dim_div,
+            num_layers=num_transformer_block,
+            attention_block_types=attention_block_types,
+            cross_frame_attention_mode=cross_frame_attention_mode,
+            temporal_position_encoding=temporal_position_encoding,
+            temporal_position_encoding_max_len=temporal_position_encoding_max_len,
+        )
+        if zero_initialize:
+            self.temporal_transformer.proj_out = zero_module(self.temporal_transformer.proj_out)
+    def forward(self, input_tensor, temb, encoder_hidden_states, attention_mask=None, anchor_frame_idx=None):
+        hidden_states = input_tensor
+        hidden_states = self.temporal_transformer(hidden_states, encoder_hidden_states, attention_mask)
+        output = hidden_states
+        return output
+class TemporalTransformer3DModel(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        num_attention_heads,
+        attention_head_dim,
+        num_layers,
+        attention_block_types=(
+            "Temporal_Self",
+            "Temporal_Self",
+        ),
+        dropout=0.0,
+        norm_num_groups=32,
+        cross_attention_dim=768,
+        activation_fn="geglu",
+        attention_bias=False,
+        upcast_attention=False,
+        cross_frame_attention_mode=None,
+        temporal_position_encoding=False,
+        temporal_position_encoding_max_len=24,
+    ):
+        super().__init__()
+        inner_dim = num_attention_heads * attention_head_dim
+        self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+        self.proj_in = nn.Linear(in_channels, inner_dim)
+        self.transformer_blocks = nn.ModuleList(
+            [
+                TemporalTransformerBlock(
+                    dim=inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    attention_block_types=attention_block_types,
+                    dropout=dropout,
+                    norm_num_groups=norm_num_groups,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    attention_bias=attention_bias,
+                    upcast_attention=upcast_attention,
+                    cross_frame_attention_mode=cross_frame_attention_mode,
+                    temporal_position_encoding=temporal_position_encoding,
+                    temporal_position_encoding_max_len=temporal_position_encoding_max_len,
+                )
+                for d in range(num_layers)
+            ]
+        )
+        self.proj_out = nn.Linear(inner_dim, in_channels)
+    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None):
+        assert hidden_states.dim() == 5, f"Expected hidden_states to have ndim=5, but got ndim={hidden_states.dim()}."
+        video_length = hidden_states.shape[2]
+        hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
+        batch, channel, height, weight = hidden_states.shape
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states)
+        hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, channel)
+        hidden_states = self.proj_in(hidden_states)
+        # Transformer Blocks
+        for block in self.transformer_blocks:
+            hidden_states = block(
+                hidden_states, encoder_hidden_states=encoder_hidden_states, video_length=video_length
+            )
+        # output
+        hidden_states = self.proj_out(hidden_states)
+        hidden_states = hidden_states.reshape(batch, height, weight, channel).permute(0, 3, 1, 2).contiguous()
+        output = hidden_states + residual
+        output = rearrange(output, "(b f) c h w -> b c f h w", f=video_length)
+        return output
+class TemporalTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_attention_heads,
+        attention_head_dim,
+        attention_block_types=(
+            "Temporal_Self",
+            "Temporal_Self",
+        ),
+        dropout=0.0,
+        norm_num_groups=32,
+        cross_attention_dim=768,
+        activation_fn="geglu",
+        attention_bias=False,
+        upcast_attention=False,
+        cross_frame_attention_mode=None,
+        temporal_position_encoding=False,
+        temporal_position_encoding_max_len=24,
+    ):
+        super().__init__()
+        attention_blocks = []
+        norms = []
+        for block_name in attention_block_types:
+            attention_blocks.append(
+                VersatileAttention(
+                    attention_mode=block_name.split("_")[0],
+                    cross_attention_dim=cross_attention_dim if block_name.endswith("_Cross") else None,
+                    query_dim=dim,
+                    heads=num_attention_heads,
+                    dim_head=attention_head_dim,
+                    dropout=dropout,
+                    bias=attention_bias,
+                    upcast_attention=upcast_attention,
+                    cross_frame_attention_mode=cross_frame_attention_mode,
+                    temporal_position_encoding=temporal_position_encoding,
+                    temporal_position_encoding_max_len=temporal_position_encoding_max_len,
+                )
+            )
+            norms.append(nn.LayerNorm(dim))
+        self.attention_blocks = nn.ModuleList(attention_blocks)
+        self.norms = nn.ModuleList(norms)
+        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn)
+        self.ff_norm = nn.LayerNorm(dim)
+    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, video_length=None):
+        for attention_block, norm in zip(self.attention_blocks, self.norms):
+            norm_hidden_states = norm(hidden_states)
+            hidden_states = (
+                attention_block(
+                    norm_hidden_states,
+                    encoder_hidden_states=encoder_hidden_states if attention_block.is_cross_attention else None,
+                    video_length=video_length,
+                )
+                + hidden_states
+            )
+        hidden_states = self.ff(self.ff_norm(hidden_states)) + hidden_states
+        output = hidden_states
+        return output
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model, dropout=0.0, max_len=24):
+        super().__init__()
+        self.dropout = nn.Dropout(p=dropout)
+        position = torch.arange(max_len).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
+        pe = torch.zeros(1, max_len, d_model)
+        pe[0, :, 0::2] = torch.sin(position * div_term)
+        pe[0, :, 1::2] = torch.cos(position * div_term)
+        self.register_buffer("pe", pe)
+    def forward(self, x):
+        x = x + self.pe[:, : x.size(1)]
+        return self.dropout(x)
+class VersatileAttention(CrossAttention):
+    def __init__(
+        self,
+        attention_mode=None,
+        cross_frame_attention_mode=None,
+        temporal_position_encoding=False,
+        temporal_position_encoding_max_len=24,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        assert attention_mode == "Temporal"
+        self.attention_mode = attention_mode
+        self.is_cross_attention = kwargs["cross_attention_dim"] is not None
+        self.pos_encoder = (
+            PositionalEncoding(kwargs["query_dim"], dropout=0.0, max_len=temporal_position_encoding_max_len)
+            if (temporal_position_encoding and attention_mode == "Temporal")
+            else None
+        )
+    def extra_repr(self):
+        return f"(Module Info) Attention_Mode: {self.attention_mode}, Is_Cross_Attention: {self.is_cross_attention}"
+    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, video_length=None):
+        batch_size, sequence_length, _ = hidden_states.shape
+        if self.attention_mode == "Temporal":
+            d = hidden_states.shape[1]
+            hidden_states = rearrange(hidden_states, "(b f) d c -> (b d) f c", f=video_length)
+            if self.pos_encoder is not None:
+                hidden_states = self.pos_encoder(hidden_states)
+            encoder_hidden_states = (
+                repeat(encoder_hidden_states, "b n c -> (b d) n c", d=d)
+                if encoder_hidden_states is not None
+                else encoder_hidden_states
+            )
+        else:
+            raise NotImplementedError
+        # encoder_hidden_states = encoder_hidden_states
+        if self.group_norm is not None:
+            hidden_states = self.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = self.to_q(hidden_states)
+        dim = query.shape[-1]
+        query = self.reshape_heads_to_batch_dim(query)
+        if self.added_kv_proj_dim is not None:
+            raise NotImplementedError
+        encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
+        key = self.to_k(encoder_hidden_states)
+        value = self.to_v(encoder_hidden_states)
+        key = self.reshape_heads_to_batch_dim(key)
+        value = self.reshape_heads_to_batch_dim(value)
+        if attention_mask is not None:
+            if attention_mask.shape[-1] != query.shape[1]:
+                target_length = query.shape[1]
+                attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
+                attention_mask = attention_mask.repeat_interleave(self.heads, dim=0)
+        # attention, what we cannot get enough of
+        if self._use_memory_efficient_attention_xformers:
+            hidden_states = self._memory_efficient_attention_xformers(query, key, value, attention_mask)
+            # Some versions of xformers return output in fp32, cast it back to the dtype of the input
+            hidden_states = hidden_states.to(query.dtype)
+        else:
+            if self._slice_size is None or query.shape[0] // self._slice_size == 1:
+                hidden_states = self._attention(query, key, value, attention_mask)
+            else:
+                hidden_states = self._sliced_attention(query, key, value, sequence_length, dim, attention_mask)
+        # linear proj
+        hidden_states = self.to_out[0](hidden_states)
+        # dropout
+        hidden_states = self.to_out[1](hidden_states)
+        if self.attention_mode == "Temporal":
+            hidden_states = rearrange(hidden_states, "(b d) f c -> (b f) d c", d=d)
+        return hidden_states

latentsync/models/resnet.py ADDED Viewed

	@@ -0,0 +1,234 @@

+# Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/resnet.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+class InflatedConv3d(nn.Conv2d):
+    def forward(self, x):
+        video_length = x.shape[2]
+        x = rearrange(x, "b c f h w -> (b f) c h w")
+        x = super().forward(x)
+        x = rearrange(x, "(b f) c h w -> b c f h w", f=video_length)
+        return x
+class InflatedGroupNorm(nn.GroupNorm):
+    def forward(self, x):
+        video_length = x.shape[2]
+        x = rearrange(x, "b c f h w -> (b f) c h w")
+        x = super().forward(x)
+        x = rearrange(x, "(b f) c h w -> b c f h w", f=video_length)
+        return x
+class Upsample3D(nn.Module):
+    def __init__(self, channels, use_conv=False, use_conv_transpose=False, out_channels=None, name="conv"):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_conv_transpose = use_conv_transpose
+        self.name = name
+        conv = None
+        if use_conv_transpose:
+            raise NotImplementedError
+        elif use_conv:
+            self.conv = InflatedConv3d(self.channels, self.out_channels, 3, padding=1)
+    def forward(self, hidden_states, output_size=None):
+        assert hidden_states.shape[1] == self.channels
+        if self.use_conv_transpose:
+            raise NotImplementedError
+        # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16
+        dtype = hidden_states.dtype
+        if dtype == torch.bfloat16:
+            hidden_states = hidden_states.to(torch.float32)
+        # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
+        if hidden_states.shape[0] >= 64:
+            hidden_states = hidden_states.contiguous()
+        # if `output_size` is passed we force the interpolation output
+        # size and do not make use of `scale_factor=2`
+        if output_size is None:
+            hidden_states = F.interpolate(hidden_states, scale_factor=[1.0, 2.0, 2.0], mode="nearest")
+        else:
+            hidden_states = F.interpolate(hidden_states, size=output_size, mode="nearest")
+        # If the input is bfloat16, we cast back to bfloat16
+        if dtype == torch.bfloat16:
+            hidden_states = hidden_states.to(dtype)
+        # if self.use_conv:
+        #     if self.name == "conv":
+        #         hidden_states = self.conv(hidden_states)
+        #     else:
+        #         hidden_states = self.Conv2d_0(hidden_states)
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+class Downsample3D(nn.Module):
+    def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name="conv"):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.padding = padding
+        stride = 2
+        self.name = name
+        if use_conv:
+            self.conv = InflatedConv3d(self.channels, self.out_channels, 3, stride=stride, padding=padding)
+        else:
+            raise NotImplementedError
+    def forward(self, hidden_states):
+        assert hidden_states.shape[1] == self.channels
+        if self.use_conv and self.padding == 0:
+            raise NotImplementedError
+        assert hidden_states.shape[1] == self.channels
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+class ResnetBlock3D(nn.Module):
+    def __init__(
+        self,
+        *,
+        in_channels,
+        out_channels=None,
+        conv_shortcut=False,
+        dropout=0.0,
+        temb_channels=512,
+        groups=32,
+        groups_out=None,
+        pre_norm=True,
+        eps=1e-6,
+        non_linearity="swish",
+        time_embedding_norm="default",
+        output_scale_factor=1.0,
+        use_in_shortcut=None,
+        use_inflated_groupnorm=False,
+    ):
+        super().__init__()
+        self.pre_norm = pre_norm
+        self.pre_norm = True
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.time_embedding_norm = time_embedding_norm
+        self.output_scale_factor = output_scale_factor
+        if groups_out is None:
+            groups_out = groups
+        assert use_inflated_groupnorm != None
+        if use_inflated_groupnorm:
+            self.norm1 = InflatedGroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
+        else:
+            self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
+        self.conv1 = InflatedConv3d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if temb_channels is not None:
+            time_emb_proj_out_channels = out_channels
+            # if self.time_embedding_norm == "default":
+            #     time_emb_proj_out_channels = out_channels
+            # elif self.time_embedding_norm == "scale_shift":
+            #     time_emb_proj_out_channels = out_channels * 2
+            # else:
+            #     raise ValueError(f"unknown time_embedding_norm : {self.time_embedding_norm} ")
+            self.time_emb_proj = torch.nn.Linear(temb_channels, time_emb_proj_out_channels)
+        else:
+            self.time_emb_proj = None
+        if self.time_embedding_norm == "scale_shift":
+            self.double_len_linear = torch.nn.Linear(time_emb_proj_out_channels, 2 * time_emb_proj_out_channels)
+        else:
+            self.double_len_linear = None
+        if use_inflated_groupnorm:
+            self.norm2 = InflatedGroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)
+        else:
+            self.norm2 = torch.nn.GroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = InflatedConv3d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if non_linearity == "swish":
+            self.nonlinearity = lambda x: F.silu(x)
+        elif non_linearity == "mish":
+            self.nonlinearity = Mish()
+        elif non_linearity == "silu":
+            self.nonlinearity = nn.SiLU()
+        self.use_in_shortcut = self.in_channels != self.out_channels if use_in_shortcut is None else use_in_shortcut
+        self.conv_shortcut = None
+        if self.use_in_shortcut:
+            self.conv_shortcut = InflatedConv3d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, input_tensor, temb):
+        hidden_states = input_tensor
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+        if temb is not None:
+            if temb.dim() == 2:
+                # input (1, 1280)
+                temb = self.time_emb_proj(self.nonlinearity(temb))
+                temb = temb[:, :, None, None, None]  # unsqueeze
+            else:
+                # input (1, 1280, 16)
+                temb = temb.permute(0, 2, 1)
+                temb = self.time_emb_proj(self.nonlinearity(temb))
+                if self.double_len_linear is not None:
+                    temb = self.double_len_linear(self.nonlinearity(temb))
+                temb = temb.permute(0, 2, 1)
+                temb = temb[:, :, :, None, None]
+        if temb is not None and self.time_embedding_norm == "default":
+            hidden_states = hidden_states + temb
+        hidden_states = self.norm2(hidden_states)
+        if temb is not None and self.time_embedding_norm == "scale_shift":
+            scale, shift = torch.chunk(temb, 2, dim=1)
+            hidden_states = hidden_states * (1 + scale) + shift
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        if self.conv_shortcut is not None:
+            input_tensor = self.conv_shortcut(input_tensor)
+        output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
+        return output_tensor
+class Mish(torch.nn.Module):
+    def forward(self, hidden_states):
+        return hidden_states * torch.tanh(torch.nn.functional.softplus(hidden_states))

latentsync/models/syncnet.py ADDED Viewed

	@@ -0,0 +1,233 @@

+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from torch import nn
+from einops import rearrange
+from torch.nn import functional as F
+from ..utils.util import cosine_loss
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.models.attention import CrossAttention, FeedForward
+from diffusers.utils.import_utils import is_xformers_available
+from einops import rearrange
+class SyncNet(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.audio_encoder = DownEncoder2D(
+            in_channels=config["audio_encoder"]["in_channels"],
+            block_out_channels=config["audio_encoder"]["block_out_channels"],
+            downsample_factors=config["audio_encoder"]["downsample_factors"],
+            dropout=config["audio_encoder"]["dropout"],
+            attn_blocks=config["audio_encoder"]["attn_blocks"],
+        )
+        self.visual_encoder = DownEncoder2D(
+            in_channels=config["visual_encoder"]["in_channels"],
+            block_out_channels=config["visual_encoder"]["block_out_channels"],
+            downsample_factors=config["visual_encoder"]["downsample_factors"],
+            dropout=config["visual_encoder"]["dropout"],
+            attn_blocks=config["visual_encoder"]["attn_blocks"],
+        )
+        self.eval()
+    def forward(self, image_sequences, audio_sequences):
+        vision_embeds = self.visual_encoder(image_sequences)  # (b, c, 1, 1)
+        audio_embeds = self.audio_encoder(audio_sequences)  # (b, c, 1, 1)
+        vision_embeds = vision_embeds.reshape(vision_embeds.shape[0], -1)  # (b, c)
+        audio_embeds = audio_embeds.reshape(audio_embeds.shape[0], -1)  # (b, c)
+        # Make them unit vectors
+        vision_embeds = F.normalize(vision_embeds, p=2, dim=1)
+        audio_embeds = F.normalize(audio_embeds, p=2, dim=1)
+        return vision_embeds, audio_embeds
+class ResnetBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        eps: float = 1e-6,
+        act_fn: str = "silu",
+        downsample_factor=2,
+    ):
+        super().__init__()
+        self.norm1 = nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=eps, affine=True)
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.norm2 = nn.GroupNorm(num_groups=norm_num_groups, num_channels=out_channels, eps=eps, affine=True)
+        self.dropout = nn.Dropout(dropout)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if act_fn == "relu":
+            self.act_fn = nn.ReLU()
+        elif act_fn == "silu":
+            self.act_fn = nn.SiLU()
+        if in_channels != out_channels:
+            self.conv_shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+        else:
+            self.conv_shortcut = None
+        if isinstance(downsample_factor, list):
+            downsample_factor = tuple(downsample_factor)
+        if downsample_factor == 1:
+            self.downsample_conv = None
+        else:
+            self.downsample_conv = nn.Conv2d(
+                out_channels, out_channels, kernel_size=3, stride=downsample_factor, padding=0
+            )
+            self.pad = (0, 1, 0, 1)
+            if isinstance(downsample_factor, tuple):
+                if downsample_factor[0] == 1:
+                    self.pad = (0, 1, 1, 1)  # The padding order is from back to front
+                elif downsample_factor[1] == 1:
+                    self.pad = (1, 1, 0, 1)
+    def forward(self, input_tensor):
+        hidden_states = input_tensor
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = self.act_fn(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+        hidden_states = self.norm2(hidden_states)
+        hidden_states = self.act_fn(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        if self.conv_shortcut is not None:
+            input_tensor = self.conv_shortcut(input_tensor)
+        hidden_states += input_tensor
+        if self.downsample_conv is not None:
+            hidden_states = F.pad(hidden_states, self.pad, mode="constant", value=0)
+            hidden_states = self.downsample_conv(hidden_states)
+        return hidden_states
+class AttentionBlock2D(nn.Module):
+    def __init__(self, query_dim, norm_num_groups=32, dropout=0.0):
+        super().__init__()
+        if not is_xformers_available():
+            raise ModuleNotFoundError(
+                "You have to install xformers to enable memory efficient attetion", name="xformers"
+            )
+        # inner_dim = dim_head * heads
+        self.norm1 = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=query_dim, eps=1e-6, affine=True)
+        self.norm2 = nn.LayerNorm(query_dim)
+        self.norm3 = nn.LayerNorm(query_dim)
+        self.ff = FeedForward(query_dim, dropout=dropout, activation_fn="geglu")
+        self.conv_in = nn.Conv2d(query_dim, query_dim, kernel_size=1, stride=1, padding=0)
+        self.conv_out = nn.Conv2d(query_dim, query_dim, kernel_size=1, stride=1, padding=0)
+        self.attn = CrossAttention(query_dim=query_dim, heads=8, dim_head=query_dim // 8, dropout=dropout, bias=True)
+        self.attn._use_memory_efficient_attention_xformers = True
+    def forward(self, hidden_states):
+        assert hidden_states.dim() == 4, f"Expected hidden_states to have ndim=4, but got ndim={hidden_states.dim()}."
+        batch, channel, height, width = hidden_states.shape
+        residual = hidden_states
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = self.conv_in(hidden_states)
+        hidden_states = rearrange(hidden_states, "b c h w -> b (h w) c")
+        norm_hidden_states = self.norm2(hidden_states)
+        hidden_states = self.attn(norm_hidden_states, attention_mask=None) + hidden_states
+        hidden_states = self.ff(self.norm3(hidden_states)) + hidden_states
+        hidden_states = rearrange(hidden_states, "b (h w) c -> b c h w", h=height, w=width)
+        hidden_states = self.conv_out(hidden_states)
+        hidden_states = hidden_states + residual
+        return hidden_states
+class DownEncoder2D(nn.Module):
+    def __init__(
+        self,
+        in_channels=4 * 16,
+        block_out_channels=[64, 128, 256, 256],
+        downsample_factors=[2, 2, 2, 2],
+        layers_per_block=2,
+        norm_num_groups=32,
+        attn_blocks=[1, 1, 1, 1],
+        dropout: float = 0.0,
+        act_fn="silu",
+    ):
+        super().__init__()
+        self.layers_per_block = layers_per_block
+        # in
+        self.conv_in = nn.Conv2d(in_channels, block_out_channels[0], kernel_size=3, stride=1, padding=1)
+        # down
+        self.down_blocks = nn.ModuleList([])
+        output_channels = block_out_channels[0]
+        for i, block_out_channel in enumerate(block_out_channels):
+            input_channels = output_channels
+            output_channels = block_out_channel
+            # is_final_block = i == len(block_out_channels) - 1
+            down_block = ResnetBlock2D(
+                in_channels=input_channels,
+                out_channels=output_channels,
+                downsample_factor=downsample_factors[i],
+                norm_num_groups=norm_num_groups,
+                dropout=dropout,
+                act_fn=act_fn,
+            )
+            self.down_blocks.append(down_block)
+            if attn_blocks[i] == 1:
+                attention_block = AttentionBlock2D(query_dim=output_channels, dropout=dropout)
+                self.down_blocks.append(attention_block)
+        # out
+        self.norm_out = nn.GroupNorm(num_channels=block_out_channels[-1], num_groups=norm_num_groups, eps=1e-6)
+        self.act_fn_out = nn.ReLU()
+    def forward(self, hidden_states):
+        hidden_states = self.conv_in(hidden_states)
+        # down
+        for down_block in self.down_blocks:
+            hidden_states = down_block(hidden_states)
+        # post-process
+        hidden_states = self.norm_out(hidden_states)
+        hidden_states = self.act_fn_out(hidden_states)
+        return hidden_states

latentsync/models/syncnet_wav2lip.py ADDED Viewed

	@@ -0,0 +1,90 @@

+# Adapted from https://github.com/primepake/wav2lip_288x288/blob/master/models/syncnetv2.py
+# The code here is for ablation study.
+from torch import nn
+from torch.nn import functional as F
+class SyncNetWav2Lip(nn.Module):
+    def __init__(self, act_fn="leaky"):
+        super().__init__()
+        # input image sequences: (15, 128, 256)
+        self.visual_encoder = nn.Sequential(
+            Conv2d(15, 32, kernel_size=(7, 7), stride=1, padding=3, act_fn=act_fn), # (128, 256)
+            Conv2d(32, 64, kernel_size=5, stride=(1, 2), padding=1, act_fn=act_fn), # (126, 127)
+            Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True, act_fn=act_fn),
+            Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True, act_fn=act_fn),
+            Conv2d(64, 128, kernel_size=3, stride=2, padding=1, act_fn=act_fn), # (63, 64)
+            Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True, act_fn=act_fn),
+            Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True, act_fn=act_fn),
+            Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True, act_fn=act_fn),
+            Conv2d(128, 256, kernel_size=3, stride=3, padding=1, act_fn=act_fn), # (21, 22)
+            Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True, act_fn=act_fn),
+            Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True, act_fn=act_fn),
+            Conv2d(256, 512, kernel_size=3, stride=2, padding=1, act_fn=act_fn), # (11, 11)
+            Conv2d(512, 512, kernel_size=3, stride=1, padding=1, residual=True, act_fn=act_fn),
+            Conv2d(512, 512, kernel_size=3, stride=1, padding=1, residual=True, act_fn=act_fn),
+            Conv2d(512, 1024, kernel_size=3, stride=2, padding=1, act_fn=act_fn), # (6, 6)
+            Conv2d(1024, 1024, kernel_size=3, stride=1, padding=1, residual=True, act_fn=act_fn),
+            Conv2d(1024, 1024, kernel_size=3, stride=1, padding=1, residual=True, act_fn=act_fn),
+            Conv2d(1024, 1024, kernel_size=3, stride=2, padding=1, act_fn="relu"), # (3, 3)
+            Conv2d(1024, 1024, kernel_size=3, stride=1, padding=0, act_fn="relu"), # (1, 1)
+            Conv2d(1024, 1024, kernel_size=1, stride=1, padding=0, act_fn="relu"),
+        )
+        # input audio sequences: (1, 80, 16)
+        self.audio_encoder = nn.Sequential(
+            Conv2d(1, 32, kernel_size=3, stride=1, padding=1, act_fn=act_fn),
+            Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True, act_fn=act_fn),
+            Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True, act_fn=act_fn),
+            Conv2d(32, 64, kernel_size=3, stride=(3, 1), padding=1, act_fn=act_fn), # (27, 16)
+            Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True, act_fn=act_fn),
+            Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True, act_fn=act_fn),
+            Conv2d(64, 128, kernel_size=3, stride=3, padding=1, act_fn=act_fn), # (9, 6)
+            Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True, act_fn=act_fn),
+            Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True, act_fn=act_fn),
+            Conv2d(128, 256, kernel_size=3, stride=(3, 2), padding=1, act_fn=act_fn), # (3, 3)
+            Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True, act_fn=act_fn),
+            Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True, act_fn=act_fn),
+            Conv2d(256, 512, kernel_size=3, stride=1, padding=1, act_fn=act_fn),
+            Conv2d(512, 512, kernel_size=3, stride=1, padding=1, residual=True, act_fn=act_fn),
+            Conv2d(512, 512, kernel_size=3, stride=1, padding=1, residual=True, act_fn=act_fn),
+            Conv2d(512, 1024, kernel_size=3, stride=1, padding=0, act_fn="relu"), # (1, 1)
+            Conv2d(1024, 1024, kernel_size=1, stride=1, padding=0, act_fn="relu"),
+        )
+    def forward(self, image_sequences, audio_sequences):
+        vision_embeds = self.visual_encoder(image_sequences)  # (b, c, 1, 1)
+        audio_embeds = self.audio_encoder(audio_sequences)  # (b, c, 1, 1)
+        vision_embeds = vision_embeds.reshape(vision_embeds.shape[0], -1)  # (b, c)
+        audio_embeds = audio_embeds.reshape(audio_embeds.shape[0], -1)  # (b, c)
+        # Make them unit vectors
+        vision_embeds = F.normalize(vision_embeds, p=2, dim=1)
+        audio_embeds = F.normalize(audio_embeds, p=2, dim=1)
+        return vision_embeds, audio_embeds
+class Conv2d(nn.Module):
+    def __init__(self, cin, cout, kernel_size, stride, padding, residual=False, act_fn="relu", *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.conv_block = nn.Sequential(nn.Conv2d(cin, cout, kernel_size, stride, padding), nn.BatchNorm2d(cout))
+        if act_fn == "relu":
+            self.act_fn = nn.ReLU()
+        elif act_fn == "tanh":
+            self.act_fn = nn.Tanh()
+        elif act_fn == "silu":
+            self.act_fn = nn.SiLU()
+        elif act_fn == "leaky":
+            self.act_fn = nn.LeakyReLU(0.2, inplace=True)
+        self.residual = residual
+    def forward(self, x):
+        out = self.conv_block(x)
+        if self.residual:
+            out += x
+        return self.act_fn(out)

latentsync/models/unet.py ADDED Viewed

	@@ -0,0 +1,528 @@

+# Adapted from https://github.com/guoyww/AnimateDiff/blob/main/animatediff/models/unet.py
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+import copy
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.modeling_utils import ModelMixin
+from diffusers import UNet2DConditionModel
+from diffusers.utils import BaseOutput, logging
+from diffusers.models.embeddings import TimestepEmbedding, Timesteps
+from .unet_blocks import (
+    CrossAttnDownBlock3D,
+    CrossAttnUpBlock3D,
+    DownBlock3D,
+    UNetMidBlock3DCrossAttn,
+    UpBlock3D,
+    get_down_block,
+    get_up_block,
+)
+from .resnet import InflatedConv3d, InflatedGroupNorm
+from ..utils.util import zero_rank_log
+from einops import rearrange
+from .utils import zero_module
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@dataclass
+class UNet3DConditionOutput(BaseOutput):
+    sample: torch.FloatTensor
+class UNet3DConditionModel(ModelMixin, ConfigMixin):
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        center_input_sample: bool = False,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock3D",
+            "CrossAttnDownBlock3D",
+            "CrossAttnDownBlock3D",
+            "DownBlock3D",
+        ),
+        mid_block_type: str = "UNetMidBlock3DCrossAttn",
+        up_block_types: Tuple[str] = ("UpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D"),
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: int = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: int = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: int = 1280,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        num_class_embeds: Optional[int] = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        use_inflated_groupnorm=False,
+        # Additional
+        use_motion_module=False,
+        motion_module_resolutions=(1, 2, 4, 8),
+        motion_module_mid_block=False,
+        motion_module_decoder_only=False,
+        motion_module_type=None,
+        motion_module_kwargs={},
+        unet_use_cross_frame_attention=False,
+        unet_use_temporal_attention=False,
+        add_audio_layer=False,
+        audio_condition_method: str = "cross_attn",
+        custom_audio_layer=False,
+    ):
+        super().__init__()
+        self.sample_size = sample_size
+        time_embed_dim = block_out_channels[0] * 4
+        self.use_motion_module = use_motion_module
+        self.add_audio_layer = add_audio_layer
+        self.conv_in = zero_module(InflatedConv3d(in_channels, block_out_channels[0], kernel_size=3, padding=(1, 1)))
+        # time
+        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+        timestep_input_dim = block_out_channels[0]
+        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+        # class embedding
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+        elif class_embed_type == "identity":
+            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
+        else:
+            self.class_embedding = None
+        self.down_blocks = nn.ModuleList([])
+        self.mid_block = None
+        self.up_blocks = nn.ModuleList([])
+        if isinstance(only_cross_attention, bool):
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            res = 2**i
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=attention_head_dim[i],
+                downsample_padding=downsample_padding,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+                unet_use_temporal_attention=unet_use_temporal_attention,
+                use_inflated_groupnorm=use_inflated_groupnorm,
+                use_motion_module=use_motion_module
+                and (res in motion_module_resolutions)
+                and (not motion_module_decoder_only),
+                motion_module_type=motion_module_type,
+                motion_module_kwargs=motion_module_kwargs,
+                add_audio_layer=add_audio_layer,
+                audio_condition_method=audio_condition_method,
+                custom_audio_layer=custom_audio_layer,
+            )
+            self.down_blocks.append(down_block)
+        # mid
+        if mid_block_type == "UNetMidBlock3DCrossAttn":
+            self.mid_block = UNetMidBlock3DCrossAttn(
+                in_channels=block_out_channels[-1],
+                temb_channels=time_embed_dim,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=attention_head_dim[-1],
+                resnet_groups=norm_num_groups,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                upcast_attention=upcast_attention,
+                unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+                unet_use_temporal_attention=unet_use_temporal_attention,
+                use_inflated_groupnorm=use_inflated_groupnorm,
+                use_motion_module=use_motion_module and motion_module_mid_block,
+                motion_module_type=motion_module_type,
+                motion_module_kwargs=motion_module_kwargs,
+                add_audio_layer=add_audio_layer,
+                audio_condition_method=audio_condition_method,
+                custom_audio_layer=custom_audio_layer,
+            )
+        else:
+            raise ValueError(f"unknown mid_block_type : {mid_block_type}")
+        # count how many layers upsample the videos
+        self.num_upsamplers = 0
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_attention_head_dim = list(reversed(attention_head_dim))
+        only_cross_attention = list(reversed(only_cross_attention))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            res = 2 ** (3 - i)
+            is_final_block = i == len(block_out_channels) - 1
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=layers_per_block + 1,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=reversed_attention_head_dim[i],
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+                unet_use_temporal_attention=unet_use_temporal_attention,
+                use_inflated_groupnorm=use_inflated_groupnorm,
+                use_motion_module=use_motion_module and (res in motion_module_resolutions),
+                motion_module_type=motion_module_type,
+                motion_module_kwargs=motion_module_kwargs,
+                add_audio_layer=add_audio_layer,
+                audio_condition_method=audio_condition_method,
+                custom_audio_layer=custom_audio_layer,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+        # out
+        if use_inflated_groupnorm:
+            self.conv_norm_out = InflatedGroupNorm(
+                num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps
+            )
+        else:
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps
+            )
+        self.conv_act = nn.SiLU()
+        self.conv_out = zero_module(InflatedConv3d(block_out_channels[0], out_channels, kernel_size=3, padding=1))
+    def set_attention_slice(self, slice_size):
+        r"""
+        Enable sliced attention computation.
+        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+        in several steps. This is useful to save some memory in exchange for a small speed decrease.
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+                `"max"`, maxium amount of memory will be saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+        def fn_recursive_retrieve_slicable_dims(module: torch.nn.Module):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+            for child in module.children():
+                fn_recursive_retrieve_slicable_dims(child)
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_slicable_dims(module)
+        num_slicable_layers = len(sliceable_head_dims)
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_slicable_layers * [1]
+        slice_size = num_slicable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (CrossAttnDownBlock3D, DownBlock3D, CrossAttnUpBlock3D, UpBlock3D)):
+            module.gradient_checkpointing = value
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        # support controlnet
+        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        mid_block_additional_residual: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[UNet3DConditionOutput, Tuple]:
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
+            timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps
+            encoder_hidden_states (`torch.FloatTensor`): (batch, sequence_length, feature_dim) encoder hidden states
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            logger.info("Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+        # prepare attention_mask
+        if attention_mask is not None:
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+        # time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+        t_emb = self.time_proj(timesteps)
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=self.dtype)
+        emb = self.time_embedding(t_emb)
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+            class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
+            emb = emb + class_emb
+        # pre-process
+        sample = self.conv_in(sample)
+        # down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                )
+            else:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample, temb=emb, encoder_hidden_states=encoder_hidden_states
+                )
+            down_block_res_samples += res_samples
+        # support controlnet
+        down_block_res_samples = list(down_block_res_samples)
+        if down_block_additional_residuals is not None:
+            for i, down_block_additional_residual in enumerate(down_block_additional_residuals):
+                if down_block_additional_residual.dim() == 4:  # boardcast
+                    down_block_additional_residual = down_block_additional_residual.unsqueeze(2)
+                down_block_res_samples[i] = down_block_res_samples[i] + down_block_additional_residual
+        # mid
+        sample = self.mid_block(
+            sample, emb, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask
+        )
+        # support controlnet
+        if mid_block_additional_residual is not None:
+            if mid_block_additional_residual.dim() == 4:  # boardcast
+                mid_block_additional_residual = mid_block_additional_residual.unsqueeze(2)
+            sample = sample + mid_block_additional_residual
+        # up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    upsample_size=upsample_size,
+                    encoder_hidden_states=encoder_hidden_states,
+                )
+        # post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        if not return_dict:
+            return (sample,)
+        return UNet3DConditionOutput(sample=sample)
+    def load_state_dict(self, state_dict, strict=True):
+        # If the loaded checkpoint's in_channels or out_channels are different from config
+        temp_state_dict = copy.deepcopy(state_dict)
+        if temp_state_dict["conv_in.weight"].shape[1] != self.config.in_channels:
+            del temp_state_dict["conv_in.weight"]
+            del temp_state_dict["conv_in.bias"]
+        if temp_state_dict["conv_out.weight"].shape[0] != self.config.out_channels:
+            del temp_state_dict["conv_out.weight"]
+            del temp_state_dict["conv_out.bias"]
+        # If the loaded checkpoint's cross_attention_dim is different from config
+        keys_to_remove = []
+        for key in temp_state_dict:
+            if "audio_cross_attn.attn.to_k." in key or "audio_cross_attn.attn.to_v." in key:
+                if temp_state_dict[key].shape[1] != self.config.cross_attention_dim:
+                    keys_to_remove.append(key)
+        for key in keys_to_remove:
+            del temp_state_dict[key]
+        return super().load_state_dict(state_dict=temp_state_dict, strict=strict)
+    @classmethod
+    def from_pretrained(cls, model_config: dict, ckpt_path: str, device="cpu"):
+        unet = cls.from_config(model_config).to(device)
+        if ckpt_path != "":
+            zero_rank_log(logger, f"Load from checkpoint: {ckpt_path}")
+            ckpt = torch.load(ckpt_path, map_location=device)
+            if "global_step" in ckpt:
+                zero_rank_log(logger, f"resume from global_step: {ckpt['global_step']}")
+                resume_global_step = ckpt["global_step"]
+            else:
+                resume_global_step = 0
+            state_dict = ckpt["state_dict"] if "state_dict" in ckpt else ckpt
+            unet.load_state_dict(state_dict, strict=False)
+        else:
+            resume_global_step = 0
+        return unet, resume_global_step

latentsync/models/unet_blocks.py ADDED Viewed

	@@ -0,0 +1,903 @@

+# Adapted from https://github.com/guoyww/AnimateDiff/blob/main/animatediff/models/unet_blocks.py
+import torch
+from torch import nn
+from .attention import Transformer3DModel
+from .resnet import Downsample3D, ResnetBlock3D, Upsample3D
+from .motion_module import get_motion_module
+def get_down_block(
+    down_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    temb_channels,
+    add_downsample,
+    resnet_eps,
+    resnet_act_fn,
+    attn_num_head_channels,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    downsample_padding=None,
+    dual_cross_attention=False,
+    use_linear_projection=False,
+    only_cross_attention=False,
+    upcast_attention=False,
+    resnet_time_scale_shift="default",
+    unet_use_cross_frame_attention=False,
+    unet_use_temporal_attention=False,
+    use_inflated_groupnorm=False,
+    use_motion_module=None,
+    motion_module_type=None,
+    motion_module_kwargs=None,
+    add_audio_layer=False,
+    audio_condition_method="cross_attn",
+    custom_audio_layer=False,
+):
+    down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
+    if down_block_type == "DownBlock3D":
+        return DownBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            use_inflated_groupnorm=use_inflated_groupnorm,
+            use_motion_module=use_motion_module,
+            motion_module_type=motion_module_type,
+            motion_module_kwargs=motion_module_kwargs,
+        )
+    elif down_block_type == "CrossAttnDownBlock3D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock3D")
+        return CrossAttnDownBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attn_num_head_channels,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+            unet_use_temporal_attention=unet_use_temporal_attention,
+            use_inflated_groupnorm=use_inflated_groupnorm,
+            use_motion_module=use_motion_module,
+            motion_module_type=motion_module_type,
+            motion_module_kwargs=motion_module_kwargs,
+            add_audio_layer=add_audio_layer,
+            audio_condition_method=audio_condition_method,
+            custom_audio_layer=custom_audio_layer,
+        )
+    raise ValueError(f"{down_block_type} does not exist.")
+def get_up_block(
+    up_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    prev_output_channel,
+    temb_channels,
+    add_upsample,
+    resnet_eps,
+    resnet_act_fn,
+    attn_num_head_channels,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    dual_cross_attention=False,
+    use_linear_projection=False,
+    only_cross_attention=False,
+    upcast_attention=False,
+    resnet_time_scale_shift="default",
+    unet_use_cross_frame_attention=False,
+    unet_use_temporal_attention=False,
+    use_inflated_groupnorm=False,
+    use_motion_module=None,
+    motion_module_type=None,
+    motion_module_kwargs=None,
+    add_audio_layer=False,
+    audio_condition_method="cross_attn",
+    custom_audio_layer=False,
+):
+    up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
+    if up_block_type == "UpBlock3D":
+        return UpBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            use_inflated_groupnorm=use_inflated_groupnorm,
+            use_motion_module=use_motion_module,
+            motion_module_type=motion_module_type,
+            motion_module_kwargs=motion_module_kwargs,
+        )
+    elif up_block_type == "CrossAttnUpBlock3D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock3D")
+        return CrossAttnUpBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attn_num_head_channels,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+            unet_use_temporal_attention=unet_use_temporal_attention,
+            use_inflated_groupnorm=use_inflated_groupnorm,
+            use_motion_module=use_motion_module,
+            motion_module_type=motion_module_type,
+            motion_module_kwargs=motion_module_kwargs,
+            add_audio_layer=add_audio_layer,
+            audio_condition_method=audio_condition_method,
+            custom_audio_layer=custom_audio_layer,
+        )
+    raise ValueError(f"{up_block_type} does not exist.")
+class UNetMidBlock3DCrossAttn(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        output_scale_factor=1.0,
+        cross_attention_dim=1280,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        upcast_attention=False,
+        unet_use_cross_frame_attention=False,
+        unet_use_temporal_attention=False,
+        use_inflated_groupnorm=False,
+        use_motion_module=None,
+        motion_module_type=None,
+        motion_module_kwargs=None,
+        add_audio_layer=False,
+        audio_condition_method="cross_attn",
+        custom_audio_layer: bool = False,
+    ):
+        super().__init__()
+        self.has_cross_attention = True
+        self.attn_num_head_channels = attn_num_head_channels
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock3D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+                use_inflated_groupnorm=use_inflated_groupnorm,
+            )
+        ]
+        attentions = []
+        audio_attentions = []
+        motion_modules = []
+        for _ in range(num_layers):
+            if dual_cross_attention:
+                raise NotImplementedError
+            attentions.append(
+                Transformer3DModel(
+                    attn_num_head_channels,
+                    in_channels // attn_num_head_channels,
+                    in_channels=in_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    use_linear_projection=use_linear_projection,
+                    upcast_attention=upcast_attention,
+                    use_motion_module=use_motion_module,
+                    unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+                    unet_use_temporal_attention=unet_use_temporal_attention,
+                    add_audio_layer=add_audio_layer,
+                    audio_condition_method=audio_condition_method,
+                )
+            )
+            audio_attentions.append(
+                Transformer3DModel(
+                    attn_num_head_channels,
+                    in_channels // attn_num_head_channels,
+                    in_channels=in_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    use_linear_projection=use_linear_projection,
+                    upcast_attention=upcast_attention,
+                    use_motion_module=use_motion_module,
+                    unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+                    unet_use_temporal_attention=unet_use_temporal_attention,
+                    add_audio_layer=add_audio_layer,
+                    audio_condition_method=audio_condition_method,
+                    custom_audio_layer=True,
+                )
+                if custom_audio_layer
+                else None
+            )
+            motion_modules.append(
+                get_motion_module(
+                    in_channels=in_channels,
+                    motion_module_type=motion_module_type,
+                    motion_module_kwargs=motion_module_kwargs,
+                )
+                if use_motion_module
+                else None
+            )
+            resnets.append(
+                ResnetBlock3D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                    use_inflated_groupnorm=use_inflated_groupnorm,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.audio_attentions = nn.ModuleList(audio_attentions)
+        self.resnets = nn.ModuleList(resnets)
+        self.motion_modules = nn.ModuleList(motion_modules)
+    def forward(self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None):
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for attn, audio_attn, resnet, motion_module in zip(
+            self.attentions, self.audio_attentions, self.resnets[1:], self.motion_modules
+        ):
+            hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states).sample
+            hidden_states = (
+                audio_attn(hidden_states, encoder_hidden_states=encoder_hidden_states).sample
+                if audio_attn is not None
+                else hidden_states
+            )
+            hidden_states = (
+                motion_module(hidden_states, temb, encoder_hidden_states=encoder_hidden_states)
+                if motion_module is not None
+                else hidden_states
+            )
+            hidden_states = resnet(hidden_states, temb)
+        return hidden_states
+class CrossAttnDownBlock3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        cross_attention_dim=1280,
+        output_scale_factor=1.0,
+        downsample_padding=1,
+        add_downsample=True,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        only_cross_attention=False,
+        upcast_attention=False,
+        unet_use_cross_frame_attention=False,
+        unet_use_temporal_attention=False,
+        use_inflated_groupnorm=False,
+        use_motion_module=None,
+        motion_module_type=None,
+        motion_module_kwargs=None,
+        add_audio_layer=False,
+        audio_condition_method="cross_attn",
+        custom_audio_layer: bool = False,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        audio_attentions = []
+        motion_modules = []
+        self.has_cross_attention = True
+        self.attn_num_head_channels = attn_num_head_channels
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock3D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                    use_inflated_groupnorm=use_inflated_groupnorm,
+                )
+            )
+            if dual_cross_attention:
+                raise NotImplementedError
+            attentions.append(
+                Transformer3DModel(
+                    attn_num_head_channels,
+                    out_channels // attn_num_head_channels,
+                    in_channels=out_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    use_linear_projection=use_linear_projection,
+                    only_cross_attention=only_cross_attention,
+                    upcast_attention=upcast_attention,
+                    use_motion_module=use_motion_module,
+                    unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+                    unet_use_temporal_attention=unet_use_temporal_attention,
+                    add_audio_layer=add_audio_layer,
+                    audio_condition_method=audio_condition_method,
+                )
+            )
+            audio_attentions.append(
+                Transformer3DModel(
+                    attn_num_head_channels,
+                    out_channels // attn_num_head_channels,
+                    in_channels=out_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    use_linear_projection=use_linear_projection,
+                    only_cross_attention=only_cross_attention,
+                    upcast_attention=upcast_attention,
+                    use_motion_module=use_motion_module,
+                    unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+                    unet_use_temporal_attention=unet_use_temporal_attention,
+                    add_audio_layer=add_audio_layer,
+                    audio_condition_method=audio_condition_method,
+                    custom_audio_layer=True,
+                )
+                if custom_audio_layer
+                else None
+            )
+            motion_modules.append(
+                get_motion_module(
+                    in_channels=out_channels,
+                    motion_module_type=motion_module_type,
+                    motion_module_kwargs=motion_module_kwargs,
+                )
+                if use_motion_module
+                else None
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.audio_attentions = nn.ModuleList(audio_attentions)
+        self.resnets = nn.ModuleList(resnets)
+        self.motion_modules = nn.ModuleList(motion_modules)
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample3D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+        self.gradient_checkpointing = False
+    def forward(self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None):
+        output_states = ()
+        for resnet, attn, audio_attn, motion_module in zip(
+            self.resnets, self.attentions, self.audio_attentions, self.motion_modules
+        ):
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(attn, return_dict=False),
+                    hidden_states,
+                    encoder_hidden_states,
+                )[0]
+                if motion_module is not None:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(motion_module),
+                        hidden_states.requires_grad_(),
+                        temb,
+                        encoder_hidden_states,
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states).sample
+                hidden_states = (
+                    audio_attn(hidden_states, encoder_hidden_states=encoder_hidden_states).sample
+                    if audio_attn is not None
+                    else hidden_states
+                )
+                # add motion module
+                hidden_states = (
+                    motion_module(hidden_states, temb, encoder_hidden_states=encoder_hidden_states)
+                    if motion_module is not None
+                    else hidden_states
+                )
+            output_states += (hidden_states,)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+            output_states += (hidden_states,)
+        return hidden_states, output_states
+class DownBlock3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_downsample=True,
+        downsample_padding=1,
+        use_inflated_groupnorm=False,
+        use_motion_module=None,
+        motion_module_type=None,
+        motion_module_kwargs=None,
+    ):
+        super().__init__()
+        resnets = []
+        motion_modules = []
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock3D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                    use_inflated_groupnorm=use_inflated_groupnorm,
+                )
+            )
+            motion_modules.append(
+                get_motion_module(
+                    in_channels=out_channels,
+                    motion_module_type=motion_module_type,
+                    motion_module_kwargs=motion_module_kwargs,
+                )
+                if use_motion_module
+                else None
+            )
+        self.resnets = nn.ModuleList(resnets)
+        self.motion_modules = nn.ModuleList(motion_modules)
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample3D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+        self.gradient_checkpointing = False
+    def forward(self, hidden_states, temb=None, encoder_hidden_states=None):
+        output_states = ()
+        for resnet, motion_module in zip(self.resnets, self.motion_modules):
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+                if motion_module is not None:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(motion_module),
+                        hidden_states.requires_grad_(),
+                        temb,
+                        encoder_hidden_states,
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                # add motion module
+                hidden_states = (
+                    motion_module(hidden_states, temb, encoder_hidden_states=encoder_hidden_states)
+                    if motion_module is not None
+                    else hidden_states
+                )
+            output_states += (hidden_states,)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+            output_states += (hidden_states,)
+        return hidden_states, output_states
+class CrossAttnUpBlock3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        cross_attention_dim=1280,
+        output_scale_factor=1.0,
+        add_upsample=True,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        only_cross_attention=False,
+        upcast_attention=False,
+        unet_use_cross_frame_attention=False,
+        unet_use_temporal_attention=False,
+        use_inflated_groupnorm=False,
+        use_motion_module=None,
+        motion_module_type=None,
+        motion_module_kwargs=None,
+        add_audio_layer=False,
+        audio_condition_method="cross_attn",
+        custom_audio_layer=False,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        audio_attentions = []
+        motion_modules = []
+        self.has_cross_attention = True
+        self.attn_num_head_channels = attn_num_head_channels
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock3D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                    use_inflated_groupnorm=use_inflated_groupnorm,
+                )
+            )
+            if dual_cross_attention:
+                raise NotImplementedError
+            attentions.append(
+                Transformer3DModel(
+                    attn_num_head_channels,
+                    out_channels // attn_num_head_channels,
+                    in_channels=out_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    use_linear_projection=use_linear_projection,
+                    only_cross_attention=only_cross_attention,
+                    upcast_attention=upcast_attention,
+                    use_motion_module=use_motion_module,
+                    unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+                    unet_use_temporal_attention=unet_use_temporal_attention,
+                    add_audio_layer=add_audio_layer,
+                    audio_condition_method=audio_condition_method,
+                )
+            )
+            audio_attentions.append(
+                Transformer3DModel(
+                    attn_num_head_channels,
+                    out_channels // attn_num_head_channels,
+                    in_channels=out_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    use_linear_projection=use_linear_projection,
+                    only_cross_attention=only_cross_attention,
+                    upcast_attention=upcast_attention,
+                    use_motion_module=use_motion_module,
+                    unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+                    unet_use_temporal_attention=unet_use_temporal_attention,
+                    add_audio_layer=add_audio_layer,
+                    audio_condition_method=audio_condition_method,
+                    custom_audio_layer=True,
+                )
+                if custom_audio_layer
+                else None
+            )
+            motion_modules.append(
+                get_motion_module(
+                    in_channels=out_channels,
+                    motion_module_type=motion_module_type,
+                    motion_module_kwargs=motion_module_kwargs,
+                )
+                if use_motion_module
+                else None
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.audio_attentions = nn.ModuleList(audio_attentions)
+        self.resnets = nn.ModuleList(resnets)
+        self.motion_modules = nn.ModuleList(motion_modules)
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample3D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states,
+        res_hidden_states_tuple,
+        temb=None,
+        encoder_hidden_states=None,
+        upsample_size=None,
+        attention_mask=None,
+    ):
+        for resnet, attn, audio_attn, motion_module in zip(
+            self.resnets, self.attentions, self.audio_attentions, self.motion_modules
+        ):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(attn, return_dict=False),
+                    hidden_states,
+                    encoder_hidden_states,
+                )[0]
+                if motion_module is not None:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(motion_module),
+                        hidden_states.requires_grad_(),
+                        temb,
+                        encoder_hidden_states,
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states).sample
+                hidden_states = (
+                    audio_attn(hidden_states, encoder_hidden_states=encoder_hidden_states).sample
+                    if audio_attn is not None
+                    else hidden_states
+                )
+                # add motion module
+                hidden_states = (
+                    motion_module(hidden_states, temb, encoder_hidden_states=encoder_hidden_states)
+                    if motion_module is not None
+                    else hidden_states
+                )
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+        return hidden_states
+class UpBlock3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_upsample=True,
+        use_inflated_groupnorm=False,
+        use_motion_module=None,
+        motion_module_type=None,
+        motion_module_kwargs=None,
+    ):
+        super().__init__()
+        resnets = []
+        motion_modules = []
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock3D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                    use_inflated_groupnorm=use_inflated_groupnorm,
+                )
+            )
+            motion_modules.append(
+                get_motion_module(
+                    in_channels=out_channels,
+                    motion_module_type=motion_module_type,
+                    motion_module_kwargs=motion_module_kwargs,
+                )
+                if use_motion_module
+                else None
+            )
+        self.resnets = nn.ModuleList(resnets)
+        self.motion_modules = nn.ModuleList(motion_modules)
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample3D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states,
+        res_hidden_states_tuple,
+        temb=None,
+        upsample_size=None,
+        encoder_hidden_states=None,
+    ):
+        for resnet, motion_module in zip(self.resnets, self.motion_modules):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+                if motion_module is not None:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(motion_module),
+                        hidden_states.requires_grad_(),
+                        temb,
+                        encoder_hidden_states,
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = (
+                    motion_module(hidden_states, temb, encoder_hidden_states=encoder_hidden_states)
+                    if motion_module is not None
+                    else hidden_states
+                )
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+        return hidden_states

latentsync/models/utils.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+def zero_module(module):
+    # Zero out the parameters of a module and return it.
+    for p in module.parameters():
+        p.detach().zero_()
+    return module

latentsync/pipelines/lipsync_pipeline.py ADDED Viewed

	@@ -0,0 +1,470 @@

+# Adapted from https://github.com/guoyww/AnimateDiff/blob/main/animatediff/pipelines/pipeline_animation.py
+import inspect
+import os
+import shutil
+from typing import Callable, List, Optional, Union
+import subprocess
+import numpy as np
+import torch
+import torchvision
+from diffusers.utils import is_accelerate_available
+from packaging import version
+from diffusers.configuration_utils import FrozenDict
+from diffusers.models import AutoencoderKL
+from diffusers.pipeline_utils import DiffusionPipeline
+from diffusers.schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from diffusers.utils import deprecate, logging
+from einops import rearrange
+from ..models.unet import UNet3DConditionModel
+from ..utils.image_processor import ImageProcessor
+from ..utils.util import read_video, read_audio, write_video
+from ..whisper.audio2feature import Audio2Feature
+import tqdm
+import soundfile as sf
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class LipsyncPipeline(DiffusionPipeline):
+    _optional_components = []
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        audio_encoder: Audio2Feature,
+        unet: UNet3DConditionModel,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
+    ):
+        super().__init__()
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+        self.register_modules(
+            vae=vae,
+            audio_encoder=audio_encoder,
+            unet=unet,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.set_progress_bar_config(desc="Steps")
+    def enable_vae_slicing(self):
+        self.vae.enable_slicing()
+    def disable_vae_slicing(self):
+        self.vae.disable_slicing()
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+        device = torch.device(f"cuda:{gpu_id}")
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+    @property
+    def _execution_device(self):
+        if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+    def decode_latents(self, latents):
+        latents = latents / self.vae.config.scaling_factor + self.vae.config.shift_factor
+        latents = rearrange(latents, "b c f h w -> (b f) c h w")
+        decoded_latents = self.vae.decode(latents).sample
+        return decoded_latents
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    def check_inputs(self, height, width, callback_steps):
+        assert height == width, "Height and width must be equal"
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+    def prepare_latents(self, batch_size, num_frames, num_channels_latents, height, width, dtype, device, generator):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            1,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        rand_device = "cpu" if device.type == "mps" else device
+        latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype).to(device)
+        latents = latents.repeat(1, 1, num_frames, 1, 1)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    def prepare_mask_latents(
+        self, mask, masked_image, height, width, dtype, device, generator, do_classifier_free_guidance
+    ):
+        # resize the mask to latents shape as we concatenate the mask to the latents
+        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
+        # and half precision
+        mask = torch.nn.functional.interpolate(
+            mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
+        )
+        masked_image = masked_image.to(device=device, dtype=dtype)
+        # encode the mask image into latents space so we can concatenate it to the latents
+        masked_image_latents = self.vae.encode(masked_image).latent_dist.sample(generator=generator)
+        masked_image_latents = (masked_image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+        # aligning device to prevent device errors when concating it with the latent model input
+        masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
+        mask = mask.to(device=device, dtype=dtype)
+        # assume batch size = 1
+        mask = rearrange(mask, "f c h w -> 1 c f h w")
+        masked_image_latents = rearrange(masked_image_latents, "f c h w -> 1 c f h w")
+        mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
+        masked_image_latents = (
+            torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+        )
+        return mask, masked_image_latents
+    def prepare_image_latents(self, images, device, dtype, generator, do_classifier_free_guidance):
+        images = images.to(device=device, dtype=dtype)
+        image_latents = self.vae.encode(images).latent_dist.sample(generator=generator)
+        image_latents = (image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+        image_latents = rearrange(image_latents, "f c h w -> 1 c f h w")
+        image_latents = torch.cat([image_latents] * 2) if do_classifier_free_guidance else image_latents
+        return image_latents
+    def set_progress_bar_config(self, **kwargs):
+        if not hasattr(self, "_progress_bar_config"):
+            self._progress_bar_config = {}
+        self._progress_bar_config.update(kwargs)
+    @staticmethod
+    def paste_surrounding_pixels_back(decoded_latents, pixel_values, masks, device, weight_dtype):
+        # Paste the surrounding pixels back, because we only want to change the mouth region
+        pixel_values = pixel_values.to(device=device, dtype=weight_dtype)
+        masks = masks.to(device=device, dtype=weight_dtype)
+        combined_pixel_values = decoded_latents * masks + pixel_values * (1 - masks)
+        return combined_pixel_values
+    @staticmethod
+    def pixel_values_to_images(pixel_values: torch.Tensor):
+        pixel_values = rearrange(pixel_values, "f c h w -> f h w c")
+        pixel_values = (pixel_values / 2 + 0.5).clamp(0, 1)
+        images = (pixel_values * 255).to(torch.uint8)
+        images = images.cpu().numpy()
+        return images
+    def affine_transform_video(self, video_path):
+        video_frames = read_video(video_path, use_decord=False)
+        faces = []
+        boxes = []
+        affine_matrices = []
+        print(f"Affine transforming {len(video_frames)} faces...")
+        for frame in tqdm.tqdm(video_frames):
+            face, box, affine_matrix = self.image_processor.affine_transform(frame)
+            faces.append(face)
+            boxes.append(box)
+            affine_matrices.append(affine_matrix)
+        faces = torch.stack(faces)
+        return faces, video_frames, boxes, affine_matrices
+    def restore_video(self, faces, video_frames, boxes, affine_matrices):
+        video_frames = video_frames[: faces.shape[0]]
+        out_frames = []
+        for index, face in enumerate(faces):
+            x1, y1, x2, y2 = boxes[index]
+            height = int(y2 - y1)
+            width = int(x2 - x1)
+            face = torchvision.transforms.functional.resize(face, size=(height, width), antialias=True)
+            face = rearrange(face, "c h w -> h w c")
+            face = (face / 2 + 0.5).clamp(0, 1)
+            face = (face * 255).to(torch.uint8).cpu().numpy()
+            out_frame = self.image_processor.restorer.restore_img(video_frames[index], face, affine_matrices[index])
+            out_frames.append(out_frame)
+        return np.stack(out_frames, axis=0)
+    @torch.no_grad()
+    def __call__(
+        self,
+        video_path: str,
+        audio_path: str,
+        video_out_path: str,
+        video_mask_path: str = None,
+        num_frames: int = 16,
+        video_fps: int = 25,
+        audio_sample_rate: int = 16000,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 20,
+        guidance_scale: float = 1.5,
+        weight_dtype: Optional[torch.dtype] = torch.float16,
+        eta: float = 0.0,
+        mask: str = "fix_mask",
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
+        is_train = self.unet.training
+        self.unet.eval()
+        # 0. Define call parameters
+        batch_size = 1
+        device = self._execution_device
+        self.image_processor = ImageProcessor(height, mask=mask, device="cuda")
+        self.set_progress_bar_config(desc=f"Sample frames: {num_frames}")
+        video_frames, original_video_frames, boxes, affine_matrices = self.affine_transform_video(video_path)
+        audio_samples = read_audio(audio_path)
+        # 1. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # 2. Check inputs
+        self.check_inputs(height, width, callback_steps)
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 3. set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 4. Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        self.video_fps = video_fps
+        if self.unet.add_audio_layer:
+            whisper_feature = self.audio_encoder.audio2feat(audio_path)
+            whisper_chunks = self.audio_encoder.feature2chunks(feature_array=whisper_feature, fps=video_fps)
+            num_inferences = min(len(video_frames), len(whisper_chunks)) // num_frames
+        else:
+            num_inferences = len(video_frames) // num_frames
+        synced_video_frames = []
+        masked_video_frames = []
+        num_channels_latents = self.vae.config.latent_channels
+        # Prepare latent variables
+        all_latents = self.prepare_latents(
+            batch_size,
+            num_frames * num_inferences,
+            num_channels_latents,
+            height,
+            width,
+            weight_dtype,
+            device,
+            generator,
+        )
+        for i in tqdm.tqdm(range(num_inferences), desc="Doing inference..."):
+            if self.unet.add_audio_layer:
+                audio_embeds = torch.stack(whisper_chunks[i * num_frames : (i + 1) * num_frames])
+                audio_embeds = audio_embeds.to(device, dtype=weight_dtype)
+                if do_classifier_free_guidance:
+                    empty_audio_embeds = torch.zeros_like(audio_embeds)
+                    audio_embeds = torch.cat([empty_audio_embeds, audio_embeds])
+            else:
+                audio_embeds = None
+            inference_video_frames = video_frames[i * num_frames : (i + 1) * num_frames]
+            latents = all_latents[:, :, i * num_frames : (i + 1) * num_frames]
+            pixel_values, masked_pixel_values, masks = self.image_processor.prepare_masks_and_masked_images(
+                inference_video_frames, affine_transform=False
+            )
+            # 7. Prepare mask latent variables
+            mask_latents, masked_image_latents = self.prepare_mask_latents(
+                masks,
+                masked_pixel_values,
+                height,
+                width,
+                weight_dtype,
+                device,
+                generator,
+                do_classifier_free_guidance,
+            )
+            # 8. Prepare image latents
+            image_latents = self.prepare_image_latents(
+                pixel_values,
+                device,
+                weight_dtype,
+                generator,
+                do_classifier_free_guidance,
+            )
+            # 9. Denoising loop
+            num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+            with self.progress_bar(total=num_inference_steps) as progress_bar:
+                for j, t in enumerate(timesteps):
+                    # expand the latents if we are doing classifier free guidance
+                    latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                    # concat latents, mask, masked_image_latents in the channel dimension
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                    latent_model_input = torch.cat(
+                        [latent_model_input, mask_latents, masked_image_latents, image_latents], dim=1
+                    )
+                    # predict the noise residual
+                    noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=audio_embeds).sample
+                    # perform guidance
+                    if do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_audio = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_audio - noise_pred_uncond)
+                    # compute the previous noisy sample x_t -> x_t-1
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                    # call the callback, if provided
+                    if j == len(timesteps) - 1 or ((j + 1) > num_warmup_steps and (j + 1) % self.scheduler.order == 0):
+                        progress_bar.update()
+                        if callback is not None and j % callback_steps == 0:
+                            callback(j, t, latents)
+            # Recover the pixel values
+            decoded_latents = self.decode_latents(latents)
+            decoded_latents = self.paste_surrounding_pixels_back(
+                decoded_latents, pixel_values, 1 - masks, device, weight_dtype
+            )
+            synced_video_frames.append(decoded_latents)
+            masked_video_frames.append(masked_pixel_values)
+        synced_video_frames = self.restore_video(
+            torch.cat(synced_video_frames), original_video_frames, boxes, affine_matrices
+        )
+        masked_video_frames = self.restore_video(
+            torch.cat(masked_video_frames), original_video_frames, boxes, affine_matrices
+        )
+        audio_samples_remain_length = int(synced_video_frames.shape[0] / video_fps * audio_sample_rate)
+        audio_samples = audio_samples[:audio_samples_remain_length].cpu().numpy()
+        if is_train:
+            self.unet.train()
+        temp_dir = "temp"
+        if os.path.exists(temp_dir):
+            shutil.rmtree(temp_dir)
+        os.makedirs(temp_dir, exist_ok=True)
+        write_video(os.path.join(temp_dir, "video.mp4"), synced_video_frames, fps=25)
+        # write_video(video_mask_path, masked_video_frames, fps=25)
+        sf.write(os.path.join(temp_dir, "audio.wav"), audio_samples, audio_sample_rate)
+        command = f"ffmpeg -y -loglevel error -nostdin -i {os.path.join(temp_dir, 'video.mp4')} -i {os.path.join(temp_dir, 'audio.wav')} -c:v libx264 -c:a aac -q:v 0 -q:a 0 {video_out_path}"
+        subprocess.run(command, shell=True)

latentsync/trepa/__init__.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+from einops import rearrange
+from .third_party.VideoMAEv2.utils import load_videomae_model
+class TREPALoss:
+    def __init__(
+        self,
+        device="cuda",
+        ckpt_path="/mnt/bn/maliva-gen-ai-v2/chunyu.li/checkpoints/vit_g_hybrid_pt_1200e_ssv2_ft.pth",
+    ):
+        self.model = load_videomae_model(device, ckpt_path).eval().to(dtype=torch.float16)
+        self.model.requires_grad_(False)
+        self.bce_loss = nn.BCELoss()
+    def __call__(self, videos_fake, videos_real, loss_type="mse"):
+        batch_size = videos_fake.shape[0]
+        num_frames = videos_fake.shape[2]
+        videos_fake = rearrange(videos_fake.clone(), "b c f h w -> (b f) c h w")
+        videos_real = rearrange(videos_real.clone(), "b c f h w -> (b f) c h w")
+        videos_fake = F.interpolate(videos_fake, size=(224, 224), mode="bilinear")
+        videos_real = F.interpolate(videos_real, size=(224, 224), mode="bilinear")
+        videos_fake = rearrange(videos_fake, "(b f) c h w -> b c f h w", f=num_frames)
+        videos_real = rearrange(videos_real, "(b f) c h w -> b c f h w", f=num_frames)
+        # Because input pixel range is [-1, 1], and model expects pixel range to be [0, 1]
+        videos_fake = (videos_fake / 2 + 0.5).clamp(0, 1)
+        videos_real = (videos_real / 2 + 0.5).clamp(0, 1)
+        feats_fake = self.model.forward_features(videos_fake)
+        feats_real = self.model.forward_features(videos_real)
+        feats_fake = F.normalize(feats_fake, p=2, dim=1)
+        feats_real = F.normalize(feats_real, p=2, dim=1)
+        return F.mse_loss(feats_fake, feats_real)
+if __name__ == "__main__":
+    # input shape: (b, c, f, h, w)
+    videos_fake = torch.randn(2, 3, 16, 256, 256, requires_grad=True).to(device="cuda", dtype=torch.float16)
+    videos_real = torch.randn(2, 3, 16, 256, 256, requires_grad=True).to(device="cuda", dtype=torch.float16)
+    trepa_loss = TREPALoss(device="cuda")
+    loss = trepa_loss(videos_fake, videos_real)
+    print(loss)

latentsync/trepa/third_party/VideoMAEv2/__init__.py ADDED Viewed

File without changes