Still getting video embeddings

d31003 · d31003 · commit 62da752bd1d5 · 2024-10-18T14:08:44.000-05:00
diff --git a/.gitignore b/.gitignore
@@ -171,4 +171,4 @@ plots/*
 # lock files
 *.lock
 .checkpoints/
-ImageBind/
+.assets/
diff --git a/mmda/get_embeddings.py b/mmda/get_embeddings.py
@@ -6,6 +6,8 @@
 from pathlib import Path
 
 import numpy as np
+import torch
+import torchaudio
 from omegaconf import DictConfig
 from tqdm import tqdm
 
@@ -33,7 +35,7 @@
 )
 from mmda.utils.imagebind_utils import ImageBindInference
 
-BATCH_SIZE = 128
+BATCH_SIZE = 256
 
 
 def get_video_emb(
@@ -70,7 +72,7 @@ def get_video_emb(
         # video_ids from 7010 to 7990
         img_dir = Path(cfg_dataset.paths.dataset_path, "keyframes", video_ids)
         num_frames = len(os.listdir(img_dir))
-        for frame_id in range(num_frames, 2):
+        for frame_id in range(0, num_frames, 2):
             if frame_id + 1 >= num_frames:
                 break
             first_img_path = img_dir / f"{frame_id:04d}.jpg"
@@ -135,42 +137,75 @@ def main(cfg: DictConfig) -> None:  # noqa: PLR0915, C901, PLR0912
         id_order, first_img_paths, last_img_paths = get_video_emb(
             cfg_dataset, video_dict, use_kaggle=False
         )
-        first_img_emb = clip_imgs(first_img_paths, BATCH_SIZE)
-        last_img_emb = clip_imgs(last_img_paths, BATCH_SIZE)
-        video_id_emb = np.concatenate([first_img_emb, last_img_emb], axis=1)
-        with Path(cfg_dataset.paths.save_path, "MSRVTT_video_emb_clip.pkl").open(
-            "wb"
-        ) as f:
-            pickle.dump(video_id_emb, f)
-        print("CLIP embeddings saved")
-        with Path(cfg_dataset.paths.save_path, "MSRVTT_ref_video_ids.pkl").open(
-            "wb"
-        ) as f:
-            pickle.dump(id_order, f)
 
         # get audio embeddings
-        audio_paths = []
-        for video_id in id_order:
-            audio_path = str(
-                Path(cfg_dataset.paths.dataset_path, f"TestVideo/{video_id}.wav")
-            )
-            audio_paths.append(audio_path)
+        if not (
+            Path(cfg_dataset.paths.save_path, "MSRVTT_id_order.pkl").exists
+            and Path(cfg_dataset.paths.save_path, "MSRVTT_null_audio.pkl").exists()
+            and Path(cfg_dataset.paths.save_path, "MSRVTT_audio_paths.pkl").exists()
+        ):
+            audio_paths, null_audio = [], []
+            for video_id in tqdm(id_order, desc="process id_order"):
+                audio_path = Path(
+                    cfg_dataset.paths.dataset_path, f"TestVideo/{video_id}.wav"
+                )
+                if (
+                    not audio_path.exists()
+                    or torch.sum(torchaudio.load(str(audio_path))[0]) == 0
+                ):
+                    null_audio.append(True)
+                    # just a placeholder for wav path
+                    audio_paths.append(".assets/bird_audio.wav")
+                else:
+                    null_audio.append(False)
+                    audio_paths.append(str(audio_path))
+            with Path(cfg_dataset.paths.save_path, "MSRVTT_id_order.pkl").open(
+                "wb"
+            ) as f:
+                pickle.dump(id_order, f)
+            with Path(cfg_dataset.paths.save_path, "MSRVTT_null_audio.pkl").open(
+                "wb"
+            ) as f:
+                pickle.dump(null_audio, f)
+            with Path(cfg_dataset.paths.save_path, "MSRVTT_audio_paths.pkl").open(
+                "wb"
+            ) as f:
+                pickle.dump(audio_paths, f)
+        else:
+            with Path(cfg_dataset.paths.save_path, "MSRVTT_id_order.pkl").open(
+                "rb"
+            ) as f:
+                id_order = pickle.load(f)  # noqa: S301
+            with Path(cfg_dataset.paths.save_path, "MSRVTT_null_audio.pkl").open(
+                "rb"
+            ) as f:
+                null_audio = pickle.load(f)  # noqa: S301
+            with Path(cfg_dataset.paths.save_path, "MSRVTT_audio_paths.pkl").open(
+                "rb"
+            ) as f:
+                audio_paths = pickle.load(f)  # noqa: S301
+
         # inference imagebind
-        imagebind_class = ImageBindInference(device=0)
+        imagebind_class = ImageBindInference()
         audio_np = []
         img_np = []
-        for i in range(len(id_order)), BATCH_SIZE:
+        print(len(id_order))
+        for i in tqdm(range(0, len(id_order), BATCH_SIZE), desc="imagebind"):
             audios = audio_paths[i : i + BATCH_SIZE]
             first_images = first_img_paths[i : i + BATCH_SIZE]
             last_images = last_img_paths[i : i + BATCH_SIZE]
-            audio_embs = imagebind_class.inference_audio(audios).cpu().numpy()
-            first_embs = (
-                imagebind_class.inference_image_only(first_images).cpu().numpy()
+            # audio_embs = imagebind_class.inference_audio(audios).cpu().numpy()
+            # first_embs = imagebind_class.inference_image(first_images).cpu().numpy()
+            first_embs, audio_embs = imagebind_class.inference_image_audio(
+                first_images, audios
             )
-            last_embs = imagebind_class.inference_image_only(last_images).cpu().numpy()
+            first_embs = first_embs.cpu().numpy()
+            audio_embs = audio_embs.cpu().numpy()
+            last_embs = imagebind_class.inference_image(last_images).cpu().numpy()
             img_embs = np.concatenate([first_embs, last_embs], axis=1)
             audio_np.append(audio_embs)
             img_np.append(img_embs)
+            # print(img_embs.shape, audio_embs.shape)
         audio_np = np.array(audio_np)
         img_np = np.array(img_np)
         with Path(cfg_dataset.paths.save_path, "MSRVTT_audio_emb_imagebind.pkl").open(
@@ -183,6 +218,7 @@ def main(cfg: DictConfig) -> None:  # noqa: PLR0915, C901, PLR0912
         ) as f:
             pickle.dump(img_np, f)
         print("imagebind embeddings saved")
+        return
 
         shape = video_info_sen_order[0]["audio_np"].shape
         audio_np = [
@@ -511,4 +547,4 @@ def main(cfg: DictConfig) -> None:  # noqa: PLR0915, C901, PLR0912
 
 if __name__ == "__main__":
     main()
-# CUDA_VISIBLE_DEVICES=0 poetry run python mmda/get_embeddings.py
+# CUDA_VISIBLE_DEVICES=5 poetry run python mmda/get_embeddings.py
diff --git a/mmda/utils/dataset_utils.py b/mmda/utils/dataset_utils.py
@@ -99,7 +99,7 @@ def load_msrvtt(
                 "category": category,
                 "url": url,
             }
-        num_processes = 32
+        num_processes = 64
         p = Pool(processes=num_processes)
         print("num_processes:", num_processes)
         data = p.map(
diff --git a/mmda/utils/imagebind_utils.py b/mmda/utils/imagebind_utils.py
@@ -10,13 +10,13 @@
 
 
 class ImageBindInference:
-    def __init__(self, device: int = 0):
-        self.device = f"cuda:{device}" if torch.cuda.is_available() else "cpu"
+    def __init__(self):
+        self.device = f"cuda" if torch.cuda.is_available() else "cpu"
         self.model = imagebind_model.imagebind_huge(pretrained=True)
         self.model.eval()
         self.model.to(self.device)
 
-    def inference_audio(self, image_paths, audio_paths):
+    def inference_audio(self, audio_paths):
         inputs = {
             ModalityType.AUDIO: load_and_transform_audio_data(audio_paths, self.device),
         }
@@ -44,3 +44,15 @@ def inference_text(self, text_list):
         with torch.no_grad():
             embeddings = self.model(inputs)
             return embeddings[ModalityType.TEXT]
+
+    def inference_image_audio(self, image_paths, audio_paths):
+        inputs = {
+            ModalityType.VISION: load_and_transform_vision_data(
+                image_paths, self.device
+            ),
+            ModalityType.AUDIO: load_and_transform_audio_data(audio_paths, self.device),
+        }
+
+        with torch.no_grad():
+            embeddings = self.model(inputs)
+            return embeddings[ModalityType.VISION], embeddings[ModalityType.AUDIO]
diff --git a/mmda/utils/mstvtt_ds_class.py b/mmda/utils/mstvtt_ds_class.py
@@ -105,6 +105,7 @@ def load_data(self) -> None:
         ]
 
         # get video idx which has no audio. 355 in total.
+        # TODO: video7010 has torch.zeros wav files.
         null_audio_idx = []
         for idx, video_info in enumerate(self.video_info_sen_order):
             if video_info["audio_np"] is None and idx % self.step_size == 0:
@@ -114,23 +115,23 @@ def load_data(self) -> None:
             self.cfg_dataset.paths.save_path
             + f"MSRVTT_text_emb_{self.img2txt_encoder}.pkl"
         ).open("rb") as file:
-            self.txt2img_emb = pickle.load(file)  # (59800, 1280) # noqa: S301
+            self.txt2img_emb = pickle.load(file)  # (59800,) # noqa: S301
         with Path(
             self.cfg_dataset.paths.save_path
             + f"MSRVTT_video_emb_{self.img2txt_encoder}.pkl"
         ).open("rb") as file:
-            self.img2txt_emb = pickle.load(file)  # noqa: S301
+            self.img2txt_emb = pickle.load(file)  # (47392,) # noqa: S301
         print(self.img2txt_emb.shape)
         with Path(
             self.cfg_dataset.paths.save_path
             + f"MSRVTT_text_emb_{self.audio2txt_encoder}.pkl"
         ).open("rb") as file:
-            self.txt2audio_emb = pickle.load(file)  # (59800, 512) # noqa: S301
+            self.txt2audio_emb = pickle.load(file)  # (59800,) # noqa: S301
         with Path(
             self.cfg_dataset.paths.save_path
             + f"MSRVTT_audio_emb_{self.audio2txt_encoder}.pkl"
         ).open("rb") as file:
-            self.audio2txt_emb = pickle.load(file)  # (???, 512) # noqa: S301
+            self.audio2txt_emb = pickle.load(file)  # (47392,) # noqa: S301
         print(self.audio2txt_emb.shape)
 
         # normalize all the embeddings to have unit norm using L2 normalization
diff --git a/unused/test_imgebind.py b/unused/test_imgebind.py

Original file line number	Diff line number	Diff line change
`@@ -99,7 +99,7 @@ def load_msrvtt(`
`99`	`99`	`"category": category,`
`100`	`100`	`"url": url,`
`101`	`101`	`}`
`102`		`- num_processes = 32`
	`102`	`+ num_processes = 64`
`103`	`103`	`p = Pool(processes=num_processes)`
`104`	`104`	`print("num_processes:", num_processes)`
`105`	`105`	`data = p.map(`