feat: Add support for imagebind embeddings with MSRVTT dataset

d31003 · d31003 · commit f1f0e69a2a1f · 2024-10-18T18:20:34.000-05:00
diff --git a/config/main.yaml b/config/main.yaml
@@ -169,4 +169,5 @@ asif:  # Hyperparameters of asif baseline
 # gtr: 768
 # dino: 1536
 # clip: 1280 (1024 for msrvtt)
-# liploc: 256
+# liploc: 256
+# imagebind: 1024
diff --git a/mmda/get_embeddings.py b/mmda/get_embeddings.py
@@ -1,6 +1,6 @@
 """Get feature embeddings for the datasets."""
 
-# ruff: noqa: ERA001
+# ruff: noqa: ERA001, PLR2004
 import os
 import pickle
 from pathlib import Path
@@ -39,8 +39,8 @@
 
 
 def get_video_emb(
-    cfg_dataset: DictConfig, video_dict: dict, use_kaggle: bool = False
-) -> dict[str, np.ndarray]:
+    cfg_dataset: DictConfig, video_dict: dict
+) -> tuple[list[str], list[str], list[str]]:
     """Get video embeddings for the videos in the video_dict.
 
     Args:
@@ -49,35 +49,23 @@ def get_video_emb(
         use_kaggle: whether to use the kaggle dataset
 
     Returns:
-        video embeddings. dict: video_id -> video_embedding (if use_kaggle)
-        img_paths. list: list of image paths (if not use_kaggle)
+        id_order: list of video ids
+        first_img_paths: list of first image paths
+        last_img_paths: list of last image paths
     """
-    # skip image embeddings (CLIP is already done from the dataset)
-    # load the existing embeddings
-    if use_kaggle:
-        video_emb = {}
-        for video_ids in tqdm(video_dict, desc="Loading video embeddings"):
-            video_np_path = Path(
-                cfg_dataset.paths.dataset_path,
-                f"clip-features-vit-h14/{video_ids}.npy",
-            )
-            # only sample the first and last frame
-            video_np = np.load(video_np_path)[[0, -1], :].reshape(1, -1)
-            video_emb[video_ids] = video_np
-        return video_emb
     id_order = []
     first_img_paths = []
     last_img_paths = []
-    for video_ids in tqdm(sorted(video_dict), desc="loading keyframe paths"):
+    for video_id in tqdm(sorted(video_dict), desc="loading keyframe paths"):
         # video_ids from 7010 to 7990
-        img_dir = Path(cfg_dataset.paths.dataset_path, "keyframes", video_ids)
+        img_dir = Path(cfg_dataset.paths.dataset_path, "keyframes", video_id)
         num_frames = len(os.listdir(img_dir))
         for frame_id in range(0, num_frames, 2):
             if frame_id + 1 >= num_frames:
                 break
             first_img_path = img_dir / f"{frame_id:04d}.jpg"
             last_img_path = img_dir / f"{frame_id + 1:04d}.jpg"
-            id_order.append(video_ids)
+            id_order.append(video_id)
             first_img_paths.append(str(first_img_path))
             last_img_paths.append(str(last_img_path))
     return id_order, first_img_paths, last_img_paths
@@ -135,7 +123,7 @@ def main(cfg: DictConfig) -> None:  # noqa: PLR0915, C901, PLR0912
         _, captions, video_info_sen_order, video_dict = load_msrvtt(cfg_dataset)
 
         id_order, first_img_paths, last_img_paths = get_video_emb(
-            cfg_dataset, video_dict, use_kaggle=False
+            cfg_dataset, video_dict
         )
 
         # get audio embeddings
@@ -190,24 +178,20 @@ def main(cfg: DictConfig) -> None:  # noqa: PLR0915, C901, PLR0912
         audio_np = []
         img_np = []
         print(len(id_order))
-        for i in tqdm(range(0, len(id_order), BATCH_SIZE), desc="imagebind"):
+        for i in tqdm(range(0, len(id_order), BATCH_SIZE), desc="imagebind inference"):
             audios = audio_paths[i : i + BATCH_SIZE]
             first_images = first_img_paths[i : i + BATCH_SIZE]
             last_images = last_img_paths[i : i + BATCH_SIZE]
-            # audio_embs = imagebind_class.inference_audio(audios).cpu().numpy()
-            # first_embs = imagebind_class.inference_image(first_images).cpu().numpy()
-            first_embs, audio_embs = imagebind_class.inference_image_audio(
-                first_images, audios
-            )
-            first_embs = first_embs.cpu().numpy()
-            audio_embs = audio_embs.cpu().numpy()
+            audio_embs = imagebind_class.inference_audio(audios).cpu().numpy()
+            first_embs = imagebind_class.inference_image(first_images).cpu().numpy()
             last_embs = imagebind_class.inference_image(last_images).cpu().numpy()
             img_embs = np.concatenate([first_embs, last_embs], axis=1)
             audio_np.append(audio_embs)
             img_np.append(img_embs)
-            # print(img_embs.shape, audio_embs.shape)
-        audio_np = np.array(audio_np)
-        img_np = np.array(img_np)
+            assert img_embs.shape[1] == 2048, f"img.shape: {img_embs.shape}, {i}"
+            assert audio_embs.shape[1] == 1024, f"audio.shape: {audio_embs.shape}, {i}"
+        audio_np = np.concatenate(audio_np, axis=0)
+        img_np = np.concatenate(img_np, axis=0)
         with Path(cfg_dataset.paths.save_path, "MSRVTT_audio_emb_imagebind.pkl").open(
             "wb"
         ) as f:
@@ -218,7 +202,7 @@ def main(cfg: DictConfig) -> None:  # noqa: PLR0915, C901, PLR0912
         ) as f:
             pickle.dump(img_np, f)
         print("imagebind embeddings saved")
-        return
+        # return
 
         shape = video_info_sen_order[0]["audio_np"].shape
         audio_np = [
@@ -237,7 +221,15 @@ def main(cfg: DictConfig) -> None:  # noqa: PLR0915, C901, PLR0912
         print("CLAP embeddings saved")
 
         # get text embeddings
-        text_emb = imagebind_class.inference_text(captions)
+        imagebind_class = ImageBindInference()
+        text_emb = []
+        for i in tqdm(range(0, len(captions), BATCH_SIZE), desc="imagebind txt"):
+            text_emb.append(
+                imagebind_class.inference_text(captions[i : i + BATCH_SIZE])
+                .cpu()
+                .numpy()
+            )
+        text_emb = np.concatenate(text_emb, axis=0)
         with Path(cfg_dataset.paths.save_path, "MSRVTT_text_emb_imagebind.pkl").open(
             "wb"
         ) as f: