From 6bcdc96932328b7fad5541262ed1a8dbdbe3cb29 Mon Sep 17 00:00:00 2001 From: d31003 Date: Wed, 13 Nov 2024 14:51:52 -0600 Subject: [PATCH 01/10] Create branch --- bash_scripts/imagenet_script.sh | 18 +++++++++--------- config/main.yaml | 4 ++-- mmda/get_embeddings.py | 3 ++- mmda/utils/dataset_utils.py | 4 +++- mmda/utils/embed_data.py | 3 +++ 5 files changed, 19 insertions(+), 13 deletions(-) diff --git a/bash_scripts/imagenet_script.sh b/bash_scripts/imagenet_script.sh index c2f0561..a60f887 100644 --- a/bash_scripts/imagenet_script.sh +++ b/bash_scripts/imagenet_script.sh @@ -1,11 +1,11 @@ -# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=10 -# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=25 -# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=50 -# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=100 -# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=150 -# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=200 -# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=500 -# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=700 +CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=10 +CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=25 +CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=50 +CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=100 +CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=150 +CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=200 +CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=500 +CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=700 # CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet train_test_ratio=0.1 imagenet.sim_dim=10 # CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet train_test_ratio=0.1 imagenet.sim_dim=25 # CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet train_test_ratio=0.1 imagenet.sim_dim=50 @@ -17,4 +17,4 @@ # # classification # CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=imagenet -CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=imagenet imagenet.shuffle=True +# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=imagenet imagenet.shuffle=True diff --git a/config/main.yaml b/config/main.yaml index 80ce620..736da6c 100644 --- a/config/main.yaml +++ b/config/main.yaml @@ -92,8 +92,8 @@ musiccaps: imagenet: sim_dim: 700 # dimension of the similarity score and the CCA transformation equal_weights: False - img_encoder: "dino" - text_encoder: "gtr" + img_encoder: "clip" + text_encoder: "clip" train_test_ratios: [0.7] #, 0.3, 0.5, 0.7] shuffle_ratios: [0.1, 0.3, 0.5, 0.7, 1.0] shuffle: False diff --git a/mmda/get_embeddings.py b/mmda/get_embeddings.py index b960090..bfe7ea9 100644 --- a/mmda/get_embeddings.py +++ b/mmda/get_embeddings.py @@ -543,5 +543,6 @@ def main(cfg: DictConfig) -> None: # noqa: PLR0915, C901, PLR0912 if __name__ == "__main__": - main() + clip_imgs(["/Users/yunfan/Downloads/1.jpg"], noise=True) + # main() # CUDA_VISIBLE_DEVICES=5 poetry run python mmda/get_embeddings.py diff --git a/mmda/utils/dataset_utils.py b/mmda/utils/dataset_utils.py index f74d74d..ff45721 100644 --- a/mmda/utils/dataset_utils.py +++ b/mmda/utils/dataset_utils.py @@ -414,6 +414,8 @@ def load_imagenet( idx, label = int(idx.strip()), label.strip() label = label.replace("'", "") clsidx_to_labels[idx] = label + print(np.sum(orig_idx != mturks_idx)) + print(len(orig_idx)) return img_path, mturks_idx, orig_idx, clsidx_to_labels @@ -700,7 +702,7 @@ def shuffle_by_level( # noqa: PLR0912, C901, ANN201 @hydra.main(version_base=None, config_path="../../config", config_name="main") def main(cfg: DictConfig) -> None: # noqa: D103 - load_msrvtt(cfg.MSRVTT) + load_imagenet(cfg.imagenet) if __name__ == "__main__": diff --git a/mmda/utils/embed_data.py b/mmda/utils/embed_data.py index 7c39833..6e961ae 100644 --- a/mmda/utils/embed_data.py +++ b/mmda/utils/embed_data.py @@ -134,6 +134,9 @@ def clip_imgs( "hf-hub:laion/CLIP-ViT-bigG-14-laion2B-39B-b160k" ) model = model.cuda() + print("Loading CLIP model") + num_params = sum(p.numel() for p in model.parameters()) + print(f"Number of parameters in CLIP model: {num_params:,}") img_embeddings = [] with torch.no_grad(), torch.cuda.amp.autocast(): for i in tqdm(range(0, len(img_files), batch_size)): From e92e1b4a69f011a3f6098d5a4fb32968faabfb6d Mon Sep 17 00:00:00 2001 From: d31003 Date: Wed, 13 Nov 2024 15:49:07 -0600 Subject: [PATCH 02/10] bimodal results for rebuttal --- mmda/plot_single_modal.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/mmda/plot_single_modal.py b/mmda/plot_single_modal.py index 9dea26c..787bf68 100644 --- a/mmda/plot_single_modal.py +++ b/mmda/plot_single_modal.py @@ -46,6 +46,30 @@ def plot_single_modal_recall(cfg: DictConfig) -> None: / f"single_modal_recall5_{cfg_dataset.retrieval_dim}_{cfg_dataset.mask_ratio}.pdf" ) + dir_path = Path("plots/KITTI/") + single1_recalls = [[32.4], [32.8]] + single_recalls = np.array(single1_recalls).reshape(2, 1) + plt.figure(figsize=(6, 8)) + ax = sns.heatmap( + single_recalls, + fmt=".1f", + cmap="YlGnBu", + cbar=False, + square=True, + xticklabels=["LiDAR (Lip-loc)"], + yticklabels=["LiDAR (Lip-loc)", "Text (GTR)"], + annot=True, + annot_kws={"size": cell_size, "weight": "bold"}, + ) + ax.xaxis.tick_top() + plt.xlabel("Reference modality", fontsize=label_size) + plt.ylabel("Query modality", fontsize=label_size) + plt.xticks(fontsize=ticks_size) + plt.yticks(fontsize=ticks_size) + ax.xaxis.set_label_position("top") # Move the label to the top + plt.tight_layout() + plt.savefig(dir_path / f"bimodal_recall5_{cfg_dataset.retrieval_dim}.pdf") + cfg_dataset = cfg["MSRVTT"] dir_path = ( Path(cfg_dataset.paths.plots_path) From 612dc637baade2cc47a7085850e6f5a92f1102e5 Mon Sep 17 00:00:00 2001 From: d31003 Date: Wed, 13 Nov 2024 17:43:13 -0600 Subject: [PATCH 03/10] t-SNE visualizations --- config/main.yaml | 6 +- mmda/exps/mislabel_align.py | 2 + mmda/plot_single_modal.py | 2 +- mmda/tsne_csa.py | 107 ++++++++++++++++++++++++++++++++++++ mmda/utils/dataset_utils.py | 4 +- 5 files changed, 115 insertions(+), 6 deletions(-) create mode 100644 mmda/tsne_csa.py diff --git a/config/main.yaml b/config/main.yaml index 736da6c..7282f1f 100644 --- a/config/main.yaml +++ b/config/main.yaml @@ -7,7 +7,7 @@ noisy_train_set: True repo_root: "/home/pl22767/Project/MMDA/" # repo_root: "/home/po-han/Desktop/Projects/MMDA/" -dataset: "MSRVTT" +dataset: "imagenet" dataset_level_datasets: [pitts, imagenet, cosmos, sop, tiil, musiccaps, flickr] class_level_datasets: [sop] object_level_datasets: [pitts, sop] @@ -92,8 +92,8 @@ musiccaps: imagenet: sim_dim: 700 # dimension of the similarity score and the CCA transformation equal_weights: False - img_encoder: "clip" - text_encoder: "clip" + img_encoder: "dino" + text_encoder: "gtr" train_test_ratios: [0.7] #, 0.3, 0.5, 0.7] shuffle_ratios: [0.1, 0.3, 0.5, 0.7, 1.0] shuffle: False diff --git a/mmda/exps/mislabel_align.py b/mmda/exps/mislabel_align.py index d3d21fe..8dc724b 100644 --- a/mmda/exps/mislabel_align.py +++ b/mmda/exps/mislabel_align.py @@ -118,6 +118,8 @@ def __init__(self, *args, **kwargs): # noqa: ANN204, ANN002, ANN003 "valdata2align": valdata2align, "valdata1unalign": valdata1unalign, "valdata2unalign": valdata2unalign, + "train_idx": train_idx, + "train_wrong_labels_bool": train_wrong_labels_bool, } ) return alldata diff --git a/mmda/plot_single_modal.py b/mmda/plot_single_modal.py index 787bf68..dab1c57 100644 --- a/mmda/plot_single_modal.py +++ b/mmda/plot_single_modal.py @@ -59,7 +59,7 @@ def plot_single_modal_recall(cfg: DictConfig) -> None: xticklabels=["LiDAR (Lip-loc)"], yticklabels=["LiDAR (Lip-loc)", "Text (GTR)"], annot=True, - annot_kws={"size": cell_size, "weight": "bold"}, + annot_kws={"size": cell_size + 10, "weight": "bold"}, ) ax.xaxis.tick_top() plt.xlabel("Reference modality", fontsize=label_size) diff --git a/mmda/tsne_csa.py b/mmda/tsne_csa.py new file mode 100644 index 0000000..765d04a --- /dev/null +++ b/mmda/tsne_csa.py @@ -0,0 +1,107 @@ +"""Plot the T-SNE of the CSA embeddings on ImageNet.""" + +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np +from omegaconf import DictConfig +from sklearn.manifold import TSNE + +import hydra +from mmda.exps.mislabel_align import separate_data +from mmda.utils.cca_class import NormalizedCCA +from mmda.utils.data_utils import load_two_encoder_data +from mmda.utils.dataset_utils import load_imagenet + +cell_size = 30 +label_size = 30 +ticks_size = 26 + + +def plot_imagenet_tsne(cfg: DictConfig, save: bool = False) -> None: + """Plot the T-SNE of the CSA embeddings on ImageNet.""" + ### load embeddings ### + img_path, mturks_idx, orig_idx, clsidx_to_labels = load_imagenet(cfg.imagenet) + + np.random.seed(cfg.seed) + cfg_dataset, data1, data2 = load_two_encoder_data(cfg) + print(f"Loaded data1 shape: {data1.shape}, data2 shape: {data2.shape}") + alldata = separate_data(cfg, data1, data2) + + # we only consider the correctly labeled data + + # select training data based on the noisy_train_set + traindata1 = alldata.traindata1align + traindata2 = alldata.traindata2align + train_idx = alldata.train_idx[~alldata.train_wrong_labels_bool] + print( + f"img_data shape: {traindata1.shape}, mturks_idx[train_idx] shape: {mturks_idx[train_idx].shape}" + ) + class_20_idx = mturks_idx[train_idx] % 50 == 0 + print(f"val_idx shape: {class_20_idx.shape}") + + # transform the data using CCA + cca = NormalizedCCA() + cca_img_data, cca_text_data, _ = cca.fit_transform_train_data( + cfg_dataset, traindata1, traindata2 + ) + print(f"cca_img_data shape: {cca_img_data.shape}") + + # Compute t-SNE for original image embeddings + tsne_img = TSNE(n_components=2, random_state=cfg.seed).fit_transform( + traindata1[class_20_idx] + ) + + # Compute t-SNE for CCA-transformed embeddings + tsne_cca = TSNE(n_components=2, random_state=cfg.seed).fit_transform( + cca_img_data[class_20_idx] + ) + # Plot original image embeddings + fig1, ax1 = plt.subplots(figsize=(10, 8)) + print(f"idx: {mturks_idx[train_idx][class_20_idx].shape}") + print(f"embeddings: {tsne_img.shape}") + _ = ax1.scatter( + tsne_img[:, 0], + tsne_img[:, 1], + c=mturks_idx[train_idx][class_20_idx], + cmap="tab20", + alpha=0.8, + ) + ax1.set_xlabel("t-SNE dimension 1", fontsize=label_size) + ax1.set_ylabel("t-SNE dimension 2", fontsize=label_size) + ax1.tick_params(axis="both", labelsize=ticks_size) + plt.tight_layout() + + # Plot CCA-transformed embeddings + fig2, ax2 = plt.subplots(figsize=(10, 8)) + _ = ax2.scatter( + tsne_cca[:, 0], + tsne_cca[:, 1], + c=mturks_idx[train_idx][class_20_idx], + cmap="tab20", + alpha=0.8, + ) + ax2.set_xlabel("t-SNE dimension 1", fontsize=label_size) + ax2.set_ylabel("t-SNE dimension 2", fontsize=label_size) + ax2.tick_params(axis="both", labelsize=ticks_size) + plt.tight_layout() + + # Save plots if specified + if save: + plots_path = Path( + cfg_dataset.paths.plots_path, + f"tsne_{cfg_dataset.text_encoder}_{cfg_dataset.img_encoder}/", + ) + plots_path.mkdir(parents=True, exist_ok=True) + fig1.savefig(plots_path / "tsne_clip.png") + fig2.savefig(plots_path / "tsne_csa.png") + plt.close("all") + + +@hydra.main(version_base=None, config_path="../config", config_name="main") +def main(cfg: DictConfig) -> None: # noqa: D103 + plot_imagenet_tsne(cfg, save=True) + + +if __name__ == "__main__": + main() diff --git a/mmda/utils/dataset_utils.py b/mmda/utils/dataset_utils.py index ff45721..f1ba3cd 100644 --- a/mmda/utils/dataset_utils.py +++ b/mmda/utils/dataset_utils.py @@ -414,8 +414,8 @@ def load_imagenet( idx, label = int(idx.strip()), label.strip() label = label.replace("'", "") clsidx_to_labels[idx] = label - print(np.sum(orig_idx != mturks_idx)) - print(len(orig_idx)) + print("Mismatch: ", np.sum(orig_idx != mturks_idx)) + print("Total: ", len(orig_idx)) return img_path, mturks_idx, orig_idx, clsidx_to_labels From 2ce640cbb23986d0a5c62a778ad5f61701cdaabf Mon Sep 17 00:00:00 2001 From: d31003 Date: Wed, 13 Nov 2024 17:47:19 -0600 Subject: [PATCH 04/10] Fix ruff error --- mmda/plot_kitti_cross_retrieval.py | 50 ++++++++++++++++++++++++++++++ mmda/plot_single_modal.py | 26 +--------------- 2 files changed, 51 insertions(+), 25 deletions(-) create mode 100644 mmda/plot_kitti_cross_retrieval.py diff --git a/mmda/plot_kitti_cross_retrieval.py b/mmda/plot_kitti_cross_retrieval.py new file mode 100644 index 0000000..88b4575 --- /dev/null +++ b/mmda/plot_kitti_cross_retrieval.py @@ -0,0 +1,50 @@ +"""Plot functions.""" + +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np +import seaborn as sns +from omegaconf import DictConfig + +import hydra + + +@hydra.main(version_base=None, config_path="../config", config_name="main") +def plot_kitti_cross_retrieval(cfg: DictConfig) -> None: + """Plot cross-modal retrieval results for KITTI dataset. + + Args: + cfg: Configuration object containing dataset parameters + """ + cell_size = 30 + label_size = 30 + ticks_size = 28 + cfg_dataset = cfg["KITTI"] + dir_path = Path("plots/KITTI/") + single1_recalls = [[32.4], [32.8]] + single_recalls = np.array(single1_recalls).reshape(2, 1) + plt.figure(figsize=(6, 8)) + ax = sns.heatmap( + single_recalls, + fmt=".1f", + cmap="YlGnBu", + cbar=False, + square=True, + xticklabels=["LiDAR (Lip-loc)"], + yticklabels=["LiDAR (Lip-loc)", "Text (GTR)"], + annot=True, + annot_kws={"size": cell_size + 10, "weight": "bold"}, + ) + ax.xaxis.tick_top() + plt.xlabel("Reference modality", fontsize=label_size) + plt.ylabel("Query modality", fontsize=label_size) + plt.xticks(fontsize=ticks_size) + plt.yticks(fontsize=ticks_size) + ax.xaxis.set_label_position("top") # Move the label to the top + plt.tight_layout() + plt.savefig(dir_path / f"bimodal_recall5_{cfg_dataset.retrieval_dim}.pdf") + + +if __name__ == "__main__": + plot_kitti_cross_retrieval() diff --git a/mmda/plot_single_modal.py b/mmda/plot_single_modal.py index dab1c57..74c4b77 100644 --- a/mmda/plot_single_modal.py +++ b/mmda/plot_single_modal.py @@ -1,4 +1,4 @@ -"""Plot functions.""" +"""Plot single-modal recall.""" from pathlib import Path @@ -46,30 +46,6 @@ def plot_single_modal_recall(cfg: DictConfig) -> None: / f"single_modal_recall5_{cfg_dataset.retrieval_dim}_{cfg_dataset.mask_ratio}.pdf" ) - dir_path = Path("plots/KITTI/") - single1_recalls = [[32.4], [32.8]] - single_recalls = np.array(single1_recalls).reshape(2, 1) - plt.figure(figsize=(6, 8)) - ax = sns.heatmap( - single_recalls, - fmt=".1f", - cmap="YlGnBu", - cbar=False, - square=True, - xticklabels=["LiDAR (Lip-loc)"], - yticklabels=["LiDAR (Lip-loc)", "Text (GTR)"], - annot=True, - annot_kws={"size": cell_size + 10, "weight": "bold"}, - ) - ax.xaxis.tick_top() - plt.xlabel("Reference modality", fontsize=label_size) - plt.ylabel("Query modality", fontsize=label_size) - plt.xticks(fontsize=ticks_size) - plt.yticks(fontsize=ticks_size) - ax.xaxis.set_label_position("top") # Move the label to the top - plt.tight_layout() - plt.savefig(dir_path / f"bimodal_recall5_{cfg_dataset.retrieval_dim}.pdf") - cfg_dataset = cfg["MSRVTT"] dir_path = ( Path(cfg_dataset.paths.plots_path) From 3b1eec81a57acb9d130c07731724ab967ac745bd Mon Sep 17 00:00:00 2001 From: d31003 Date: Fri, 15 Nov 2024 15:18:15 -0600 Subject: [PATCH 05/10] Linear classifier --- bash_scripts/imagenet_script.sh | 24 +++-- bash_scripts/leafy_script.sh | 7 +- mmda/linear_svm_clip.py | 167 ++++++++++++++++++++++++++++++++ mmda/utils/dataset_utils.py | 4 +- 4 files changed, 190 insertions(+), 12 deletions(-) create mode 100644 mmda/linear_svm_clip.py diff --git a/bash_scripts/imagenet_script.sh b/bash_scripts/imagenet_script.sh index a60f887..91b3d52 100644 --- a/bash_scripts/imagenet_script.sh +++ b/bash_scripts/imagenet_script.sh @@ -1,11 +1,11 @@ -CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=10 -CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=25 -CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=50 -CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=100 -CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=150 -CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=200 -CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=500 -CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=700 +# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=10 +# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=25 +# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=50 +# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=100 +# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=150 +# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=200 +# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=500 +# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=700 # CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet train_test_ratio=0.1 imagenet.sim_dim=10 # CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet train_test_ratio=0.1 imagenet.sim_dim=25 # CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet train_test_ratio=0.1 imagenet.sim_dim=50 @@ -15,6 +15,12 @@ CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset # CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet train_test_ratio=0.1 imagenet.sim_dim=500 # CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet train_test_ratio=0.1 imagenet.sim_dim=700 -# # classification +# classification # CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=imagenet # CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=imagenet imagenet.shuffle=True + +# SVM +# poetry run python mmda/linear_svm_clip.py train_test_ratio=0.1 +poetry run python mmda/linear_svm_clip.py train_test_ratio=0.3 +poetry run python mmda/linear_svm_clip.py train_test_ratio=0.5 +poetry run python mmda/linear_svm_clip.py train_test_ratio=0.7 \ No newline at end of file diff --git a/bash_scripts/leafy_script.sh b/bash_scripts/leafy_script.sh index d7d9d6a..573a5b2 100644 --- a/bash_scripts/leafy_script.sh +++ b/bash_scripts/leafy_script.sh @@ -1,4 +1,9 @@ # CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=leafy_spurge leafy_spurge.sim_dim=10 # CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=leafy_spurge leafy_spurge.sim_dim=50 # CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=leafy_spurge leafy_spurge.sim_dim=100 -CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=leafy_spurge leafy_spurge.sim_dim=250 +# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=leafy_spurge leafy_spurge.sim_dim=250 + +CUDA_VISIBLE_DEVICES=1 poetry run python mmda/linear_svm_clip.py dataset=leafy_spurge leafy_spurge.sim_dim=250 train_test_ratio=0.4 +CUDA_VISIBLE_DEVICES=1 poetry run python mmda/linear_svm_clip.py dataset=leafy_spurge leafy_spurge.sim_dim=250 train_test_ratio=0.6 +CUDA_VISIBLE_DEVICES=1 poetry run python mmda/linear_svm_clip.py dataset=leafy_spurge leafy_spurge.sim_dim=250 train_test_ratio=0.7 +CUDA_VISIBLE_DEVICES=1 poetry run python mmda/linear_svm_clip.py dataset=leafy_spurge leafy_spurge.sim_dim=250 train_test_ratio=0.888 diff --git a/mmda/linear_svm_clip.py b/mmda/linear_svm_clip.py new file mode 100644 index 0000000..66095d7 --- /dev/null +++ b/mmda/linear_svm_clip.py @@ -0,0 +1,167 @@ +"""Train a linear SVM on the ImageNet dataset.""" + +# ruff: noqa: ERA001, PLR2004, S301 + +import pickle +import time +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from omegaconf import DictConfig +from sklearn import svm + +import hydra +from mmda.utils.cca_class import NormalizedCCA +from mmda.utils.data_utils import load_two_encoder_data +from mmda.utils.dataset_utils import ( + get_train_test_split_index, + load_imagenet, + load_leafy_spurge, + train_test_split, +) + +BATCH_SIZE = 256 + + +@hydra.main(version_base=None, config_path="../config", config_name="main") +def train_linear_svm(cfg: DictConfig) -> None: + """Train a linear SVM on the ImageNet dataset.""" + np.random.seed(cfg.seed) + cfg_dataset = cfg[cfg.dataset] + if cfg.dataset == "imagenet": + _, mturks_idx, labels, _ = load_imagenet(cfg_dataset) + + with Path(cfg_dataset.paths.save_path, "ImageNet_img_emb_clip.pkl").open( + "rb" + ) as f: + img_emb = pickle.load(f) + elif cfg.dataset == "leafy_spurge": + _, labels, _ = load_leafy_spurge(cfg_dataset) + with Path(cfg_dataset.paths.save_path, "LeafySpurge_img_emb_clip.pkl").open( + "rb" + ) as f: + img_emb = pickle.load(f) + + # transform the data using CCA + cfg_dataset, data1, data2 = load_two_encoder_data(cfg) + + # Train linear SVM + start_time = time.time() + train_idx, val_idx = get_train_test_split_index( + cfg.train_test_ratio, img_emb.shape[0] + ) + labels_train, labels_test = train_test_split(labels, train_idx, val_idx) + print(labels_train.shape, labels_test.shape) + + # CSA case + csa_train_data1, csa_val_data1 = train_test_split(data1, train_idx, val_idx) + csa_train_data2, csa_val_data2 = train_test_split(data2, train_idx, val_idx) + cca = NormalizedCCA() + cca_img_train, cca_text_train, _ = cca.fit_transform_train_data( + cfg_dataset, csa_train_data1, csa_train_data2 + ) + clf = svm.SVC(kernel="linear") + clf.fit(cca_img_train, labels_train) + cca_img_val, cca_text_val = cca.transform_data(csa_val_data1, csa_val_data2) + y_pred = clf.predict(cca_img_val) + accuracy = np.mean(y_pred == labels_test) + print(f"CSA accuracy: {accuracy * 100:.2f}%") + return + + # CLIP case + x_train, x_test = train_test_split(img_emb, train_idx, val_idx) + print(x_train.shape, x_test.shape) + print(len(labels_train), len(labels_test)) + clf = svm.SVC(kernel="linear") + clf.fit(x_train, labels_train) + + end_time = time.time() + + print(f"Training time: {end_time - start_time:.2f} seconds") + y_pred = clf.predict(x_test) + accuracy = np.mean(y_pred == labels_test) + print(f"Split {cfg.train_test_ratio} accuracy: {accuracy * 100:.2f}%") + + +@hydra.main(version_base=None, config_path="../config", config_name="main") +def plot_accuracy(cfg: DictConfig) -> None: + """Plot the accuracy of the model.""" + cfg_dataset = cfg[cfg.dataset] + ds_size = 50_000 if cfg.dataset == "imagenet" else 900 + csv_save_path = ( + Path(cfg_dataset.paths.plots_path) + / f"classify_{cfg_dataset.text_encoder}_{cfg_dataset.img_encoder}/" + / f"accuracy_{cfg_dataset.sim_dim}_svm.csv" + ) + df = pd.read_csv(csv_save_path) + ratios = df["train_test_ratio"] * ds_size + cca_accs = df["cca_accs"] + asif_accs = df["asif_accs"] + fig, ax = plt.subplots() + ax.plot( + ratios, + cca_accs, + "o-", + ms=12, + label="CSA (ours)", + color="blue", + ) + clip_accs = df["clip_accs"] + ax.plot( + ratios, + clip_accs, + "^--", + ms=12, + label="CLIP", + color="red", + ) + clip_svm_accs = df["svm_accs"] + ax.plot( + ratios, + clip_svm_accs, + "v--", + ms=12, + label="CLIP + Linear SVM", + color="orange", + ) + csa_svm_accs = df["csa_svm_accs"] + ax.plot( + ratios, + csa_svm_accs, + "D-.", + ms=12, + label="CSA + Linear SVM", + color="purple", + ) + ax.plot( + ratios, + asif_accs, + "D-.", + ms=12, + label="ASIF", + color="green", + ) + ax.set_xlabel("Amount of training data", fontsize=20) + ax.set_ylabel("Classification accuracy", fontsize=20) + ax.xaxis.set_tick_params(labelsize=15) + ax.yaxis.set_tick_params(labelsize=15) + ax.set_ylim(-0.1, 1.1) if cfg.dataset == "imagenet" else ax.set_ylim(0.3, 0.7) + ax.legend(loc="lower right", fontsize=16) + ax.grid() + + plots_path = ( + Path(cfg_dataset.paths.plots_path) + / f"classify_{cfg_dataset.text_encoder}_{cfg_dataset.img_encoder}/" + ) + plots_path.mkdir(parents=True, exist_ok=True) + plt.tight_layout() + fig.savefig(plots_path / f"trainsize_vs_accuracy_svm{cfg_dataset.sim_dim}.png") + + +if __name__ == "__main__": + # train_linear_svm() + plot_accuracy() + +# CUDA_VISIBLE_DEVICES=5 poetry run python mmda/linear_svm_clip.py dataset=leafy_spurge leafy_spurge.sim_dim=250 diff --git a/mmda/utils/dataset_utils.py b/mmda/utils/dataset_utils.py index f1ba3cd..c91e06d 100644 --- a/mmda/utils/dataset_utils.py +++ b/mmda/utils/dataset_utils.py @@ -414,8 +414,8 @@ def load_imagenet( idx, label = int(idx.strip()), label.strip() label = label.replace("'", "") clsidx_to_labels[idx] = label - print("Mismatch: ", np.sum(orig_idx != mturks_idx)) - print("Total: ", len(orig_idx)) + print("ImageNet mislabel count: ", np.sum(orig_idx != mturks_idx)) + print("ImageNet total count: ", len(orig_idx)) return img_path, mturks_idx, orig_idx, clsidx_to_labels From 96d7fa697876ecd66a00aadda76eca6efa47d8d1 Mon Sep 17 00:00:00 2001 From: d31003 Date: Wed, 20 Nov 2024 16:50:47 -0600 Subject: [PATCH 06/10] optimize plots --- mmda/linear_svm_clip.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/mmda/linear_svm_clip.py b/mmda/linear_svm_clip.py index 66095d7..6ee5983 100644 --- a/mmda/linear_svm_clip.py +++ b/mmda/linear_svm_clip.py @@ -98,7 +98,6 @@ def plot_accuracy(cfg: DictConfig) -> None: df = pd.read_csv(csv_save_path) ratios = df["train_test_ratio"] * ds_size cca_accs = df["cca_accs"] - asif_accs = df["asif_accs"] fig, ax = plt.subplots() ax.plot( ratios, @@ -117,37 +116,38 @@ def plot_accuracy(cfg: DictConfig) -> None: label="CLIP", color="red", ) - clip_svm_accs = df["svm_accs"] + asif_accs = df["asif_accs"] ax.plot( ratios, - clip_svm_accs, - "v--", + asif_accs, + "D-.", ms=12, - label="CLIP + Linear SVM", - color="orange", + label="ASIF", + color="green", ) csa_svm_accs = df["csa_svm_accs"] ax.plot( ratios, csa_svm_accs, - "D-.", + "D-", ms=12, label="CSA + Linear SVM", color="purple", ) + clip_svm_accs = df["svm_accs"] ax.plot( ratios, - asif_accs, - "D-.", + clip_svm_accs, + "v--", ms=12, - label="ASIF", - color="green", + label="CLIP + Linear SVM", + color="orange", ) ax.set_xlabel("Amount of training data", fontsize=20) ax.set_ylabel("Classification accuracy", fontsize=20) ax.xaxis.set_tick_params(labelsize=15) ax.yaxis.set_tick_params(labelsize=15) - ax.set_ylim(-0.1, 1.1) if cfg.dataset == "imagenet" else ax.set_ylim(0.3, 0.7) + ax.set_ylim(0.0, 1.1) if cfg.dataset == "imagenet" else ax.set_ylim(0.2, 0.8) ax.legend(loc="lower right", fontsize=16) ax.grid() From 0e07307707892fe7596a1c99cc71836f529c6f89 Mon Sep 17 00:00:00 2001 From: d31003 Date: Wed, 27 Nov 2024 15:59:15 -0600 Subject: [PATCH 07/10] Ablation study of encoder architectures --- bash_scripts/imagenet_script.sh | 7 ++-- config/main.yaml | 8 ++-- mmda/get_embeddings.py | 25 +++++++++-- mmda/utils/data_utils.py | 10 ++++- mmda/utils/embed_data.py | 73 +++++++++++++++++++++++++++++++++ 5 files changed, 111 insertions(+), 12 deletions(-) diff --git a/bash_scripts/imagenet_script.sh b/bash_scripts/imagenet_script.sh index 91b3d52..8b8fb21 100644 --- a/bash_scripts/imagenet_script.sh +++ b/bash_scripts/imagenet_script.sh @@ -1,3 +1,4 @@ +# mislabeled data # CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=10 # CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=25 # CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=50 @@ -21,6 +22,6 @@ # SVM # poetry run python mmda/linear_svm_clip.py train_test_ratio=0.1 -poetry run python mmda/linear_svm_clip.py train_test_ratio=0.3 -poetry run python mmda/linear_svm_clip.py train_test_ratio=0.5 -poetry run python mmda/linear_svm_clip.py train_test_ratio=0.7 \ No newline at end of file +# poetry run python mmda/linear_svm_clip.py train_test_ratio=0.3 +# poetry run python mmda/linear_svm_clip.py train_test_ratio=0.5 +# poetry run python mmda/linear_svm_clip.py train_test_ratio=0.7 \ No newline at end of file diff --git a/config/main.yaml b/config/main.yaml index 7282f1f..0420694 100644 --- a/config/main.yaml +++ b/config/main.yaml @@ -39,7 +39,6 @@ BTC: save_path: ${BTC.paths.dataset_path}/any2any/ plots_path: ${repo_root}plots/BTC/ - MSRVTT: img_encoder: "clip" audio_encoder: "clap" @@ -92,9 +91,10 @@ musiccaps: imagenet: sim_dim: 700 # dimension of the similarity score and the CCA transformation equal_weights: False - img_encoder: "dino" - text_encoder: "gtr" - train_test_ratios: [0.7] #, 0.3, 0.5, 0.7] + img_encoder: "clipopenai" + text_encoder: "clipdatacomp_xl_s13b_b90k" + model_name: "openai" + train_test_ratios: [0.3, 0.5, 0.7] shuffle_ratios: [0.1, 0.3, 0.5, 0.7, 1.0] shuffle: False paths: diff --git a/mmda/get_embeddings.py b/mmda/get_embeddings.py index bfe7ea9..09f757e 100644 --- a/mmda/get_embeddings.py +++ b/mmda/get_embeddings.py @@ -31,6 +31,8 @@ clip_text, cosplace_img, dinov2, + fair_clip_imgs, + fair_clip_text, gtr_text, ) from mmda.utils.imagebind_utils import ImageBindInference @@ -40,7 +42,7 @@ process_audio, ) -BATCH_SIZE = 256 +BATCH_SIZE = 758 @hydra.main(version_base=None, config_path="../config", config_name="main") @@ -372,6 +374,24 @@ def main(cfg: DictConfig) -> None: # noqa: PLR0915, C901, PLR0912 text_descriptions = ["An image of " + label + "." for label in orig_labels] # get text embeddings + model = "openai" + + img_emb = fair_clip_imgs(img_path, BATCH_SIZE, model_name=("ViT-L-14", model)) + with Path( + cfg_dataset.paths.save_path, f"ImageNet_img_emb_clip{model}.pkl" + ).open("wb") as f: + pickle.dump(img_emb, f) + print("FairCLIP embeddings saved") + + text_emb = fair_clip_text( + text_descriptions, BATCH_SIZE, model_name=("ViT-L-14", model) + ) + with Path( + cfg_dataset.paths.save_path, f"ImageNet_text_emb_clip{model}.pkl" + ).open("wb") as f: + pickle.dump(text_emb, f) + print("FairCLIP embeddings saved") + text_emb = clip_text(text_descriptions, BATCH_SIZE) with Path(cfg_dataset.paths.save_path, "ImageNet_text_emb_clip.pkl").open( "wb" @@ -543,6 +563,5 @@ def main(cfg: DictConfig) -> None: # noqa: PLR0915, C901, PLR0912 if __name__ == "__main__": - clip_imgs(["/Users/yunfan/Downloads/1.jpg"], noise=True) - # main() + main() # CUDA_VISIBLE_DEVICES=5 poetry run python mmda/get_embeddings.py diff --git a/mmda/utils/data_utils.py b/mmda/utils/data_utils.py index e58651a..487e655 100644 --- a/mmda/utils/data_utils.py +++ b/mmda/utils/data_utils.py @@ -204,10 +204,16 @@ def load_clip_like_data(cfg: DictConfig) -> tuple[DictConfig, np.ndarray, np.nda ) elif dataset == "imagenet": data1 = joblib.load( - Path(cfg_dataset.paths.save_path + "ImageNet_img_emb_clip.pkl") + Path( + cfg_dataset.paths.save_path + + f"ImageNet_img_emb_clip{cfg_dataset.model_name}.pkl" + ) ) data2 = joblib.load( - Path(cfg_dataset.paths.save_path + "ImageNet_text_emb_clip.pkl") + Path( + cfg_dataset.paths.save_path + + f"ImageNet_text_emb_clip{cfg_dataset.model_name}.pkl" + ) ) elif dataset == "tiil": data1 = joblib.load(Path(cfg_dataset.paths.save_path + "TIIL_img_emb_clip.pkl")) diff --git a/mmda/utils/embed_data.py b/mmda/utils/embed_data.py index 6e961ae..440f63a 100644 --- a/mmda/utils/embed_data.py +++ b/mmda/utils/embed_data.py @@ -161,6 +161,46 @@ def clip_imgs( return np.concatenate(img_embeddings, axis=0) +def fair_clip_imgs( + img_files: list[str], + batch_size: int = 32, + model_name: tuple[str, str] = ("ViT-L-14", "datacomp_xl_s13b_b90k"), +) -> np.ndarray: + """Extract image features using CLIP model. + + Args: + img_files: list of image files + batch_size: batch size + model_name: name of the CLIP model. (architecture, pretrained) + + Returns: + image features + """ + model, _, preprocess = open_clip.create_model_and_transforms( + model_name[0], pretrained=model_name[1] + ) + # commonpool_xl_clip_s13b_b90k, commonpool_xl_s13b_b90k, commonpool_xl_laion_s13b_b90k, openai + model = model.cuda() + img_embeddings = [] + with torch.no_grad(), torch.cuda.amp.autocast(): + for i in tqdm(range(0, len(img_files), batch_size)): + batch = [] + for img_file in img_files[i : i + batch_size]: + if isinstance(img_file, str): + image = preprocess(Image.open(img_file)).unsqueeze(0) + elif isinstance(img_file, Path): + image = preprocess(Image.open(str(img_file))).unsqueeze(0) + elif isinstance(img_file, Image.Image): + image = preprocess(img_file).unsqueeze(0) + batch.append(image) + batch = torch.cat(batch, dim=0) + batch = batch.cuda() + image_features = model.encode_image(batch) + image_features /= image_features.norm(dim=-1, keepdim=True) + img_embeddings.append(image_features.detach().cpu().numpy()) + return np.concatenate(img_embeddings, axis=0) + + # clip text in batch with gpu def clip_text( text: list[str], @@ -192,6 +232,39 @@ def clip_text( return np.concatenate(text_features, axis=0) +def fair_clip_text( + text: list[str], + batch_size: int = 32, + model_name: tuple[str, str] = ("ViT-L-14", "openai"), +) -> np.ndarray: + """Extract text features using CLIP model. + + Args: + text: list of text + batch_size: batch size + model_name: name of the CLIP model. (architecture, pretrained) + + Returns: + text features + """ + model, _, _ = open_clip.create_model_and_transforms( + model_name[0], pretrained=model_name[1] + ) + tokenizer = open_clip.get_tokenizer(model_name[0]) + model = model.cuda() + + text_features = [] + with torch.no_grad(), torch.cuda.amp.autocast(): + for i in tqdm(range(0, len(text), batch_size)): + batch = text[i : i + batch_size] + batch = tokenizer(batch) + batch = batch.cuda() + batch = model.encode_text(batch) + batch /= batch.norm(dim=-1, keepdim=True) + text_features.append(batch.detach().cpu().numpy()) + return np.concatenate(text_features, axis=0) + + def gtr_text(text: list[str]) -> np.ndarray: """Extract text features using GTR model. From 40b2fb61e70d6f50b2b5e0a06b0c109db4707136 Mon Sep 17 00:00:00 2001 From: d31003 Date: Wed, 27 Nov 2024 23:36:40 -0600 Subject: [PATCH 08/10] rewrite CCA... --- .gitignore | 3 +- bash_scripts/handwriting_script.sh | 7 ++ config/main.yaml | 17 ++- mmda/bimodal_classification.py | 24 +++- mmda/exps/classification.py | 14 +-- mmda/get_embeddings.py | 57 ++++++++- mmda/handwriting_baseline.py | 29 +++++ mmda/utils/cca_class.py | 124 +++++++++++++++++- mmda/utils/classification_dataset_class.py | 138 ++++++++++++++++++++- mmda/utils/data_utils.py | 13 ++ mmda/utils/dataset_utils.py | 71 ++++++++++- mmda/utils/embed_data.py | 27 +++- pyproject.toml | 11 +- 13 files changed, 508 insertions(+), 27 deletions(-) create mode 100644 bash_scripts/handwriting_script.sh create mode 100644 mmda/handwriting_baseline.py diff --git a/.gitignore b/.gitignore index 4000a22..20dbc72 100644 --- a/.gitignore +++ b/.gitignore @@ -171,4 +171,5 @@ plots/* # lock files *.lock .checkpoints/ -.assets/ \ No newline at end of file +.assets/ +*.keras \ No newline at end of file diff --git a/bash_scripts/handwriting_script.sh b/bash_scripts/handwriting_script.sh new file mode 100644 index 0000000..7e20846 --- /dev/null +++ b/bash_scripts/handwriting_script.sh @@ -0,0 +1,7 @@ +# classification +CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=10 +CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=25 +CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=50 +CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=100 +CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=200 +CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=700 diff --git a/config/main.yaml b/config/main.yaml index 0420694..dbda3a4 100644 --- a/config/main.yaml +++ b/config/main.yaml @@ -7,7 +7,7 @@ noisy_train_set: True repo_root: "/home/pl22767/Project/MMDA/" # repo_root: "/home/po-han/Desktop/Projects/MMDA/" -dataset: "imagenet" +dataset: "handwriting" dataset_level_datasets: [pitts, imagenet, cosmos, sop, tiil, musiccaps, flickr] class_level_datasets: [sop] object_level_datasets: [pitts, sop] @@ -16,7 +16,7 @@ retrieval_datasets: [flickr] any_retrieval_datasets: [KITTI, MSRVTT, BTC] shuffle_llava_datasets: [pitts, sop] # datasets whose plots contains llava mislabel_llava_datasets: [imagenet] -classification_datasets: [imagenet, leafy_spurge] +classification_datasets: [imagenet, leafy_spurge, handwriting] dataset_size: { sop: 56222, musiccaps: 5397, @@ -103,6 +103,19 @@ imagenet: plots_path: ${repo_root}plots/ImageNet/ label_embeddings: ${imagenet.paths.dataset_path}_${text_encoder}_label_embeddings.npy +handwriting: + sim_dim: 700 # dimension of the similarity score and the CCA transformation + equal_weights: False + img_encoder: "chronos" + text_encoder: "clip" + train_test_ratios: [0.9] + shuffle: False + paths: + dataset_path: "/nas/pohan/datasets/Handwriting/" + save_path: ${handwriting.paths.dataset_path}embeddings/ + plots_path: ${repo_root}plots/handwriting/ + label_embeddings: ${handwriting.paths.dataset_path}_${text_encoder}_label_embeddings.npy + leafy_spurge: sim_dim: 700 # dimension of the similarity score and the CCA transformation equal_weights: False diff --git a/mmda/bimodal_classification.py b/mmda/bimodal_classification.py index 83caf22..415018e 100644 --- a/mmda/bimodal_classification.py +++ b/mmda/bimodal_classification.py @@ -15,7 +15,7 @@ @hydra.main(version_base=None, config_path="../config", config_name="main") -def main(cfg: DictConfig) -> None: +def main(cfg: DictConfig) -> None: # noqa: C901, PLR0915, PLR0912 """Main function to generate the classification results of the bimodal datasets. Args: @@ -27,7 +27,12 @@ def main(cfg: DictConfig) -> None: ), f"{cfg.dataset} is not for classification." cfg_dataset = cfg[cfg.dataset] shuffle_tag = "shuffled" if cfg_dataset.shuffle else "" - ds_size = 50_000 if cfg.dataset == "imagenet" else 900 + if cfg.dataset == "imagenet": + ds_size = 50_000 + elif cfg.dataset == "leafy_spurge": + ds_size = 900 + elif cfg.dataset == "handwriting": + ds_size = 1000 csv_save_path = ( Path(cfg_dataset.paths.plots_path) / f"classify_{cfg_dataset.text_encoder}_{cfg_dataset.img_encoder}/" @@ -49,7 +54,11 @@ def main(cfg: DictConfig) -> None: for train_test_ratio in cfg_dataset.train_test_ratios: asif_accs = asif_classification(cfg, train_test_ratio) cca_accs = cca_classification(cfg, train_test_ratio) - clip_accs = clip_like_classification(cfg, train_test_ratio) + clip_accs = ( + clip_like_classification(cfg, train_test_ratio) + if cfg.dataset != "handwriting" + else 0 + ) # write accuracy to file if not csv_save_path.exists(): # create the file and write the header @@ -77,7 +86,7 @@ def main(cfg: DictConfig) -> None: label="CSA (ours)", color="blue", ) - if not cfg_dataset.shuffle: + if not cfg_dataset.shuffle and cfg.dataset != "handwriting": clip_accs = df["clip_accs"] ax.plot( ratios, @@ -99,7 +108,12 @@ def main(cfg: DictConfig) -> None: ax.set_ylabel("Classification accuracy", fontsize=20) ax.xaxis.set_tick_params(labelsize=15) ax.yaxis.set_tick_params(labelsize=15) - ax.set_ylim(0, 1.03) if cfg.dataset == "imagenet" else ax.set_ylim(0.4, 0.65) + if cfg.dataset == "imagenet": + ax.set_ylim(0, 1.03) + elif cfg.dataset == "leafy_spurge": + ax.set_ylim(0.4, 0.65) + else: + ax.set_ylim(0, 1.03) ( ax.legend(loc="lower right", fontsize=18) if not cfg_dataset.shuffle diff --git a/mmda/exps/classification.py b/mmda/exps/classification.py index 7535323..26968fd 100644 --- a/mmda/exps/classification.py +++ b/mmda/exps/classification.py @@ -3,14 +3,14 @@ import numpy as np from omegaconf import DictConfig -from mmda.utils.cca_class import NormalizedCCA +from mmda.utils.cca_class import NormalizedCCA, ReNormalizedCCA from mmda.utils.classification_dataset_class import load_classification_dataset from mmda.utils.sim_utils import cosine_sim, weighted_corr_sim def cca_classification( cfg: DictConfig, train_test_ratio: float, shuffle_ratio: float = 0.0 -) -> tuple[dict[float:float], dict[float : dict[float:float]]]: +) -> float: """Retrieve data using the proposed CCA method. Args: @@ -20,13 +20,15 @@ def cca_classification( Returns: data_size2accuracy: {data_size: accuracy} """ + print("CCA") cfg_dataset = cfg[cfg.dataset] ds = load_classification_dataset(cfg) ds.load_data(train_test_ratio, clip_bool=False, shuffle_ratio=shuffle_ratio) - cca = NormalizedCCA() + cca = ReNormalizedCCA() if cfg.dataset == "handwriting" else NormalizedCCA() ds.train_img, ds.train_text, corr = cca.fit_transform_train_data( cfg_dataset, ds.train_img, ds.train_text ) + print("corr", corr) ds.test_img, ds.test_text = cca.transform_data(ds.test_img, ds.test_text) ds.get_labels_emb() @@ -39,9 +41,7 @@ def sim_fn(x: np.array, y: np.array, corr: np.array = corr) -> np.array: return ds.classification(sim_fn=sim_fn) -def clip_like_classification( - cfg: DictConfig, train_test_ratio: float -) -> tuple[dict[float:float], dict[float:float]]: +def clip_like_classification(cfg: DictConfig, train_test_ratio: float) -> float: """Retrieve data using the CLIP-like method. Args: @@ -58,7 +58,7 @@ def clip_like_classification( def asif_classification( cfg: DictConfig, train_test_ratio: float, shuffle_ratio: float = 0.0 -) -> tuple[dict[float:float], dict[float:float]]: +) -> float: """Retrieve data using the CLIP-like method. Args: diff --git a/mmda/get_embeddings.py b/mmda/get_embeddings.py index 09f757e..322577e 100644 --- a/mmda/get_embeddings.py +++ b/mmda/get_embeddings.py @@ -15,6 +15,7 @@ from mmda.utils.dataset_utils import ( load_cosmos, load_flickr, + load_handwriting, load_imagenet, load_kitti, load_leafy_spurge, @@ -25,6 +26,7 @@ load_tiil, ) from mmda.utils.embed_data import ( + chronos_ts, clap_audio, clap_text, clip_imgs, @@ -35,7 +37,6 @@ fair_clip_text, gtr_text, ) -from mmda.utils.imagebind_utils import ImageBindInference from mmda.utils.video_audio_utils import ( get_video_emb, prepare_audio_for_imagebind, @@ -94,6 +95,8 @@ def main(cfg: DictConfig) -> None: # noqa: PLR0915, C901, PLR0912 pickle.dump(clap_audio_features, f) elif dataset == "MSRVTT": + from mmda.utils.imagebind_utils import ImageBindInference + _, captions, video_info_sen_order, video_dict = load_msrvtt(cfg_dataset) id_order, img_paths, audio_start_secs, audio_num_secs = get_video_emb( cfg_dataset, video_dict @@ -556,6 +559,58 @@ def main(cfg: DictConfig) -> None: # noqa: PLR0915, C901, PLR0912 pickle.dump(img_emb, f) print("CLIP embeddings saved") + elif dataset == "handwriting": + # sentence_26 = { + # 1: "apple.", + # 2: "ball.", + # 3: "cat.", + # 4: "dog.", + # 5: "elephant.", + # 6: "fish.", + # 7: "giraffe.", + # 8: "hat.", + # 9: "ice cream.", + # 10: "jaguar.", + # 11: "kangaroo.", + # 12: "lion.", + # 13: "monkey.", + # 14: "nest.", + # 15: "owl.", + # 16: "penguin.", + # 17: "queen.", + # 18: "rabbit.", + # 19: "snake.", + # 20: "tiger.", + # 21: "umbrella.", + # 22: "vase.", + # 23: "whale.", + # 24: "x-ray.", + # 25: "yak.", + # 26: "zebra.", + # } + data, labels, num2alphabet, alphabets_hand = load_handwriting(cfg_dataset) + # sentences = [sentence_26[int(label.split(".")[0])] for label in labels] + # int_labels = [int(label.split(".")[0]) - 1 for label in labels] + + embeddings = chronos_ts(data) if False else data.reshape(data.shape[0], -1) + # check if embeddings has unique rows + assert embeddings.shape[0] == len( + np.unique(embeddings, axis=0) + ), f"Embeddings has repeated entries. {embeddings.shape[0]}!={len(np.unique(embeddings, axis=0))}" + print("Chronos shape:", embeddings.shape) + with Path(cfg_dataset.paths.save_path, "Handwriting_emb_chronos.pkl").open( + "wb" + ) as f: + pickle.dump(embeddings, f) + print("Chronos embeddings saved") + + embeddings = clip_imgs(alphabets_hand, 256) + print("text shape:", embeddings.shape) + with Path(cfg_dataset.paths.save_path, "Handwriting_text_emb_clip.pkl").open( + "wb" + ) as f: + pickle.dump(embeddings, f) + print("CLIP embeddings saved") # TODO: add more datasets else: msg = f"Dataset {dataset} not supported." diff --git a/mmda/handwriting_baseline.py b/mmda/handwriting_baseline.py new file mode 100644 index 0000000..9fdfbcb --- /dev/null +++ b/mmda/handwriting_baseline.py @@ -0,0 +1,29 @@ +"""This script is for the handwriting baseline.""" + +import numpy as np +from aeon.classification.deep_learning import InceptionTimeClassifier +from omegaconf import DictConfig +from sklearn.metrics import accuracy_score + +import hydra +from mmda.utils.dataset_utils import load_handwriting + + +@hydra.main(version_base=None, config_path="../config", config_name="main") +def main(cfg: DictConfig) -> None: + """Train the handwriting baseline.""" + x, labels, _ = load_handwriting(cfg_dataset=cfg.handwriting) + inception = InceptionTimeClassifier() + for train_test_ratio in cfg.handwriting.train_test_ratios: + np.random.seed(42) + train_size = int(train_test_ratio * x.shape[0]) + print(x.shape, labels.shape) + inception.fit(x[:train_size], labels[:train_size]) + y_pred = inception.predict(x[train_size:]) + accuracy = accuracy_score(labels[train_size:], y_pred) + print(f"train_test_ratio: {train_test_ratio}, accuracy: {accuracy}") + + +if __name__ == "__main__": + main() +# CUDA_VISIBLE_DEVICES="" poetry run python mmda/handwriting_baseline.py diff --git a/mmda/utils/cca_class.py b/mmda/utils/cca_class.py index 09565b2..bca76d1 100644 --- a/mmda/utils/cca_class.py +++ b/mmda/utils/cca_class.py @@ -7,6 +7,7 @@ import numpy as np from cca_zoo.linear import CCA from omegaconf import DictConfig +from scipy.linalg import sqrtm from mmda.utils.data_utils import origin_centered @@ -64,8 +65,10 @@ def fit_transform_train_data( ) # dim, assert ( corr_coeff >= 0 - ).any, f"Correlation should be non-negative. {corr_coeff}" - assert (corr_coeff <= 1).any, f"Correlation should be less than 1. {corr_coeff}" + ).all(), f"Correlation should be non-negative. {corr_coeff}" + assert ( + corr_coeff <= 1 + ).all(), f"Correlation should be less than 1. {corr_coeff}" self.corr_coeff = corr_coeff self.traindata1, self.traindata2 = traindata1, traindata2 return traindata1, traindata2, corr_coeff @@ -111,3 +114,120 @@ def load_model(self, path: str | Path) -> None: if isinstance(path, str): path = Path(path) self.__dict__ = joblib.load(path.open("rb")).__dict__ + + +class ReNormalizedCCA: + """Canonical Correlation Analysis (CCA) class which automatically zero-mean data.""" + + def __init__(self, sim_dim: int | None = None) -> None: + """Initialize the CCA model.""" + self.traindata1_mean = None + self.traindata2_mean = None + self.sim_dim = sim_dim + + def fit_transform_train_data( + self, cfg_dataset: DictConfig, traindata1: np.ndarray, traindata2: np.ndarray + ) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + """Fit the CCA model to the training data. + + Args: + cfg_dataset: the dataset configuration + traindata1: the first training data. shape: (num_samples, dim) + traindata2: the second training data. shape: (num_samples, dim) + + Returns: + traindata1: the first training data after CCA. shape: (num_samples, dim) + traindata2: the second training data after CCA. shape: (num_samples, dim) + corr_coeff: the correlation coefficient. shape: (dim,) + """ + # Check the shape of the training data + # zero mean data + traindata1, traindata1_mean = origin_centered(traindata1) + traindata2, traindata2_mean = origin_centered(traindata2) + self.traindata1_mean, self.traindata2_mean = traindata1_mean, traindata2_mean + + # check if training data is zero-mean + assert np.allclose( + traindata1.mean(axis=0), 0, atol=1e-3, rtol=1e-4 + ), f"traindata1align not zero mean: {max(abs(traindata1.mean(axis=0)))}" + assert np.allclose( + traindata2.mean(axis=0), 0, atol=1e-3, rtol=1e-4 + ), f"traindata2align not zero mean: {max(abs(traindata2.mean(axis=0)))}" + + # CCA dimensionality reduction + print((traindata1.T @ traindata1).shape) + sigma_z1_inv = np.linalg.inv(traindata1.T @ traindata1) + sigma_z1_inv_sqrt = sqrtm(sigma_z1_inv) + assert np.allclose( + sigma_z1_inv_sqrt @ sigma_z1_inv_sqrt, sigma_z1_inv + ), "sigma_z1_inv_sqrt is not the square root of sigma_z1_inv" + sigma_z2_inv = np.linalg.inv(traindata2.T @ traindata2) + sigma_z2_inv_sqrt = sqrtm(sigma_z2_inv) + assert np.allclose( + sigma_z2_inv_sqrt @ sigma_z2_inv_sqrt, sigma_z2_inv + ), "sigma_z2_inv_sqrt is not the square root of sigma_z2_inv" + + svd_mat = sigma_z1_inv_sqrt @ traindata1.T @ traindata2 @ sigma_z2_inv_sqrt + u, s, vh = np.linalg.svd(svd_mat) + assert np.allclose( + u @ np.diag(s) @ vh, svd_mat + ), "svd_mat is not the SVD of svd_mat" + + self.A = u @ sigma_z1_inv_sqrt + self.B = vh @ sigma_z2_inv_sqrt + + corr_coeff = np.ones((traindata2.shape[1],)) if cfg_dataset.equal_weights else s + assert ( + corr_coeff >= 0 + ).all(), f"Correlation should be non-negative. {corr_coeff}" + assert ( + corr_coeff <= 1 + ).all(), f"Correlation should be less than 1. {corr_coeff}" + self.corr_coeff = corr_coeff + self.traindata1, self.traindata2 = ( + (self.A @ traindata1.T).T, + (self.B @ traindata2.T).T, + ) + return self.traindata1, self.traindata2, corr_coeff + + def transform_data( + self, data1: tuple[np.ndarray, np.ndarray], data2: tuple[np.ndarray, np.ndarray] + ) -> tuple[np.ndarray, np.ndarray]: + """Transform the data using the fitted CCA model. + + Args: + data1: the first data. shape: (num_samples, dim) + data2: the second data. shape: (num_samples, dim) + + Returns: + data1: the first transformed data. shape: (num_samples, dim) + data2: the second transformed data. shape: (num_samples, dim) + """ + assert self.traindata1_mean is not None, "Please fit the cca model first." + assert self.traindata2_mean is not None, "Please fit the cca model first." + # zero mean data and transform + data1 = data1 - self.traindata1_mean + data2 = data2 - self.traindata2_mean + data1, data2 = (self.A @ data1.T).T, (self.B @ data2.T).T + return data1, data2 + + def save_model(self, path: str | Path) -> None: + """Save the CCA class. + + Args: + path: the path to save the class + """ + if isinstance(path, str): + path = Path(path) + with path.open("wb") as f: + pickle.dump(self, f) + + def load_model(self, path: str | Path) -> None: + """Load the CCA class. + + Args: + path: the path to load the class + """ + if isinstance(path, str): + path = Path(path) + self.__dict__ = joblib.load(path.open("rb")).__dict__ diff --git a/mmda/utils/classification_dataset_class.py b/mmda/utils/classification_dataset_class.py index 00f67ca..93a948f 100644 --- a/mmda/utils/classification_dataset_class.py +++ b/mmda/utils/classification_dataset_class.py @@ -10,6 +10,7 @@ from mmda.utils.data_utils import load_clip_like_data, load_two_encoder_data from mmda.utils.dataset_utils import ( get_train_test_split_index, + load_handwriting, load_imagenet, load_leafy_spurge, shuffle_percentage_of_data, @@ -255,7 +256,140 @@ def classification(self, sim_fn: Union[callable, str]) -> float: # noqa: UP007 return np.mean(correct) -def load_classification_dataset(cfg: DictConfig) -> ImageNetDataset: +class HandwritingDataset(BaseClassificationDataset): + """Handwriting dataset class.""" + + def __init__(self, cfg: DictConfig) -> None: + """Initialize the dataset. + + Args: + cfg: configuration file + """ + super().__init__(cfg) + self.cfg = cfg + self.images, self.labels, self.num2alphabet, _ = load_handwriting( + cfg.handwriting + ) + # convert labels to int (str) + self.labels = [int(label.split(".")[0]) for label in self.labels] + self.labels = np.array(self.labels) - 1 + + def load_data( + self, + train_test_ratio: float, + clip_bool: bool = False, + shuffle_ratio: float = 0.0, + ) -> None: + """Load the data for ImageNet dataset. + + Args: + train_test_ratio: ratio of training data + clip_bool: whether to use CLIP-like method + shuffle_ratio: ratio of data to shuffle + """ + shuffle_ratio = shuffle_ratio + 1 # unused + self.train_test_ratio = train_test_ratio + if clip_bool: + _, self.img_emb, self.text_emb = load_clip_like_data(self.cfg) + else: + _, self.img_emb, self.text_emb = load_two_encoder_data(self.cfg) + train_size = int(self.train_test_ratio * self.img_emb.shape[0]) + self.train_img, self.test_img = ( + self.img_emb[:train_size], + self.img_emb[train_size:], + ) + self.train_text, self.test_text = ( + self.text_emb[:train_size], + self.text_emb[train_size:], + ) + self.train_idx, self.test_idx = ( + self.labels[:train_size], + self.labels[train_size:], + ) + print(self.train_img.shape, self.train_text.shape) + + def get_labels_emb(self) -> None: + """Get the text embeddings for all possible labels.""" + label_emb = [] + for num in range(len(self.num2alphabet)): + # find where the label is in the train_idx + assert ( + self.labels.shape[0] == self.text_emb.shape[0] + ), f"{self.labels.shape[0]}!={self.text_emb.shape[0]}" + label_idx_in_ds = np.where(self.labels == num)[0] + label_emb.append(self.text_emb[label_idx_in_ds[0]]) + self.labels_emb = np.array(label_emb) + assert self.labels_emb.shape[0] == len(self.num2alphabet) + + def classification(self, sim_fn: Union[callable, str]) -> float: # noqa: UP007 + """Classification task. + + Args: + sim_fn: similarity function + Returns: + accuracy: classification accuracy + """ + assert np.allclose( + self.labels_emb[self.train_idx[0]], self.train_text[0], atol=1e-3, rtol=1e-4 + ), f"{self.labels_emb[self.train_idx[0]].shape}!={self.train_text[0].shape}" + + cfg = self.cfg + sim_scores = [] + if sim_fn == "asif": + # set parameters + non_zeros = min(cfg.asif.non_zeros, self.train_img.shape[0]) + range_anch = [ + 2**i + for i in range( + int(np.log2(non_zeros) + 1), + int(np.log2(len(self.train_img))) + 2, + ) + ] + range_anch = range_anch[-1:] # run just last anchor to be quick + val_labels = torch.zeros((1,), dtype=torch.float32) + # generate noise in the shape of the labels_emb + noise = np.random.rand( + self.test_img.shape[0] - self.labels_emb.shape[0], + self.labels_emb.shape[1], + ).astype(np.float32) + self.test_label = np.concatenate((self.labels_emb, noise), axis=0) + assert ( + self.test_img.shape[0] == self.test_label.shape[0] + ), f"{self.test_img.shape[0]}!={self.test_label.shape[0]}" + _anchors, scores, sim_score_matrix = zero_shot_classification( + torch.tensor(self.test_img, dtype=torch.float32), + torch.tensor(self.test_label, dtype=torch.float32), + torch.tensor(self.train_img, dtype=torch.float32), + torch.tensor(self.train_text, dtype=torch.float32), + val_labels, + non_zeros, + range_anch, + cfg.asif.val_exps, + max_gpu_mem_gb=cfg.asif.max_gpu_mem_gb, + ) + sim_score_matrix = sim_score_matrix.numpy().astype(np.float32)[:, :2] + sim_scores = sim_score_matrix.T # labels x test_img_size + else: + for label_idx in range(len(self.num2alphabet)): # 0 to 25 + label_emb = self.labels_emb[label_idx].reshape(1, -1) + label_emb = np.repeat(label_emb, self.test_text.shape[0], axis=0) + ################## + # sim_score_matrix = sim_fn(self.test_img, label_emb) + sim_score_matrix = sim_fn(self.test_text, label_emb) + # print(sim_score_matrix) + # input() + ################## + sim_scores.append(sim_score_matrix) + sim_scores = np.array(sim_scores) # labels x test_img_size + + most_similar_label_idx = np.argmax(sim_scores, axis=0) + correct = most_similar_label_idx == self.test_idx + return np.mean(correct) + + +def load_classification_dataset( + cfg: DictConfig, +) -> ImageNetDataset | LeafySpurgeDataset | HandwritingDataset: """Load the dataset for classification task. Args: @@ -267,6 +401,8 @@ def load_classification_dataset(cfg: DictConfig) -> ImageNetDataset: dataset = ImageNetDataset(cfg) elif cfg.dataset == "leafy_spurge": dataset = LeafySpurgeDataset(cfg) + elif cfg.dataset == "handwriting": + dataset = HandwritingDataset(cfg) else: msg = f"Dataset {cfg.dataset} not supported" raise ValueError(msg) diff --git a/mmda/utils/data_utils.py b/mmda/utils/data_utils.py index 487e655..fff6250 100644 --- a/mmda/utils/data_utils.py +++ b/mmda/utils/data_utils.py @@ -168,6 +168,19 @@ def load_two_encoder_data(cfg: DictConfig) -> tuple[DictConfig, np.ndarray, np.n + f"LeafySpurge_text_emb_{cfg_dataset.text_encoder}.pkl", ) ) + elif dataset == "handwriting": + data1 = joblib.load( + Path( + cfg_dataset.paths.save_path + + f"Handwriting_emb_{cfg_dataset.img_encoder}.pkl" + ) + ) + data2 = joblib.load( + Path( + cfg_dataset.paths.save_path + + f"Handwriting_text_emb_{cfg_dataset.text_encoder}.pkl" + ) + ) # TODO: add more datasets else: msg = f"Dataset {dataset} not supported." diff --git a/mmda/utils/dataset_utils.py b/mmda/utils/dataset_utils.py index c91e06d..1cda708 100644 --- a/mmda/utils/dataset_utils.py +++ b/mmda/utils/dataset_utils.py @@ -10,7 +10,9 @@ import joblib import numpy as np import pandas as pd +from aeon.datasets import load_classification from omegaconf import DictConfig +from PIL import Image import hydra from mmda.liploc.dataloaders.KittiBothDataset import KITTIBothDataset @@ -18,6 +20,73 @@ from mmda.utils.video_audio_utils import process_video_ids +def load_handwriting( + cfg_dataset: DictConfig, +) -> tuple[np.ndarray, np.ndarray, dict[str, tuple[str, str]]]: + """Load the Handwriting dataset (https://github.com/amazon-science/aeon). + + Args: + cfg_dataset: configuration file + Returns: + data: data. shape: (num_samples, 3, 152) + labels: labels. e.g. "1.0" + num2alphabet: a dict of index to alphabet + alphabets_hand: list of PIL images + """ + assert cfg_dataset is not None, "cfg_dataset is None" + # train_x.shape: (150, 3, 152), test_x.shape: (850, 3, 152) + train_x, train_y = load_classification( + "Handwriting", split="train" + ) # np.ndarray, list[str] + test_x, test_y = load_classification("Handwriting", split="test") + # merge train and test + x = np.concatenate([train_x, test_x], axis=0) + y = np.concatenate([train_y, test_y], axis=0) + num2alphabet = {f"{i+1}.0": (chr(65 + i), chr(97 + i)) for i in range(26)} + np.random.seed(42) + idx = np.arange(x.shape[0]) + np.random.shuffle(idx) + x = x[idx] + y = y[idx] + + def load_alphabets_img() -> tuple[np.ndarray, np.ndarray]: + """Load the MNIST dataset. + + Returns: + data: data + labels: labels + """ + import kagglehub + + # Download latest version + path = kagglehub.dataset_download( + "sachinpatel21/az-handwritten-alphabets-in-csv-format" + ) + df = pd.read_csv(path + "/A_Z Handwritten Data.csv") + labels = df.iloc[:, 0] + data = df.iloc[:, 1:] + return data, labels + + alphabets_x, alphabets_y = load_alphabets_img() + alphabets_img = {} + for i in range(26): + alphabets_img[i + 1] = alphabets_x[alphabets_y == i][:100] + + alphabets_hand = [] + for i in range(x.shape[0]): + label = int(y[i].split(".")[0]) + random_idx = np.random.choice(alphabets_img[label].shape[0]) + random_df = alphabets_img[label].iloc[random_idx].to_numpy() + random_df = random_df.reshape(28, 28).astype(np.uint8) + alphabets_hand.append(Image.fromarray(random_df)) + return ( + x, + y, + num2alphabet, + alphabets_hand, + ) + + def load_msrvtt( cfg_dataset: DictConfig, ) -> tuple[list[str], list[str], np.ndarray, list[str]]: @@ -702,7 +771,7 @@ def shuffle_by_level( # noqa: PLR0912, C901, ANN201 @hydra.main(version_base=None, config_path="../../config", config_name="main") def main(cfg: DictConfig) -> None: # noqa: D103 - load_imagenet(cfg.imagenet) + load_handwriting(cfg.handwriting) if __name__ == "__main__": diff --git a/mmda/utils/embed_data.py b/mmda/utils/embed_data.py index 440f63a..49ae709 100644 --- a/mmda/utils/embed_data.py +++ b/mmda/utils/embed_data.py @@ -5,6 +5,7 @@ import numpy as np import open_clip import torch +from chronos import ChronosPipeline from PIL import Image, ImageFilter from sentence_transformers import SentenceTransformer from torchvision import transforms @@ -19,6 +20,29 @@ ) +def chronos_ts(ts: np.ndarray) -> np.ndarray: + """Extract time series features using Chronos model.""" + num_data, channels, num_timestamps = ts.shape + pipeline = ChronosPipeline.from_pretrained( + "amazon/chronos-t5-large", + device_map="cuda", # use "cpu" for CPU inference and "mps" for Apple Silicon + torch_dtype=torch.bfloat16, + ) + all_embeddings = [] + print("ts shape:", ts.shape) # (1000, 3, 152) + for channel in range(channels): + if channel > 0: + break + # context must be either a 1D tensor, a list of 1D tensors, + # or a left-padded 2D tensor with batch as the first dimension + context = torch.tensor(ts[:, channel, :]).reshape(num_data, num_timestamps) + embeddings, tokenizer_state = pipeline.embed(context) # (1000, 153, 1024) + all_embeddings.append( + embeddings[:, -1, :].detach().cpu().to(torch.float32).numpy() + ) + return np.concatenate(all_embeddings, axis=1) + + def cosplace_img(img_files: list, batch_size: int = 32) -> np.ndarray: """Extract image features using CosPlace model specifically trained for the Pittsburgh dataset. @@ -134,9 +158,6 @@ def clip_imgs( "hf-hub:laion/CLIP-ViT-bigG-14-laion2B-39B-b160k" ) model = model.cuda() - print("Loading CLIP model") - num_params = sum(p.numel() for p in model.parameters()) - print(f"Number of parameters in CLIP model: {num_params:,}") img_embeddings = [] with torch.no_grad(), torch.cuda.amp.autocast(): for i in tqdm(range(0, len(img_files), batch_size)): diff --git a/pyproject.toml b/pyproject.toml index 696a1a3..92fa172 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ authors = [ ] description = "Multimodal Feature Extraction" readme = "README.md" -requires-python = ">=3.10" +requires-python = ">=3.9,<3.13" classifiers = [ "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License", @@ -32,7 +32,7 @@ description = "Multimodal Data Alignment" readme = "README.md" [tool.poetry.dependencies] -python = "^3.10" +python = ">=3.9,<3.13" ruff = "*" numpy = "^1.19" pandas = "*" @@ -48,9 +48,9 @@ hydra-core = "1.3.2" hydra-joblib-launcher = "1.2.0" protobuf = "3.20.*" torchvision = "*" +torchaudio = "*" ipython = "^8.15.0" scipy = "^1.11.2" -swarm-visualizer = {git = "https://github.com/UTAustin-SwarmLab/Swarm-Visualization"} cca-zoo = "^2.5.0" ipykernel = "^6.29.3" jupyter = "^1.0.0" @@ -70,8 +70,11 @@ timm = "*" albumentations = "*" kaggle = "*" moviepy = "*" -imagebind = {git = "https://github.com/facebookresearch/ImageBind"} +# imagebind = {git = "https://github.com/facebookresearch/ImageBind"} # LLaVA = {git = "https://github.com/haotian-liu/LLaVA.git"} # contradicting with imagebind +chronos = {git = "https://github.com/amazon-science/chronos-forecasting.git"} +aeon = {git = "https://github.com/aeon-toolkit/aeon.git", branch = "main"} +kagglehub = "*" [project.optional-dependencies] dev = ["black", "ruff", "mypy"] From a19684279de5c612860fcd0171ef3cf7c60775d1 Mon Sep 17 00:00:00 2001 From: d31003 Date: Thu, 28 Nov 2024 15:57:52 -0600 Subject: [PATCH 09/10] multi-class roc --- bash_scripts/handwriting_script.sh | 13 ++- config/main.yaml | 4 +- mmda/bimodal_classification.py | 8 +- mmda/exps/classification.py | 7 +- mmda/get_embeddings.py | 65 +++++------ mmda/tsfresh_features.py | 119 +++++++++++++++++++++ mmda/utils/cca_class.py | 32 +++--- mmda/utils/classification_dataset_class.py | 19 ++-- mmda/utils/data_utils.py | 9 +- mmda/utils/dataset_utils.py | 11 +- pyproject.toml | 5 +- 11 files changed, 205 insertions(+), 87 deletions(-) create mode 100644 mmda/tsfresh_features.py diff --git a/bash_scripts/handwriting_script.sh b/bash_scripts/handwriting_script.sh index 7e20846..7cd0bbe 100644 --- a/bash_scripts/handwriting_script.sh +++ b/bash_scripts/handwriting_script.sh @@ -1,7 +1,10 @@ # classification -CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=10 -CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=25 +# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=10 +# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=25 +CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=30 CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=50 -CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=100 -CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=200 -CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=700 +CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=60 +CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=70 +# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=100 +# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=200 +# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=500 diff --git a/config/main.yaml b/config/main.yaml index dbda3a4..a3f480a 100644 --- a/config/main.yaml +++ b/config/main.yaml @@ -106,9 +106,9 @@ imagenet: handwriting: sim_dim: 700 # dimension of the similarity score and the CCA transformation equal_weights: False - img_encoder: "chronos" + img_encoder: "tsfresh" text_encoder: "clip" - train_test_ratios: [0.9] + train_test_ratios: [0.85] shuffle: False paths: dataset_path: "/nas/pohan/datasets/Handwriting/" diff --git a/mmda/bimodal_classification.py b/mmda/bimodal_classification.py index 415018e..f8569b3 100644 --- a/mmda/bimodal_classification.py +++ b/mmda/bimodal_classification.py @@ -52,13 +52,9 @@ def main(cfg: DictConfig) -> None: # noqa: C901, PLR0915, PLR0912 f.write(f"{shuffle_ratio},{cca_accs},{asif_accs}\n") else: for train_test_ratio in cfg_dataset.train_test_ratios: - asif_accs = asif_classification(cfg, train_test_ratio) cca_accs = cca_classification(cfg, train_test_ratio) - clip_accs = ( - clip_like_classification(cfg, train_test_ratio) - if cfg.dataset != "handwriting" - else 0 - ) + asif_accs = 0 if True else asif_classification(cfg, train_test_ratio) + clip_accs = 0 if True else clip_like_classification(cfg, train_test_ratio) # write accuracy to file if not csv_save_path.exists(): # create the file and write the header diff --git a/mmda/exps/classification.py b/mmda/exps/classification.py index 26968fd..cb7bcbe 100644 --- a/mmda/exps/classification.py +++ b/mmda/exps/classification.py @@ -20,15 +20,14 @@ def cca_classification( Returns: data_size2accuracy: {data_size: accuracy} """ - print("CCA") cfg_dataset = cfg[cfg.dataset] + print(f"CCA {cfg_dataset.sim_dim}") ds = load_classification_dataset(cfg) ds.load_data(train_test_ratio, clip_bool=False, shuffle_ratio=shuffle_ratio) - cca = ReNormalizedCCA() if cfg.dataset == "handwriting" else NormalizedCCA() + cca = ReNormalizedCCA() if True else NormalizedCCA() ds.train_img, ds.train_text, corr = cca.fit_transform_train_data( cfg_dataset, ds.train_img, ds.train_text ) - print("corr", corr) ds.test_img, ds.test_text = cca.transform_data(ds.test_img, ds.test_text) ds.get_labels_emb() @@ -50,6 +49,7 @@ def clip_like_classification(cfg: DictConfig, train_test_ratio: float) -> float: Returns: data_size2accuracy: {data_size: accuracy} """ + print("CLIP-like") ds = load_classification_dataset(cfg) ds.load_data(train_test_ratio, clip_bool=True) ds.get_labels_emb() @@ -68,6 +68,7 @@ def asif_classification( Returns: data_size2accuracy: {data_size: accuracy} """ + print("ASIF") ds = load_classification_dataset(cfg) ds.load_data(train_test_ratio, clip_bool=False, shuffle_ratio=shuffle_ratio) ds.get_labels_emb() diff --git a/mmda/get_embeddings.py b/mmda/get_embeddings.py index 322577e..c74ec18 100644 --- a/mmda/get_embeddings.py +++ b/mmda/get_embeddings.py @@ -560,39 +560,35 @@ def main(cfg: DictConfig) -> None: # noqa: PLR0915, C901, PLR0912 print("CLIP embeddings saved") elif dataset == "handwriting": - # sentence_26 = { - # 1: "apple.", - # 2: "ball.", - # 3: "cat.", - # 4: "dog.", - # 5: "elephant.", - # 6: "fish.", - # 7: "giraffe.", - # 8: "hat.", - # 9: "ice cream.", - # 10: "jaguar.", - # 11: "kangaroo.", - # 12: "lion.", - # 13: "monkey.", - # 14: "nest.", - # 15: "owl.", - # 16: "penguin.", - # 17: "queen.", - # 18: "rabbit.", - # 19: "snake.", - # 20: "tiger.", - # 21: "umbrella.", - # 22: "vase.", - # 23: "whale.", - # 24: "x-ray.", - # 25: "yak.", - # 26: "zebra.", - # } data, labels, num2alphabet, alphabets_hand = load_handwriting(cfg_dataset) - # sentences = [sentence_26[int(label.split(".")[0])] for label in labels] - # int_labels = [int(label.split(".")[0]) - 1 for label in labels] + # save data + with Path(cfg_dataset.paths.save_path, "Handwriting_data.pkl").open("wb") as f: + pickle.dump(data, f) + print("Handwriting data saved") + return - embeddings = chronos_ts(data) if False else data.reshape(data.shape[0], -1) + embeddings = clip_imgs(alphabets_hand, 256) + print("text shape:", embeddings.shape) + with Path(cfg_dataset.paths.save_path, "Handwriting_emb_clip.pkl").open( + "wb" + ) as f: + pickle.dump(embeddings, f) + print("CLIP embeddings saved") + + sentences = [f"Alphabet {num2alphabet[label]}." for label in labels] + print(sentences[15:21]) + embeddings = gtr_text(sentences) + assert np.allclose( + embeddings[15], embeddings[20], atol=1e-3, rtol=1e-4 + ), f"{embeddings[15].shape}!={embeddings[20].shape}" + with Path(cfg_dataset.paths.save_path, "Handwriting_emb_gtr.pkl").open( + "wb" + ) as f: + pickle.dump(embeddings, f) + print("GTR shape:", embeddings.shape) + print("GTR embeddings saved") + + embeddings = chronos_ts(data) # check if embeddings has unique rows assert embeddings.shape[0] == len( np.unique(embeddings, axis=0) @@ -604,13 +600,6 @@ def main(cfg: DictConfig) -> None: # noqa: PLR0915, C901, PLR0912 pickle.dump(embeddings, f) print("Chronos embeddings saved") - embeddings = clip_imgs(alphabets_hand, 256) - print("text shape:", embeddings.shape) - with Path(cfg_dataset.paths.save_path, "Handwriting_text_emb_clip.pkl").open( - "wb" - ) as f: - pickle.dump(embeddings, f) - print("CLIP embeddings saved") # TODO: add more datasets else: msg = f"Dataset {dataset} not supported." diff --git a/mmda/tsfresh_features.py b/mmda/tsfresh_features.py new file mode 100644 index 0000000..5f1beea --- /dev/null +++ b/mmda/tsfresh_features.py @@ -0,0 +1,119 @@ +"""Extract tsfresh features from the Handwriting dataset.""" + +import pickle +from pathlib import Path + +import kagglehub +import numpy as np +import pandas as pd +from aeon.datasets import load_classification +from PIL import Image +from tsfresh import extract_features + +PATH = "/nas/pohan/datasets/Handwriting/" +PATH_SAVE = "/nas/pohan/datasets/Handwriting/embeddings/" + + +def load_handwriting() -> tuple[np.ndarray, np.ndarray, dict[str, tuple[str, str]]]: + """Load the Handwriting dataset (https://github.com/amazon-science/aeon). + + Args: + cfg_dataset: configuration file + Returns: + data: data. shape: (num_samples, 3, 152) + labels: labels. e.g. "1.0" + num2alphabet: a dict of index to alphabet + alphabets_hand: list of PIL images + """ + # train_x.shape: (150, 3, 152), test_x.shape: (850, 3, 152) + train_x, train_y = load_classification( + "Handwriting", split="train" + ) # np.ndarray, list[str] + test_x, test_y = load_classification("Handwriting", split="test") + # merge train and test + x = np.concatenate([train_x, test_x], axis=0) + y = np.concatenate([train_y, test_y], axis=0) + num2alphabet = {f"{i+1}.0": chr(65 + i) for i in range(26)} + idx = np.arange(x.shape[0]) + x = x[idx] + y = y[idx] + + def load_alphabets_img() -> tuple[np.ndarray, np.ndarray]: + """Load the MNIST dataset. + + Returns: + data: data + labels: labels + """ + # Download latest version + path = kagglehub.dataset_download( + "sachinpatel21/az-handwritten-alphabets-in-csv-format" + ) + df = pd.read_csv(path + "/A_Z Handwritten Data.csv") + labels = df.iloc[:, 0] + data = df.iloc[:, 1:] + return data, labels + + alphabets_x, alphabets_y = load_alphabets_img() + alphabets_img = {} + for i in range(26): + alphabets_img[i + 1] = alphabets_x[alphabets_y == i][:100] + + alphabets_hand = [] + for i in range(x.shape[0]): + label = int(y[i].split(".")[0]) + random_idx = np.random.choice(alphabets_img[label].shape[0]) + random_df = alphabets_img[label].iloc[random_idx].to_numpy() + random_df = random_df.reshape(28, 28).astype(np.uint8) + # save image to png + path = Path(PATH, f"alphabet_{label}_{random_idx}.png") + Image.fromarray(random_df, mode="L").save(path) + alphabets_hand.append(path) + return ( + x, + y, + num2alphabet, + alphabets_hand, + ) + + +def tsfresh_features() -> np.ndarray: + """Extract tsfresh features from the data. + + Returns: + features: features + """ + data, labels, num2alphabet, alphabets_hand = load_handwriting() + + path = Path(PATH_SAVE, "Handwriting_tsfresh.csv") + + if path.exists(): + df = pd.read_csv(path) + else: + # convert data to a df + # column_id: id, column_sort: time, values: 3 channels + df = pd.DataFrame(columns=["id", "time", "channel_1", "channel_2", "channel_3"]) + for idx in range(data.shape[0]): + for time in range(data.shape[2]): # 152 + df.loc[idx, "id"] = idx + df.loc[idx, "time"] = time + df.loc[idx, "channel_1"] = data[idx, 0, time] + df.loc[idx, "channel_2"] = data[idx, 1, time] + df.loc[idx, "channel_3"] = data[idx, 2, time] + print(df.head()) + print(df.tail()) + + df.to_csv(path, index=False) + ts_features = extract_features(df, column_id="id", column_sort="time") + ts_features = ts_features.dropna(axis=1) + print(type(ts_features)) + print(ts_features.shape) + print(ts_features.head()) + print("ts_features shape:", ts_features.shape) + with Path(PATH_SAVE, "Handwriting_emb_tsfresh.pkl.pkl").open("wb") as f: + pickle.dump(ts_features, f) + print("TSFresh features saved") + + +if __name__ == "__main__": + tsfresh_features() diff --git a/mmda/utils/cca_class.py b/mmda/utils/cca_class.py index bca76d1..05bb8d5 100644 --- a/mmda/utils/cca_class.py +++ b/mmda/utils/cca_class.py @@ -67,7 +67,7 @@ def fit_transform_train_data( corr_coeff >= 0 ).all(), f"Correlation should be non-negative. {corr_coeff}" assert ( - corr_coeff <= 1 + corr_coeff <= 1.05 # noqa: PLR2004 ).all(), f"Correlation should be less than 1. {corr_coeff}" self.corr_coeff = corr_coeff self.traindata1, self.traindata2 = traindata1, traindata2 @@ -141,6 +141,8 @@ def fit_transform_train_data( corr_coeff: the correlation coefficient. shape: (dim,) """ # Check the shape of the training data + traindata1 = traindata1.astype(np.float32) + traindata2 = traindata2.astype(np.float32) # zero mean data traindata1, traindata1_mean = origin_centered(traindata1) traindata2, traindata2_mean = origin_centered(traindata2) @@ -155,23 +157,15 @@ def fit_transform_train_data( ), f"traindata2align not zero mean: {max(abs(traindata2.mean(axis=0)))}" # CCA dimensionality reduction - print((traindata1.T @ traindata1).shape) - sigma_z1_inv = np.linalg.inv(traindata1.T @ traindata1) + sigma_z1_inv = np.linalg.inv( + traindata1.T @ traindata1 + np.eye(traindata1.shape[1]) * 1e-5 + ) sigma_z1_inv_sqrt = sqrtm(sigma_z1_inv) - assert np.allclose( - sigma_z1_inv_sqrt @ sigma_z1_inv_sqrt, sigma_z1_inv - ), "sigma_z1_inv_sqrt is not the square root of sigma_z1_inv" sigma_z2_inv = np.linalg.inv(traindata2.T @ traindata2) sigma_z2_inv_sqrt = sqrtm(sigma_z2_inv) - assert np.allclose( - sigma_z2_inv_sqrt @ sigma_z2_inv_sqrt, sigma_z2_inv - ), "sigma_z2_inv_sqrt is not the square root of sigma_z2_inv" svd_mat = sigma_z1_inv_sqrt @ traindata1.T @ traindata2 @ sigma_z2_inv_sqrt u, s, vh = np.linalg.svd(svd_mat) - assert np.allclose( - u @ np.diag(s) @ vh, svd_mat - ), "svd_mat is not the SVD of svd_mat" self.A = u @ sigma_z1_inv_sqrt self.B = vh @ sigma_z2_inv_sqrt @@ -180,13 +174,12 @@ def fit_transform_train_data( assert ( corr_coeff >= 0 ).all(), f"Correlation should be non-negative. {corr_coeff}" - assert ( - corr_coeff <= 1 - ).all(), f"Correlation should be less than 1. {corr_coeff}" self.corr_coeff = corr_coeff + if self.sim_dim is None: + self.sim_dim = cfg_dataset.sim_dim self.traindata1, self.traindata2 = ( - (self.A @ traindata1.T).T, - (self.B @ traindata2.T).T, + (self.A @ traindata1.T).T[:, : self.sim_dim], + (self.B @ traindata2.T).T[:, : self.sim_dim], ) return self.traindata1, self.traindata2, corr_coeff @@ -203,12 +196,15 @@ def transform_data( data1: the first transformed data. shape: (num_samples, dim) data2: the second transformed data. shape: (num_samples, dim) """ + data1 = data1.astype(np.float32) + data2 = data2.astype(np.float32) assert self.traindata1_mean is not None, "Please fit the cca model first." assert self.traindata2_mean is not None, "Please fit the cca model first." # zero mean data and transform data1 = data1 - self.traindata1_mean data2 = data2 - self.traindata2_mean - data1, data2 = (self.A @ data1.T).T, (self.B @ data2.T).T + data1 = (self.A @ data1.T).T[:, : self.sim_dim] + data2 = (self.B @ data2.T).T[:, : self.sim_dim] return data1, data2 def save_model(self, path: str | Path) -> None: diff --git a/mmda/utils/classification_dataset_class.py b/mmda/utils/classification_dataset_class.py index 93a948f..84ec87c 100644 --- a/mmda/utils/classification_dataset_class.py +++ b/mmda/utils/classification_dataset_class.py @@ -5,6 +5,7 @@ import numpy as np import torch from omegaconf import DictConfig +from sklearn.metrics import roc_auc_score from mmda.baselines.asif_core import zero_shot_classification from mmda.utils.data_utils import load_clip_like_data, load_two_encoder_data @@ -306,7 +307,6 @@ def load_data( self.labels[:train_size], self.labels[train_size:], ) - print(self.train_img.shape, self.train_text.shape) def get_labels_emb(self) -> None: """Get the text embeddings for all possible labels.""" @@ -332,7 +332,6 @@ def classification(self, sim_fn: Union[callable, str]) -> float: # noqa: UP007 assert np.allclose( self.labels_emb[self.train_idx[0]], self.train_text[0], atol=1e-3, rtol=1e-4 ), f"{self.labels_emb[self.train_idx[0]].shape}!={self.train_text[0].shape}" - cfg = self.cfg sim_scores = [] if sim_fn == "asif": @@ -373,15 +372,19 @@ def classification(self, sim_fn: Union[callable, str]) -> float: # noqa: UP007 for label_idx in range(len(self.num2alphabet)): # 0 to 25 label_emb = self.labels_emb[label_idx].reshape(1, -1) label_emb = np.repeat(label_emb, self.test_text.shape[0], axis=0) - ################## - # sim_score_matrix = sim_fn(self.test_img, label_emb) - sim_score_matrix = sim_fn(self.test_text, label_emb) - # print(sim_score_matrix) - # input() - ################## + sim_score_matrix = sim_fn(self.test_img, label_emb) sim_scores.append(sim_score_matrix) sim_scores = np.array(sim_scores) # labels x test_img_size + # ROC with scikit-learn + y = self.test_idx + sim_scores_t = sim_scores.T # test_img_size x labels + sim_scores_t = np.nan_to_num(sim_scores_t, nan=0.0) + pred_y = sim_scores_t / sim_scores_t.sum(axis=1, keepdims=True) + roc_auc = roc_auc_score(y, pred_y, multi_class="ovr") + print(f"ROC AUC: {roc_auc}") + + # accuracy most_similar_label_idx = np.argmax(sim_scores, axis=0) correct = most_similar_label_idx == self.test_idx return np.mean(correct) diff --git a/mmda/utils/data_utils.py b/mmda/utils/data_utils.py index fff6250..f3dde38 100644 --- a/mmda/utils/data_utils.py +++ b/mmda/utils/data_utils.py @@ -178,7 +178,7 @@ def load_two_encoder_data(cfg: DictConfig) -> tuple[DictConfig, np.ndarray, np.n data2 = joblib.load( Path( cfg_dataset.paths.save_path - + f"Handwriting_text_emb_{cfg_dataset.text_encoder}.pkl" + + f"Handwriting_emb_{cfg_dataset.text_encoder}.pkl" ) ) # TODO: add more datasets @@ -265,6 +265,13 @@ def load_clip_like_data(cfg: DictConfig) -> tuple[DictConfig, np.ndarray, np.nda cfg_dataset.paths.save_path + "LeafySpurge_text_emb_clip.pkl", ) ) + elif dataset == "handwriting": + data1 = joblib.load( + Path(cfg_dataset.paths.save_path + "Handwriting_emb_gtr.pkl") + ) + data2 = joblib.load( + Path(cfg_dataset.paths.save_path + "Handwriting_emb_gtr.pkl") + ) # TODO: add more datasets else: msg = f"Dataset {dataset} not supported." diff --git a/mmda/utils/dataset_utils.py b/mmda/utils/dataset_utils.py index 1cda708..9b7cc03 100644 --- a/mmda/utils/dataset_utils.py +++ b/mmda/utils/dataset_utils.py @@ -42,10 +42,8 @@ def load_handwriting( # merge train and test x = np.concatenate([train_x, test_x], axis=0) y = np.concatenate([train_y, test_y], axis=0) - num2alphabet = {f"{i+1}.0": (chr(65 + i), chr(97 + i)) for i in range(26)} - np.random.seed(42) + num2alphabet = {f"{i+1}.0": chr(65 + i) for i in range(26)} idx = np.arange(x.shape[0]) - np.random.shuffle(idx) x = x[idx] y = y[idx] @@ -78,7 +76,12 @@ def load_alphabets_img() -> tuple[np.ndarray, np.ndarray]: random_idx = np.random.choice(alphabets_img[label].shape[0]) random_df = alphabets_img[label].iloc[random_idx].to_numpy() random_df = random_df.reshape(28, 28).astype(np.uint8) - alphabets_hand.append(Image.fromarray(random_df)) + # save image to png + path = Path( + cfg_dataset.paths.dataset_path, f"alphabet_{label}_{random_idx}.png" + ) + Image.fromarray(random_df, mode="L").save(path) + alphabets_hand.append(path) return ( x, y, diff --git a/pyproject.toml b/pyproject.toml index 92fa172..878646c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,7 @@ description = "Multimodal Data Alignment" readme = "README.md" [tool.poetry.dependencies] +matplotlib = "*" python = ">=3.9,<3.13" ruff = "*" numpy = "^1.19" @@ -50,7 +51,7 @@ protobuf = "3.20.*" torchvision = "*" torchaudio = "*" ipython = "^8.15.0" -scipy = "^1.11.2" +scipy = "*" cca-zoo = "^2.5.0" ipykernel = "^6.29.3" jupyter = "^1.0.0" @@ -72,7 +73,7 @@ kaggle = "*" moviepy = "*" # imagebind = {git = "https://github.com/facebookresearch/ImageBind"} # LLaVA = {git = "https://github.com/haotian-liu/LLaVA.git"} # contradicting with imagebind -chronos = {git = "https://github.com/amazon-science/chronos-forecasting.git"} +# chronos = {git = "https://github.com/amazon-science/chronos-forecasting.git"} aeon = {git = "https://github.com/aeon-toolkit/aeon.git", branch = "main"} kagglehub = "*" From 1e51fbd1bf0787aa31b63f78dae4771045149f31 Mon Sep 17 00:00:00 2001 From: d31003 Date: Thu, 23 Jan 2025 17:14:19 -0600 Subject: [PATCH 10/10] handwriting done --- bash_scripts/handwriting_script.sh | 13 +++++-------- config/main.yaml | 6 +++--- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/bash_scripts/handwriting_script.sh b/bash_scripts/handwriting_script.sh index 7cd0bbe..d224546 100644 --- a/bash_scripts/handwriting_script.sh +++ b/bash_scripts/handwriting_script.sh @@ -1,10 +1,7 @@ # classification -# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=10 -# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=25 -CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=30 +CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=10 +CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=25 CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=50 -CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=60 -CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=70 -# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=100 -# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=200 -# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=500 +CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=100 +CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=200 +CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=500 diff --git a/config/main.yaml b/config/main.yaml index a3f480a..638897e 100644 --- a/config/main.yaml +++ b/config/main.yaml @@ -104,10 +104,10 @@ imagenet: label_embeddings: ${imagenet.paths.dataset_path}_${text_encoder}_label_embeddings.npy handwriting: - sim_dim: 700 # dimension of the similarity score and the CCA transformation - equal_weights: False + sim_dim: 50 # dimension of the similarity score and the CCA transformation + equal_weights: True img_encoder: "tsfresh" - text_encoder: "clip" + text_encoder: "gtr" train_test_ratios: [0.85] shuffle: False paths: