diff --git a/.gitignore b/.gitignore index 4000a22..20dbc72 100644 --- a/.gitignore +++ b/.gitignore @@ -171,4 +171,5 @@ plots/* # lock files *.lock .checkpoints/ -.assets/ \ No newline at end of file +.assets/ +*.keras \ No newline at end of file diff --git a/bash_scripts/handwriting_script.sh b/bash_scripts/handwriting_script.sh new file mode 100644 index 0000000..d224546 --- /dev/null +++ b/bash_scripts/handwriting_script.sh @@ -0,0 +1,7 @@ +# classification +CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=10 +CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=25 +CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=50 +CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=100 +CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=200 +CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=500 diff --git a/bash_scripts/imagenet_script.sh b/bash_scripts/imagenet_script.sh index c2f0561..8b8fb21 100644 --- a/bash_scripts/imagenet_script.sh +++ b/bash_scripts/imagenet_script.sh @@ -1,3 +1,4 @@ +# mislabeled data # CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=10 # CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=25 # CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=50 @@ -15,6 +16,12 @@ # CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet train_test_ratio=0.1 imagenet.sim_dim=500 # CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet train_test_ratio=0.1 imagenet.sim_dim=700 -# # classification +# classification # CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=imagenet -CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=imagenet imagenet.shuffle=True +# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=imagenet imagenet.shuffle=True + +# SVM +# poetry run python mmda/linear_svm_clip.py train_test_ratio=0.1 +# poetry run python mmda/linear_svm_clip.py train_test_ratio=0.3 +# poetry run python mmda/linear_svm_clip.py train_test_ratio=0.5 +# poetry run python mmda/linear_svm_clip.py train_test_ratio=0.7 \ No newline at end of file diff --git a/bash_scripts/leafy_script.sh b/bash_scripts/leafy_script.sh index d7d9d6a..573a5b2 100644 --- a/bash_scripts/leafy_script.sh +++ b/bash_scripts/leafy_script.sh @@ -1,4 +1,9 @@ # CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=leafy_spurge leafy_spurge.sim_dim=10 # CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=leafy_spurge leafy_spurge.sim_dim=50 # CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=leafy_spurge leafy_spurge.sim_dim=100 -CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=leafy_spurge leafy_spurge.sim_dim=250 +# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=leafy_spurge leafy_spurge.sim_dim=250 + +CUDA_VISIBLE_DEVICES=1 poetry run python mmda/linear_svm_clip.py dataset=leafy_spurge leafy_spurge.sim_dim=250 train_test_ratio=0.4 +CUDA_VISIBLE_DEVICES=1 poetry run python mmda/linear_svm_clip.py dataset=leafy_spurge leafy_spurge.sim_dim=250 train_test_ratio=0.6 +CUDA_VISIBLE_DEVICES=1 poetry run python mmda/linear_svm_clip.py dataset=leafy_spurge leafy_spurge.sim_dim=250 train_test_ratio=0.7 +CUDA_VISIBLE_DEVICES=1 poetry run python mmda/linear_svm_clip.py dataset=leafy_spurge leafy_spurge.sim_dim=250 train_test_ratio=0.888 diff --git a/config/main.yaml b/config/main.yaml index 80ce620..638897e 100644 --- a/config/main.yaml +++ b/config/main.yaml @@ -7,7 +7,7 @@ noisy_train_set: True repo_root: "/home/pl22767/Project/MMDA/" # repo_root: "/home/po-han/Desktop/Projects/MMDA/" -dataset: "MSRVTT" +dataset: "handwriting" dataset_level_datasets: [pitts, imagenet, cosmos, sop, tiil, musiccaps, flickr] class_level_datasets: [sop] object_level_datasets: [pitts, sop] @@ -16,7 +16,7 @@ retrieval_datasets: [flickr] any_retrieval_datasets: [KITTI, MSRVTT, BTC] shuffle_llava_datasets: [pitts, sop] # datasets whose plots contains llava mislabel_llava_datasets: [imagenet] -classification_datasets: [imagenet, leafy_spurge] +classification_datasets: [imagenet, leafy_spurge, handwriting] dataset_size: { sop: 56222, musiccaps: 5397, @@ -39,7 +39,6 @@ BTC: save_path: ${BTC.paths.dataset_path}/any2any/ plots_path: ${repo_root}plots/BTC/ - MSRVTT: img_encoder: "clip" audio_encoder: "clap" @@ -92,9 +91,10 @@ musiccaps: imagenet: sim_dim: 700 # dimension of the similarity score and the CCA transformation equal_weights: False - img_encoder: "dino" - text_encoder: "gtr" - train_test_ratios: [0.7] #, 0.3, 0.5, 0.7] + img_encoder: "clipopenai" + text_encoder: "clipdatacomp_xl_s13b_b90k" + model_name: "openai" + train_test_ratios: [0.3, 0.5, 0.7] shuffle_ratios: [0.1, 0.3, 0.5, 0.7, 1.0] shuffle: False paths: @@ -103,6 +103,19 @@ imagenet: plots_path: ${repo_root}plots/ImageNet/ label_embeddings: ${imagenet.paths.dataset_path}_${text_encoder}_label_embeddings.npy +handwriting: + sim_dim: 50 # dimension of the similarity score and the CCA transformation + equal_weights: True + img_encoder: "tsfresh" + text_encoder: "gtr" + train_test_ratios: [0.85] + shuffle: False + paths: + dataset_path: "/nas/pohan/datasets/Handwriting/" + save_path: ${handwriting.paths.dataset_path}embeddings/ + plots_path: ${repo_root}plots/handwriting/ + label_embeddings: ${handwriting.paths.dataset_path}_${text_encoder}_label_embeddings.npy + leafy_spurge: sim_dim: 700 # dimension of the similarity score and the CCA transformation equal_weights: False diff --git a/mmda/bimodal_classification.py b/mmda/bimodal_classification.py index 83caf22..f8569b3 100644 --- a/mmda/bimodal_classification.py +++ b/mmda/bimodal_classification.py @@ -15,7 +15,7 @@ @hydra.main(version_base=None, config_path="../config", config_name="main") -def main(cfg: DictConfig) -> None: +def main(cfg: DictConfig) -> None: # noqa: C901, PLR0915, PLR0912 """Main function to generate the classification results of the bimodal datasets. Args: @@ -27,7 +27,12 @@ def main(cfg: DictConfig) -> None: ), f"{cfg.dataset} is not for classification." cfg_dataset = cfg[cfg.dataset] shuffle_tag = "shuffled" if cfg_dataset.shuffle else "" - ds_size = 50_000 if cfg.dataset == "imagenet" else 900 + if cfg.dataset == "imagenet": + ds_size = 50_000 + elif cfg.dataset == "leafy_spurge": + ds_size = 900 + elif cfg.dataset == "handwriting": + ds_size = 1000 csv_save_path = ( Path(cfg_dataset.paths.plots_path) / f"classify_{cfg_dataset.text_encoder}_{cfg_dataset.img_encoder}/" @@ -47,9 +52,9 @@ def main(cfg: DictConfig) -> None: f.write(f"{shuffle_ratio},{cca_accs},{asif_accs}\n") else: for train_test_ratio in cfg_dataset.train_test_ratios: - asif_accs = asif_classification(cfg, train_test_ratio) cca_accs = cca_classification(cfg, train_test_ratio) - clip_accs = clip_like_classification(cfg, train_test_ratio) + asif_accs = 0 if True else asif_classification(cfg, train_test_ratio) + clip_accs = 0 if True else clip_like_classification(cfg, train_test_ratio) # write accuracy to file if not csv_save_path.exists(): # create the file and write the header @@ -77,7 +82,7 @@ def main(cfg: DictConfig) -> None: label="CSA (ours)", color="blue", ) - if not cfg_dataset.shuffle: + if not cfg_dataset.shuffle and cfg.dataset != "handwriting": clip_accs = df["clip_accs"] ax.plot( ratios, @@ -99,7 +104,12 @@ def main(cfg: DictConfig) -> None: ax.set_ylabel("Classification accuracy", fontsize=20) ax.xaxis.set_tick_params(labelsize=15) ax.yaxis.set_tick_params(labelsize=15) - ax.set_ylim(0, 1.03) if cfg.dataset == "imagenet" else ax.set_ylim(0.4, 0.65) + if cfg.dataset == "imagenet": + ax.set_ylim(0, 1.03) + elif cfg.dataset == "leafy_spurge": + ax.set_ylim(0.4, 0.65) + else: + ax.set_ylim(0, 1.03) ( ax.legend(loc="lower right", fontsize=18) if not cfg_dataset.shuffle diff --git a/mmda/exps/classification.py b/mmda/exps/classification.py index 7535323..cb7bcbe 100644 --- a/mmda/exps/classification.py +++ b/mmda/exps/classification.py @@ -3,14 +3,14 @@ import numpy as np from omegaconf import DictConfig -from mmda.utils.cca_class import NormalizedCCA +from mmda.utils.cca_class import NormalizedCCA, ReNormalizedCCA from mmda.utils.classification_dataset_class import load_classification_dataset from mmda.utils.sim_utils import cosine_sim, weighted_corr_sim def cca_classification( cfg: DictConfig, train_test_ratio: float, shuffle_ratio: float = 0.0 -) -> tuple[dict[float:float], dict[float : dict[float:float]]]: +) -> float: """Retrieve data using the proposed CCA method. Args: @@ -21,9 +21,10 @@ def cca_classification( data_size2accuracy: {data_size: accuracy} """ cfg_dataset = cfg[cfg.dataset] + print(f"CCA {cfg_dataset.sim_dim}") ds = load_classification_dataset(cfg) ds.load_data(train_test_ratio, clip_bool=False, shuffle_ratio=shuffle_ratio) - cca = NormalizedCCA() + cca = ReNormalizedCCA() if True else NormalizedCCA() ds.train_img, ds.train_text, corr = cca.fit_transform_train_data( cfg_dataset, ds.train_img, ds.train_text ) @@ -39,9 +40,7 @@ def sim_fn(x: np.array, y: np.array, corr: np.array = corr) -> np.array: return ds.classification(sim_fn=sim_fn) -def clip_like_classification( - cfg: DictConfig, train_test_ratio: float -) -> tuple[dict[float:float], dict[float:float]]: +def clip_like_classification(cfg: DictConfig, train_test_ratio: float) -> float: """Retrieve data using the CLIP-like method. Args: @@ -50,6 +49,7 @@ def clip_like_classification( Returns: data_size2accuracy: {data_size: accuracy} """ + print("CLIP-like") ds = load_classification_dataset(cfg) ds.load_data(train_test_ratio, clip_bool=True) ds.get_labels_emb() @@ -58,7 +58,7 @@ def clip_like_classification( def asif_classification( cfg: DictConfig, train_test_ratio: float, shuffle_ratio: float = 0.0 -) -> tuple[dict[float:float], dict[float:float]]: +) -> float: """Retrieve data using the CLIP-like method. Args: @@ -68,6 +68,7 @@ def asif_classification( Returns: data_size2accuracy: {data_size: accuracy} """ + print("ASIF") ds = load_classification_dataset(cfg) ds.load_data(train_test_ratio, clip_bool=False, shuffle_ratio=shuffle_ratio) ds.get_labels_emb() diff --git a/mmda/exps/mislabel_align.py b/mmda/exps/mislabel_align.py index d3d21fe..8dc724b 100644 --- a/mmda/exps/mislabel_align.py +++ b/mmda/exps/mislabel_align.py @@ -118,6 +118,8 @@ def __init__(self, *args, **kwargs): # noqa: ANN204, ANN002, ANN003 "valdata2align": valdata2align, "valdata1unalign": valdata1unalign, "valdata2unalign": valdata2unalign, + "train_idx": train_idx, + "train_wrong_labels_bool": train_wrong_labels_bool, } ) return alldata diff --git a/mmda/get_embeddings.py b/mmda/get_embeddings.py index b960090..c74ec18 100644 --- a/mmda/get_embeddings.py +++ b/mmda/get_embeddings.py @@ -15,6 +15,7 @@ from mmda.utils.dataset_utils import ( load_cosmos, load_flickr, + load_handwriting, load_imagenet, load_kitti, load_leafy_spurge, @@ -25,22 +26,24 @@ load_tiil, ) from mmda.utils.embed_data import ( + chronos_ts, clap_audio, clap_text, clip_imgs, clip_text, cosplace_img, dinov2, + fair_clip_imgs, + fair_clip_text, gtr_text, ) -from mmda.utils.imagebind_utils import ImageBindInference from mmda.utils.video_audio_utils import ( get_video_emb, prepare_audio_for_imagebind, process_audio, ) -BATCH_SIZE = 256 +BATCH_SIZE = 758 @hydra.main(version_base=None, config_path="../config", config_name="main") @@ -92,6 +95,8 @@ def main(cfg: DictConfig) -> None: # noqa: PLR0915, C901, PLR0912 pickle.dump(clap_audio_features, f) elif dataset == "MSRVTT": + from mmda.utils.imagebind_utils import ImageBindInference + _, captions, video_info_sen_order, video_dict = load_msrvtt(cfg_dataset) id_order, img_paths, audio_start_secs, audio_num_secs = get_video_emb( cfg_dataset, video_dict @@ -372,6 +377,24 @@ def main(cfg: DictConfig) -> None: # noqa: PLR0915, C901, PLR0912 text_descriptions = ["An image of " + label + "." for label in orig_labels] # get text embeddings + model = "openai" + + img_emb = fair_clip_imgs(img_path, BATCH_SIZE, model_name=("ViT-L-14", model)) + with Path( + cfg_dataset.paths.save_path, f"ImageNet_img_emb_clip{model}.pkl" + ).open("wb") as f: + pickle.dump(img_emb, f) + print("FairCLIP embeddings saved") + + text_emb = fair_clip_text( + text_descriptions, BATCH_SIZE, model_name=("ViT-L-14", model) + ) + with Path( + cfg_dataset.paths.save_path, f"ImageNet_text_emb_clip{model}.pkl" + ).open("wb") as f: + pickle.dump(text_emb, f) + print("FairCLIP embeddings saved") + text_emb = clip_text(text_descriptions, BATCH_SIZE) with Path(cfg_dataset.paths.save_path, "ImageNet_text_emb_clip.pkl").open( "wb" @@ -536,6 +559,47 @@ def main(cfg: DictConfig) -> None: # noqa: PLR0915, C901, PLR0912 pickle.dump(img_emb, f) print("CLIP embeddings saved") + elif dataset == "handwriting": + data, labels, num2alphabet, alphabets_hand = load_handwriting(cfg_dataset) + # save data + with Path(cfg_dataset.paths.save_path, "Handwriting_data.pkl").open("wb") as f: + pickle.dump(data, f) + print("Handwriting data saved") + return + + embeddings = clip_imgs(alphabets_hand, 256) + print("text shape:", embeddings.shape) + with Path(cfg_dataset.paths.save_path, "Handwriting_emb_clip.pkl").open( + "wb" + ) as f: + pickle.dump(embeddings, f) + print("CLIP embeddings saved") + + sentences = [f"Alphabet {num2alphabet[label]}." for label in labels] + print(sentences[15:21]) + embeddings = gtr_text(sentences) + assert np.allclose( + embeddings[15], embeddings[20], atol=1e-3, rtol=1e-4 + ), f"{embeddings[15].shape}!={embeddings[20].shape}" + with Path(cfg_dataset.paths.save_path, "Handwriting_emb_gtr.pkl").open( + "wb" + ) as f: + pickle.dump(embeddings, f) + print("GTR shape:", embeddings.shape) + print("GTR embeddings saved") + + embeddings = chronos_ts(data) + # check if embeddings has unique rows + assert embeddings.shape[0] == len( + np.unique(embeddings, axis=0) + ), f"Embeddings has repeated entries. {embeddings.shape[0]}!={len(np.unique(embeddings, axis=0))}" + print("Chronos shape:", embeddings.shape) + with Path(cfg_dataset.paths.save_path, "Handwriting_emb_chronos.pkl").open( + "wb" + ) as f: + pickle.dump(embeddings, f) + print("Chronos embeddings saved") + # TODO: add more datasets else: msg = f"Dataset {dataset} not supported." diff --git a/mmda/handwriting_baseline.py b/mmda/handwriting_baseline.py new file mode 100644 index 0000000..9fdfbcb --- /dev/null +++ b/mmda/handwriting_baseline.py @@ -0,0 +1,29 @@ +"""This script is for the handwriting baseline.""" + +import numpy as np +from aeon.classification.deep_learning import InceptionTimeClassifier +from omegaconf import DictConfig +from sklearn.metrics import accuracy_score + +import hydra +from mmda.utils.dataset_utils import load_handwriting + + +@hydra.main(version_base=None, config_path="../config", config_name="main") +def main(cfg: DictConfig) -> None: + """Train the handwriting baseline.""" + x, labels, _ = load_handwriting(cfg_dataset=cfg.handwriting) + inception = InceptionTimeClassifier() + for train_test_ratio in cfg.handwriting.train_test_ratios: + np.random.seed(42) + train_size = int(train_test_ratio * x.shape[0]) + print(x.shape, labels.shape) + inception.fit(x[:train_size], labels[:train_size]) + y_pred = inception.predict(x[train_size:]) + accuracy = accuracy_score(labels[train_size:], y_pred) + print(f"train_test_ratio: {train_test_ratio}, accuracy: {accuracy}") + + +if __name__ == "__main__": + main() +# CUDA_VISIBLE_DEVICES="" poetry run python mmda/handwriting_baseline.py diff --git a/mmda/linear_svm_clip.py b/mmda/linear_svm_clip.py new file mode 100644 index 0000000..6ee5983 --- /dev/null +++ b/mmda/linear_svm_clip.py @@ -0,0 +1,167 @@ +"""Train a linear SVM on the ImageNet dataset.""" + +# ruff: noqa: ERA001, PLR2004, S301 + +import pickle +import time +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from omegaconf import DictConfig +from sklearn import svm + +import hydra +from mmda.utils.cca_class import NormalizedCCA +from mmda.utils.data_utils import load_two_encoder_data +from mmda.utils.dataset_utils import ( + get_train_test_split_index, + load_imagenet, + load_leafy_spurge, + train_test_split, +) + +BATCH_SIZE = 256 + + +@hydra.main(version_base=None, config_path="../config", config_name="main") +def train_linear_svm(cfg: DictConfig) -> None: + """Train a linear SVM on the ImageNet dataset.""" + np.random.seed(cfg.seed) + cfg_dataset = cfg[cfg.dataset] + if cfg.dataset == "imagenet": + _, mturks_idx, labels, _ = load_imagenet(cfg_dataset) + + with Path(cfg_dataset.paths.save_path, "ImageNet_img_emb_clip.pkl").open( + "rb" + ) as f: + img_emb = pickle.load(f) + elif cfg.dataset == "leafy_spurge": + _, labels, _ = load_leafy_spurge(cfg_dataset) + with Path(cfg_dataset.paths.save_path, "LeafySpurge_img_emb_clip.pkl").open( + "rb" + ) as f: + img_emb = pickle.load(f) + + # transform the data using CCA + cfg_dataset, data1, data2 = load_two_encoder_data(cfg) + + # Train linear SVM + start_time = time.time() + train_idx, val_idx = get_train_test_split_index( + cfg.train_test_ratio, img_emb.shape[0] + ) + labels_train, labels_test = train_test_split(labels, train_idx, val_idx) + print(labels_train.shape, labels_test.shape) + + # CSA case + csa_train_data1, csa_val_data1 = train_test_split(data1, train_idx, val_idx) + csa_train_data2, csa_val_data2 = train_test_split(data2, train_idx, val_idx) + cca = NormalizedCCA() + cca_img_train, cca_text_train, _ = cca.fit_transform_train_data( + cfg_dataset, csa_train_data1, csa_train_data2 + ) + clf = svm.SVC(kernel="linear") + clf.fit(cca_img_train, labels_train) + cca_img_val, cca_text_val = cca.transform_data(csa_val_data1, csa_val_data2) + y_pred = clf.predict(cca_img_val) + accuracy = np.mean(y_pred == labels_test) + print(f"CSA accuracy: {accuracy * 100:.2f}%") + return + + # CLIP case + x_train, x_test = train_test_split(img_emb, train_idx, val_idx) + print(x_train.shape, x_test.shape) + print(len(labels_train), len(labels_test)) + clf = svm.SVC(kernel="linear") + clf.fit(x_train, labels_train) + + end_time = time.time() + + print(f"Training time: {end_time - start_time:.2f} seconds") + y_pred = clf.predict(x_test) + accuracy = np.mean(y_pred == labels_test) + print(f"Split {cfg.train_test_ratio} accuracy: {accuracy * 100:.2f}%") + + +@hydra.main(version_base=None, config_path="../config", config_name="main") +def plot_accuracy(cfg: DictConfig) -> None: + """Plot the accuracy of the model.""" + cfg_dataset = cfg[cfg.dataset] + ds_size = 50_000 if cfg.dataset == "imagenet" else 900 + csv_save_path = ( + Path(cfg_dataset.paths.plots_path) + / f"classify_{cfg_dataset.text_encoder}_{cfg_dataset.img_encoder}/" + / f"accuracy_{cfg_dataset.sim_dim}_svm.csv" + ) + df = pd.read_csv(csv_save_path) + ratios = df["train_test_ratio"] * ds_size + cca_accs = df["cca_accs"] + fig, ax = plt.subplots() + ax.plot( + ratios, + cca_accs, + "o-", + ms=12, + label="CSA (ours)", + color="blue", + ) + clip_accs = df["clip_accs"] + ax.plot( + ratios, + clip_accs, + "^--", + ms=12, + label="CLIP", + color="red", + ) + asif_accs = df["asif_accs"] + ax.plot( + ratios, + asif_accs, + "D-.", + ms=12, + label="ASIF", + color="green", + ) + csa_svm_accs = df["csa_svm_accs"] + ax.plot( + ratios, + csa_svm_accs, + "D-", + ms=12, + label="CSA + Linear SVM", + color="purple", + ) + clip_svm_accs = df["svm_accs"] + ax.plot( + ratios, + clip_svm_accs, + "v--", + ms=12, + label="CLIP + Linear SVM", + color="orange", + ) + ax.set_xlabel("Amount of training data", fontsize=20) + ax.set_ylabel("Classification accuracy", fontsize=20) + ax.xaxis.set_tick_params(labelsize=15) + ax.yaxis.set_tick_params(labelsize=15) + ax.set_ylim(0.0, 1.1) if cfg.dataset == "imagenet" else ax.set_ylim(0.2, 0.8) + ax.legend(loc="lower right", fontsize=16) + ax.grid() + + plots_path = ( + Path(cfg_dataset.paths.plots_path) + / f"classify_{cfg_dataset.text_encoder}_{cfg_dataset.img_encoder}/" + ) + plots_path.mkdir(parents=True, exist_ok=True) + plt.tight_layout() + fig.savefig(plots_path / f"trainsize_vs_accuracy_svm{cfg_dataset.sim_dim}.png") + + +if __name__ == "__main__": + # train_linear_svm() + plot_accuracy() + +# CUDA_VISIBLE_DEVICES=5 poetry run python mmda/linear_svm_clip.py dataset=leafy_spurge leafy_spurge.sim_dim=250 diff --git a/mmda/plot_kitti_cross_retrieval.py b/mmda/plot_kitti_cross_retrieval.py new file mode 100644 index 0000000..88b4575 --- /dev/null +++ b/mmda/plot_kitti_cross_retrieval.py @@ -0,0 +1,50 @@ +"""Plot functions.""" + +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np +import seaborn as sns +from omegaconf import DictConfig + +import hydra + + +@hydra.main(version_base=None, config_path="../config", config_name="main") +def plot_kitti_cross_retrieval(cfg: DictConfig) -> None: + """Plot cross-modal retrieval results for KITTI dataset. + + Args: + cfg: Configuration object containing dataset parameters + """ + cell_size = 30 + label_size = 30 + ticks_size = 28 + cfg_dataset = cfg["KITTI"] + dir_path = Path("plots/KITTI/") + single1_recalls = [[32.4], [32.8]] + single_recalls = np.array(single1_recalls).reshape(2, 1) + plt.figure(figsize=(6, 8)) + ax = sns.heatmap( + single_recalls, + fmt=".1f", + cmap="YlGnBu", + cbar=False, + square=True, + xticklabels=["LiDAR (Lip-loc)"], + yticklabels=["LiDAR (Lip-loc)", "Text (GTR)"], + annot=True, + annot_kws={"size": cell_size + 10, "weight": "bold"}, + ) + ax.xaxis.tick_top() + plt.xlabel("Reference modality", fontsize=label_size) + plt.ylabel("Query modality", fontsize=label_size) + plt.xticks(fontsize=ticks_size) + plt.yticks(fontsize=ticks_size) + ax.xaxis.set_label_position("top") # Move the label to the top + plt.tight_layout() + plt.savefig(dir_path / f"bimodal_recall5_{cfg_dataset.retrieval_dim}.pdf") + + +if __name__ == "__main__": + plot_kitti_cross_retrieval() diff --git a/mmda/plot_single_modal.py b/mmda/plot_single_modal.py index 9dea26c..74c4b77 100644 --- a/mmda/plot_single_modal.py +++ b/mmda/plot_single_modal.py @@ -1,4 +1,4 @@ -"""Plot functions.""" +"""Plot single-modal recall.""" from pathlib import Path diff --git a/mmda/tsfresh_features.py b/mmda/tsfresh_features.py new file mode 100644 index 0000000..5f1beea --- /dev/null +++ b/mmda/tsfresh_features.py @@ -0,0 +1,119 @@ +"""Extract tsfresh features from the Handwriting dataset.""" + +import pickle +from pathlib import Path + +import kagglehub +import numpy as np +import pandas as pd +from aeon.datasets import load_classification +from PIL import Image +from tsfresh import extract_features + +PATH = "/nas/pohan/datasets/Handwriting/" +PATH_SAVE = "/nas/pohan/datasets/Handwriting/embeddings/" + + +def load_handwriting() -> tuple[np.ndarray, np.ndarray, dict[str, tuple[str, str]]]: + """Load the Handwriting dataset (https://github.com/amazon-science/aeon). + + Args: + cfg_dataset: configuration file + Returns: + data: data. shape: (num_samples, 3, 152) + labels: labels. e.g. "1.0" + num2alphabet: a dict of index to alphabet + alphabets_hand: list of PIL images + """ + # train_x.shape: (150, 3, 152), test_x.shape: (850, 3, 152) + train_x, train_y = load_classification( + "Handwriting", split="train" + ) # np.ndarray, list[str] + test_x, test_y = load_classification("Handwriting", split="test") + # merge train and test + x = np.concatenate([train_x, test_x], axis=0) + y = np.concatenate([train_y, test_y], axis=0) + num2alphabet = {f"{i+1}.0": chr(65 + i) for i in range(26)} + idx = np.arange(x.shape[0]) + x = x[idx] + y = y[idx] + + def load_alphabets_img() -> tuple[np.ndarray, np.ndarray]: + """Load the MNIST dataset. + + Returns: + data: data + labels: labels + """ + # Download latest version + path = kagglehub.dataset_download( + "sachinpatel21/az-handwritten-alphabets-in-csv-format" + ) + df = pd.read_csv(path + "/A_Z Handwritten Data.csv") + labels = df.iloc[:, 0] + data = df.iloc[:, 1:] + return data, labels + + alphabets_x, alphabets_y = load_alphabets_img() + alphabets_img = {} + for i in range(26): + alphabets_img[i + 1] = alphabets_x[alphabets_y == i][:100] + + alphabets_hand = [] + for i in range(x.shape[0]): + label = int(y[i].split(".")[0]) + random_idx = np.random.choice(alphabets_img[label].shape[0]) + random_df = alphabets_img[label].iloc[random_idx].to_numpy() + random_df = random_df.reshape(28, 28).astype(np.uint8) + # save image to png + path = Path(PATH, f"alphabet_{label}_{random_idx}.png") + Image.fromarray(random_df, mode="L").save(path) + alphabets_hand.append(path) + return ( + x, + y, + num2alphabet, + alphabets_hand, + ) + + +def tsfresh_features() -> np.ndarray: + """Extract tsfresh features from the data. + + Returns: + features: features + """ + data, labels, num2alphabet, alphabets_hand = load_handwriting() + + path = Path(PATH_SAVE, "Handwriting_tsfresh.csv") + + if path.exists(): + df = pd.read_csv(path) + else: + # convert data to a df + # column_id: id, column_sort: time, values: 3 channels + df = pd.DataFrame(columns=["id", "time", "channel_1", "channel_2", "channel_3"]) + for idx in range(data.shape[0]): + for time in range(data.shape[2]): # 152 + df.loc[idx, "id"] = idx + df.loc[idx, "time"] = time + df.loc[idx, "channel_1"] = data[idx, 0, time] + df.loc[idx, "channel_2"] = data[idx, 1, time] + df.loc[idx, "channel_3"] = data[idx, 2, time] + print(df.head()) + print(df.tail()) + + df.to_csv(path, index=False) + ts_features = extract_features(df, column_id="id", column_sort="time") + ts_features = ts_features.dropna(axis=1) + print(type(ts_features)) + print(ts_features.shape) + print(ts_features.head()) + print("ts_features shape:", ts_features.shape) + with Path(PATH_SAVE, "Handwriting_emb_tsfresh.pkl.pkl").open("wb") as f: + pickle.dump(ts_features, f) + print("TSFresh features saved") + + +if __name__ == "__main__": + tsfresh_features() diff --git a/mmda/tsne_csa.py b/mmda/tsne_csa.py new file mode 100644 index 0000000..765d04a --- /dev/null +++ b/mmda/tsne_csa.py @@ -0,0 +1,107 @@ +"""Plot the T-SNE of the CSA embeddings on ImageNet.""" + +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np +from omegaconf import DictConfig +from sklearn.manifold import TSNE + +import hydra +from mmda.exps.mislabel_align import separate_data +from mmda.utils.cca_class import NormalizedCCA +from mmda.utils.data_utils import load_two_encoder_data +from mmda.utils.dataset_utils import load_imagenet + +cell_size = 30 +label_size = 30 +ticks_size = 26 + + +def plot_imagenet_tsne(cfg: DictConfig, save: bool = False) -> None: + """Plot the T-SNE of the CSA embeddings on ImageNet.""" + ### load embeddings ### + img_path, mturks_idx, orig_idx, clsidx_to_labels = load_imagenet(cfg.imagenet) + + np.random.seed(cfg.seed) + cfg_dataset, data1, data2 = load_two_encoder_data(cfg) + print(f"Loaded data1 shape: {data1.shape}, data2 shape: {data2.shape}") + alldata = separate_data(cfg, data1, data2) + + # we only consider the correctly labeled data + + # select training data based on the noisy_train_set + traindata1 = alldata.traindata1align + traindata2 = alldata.traindata2align + train_idx = alldata.train_idx[~alldata.train_wrong_labels_bool] + print( + f"img_data shape: {traindata1.shape}, mturks_idx[train_idx] shape: {mturks_idx[train_idx].shape}" + ) + class_20_idx = mturks_idx[train_idx] % 50 == 0 + print(f"val_idx shape: {class_20_idx.shape}") + + # transform the data using CCA + cca = NormalizedCCA() + cca_img_data, cca_text_data, _ = cca.fit_transform_train_data( + cfg_dataset, traindata1, traindata2 + ) + print(f"cca_img_data shape: {cca_img_data.shape}") + + # Compute t-SNE for original image embeddings + tsne_img = TSNE(n_components=2, random_state=cfg.seed).fit_transform( + traindata1[class_20_idx] + ) + + # Compute t-SNE for CCA-transformed embeddings + tsne_cca = TSNE(n_components=2, random_state=cfg.seed).fit_transform( + cca_img_data[class_20_idx] + ) + # Plot original image embeddings + fig1, ax1 = plt.subplots(figsize=(10, 8)) + print(f"idx: {mturks_idx[train_idx][class_20_idx].shape}") + print(f"embeddings: {tsne_img.shape}") + _ = ax1.scatter( + tsne_img[:, 0], + tsne_img[:, 1], + c=mturks_idx[train_idx][class_20_idx], + cmap="tab20", + alpha=0.8, + ) + ax1.set_xlabel("t-SNE dimension 1", fontsize=label_size) + ax1.set_ylabel("t-SNE dimension 2", fontsize=label_size) + ax1.tick_params(axis="both", labelsize=ticks_size) + plt.tight_layout() + + # Plot CCA-transformed embeddings + fig2, ax2 = plt.subplots(figsize=(10, 8)) + _ = ax2.scatter( + tsne_cca[:, 0], + tsne_cca[:, 1], + c=mturks_idx[train_idx][class_20_idx], + cmap="tab20", + alpha=0.8, + ) + ax2.set_xlabel("t-SNE dimension 1", fontsize=label_size) + ax2.set_ylabel("t-SNE dimension 2", fontsize=label_size) + ax2.tick_params(axis="both", labelsize=ticks_size) + plt.tight_layout() + + # Save plots if specified + if save: + plots_path = Path( + cfg_dataset.paths.plots_path, + f"tsne_{cfg_dataset.text_encoder}_{cfg_dataset.img_encoder}/", + ) + plots_path.mkdir(parents=True, exist_ok=True) + fig1.savefig(plots_path / "tsne_clip.png") + fig2.savefig(plots_path / "tsne_csa.png") + plt.close("all") + + +@hydra.main(version_base=None, config_path="../config", config_name="main") +def main(cfg: DictConfig) -> None: # noqa: D103 + plot_imagenet_tsne(cfg, save=True) + + +if __name__ == "__main__": + main() diff --git a/mmda/utils/cca_class.py b/mmda/utils/cca_class.py index 09565b2..05bb8d5 100644 --- a/mmda/utils/cca_class.py +++ b/mmda/utils/cca_class.py @@ -7,6 +7,7 @@ import numpy as np from cca_zoo.linear import CCA from omegaconf import DictConfig +from scipy.linalg import sqrtm from mmda.utils.data_utils import origin_centered @@ -64,8 +65,10 @@ def fit_transform_train_data( ) # dim, assert ( corr_coeff >= 0 - ).any, f"Correlation should be non-negative. {corr_coeff}" - assert (corr_coeff <= 1).any, f"Correlation should be less than 1. {corr_coeff}" + ).all(), f"Correlation should be non-negative. {corr_coeff}" + assert ( + corr_coeff <= 1.05 # noqa: PLR2004 + ).all(), f"Correlation should be less than 1. {corr_coeff}" self.corr_coeff = corr_coeff self.traindata1, self.traindata2 = traindata1, traindata2 return traindata1, traindata2, corr_coeff @@ -111,3 +114,116 @@ def load_model(self, path: str | Path) -> None: if isinstance(path, str): path = Path(path) self.__dict__ = joblib.load(path.open("rb")).__dict__ + + +class ReNormalizedCCA: + """Canonical Correlation Analysis (CCA) class which automatically zero-mean data.""" + + def __init__(self, sim_dim: int | None = None) -> None: + """Initialize the CCA model.""" + self.traindata1_mean = None + self.traindata2_mean = None + self.sim_dim = sim_dim + + def fit_transform_train_data( + self, cfg_dataset: DictConfig, traindata1: np.ndarray, traindata2: np.ndarray + ) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + """Fit the CCA model to the training data. + + Args: + cfg_dataset: the dataset configuration + traindata1: the first training data. shape: (num_samples, dim) + traindata2: the second training data. shape: (num_samples, dim) + + Returns: + traindata1: the first training data after CCA. shape: (num_samples, dim) + traindata2: the second training data after CCA. shape: (num_samples, dim) + corr_coeff: the correlation coefficient. shape: (dim,) + """ + # Check the shape of the training data + traindata1 = traindata1.astype(np.float32) + traindata2 = traindata2.astype(np.float32) + # zero mean data + traindata1, traindata1_mean = origin_centered(traindata1) + traindata2, traindata2_mean = origin_centered(traindata2) + self.traindata1_mean, self.traindata2_mean = traindata1_mean, traindata2_mean + + # check if training data is zero-mean + assert np.allclose( + traindata1.mean(axis=0), 0, atol=1e-3, rtol=1e-4 + ), f"traindata1align not zero mean: {max(abs(traindata1.mean(axis=0)))}" + assert np.allclose( + traindata2.mean(axis=0), 0, atol=1e-3, rtol=1e-4 + ), f"traindata2align not zero mean: {max(abs(traindata2.mean(axis=0)))}" + + # CCA dimensionality reduction + sigma_z1_inv = np.linalg.inv( + traindata1.T @ traindata1 + np.eye(traindata1.shape[1]) * 1e-5 + ) + sigma_z1_inv_sqrt = sqrtm(sigma_z1_inv) + sigma_z2_inv = np.linalg.inv(traindata2.T @ traindata2) + sigma_z2_inv_sqrt = sqrtm(sigma_z2_inv) + + svd_mat = sigma_z1_inv_sqrt @ traindata1.T @ traindata2 @ sigma_z2_inv_sqrt + u, s, vh = np.linalg.svd(svd_mat) + + self.A = u @ sigma_z1_inv_sqrt + self.B = vh @ sigma_z2_inv_sqrt + + corr_coeff = np.ones((traindata2.shape[1],)) if cfg_dataset.equal_weights else s + assert ( + corr_coeff >= 0 + ).all(), f"Correlation should be non-negative. {corr_coeff}" + self.corr_coeff = corr_coeff + if self.sim_dim is None: + self.sim_dim = cfg_dataset.sim_dim + self.traindata1, self.traindata2 = ( + (self.A @ traindata1.T).T[:, : self.sim_dim], + (self.B @ traindata2.T).T[:, : self.sim_dim], + ) + return self.traindata1, self.traindata2, corr_coeff + + def transform_data( + self, data1: tuple[np.ndarray, np.ndarray], data2: tuple[np.ndarray, np.ndarray] + ) -> tuple[np.ndarray, np.ndarray]: + """Transform the data using the fitted CCA model. + + Args: + data1: the first data. shape: (num_samples, dim) + data2: the second data. shape: (num_samples, dim) + + Returns: + data1: the first transformed data. shape: (num_samples, dim) + data2: the second transformed data. shape: (num_samples, dim) + """ + data1 = data1.astype(np.float32) + data2 = data2.astype(np.float32) + assert self.traindata1_mean is not None, "Please fit the cca model first." + assert self.traindata2_mean is not None, "Please fit the cca model first." + # zero mean data and transform + data1 = data1 - self.traindata1_mean + data2 = data2 - self.traindata2_mean + data1 = (self.A @ data1.T).T[:, : self.sim_dim] + data2 = (self.B @ data2.T).T[:, : self.sim_dim] + return data1, data2 + + def save_model(self, path: str | Path) -> None: + """Save the CCA class. + + Args: + path: the path to save the class + """ + if isinstance(path, str): + path = Path(path) + with path.open("wb") as f: + pickle.dump(self, f) + + def load_model(self, path: str | Path) -> None: + """Load the CCA class. + + Args: + path: the path to load the class + """ + if isinstance(path, str): + path = Path(path) + self.__dict__ = joblib.load(path.open("rb")).__dict__ diff --git a/mmda/utils/classification_dataset_class.py b/mmda/utils/classification_dataset_class.py index 00f67ca..84ec87c 100644 --- a/mmda/utils/classification_dataset_class.py +++ b/mmda/utils/classification_dataset_class.py @@ -5,11 +5,13 @@ import numpy as np import torch from omegaconf import DictConfig +from sklearn.metrics import roc_auc_score from mmda.baselines.asif_core import zero_shot_classification from mmda.utils.data_utils import load_clip_like_data, load_two_encoder_data from mmda.utils.dataset_utils import ( get_train_test_split_index, + load_handwriting, load_imagenet, load_leafy_spurge, shuffle_percentage_of_data, @@ -255,7 +257,142 @@ def classification(self, sim_fn: Union[callable, str]) -> float: # noqa: UP007 return np.mean(correct) -def load_classification_dataset(cfg: DictConfig) -> ImageNetDataset: +class HandwritingDataset(BaseClassificationDataset): + """Handwriting dataset class.""" + + def __init__(self, cfg: DictConfig) -> None: + """Initialize the dataset. + + Args: + cfg: configuration file + """ + super().__init__(cfg) + self.cfg = cfg + self.images, self.labels, self.num2alphabet, _ = load_handwriting( + cfg.handwriting + ) + # convert labels to int (str) + self.labels = [int(label.split(".")[0]) for label in self.labels] + self.labels = np.array(self.labels) - 1 + + def load_data( + self, + train_test_ratio: float, + clip_bool: bool = False, + shuffle_ratio: float = 0.0, + ) -> None: + """Load the data for ImageNet dataset. + + Args: + train_test_ratio: ratio of training data + clip_bool: whether to use CLIP-like method + shuffle_ratio: ratio of data to shuffle + """ + shuffle_ratio = shuffle_ratio + 1 # unused + self.train_test_ratio = train_test_ratio + if clip_bool: + _, self.img_emb, self.text_emb = load_clip_like_data(self.cfg) + else: + _, self.img_emb, self.text_emb = load_two_encoder_data(self.cfg) + train_size = int(self.train_test_ratio * self.img_emb.shape[0]) + self.train_img, self.test_img = ( + self.img_emb[:train_size], + self.img_emb[train_size:], + ) + self.train_text, self.test_text = ( + self.text_emb[:train_size], + self.text_emb[train_size:], + ) + self.train_idx, self.test_idx = ( + self.labels[:train_size], + self.labels[train_size:], + ) + + def get_labels_emb(self) -> None: + """Get the text embeddings for all possible labels.""" + label_emb = [] + for num in range(len(self.num2alphabet)): + # find where the label is in the train_idx + assert ( + self.labels.shape[0] == self.text_emb.shape[0] + ), f"{self.labels.shape[0]}!={self.text_emb.shape[0]}" + label_idx_in_ds = np.where(self.labels == num)[0] + label_emb.append(self.text_emb[label_idx_in_ds[0]]) + self.labels_emb = np.array(label_emb) + assert self.labels_emb.shape[0] == len(self.num2alphabet) + + def classification(self, sim_fn: Union[callable, str]) -> float: # noqa: UP007 + """Classification task. + + Args: + sim_fn: similarity function + Returns: + accuracy: classification accuracy + """ + assert np.allclose( + self.labels_emb[self.train_idx[0]], self.train_text[0], atol=1e-3, rtol=1e-4 + ), f"{self.labels_emb[self.train_idx[0]].shape}!={self.train_text[0].shape}" + cfg = self.cfg + sim_scores = [] + if sim_fn == "asif": + # set parameters + non_zeros = min(cfg.asif.non_zeros, self.train_img.shape[0]) + range_anch = [ + 2**i + for i in range( + int(np.log2(non_zeros) + 1), + int(np.log2(len(self.train_img))) + 2, + ) + ] + range_anch = range_anch[-1:] # run just last anchor to be quick + val_labels = torch.zeros((1,), dtype=torch.float32) + # generate noise in the shape of the labels_emb + noise = np.random.rand( + self.test_img.shape[0] - self.labels_emb.shape[0], + self.labels_emb.shape[1], + ).astype(np.float32) + self.test_label = np.concatenate((self.labels_emb, noise), axis=0) + assert ( + self.test_img.shape[0] == self.test_label.shape[0] + ), f"{self.test_img.shape[0]}!={self.test_label.shape[0]}" + _anchors, scores, sim_score_matrix = zero_shot_classification( + torch.tensor(self.test_img, dtype=torch.float32), + torch.tensor(self.test_label, dtype=torch.float32), + torch.tensor(self.train_img, dtype=torch.float32), + torch.tensor(self.train_text, dtype=torch.float32), + val_labels, + non_zeros, + range_anch, + cfg.asif.val_exps, + max_gpu_mem_gb=cfg.asif.max_gpu_mem_gb, + ) + sim_score_matrix = sim_score_matrix.numpy().astype(np.float32)[:, :2] + sim_scores = sim_score_matrix.T # labels x test_img_size + else: + for label_idx in range(len(self.num2alphabet)): # 0 to 25 + label_emb = self.labels_emb[label_idx].reshape(1, -1) + label_emb = np.repeat(label_emb, self.test_text.shape[0], axis=0) + sim_score_matrix = sim_fn(self.test_img, label_emb) + sim_scores.append(sim_score_matrix) + sim_scores = np.array(sim_scores) # labels x test_img_size + + # ROC with scikit-learn + y = self.test_idx + sim_scores_t = sim_scores.T # test_img_size x labels + sim_scores_t = np.nan_to_num(sim_scores_t, nan=0.0) + pred_y = sim_scores_t / sim_scores_t.sum(axis=1, keepdims=True) + roc_auc = roc_auc_score(y, pred_y, multi_class="ovr") + print(f"ROC AUC: {roc_auc}") + + # accuracy + most_similar_label_idx = np.argmax(sim_scores, axis=0) + correct = most_similar_label_idx == self.test_idx + return np.mean(correct) + + +def load_classification_dataset( + cfg: DictConfig, +) -> ImageNetDataset | LeafySpurgeDataset | HandwritingDataset: """Load the dataset for classification task. Args: @@ -267,6 +404,8 @@ def load_classification_dataset(cfg: DictConfig) -> ImageNetDataset: dataset = ImageNetDataset(cfg) elif cfg.dataset == "leafy_spurge": dataset = LeafySpurgeDataset(cfg) + elif cfg.dataset == "handwriting": + dataset = HandwritingDataset(cfg) else: msg = f"Dataset {cfg.dataset} not supported" raise ValueError(msg) diff --git a/mmda/utils/data_utils.py b/mmda/utils/data_utils.py index e58651a..f3dde38 100644 --- a/mmda/utils/data_utils.py +++ b/mmda/utils/data_utils.py @@ -168,6 +168,19 @@ def load_two_encoder_data(cfg: DictConfig) -> tuple[DictConfig, np.ndarray, np.n + f"LeafySpurge_text_emb_{cfg_dataset.text_encoder}.pkl", ) ) + elif dataset == "handwriting": + data1 = joblib.load( + Path( + cfg_dataset.paths.save_path + + f"Handwriting_emb_{cfg_dataset.img_encoder}.pkl" + ) + ) + data2 = joblib.load( + Path( + cfg_dataset.paths.save_path + + f"Handwriting_emb_{cfg_dataset.text_encoder}.pkl" + ) + ) # TODO: add more datasets else: msg = f"Dataset {dataset} not supported." @@ -204,10 +217,16 @@ def load_clip_like_data(cfg: DictConfig) -> tuple[DictConfig, np.ndarray, np.nda ) elif dataset == "imagenet": data1 = joblib.load( - Path(cfg_dataset.paths.save_path + "ImageNet_img_emb_clip.pkl") + Path( + cfg_dataset.paths.save_path + + f"ImageNet_img_emb_clip{cfg_dataset.model_name}.pkl" + ) ) data2 = joblib.load( - Path(cfg_dataset.paths.save_path + "ImageNet_text_emb_clip.pkl") + Path( + cfg_dataset.paths.save_path + + f"ImageNet_text_emb_clip{cfg_dataset.model_name}.pkl" + ) ) elif dataset == "tiil": data1 = joblib.load(Path(cfg_dataset.paths.save_path + "TIIL_img_emb_clip.pkl")) @@ -246,6 +265,13 @@ def load_clip_like_data(cfg: DictConfig) -> tuple[DictConfig, np.ndarray, np.nda cfg_dataset.paths.save_path + "LeafySpurge_text_emb_clip.pkl", ) ) + elif dataset == "handwriting": + data1 = joblib.load( + Path(cfg_dataset.paths.save_path + "Handwriting_emb_gtr.pkl") + ) + data2 = joblib.load( + Path(cfg_dataset.paths.save_path + "Handwriting_emb_gtr.pkl") + ) # TODO: add more datasets else: msg = f"Dataset {dataset} not supported." diff --git a/mmda/utils/dataset_utils.py b/mmda/utils/dataset_utils.py index f74d74d..9b7cc03 100644 --- a/mmda/utils/dataset_utils.py +++ b/mmda/utils/dataset_utils.py @@ -10,7 +10,9 @@ import joblib import numpy as np import pandas as pd +from aeon.datasets import load_classification from omegaconf import DictConfig +from PIL import Image import hydra from mmda.liploc.dataloaders.KittiBothDataset import KITTIBothDataset @@ -18,6 +20,76 @@ from mmda.utils.video_audio_utils import process_video_ids +def load_handwriting( + cfg_dataset: DictConfig, +) -> tuple[np.ndarray, np.ndarray, dict[str, tuple[str, str]]]: + """Load the Handwriting dataset (https://github.com/amazon-science/aeon). + + Args: + cfg_dataset: configuration file + Returns: + data: data. shape: (num_samples, 3, 152) + labels: labels. e.g. "1.0" + num2alphabet: a dict of index to alphabet + alphabets_hand: list of PIL images + """ + assert cfg_dataset is not None, "cfg_dataset is None" + # train_x.shape: (150, 3, 152), test_x.shape: (850, 3, 152) + train_x, train_y = load_classification( + "Handwriting", split="train" + ) # np.ndarray, list[str] + test_x, test_y = load_classification("Handwriting", split="test") + # merge train and test + x = np.concatenate([train_x, test_x], axis=0) + y = np.concatenate([train_y, test_y], axis=0) + num2alphabet = {f"{i+1}.0": chr(65 + i) for i in range(26)} + idx = np.arange(x.shape[0]) + x = x[idx] + y = y[idx] + + def load_alphabets_img() -> tuple[np.ndarray, np.ndarray]: + """Load the MNIST dataset. + + Returns: + data: data + labels: labels + """ + import kagglehub + + # Download latest version + path = kagglehub.dataset_download( + "sachinpatel21/az-handwritten-alphabets-in-csv-format" + ) + df = pd.read_csv(path + "/A_Z Handwritten Data.csv") + labels = df.iloc[:, 0] + data = df.iloc[:, 1:] + return data, labels + + alphabets_x, alphabets_y = load_alphabets_img() + alphabets_img = {} + for i in range(26): + alphabets_img[i + 1] = alphabets_x[alphabets_y == i][:100] + + alphabets_hand = [] + for i in range(x.shape[0]): + label = int(y[i].split(".")[0]) + random_idx = np.random.choice(alphabets_img[label].shape[0]) + random_df = alphabets_img[label].iloc[random_idx].to_numpy() + random_df = random_df.reshape(28, 28).astype(np.uint8) + # save image to png + path = Path( + cfg_dataset.paths.dataset_path, f"alphabet_{label}_{random_idx}.png" + ) + Image.fromarray(random_df, mode="L").save(path) + alphabets_hand.append(path) + return ( + x, + y, + num2alphabet, + alphabets_hand, + ) + + def load_msrvtt( cfg_dataset: DictConfig, ) -> tuple[list[str], list[str], np.ndarray, list[str]]: @@ -414,6 +486,8 @@ def load_imagenet( idx, label = int(idx.strip()), label.strip() label = label.replace("'", "") clsidx_to_labels[idx] = label + print("ImageNet mislabel count: ", np.sum(orig_idx != mturks_idx)) + print("ImageNet total count: ", len(orig_idx)) return img_path, mturks_idx, orig_idx, clsidx_to_labels @@ -700,7 +774,7 @@ def shuffle_by_level( # noqa: PLR0912, C901, ANN201 @hydra.main(version_base=None, config_path="../../config", config_name="main") def main(cfg: DictConfig) -> None: # noqa: D103 - load_msrvtt(cfg.MSRVTT) + load_handwriting(cfg.handwriting) if __name__ == "__main__": diff --git a/mmda/utils/embed_data.py b/mmda/utils/embed_data.py index 7c39833..49ae709 100644 --- a/mmda/utils/embed_data.py +++ b/mmda/utils/embed_data.py @@ -5,6 +5,7 @@ import numpy as np import open_clip import torch +from chronos import ChronosPipeline from PIL import Image, ImageFilter from sentence_transformers import SentenceTransformer from torchvision import transforms @@ -19,6 +20,29 @@ ) +def chronos_ts(ts: np.ndarray) -> np.ndarray: + """Extract time series features using Chronos model.""" + num_data, channels, num_timestamps = ts.shape + pipeline = ChronosPipeline.from_pretrained( + "amazon/chronos-t5-large", + device_map="cuda", # use "cpu" for CPU inference and "mps" for Apple Silicon + torch_dtype=torch.bfloat16, + ) + all_embeddings = [] + print("ts shape:", ts.shape) # (1000, 3, 152) + for channel in range(channels): + if channel > 0: + break + # context must be either a 1D tensor, a list of 1D tensors, + # or a left-padded 2D tensor with batch as the first dimension + context = torch.tensor(ts[:, channel, :]).reshape(num_data, num_timestamps) + embeddings, tokenizer_state = pipeline.embed(context) # (1000, 153, 1024) + all_embeddings.append( + embeddings[:, -1, :].detach().cpu().to(torch.float32).numpy() + ) + return np.concatenate(all_embeddings, axis=1) + + def cosplace_img(img_files: list, batch_size: int = 32) -> np.ndarray: """Extract image features using CosPlace model specifically trained for the Pittsburgh dataset. @@ -158,6 +182,46 @@ def clip_imgs( return np.concatenate(img_embeddings, axis=0) +def fair_clip_imgs( + img_files: list[str], + batch_size: int = 32, + model_name: tuple[str, str] = ("ViT-L-14", "datacomp_xl_s13b_b90k"), +) -> np.ndarray: + """Extract image features using CLIP model. + + Args: + img_files: list of image files + batch_size: batch size + model_name: name of the CLIP model. (architecture, pretrained) + + Returns: + image features + """ + model, _, preprocess = open_clip.create_model_and_transforms( + model_name[0], pretrained=model_name[1] + ) + # commonpool_xl_clip_s13b_b90k, commonpool_xl_s13b_b90k, commonpool_xl_laion_s13b_b90k, openai + model = model.cuda() + img_embeddings = [] + with torch.no_grad(), torch.cuda.amp.autocast(): + for i in tqdm(range(0, len(img_files), batch_size)): + batch = [] + for img_file in img_files[i : i + batch_size]: + if isinstance(img_file, str): + image = preprocess(Image.open(img_file)).unsqueeze(0) + elif isinstance(img_file, Path): + image = preprocess(Image.open(str(img_file))).unsqueeze(0) + elif isinstance(img_file, Image.Image): + image = preprocess(img_file).unsqueeze(0) + batch.append(image) + batch = torch.cat(batch, dim=0) + batch = batch.cuda() + image_features = model.encode_image(batch) + image_features /= image_features.norm(dim=-1, keepdim=True) + img_embeddings.append(image_features.detach().cpu().numpy()) + return np.concatenate(img_embeddings, axis=0) + + # clip text in batch with gpu def clip_text( text: list[str], @@ -189,6 +253,39 @@ def clip_text( return np.concatenate(text_features, axis=0) +def fair_clip_text( + text: list[str], + batch_size: int = 32, + model_name: tuple[str, str] = ("ViT-L-14", "openai"), +) -> np.ndarray: + """Extract text features using CLIP model. + + Args: + text: list of text + batch_size: batch size + model_name: name of the CLIP model. (architecture, pretrained) + + Returns: + text features + """ + model, _, _ = open_clip.create_model_and_transforms( + model_name[0], pretrained=model_name[1] + ) + tokenizer = open_clip.get_tokenizer(model_name[0]) + model = model.cuda() + + text_features = [] + with torch.no_grad(), torch.cuda.amp.autocast(): + for i in tqdm(range(0, len(text), batch_size)): + batch = text[i : i + batch_size] + batch = tokenizer(batch) + batch = batch.cuda() + batch = model.encode_text(batch) + batch /= batch.norm(dim=-1, keepdim=True) + text_features.append(batch.detach().cpu().numpy()) + return np.concatenate(text_features, axis=0) + + def gtr_text(text: list[str]) -> np.ndarray: """Extract text features using GTR model. diff --git a/pyproject.toml b/pyproject.toml index 696a1a3..878646c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ authors = [ ] description = "Multimodal Feature Extraction" readme = "README.md" -requires-python = ">=3.10" +requires-python = ">=3.9,<3.13" classifiers = [ "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License", @@ -32,7 +32,8 @@ description = "Multimodal Data Alignment" readme = "README.md" [tool.poetry.dependencies] -python = "^3.10" +matplotlib = "*" +python = ">=3.9,<3.13" ruff = "*" numpy = "^1.19" pandas = "*" @@ -48,9 +49,9 @@ hydra-core = "1.3.2" hydra-joblib-launcher = "1.2.0" protobuf = "3.20.*" torchvision = "*" +torchaudio = "*" ipython = "^8.15.0" -scipy = "^1.11.2" -swarm-visualizer = {git = "https://github.com/UTAustin-SwarmLab/Swarm-Visualization"} +scipy = "*" cca-zoo = "^2.5.0" ipykernel = "^6.29.3" jupyter = "^1.0.0" @@ -70,8 +71,11 @@ timm = "*" albumentations = "*" kaggle = "*" moviepy = "*" -imagebind = {git = "https://github.com/facebookresearch/ImageBind"} +# imagebind = {git = "https://github.com/facebookresearch/ImageBind"} # LLaVA = {git = "https://github.com/haotian-liu/LLaVA.git"} # contradicting with imagebind +# chronos = {git = "https://github.com/amazon-science/chronos-forecasting.git"} +aeon = {git = "https://github.com/aeon-toolkit/aeon.git", branch = "main"} +kagglehub = "*" [project.optional-dependencies] dev = ["black", "ruff", "mypy"]