Skip to content

Iclr-rebuttal #17

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Jan 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -171,4 +171,5 @@ plots/*
# lock files
*.lock
.checkpoints/
.assets/
.assets/
*.keras
7 changes: 7 additions & 0 deletions bash_scripts/handwriting_script.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# classification
CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=10
CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=25
CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=50
CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=100
CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=200
CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=handwriting handwriting.sim_dim=500
11 changes: 9 additions & 2 deletions bash_scripts/imagenet_script.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# mislabeled data
# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=10
# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=25
# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet imagenet.sim_dim=50
Expand All @@ -15,6 +16,12 @@
# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet train_test_ratio=0.1 imagenet.sim_dim=500
# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_mislabeled_data.py dataset=imagenet train_test_ratio=0.1 imagenet.sim_dim=700

# # classification
# classification
# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=imagenet
CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=imagenet imagenet.shuffle=True
# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=imagenet imagenet.shuffle=True

# SVM
# poetry run python mmda/linear_svm_clip.py train_test_ratio=0.1
# poetry run python mmda/linear_svm_clip.py train_test_ratio=0.3
# poetry run python mmda/linear_svm_clip.py train_test_ratio=0.5
# poetry run python mmda/linear_svm_clip.py train_test_ratio=0.7
7 changes: 6 additions & 1 deletion bash_scripts/leafy_script.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=leafy_spurge leafy_spurge.sim_dim=10
# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=leafy_spurge leafy_spurge.sim_dim=50
# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=leafy_spurge leafy_spurge.sim_dim=100
CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=leafy_spurge leafy_spurge.sim_dim=250
# CUDA_VISIBLE_DEVICES=1 poetry run python mmda/bimodal_classification.py dataset=leafy_spurge leafy_spurge.sim_dim=250

CUDA_VISIBLE_DEVICES=1 poetry run python mmda/linear_svm_clip.py dataset=leafy_spurge leafy_spurge.sim_dim=250 train_test_ratio=0.4
CUDA_VISIBLE_DEVICES=1 poetry run python mmda/linear_svm_clip.py dataset=leafy_spurge leafy_spurge.sim_dim=250 train_test_ratio=0.6
CUDA_VISIBLE_DEVICES=1 poetry run python mmda/linear_svm_clip.py dataset=leafy_spurge leafy_spurge.sim_dim=250 train_test_ratio=0.7
CUDA_VISIBLE_DEVICES=1 poetry run python mmda/linear_svm_clip.py dataset=leafy_spurge leafy_spurge.sim_dim=250 train_test_ratio=0.888
25 changes: 19 additions & 6 deletions config/main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ noisy_train_set: True
repo_root: "/home/pl22767/Project/MMDA/"
# repo_root: "/home/po-han/Desktop/Projects/MMDA/"

dataset: "MSRVTT"
dataset: "handwriting"
dataset_level_datasets: [pitts, imagenet, cosmos, sop, tiil, musiccaps, flickr]
class_level_datasets: [sop]
object_level_datasets: [pitts, sop]
Expand All @@ -16,7 +16,7 @@ retrieval_datasets: [flickr]
any_retrieval_datasets: [KITTI, MSRVTT, BTC]
shuffle_llava_datasets: [pitts, sop] # datasets whose plots contains llava
mislabel_llava_datasets: [imagenet]
classification_datasets: [imagenet, leafy_spurge]
classification_datasets: [imagenet, leafy_spurge, handwriting]
dataset_size: {
sop: 56222,
musiccaps: 5397,
Expand All @@ -39,7 +39,6 @@ BTC:
save_path: ${BTC.paths.dataset_path}/any2any/
plots_path: ${repo_root}plots/BTC/


MSRVTT:
img_encoder: "clip"
audio_encoder: "clap"
Expand Down Expand Up @@ -92,9 +91,10 @@ musiccaps:
imagenet:
sim_dim: 700 # dimension of the similarity score and the CCA transformation
equal_weights: False
img_encoder: "dino"
text_encoder: "gtr"
train_test_ratios: [0.7] #, 0.3, 0.5, 0.7]
img_encoder: "clipopenai"
text_encoder: "clipdatacomp_xl_s13b_b90k"
model_name: "openai"
train_test_ratios: [0.3, 0.5, 0.7]
shuffle_ratios: [0.1, 0.3, 0.5, 0.7, 1.0]
shuffle: False
paths:
Expand All @@ -103,6 +103,19 @@ imagenet:
plots_path: ${repo_root}plots/ImageNet/
label_embeddings: ${imagenet.paths.dataset_path}_${text_encoder}_label_embeddings.npy

handwriting:
sim_dim: 50 # dimension of the similarity score and the CCA transformation
equal_weights: True
img_encoder: "tsfresh"
text_encoder: "gtr"
train_test_ratios: [0.85]
shuffle: False
paths:
dataset_path: "/nas/pohan/datasets/Handwriting/"
save_path: ${handwriting.paths.dataset_path}embeddings/
plots_path: ${repo_root}plots/handwriting/
label_embeddings: ${handwriting.paths.dataset_path}_${text_encoder}_label_embeddings.npy

leafy_spurge:
sim_dim: 700 # dimension of the similarity score and the CCA transformation
equal_weights: False
Expand Down
22 changes: 16 additions & 6 deletions mmda/bimodal_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@


@hydra.main(version_base=None, config_path="../config", config_name="main")
def main(cfg: DictConfig) -> None:
def main(cfg: DictConfig) -> None: # noqa: C901, PLR0915, PLR0912
"""Main function to generate the classification results of the bimodal datasets.

Args:
Expand All @@ -27,7 +27,12 @@ def main(cfg: DictConfig) -> None:
), f"{cfg.dataset} is not for classification."
cfg_dataset = cfg[cfg.dataset]
shuffle_tag = "shuffled" if cfg_dataset.shuffle else ""
ds_size = 50_000 if cfg.dataset == "imagenet" else 900
if cfg.dataset == "imagenet":
ds_size = 50_000
elif cfg.dataset == "leafy_spurge":
ds_size = 900
elif cfg.dataset == "handwriting":
ds_size = 1000
csv_save_path = (
Path(cfg_dataset.paths.plots_path)
/ f"classify_{cfg_dataset.text_encoder}_{cfg_dataset.img_encoder}/"
Expand All @@ -47,9 +52,9 @@ def main(cfg: DictConfig) -> None:
f.write(f"{shuffle_ratio},{cca_accs},{asif_accs}\n")
else:
for train_test_ratio in cfg_dataset.train_test_ratios:
asif_accs = asif_classification(cfg, train_test_ratio)
cca_accs = cca_classification(cfg, train_test_ratio)
clip_accs = clip_like_classification(cfg, train_test_ratio)
asif_accs = 0 if True else asif_classification(cfg, train_test_ratio)
clip_accs = 0 if True else clip_like_classification(cfg, train_test_ratio)
# write accuracy to file
if not csv_save_path.exists():
# create the file and write the header
Expand Down Expand Up @@ -77,7 +82,7 @@ def main(cfg: DictConfig) -> None:
label="CSA (ours)",
color="blue",
)
if not cfg_dataset.shuffle:
if not cfg_dataset.shuffle and cfg.dataset != "handwriting":
clip_accs = df["clip_accs"]
ax.plot(
ratios,
Expand All @@ -99,7 +104,12 @@ def main(cfg: DictConfig) -> None:
ax.set_ylabel("Classification accuracy", fontsize=20)
ax.xaxis.set_tick_params(labelsize=15)
ax.yaxis.set_tick_params(labelsize=15)
ax.set_ylim(0, 1.03) if cfg.dataset == "imagenet" else ax.set_ylim(0.4, 0.65)
if cfg.dataset == "imagenet":
ax.set_ylim(0, 1.03)
elif cfg.dataset == "leafy_spurge":
ax.set_ylim(0.4, 0.65)
else:
ax.set_ylim(0, 1.03)
(
ax.legend(loc="lower right", fontsize=18)
if not cfg_dataset.shuffle
Expand Down
15 changes: 8 additions & 7 deletions mmda/exps/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
import numpy as np
from omegaconf import DictConfig

from mmda.utils.cca_class import NormalizedCCA
from mmda.utils.cca_class import NormalizedCCA, ReNormalizedCCA
from mmda.utils.classification_dataset_class import load_classification_dataset
from mmda.utils.sim_utils import cosine_sim, weighted_corr_sim


def cca_classification(
cfg: DictConfig, train_test_ratio: float, shuffle_ratio: float = 0.0
) -> tuple[dict[float:float], dict[float : dict[float:float]]]:
) -> float:
"""Retrieve data using the proposed CCA method.

Args:
Expand All @@ -21,9 +21,10 @@ def cca_classification(
data_size2accuracy: {data_size: accuracy}
"""
cfg_dataset = cfg[cfg.dataset]
print(f"CCA {cfg_dataset.sim_dim}")
ds = load_classification_dataset(cfg)
ds.load_data(train_test_ratio, clip_bool=False, shuffle_ratio=shuffle_ratio)
cca = NormalizedCCA()
cca = ReNormalizedCCA() if True else NormalizedCCA()
ds.train_img, ds.train_text, corr = cca.fit_transform_train_data(
cfg_dataset, ds.train_img, ds.train_text
)
Expand All @@ -39,9 +40,7 @@ def sim_fn(x: np.array, y: np.array, corr: np.array = corr) -> np.array:
return ds.classification(sim_fn=sim_fn)


def clip_like_classification(
cfg: DictConfig, train_test_ratio: float
) -> tuple[dict[float:float], dict[float:float]]:
def clip_like_classification(cfg: DictConfig, train_test_ratio: float) -> float:
"""Retrieve data using the CLIP-like method.

Args:
Expand All @@ -50,6 +49,7 @@ def clip_like_classification(
Returns:
data_size2accuracy: {data_size: accuracy}
"""
print("CLIP-like")
ds = load_classification_dataset(cfg)
ds.load_data(train_test_ratio, clip_bool=True)
ds.get_labels_emb()
Expand All @@ -58,7 +58,7 @@ def clip_like_classification(

def asif_classification(
cfg: DictConfig, train_test_ratio: float, shuffle_ratio: float = 0.0
) -> tuple[dict[float:float], dict[float:float]]:
) -> float:
"""Retrieve data using the CLIP-like method.

Args:
Expand All @@ -68,6 +68,7 @@ def asif_classification(
Returns:
data_size2accuracy: {data_size: accuracy}
"""
print("ASIF")
ds = load_classification_dataset(cfg)
ds.load_data(train_test_ratio, clip_bool=False, shuffle_ratio=shuffle_ratio)
ds.get_labels_emb()
Expand Down
2 changes: 2 additions & 0 deletions mmda/exps/mislabel_align.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,8 @@ def __init__(self, *args, **kwargs): # noqa: ANN204, ANN002, ANN003
"valdata2align": valdata2align,
"valdata1unalign": valdata1unalign,
"valdata2unalign": valdata2unalign,
"train_idx": train_idx,
"train_wrong_labels_bool": train_wrong_labels_bool,
}
)
return alldata
Expand Down
68 changes: 66 additions & 2 deletions mmda/get_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from mmda.utils.dataset_utils import (
load_cosmos,
load_flickr,
load_handwriting,
load_imagenet,
load_kitti,
load_leafy_spurge,
Expand All @@ -25,22 +26,24 @@
load_tiil,
)
from mmda.utils.embed_data import (
chronos_ts,
clap_audio,
clap_text,
clip_imgs,
clip_text,
cosplace_img,
dinov2,
fair_clip_imgs,
fair_clip_text,
gtr_text,
)
from mmda.utils.imagebind_utils import ImageBindInference
from mmda.utils.video_audio_utils import (
get_video_emb,
prepare_audio_for_imagebind,
process_audio,
)

BATCH_SIZE = 256
BATCH_SIZE = 758


@hydra.main(version_base=None, config_path="../config", config_name="main")
Expand Down Expand Up @@ -92,6 +95,8 @@ def main(cfg: DictConfig) -> None: # noqa: PLR0915, C901, PLR0912
pickle.dump(clap_audio_features, f)

elif dataset == "MSRVTT":
from mmda.utils.imagebind_utils import ImageBindInference

_, captions, video_info_sen_order, video_dict = load_msrvtt(cfg_dataset)
id_order, img_paths, audio_start_secs, audio_num_secs = get_video_emb(
cfg_dataset, video_dict
Expand Down Expand Up @@ -372,6 +377,24 @@ def main(cfg: DictConfig) -> None: # noqa: PLR0915, C901, PLR0912
text_descriptions = ["An image of " + label + "." for label in orig_labels]

# get text embeddings
model = "openai"

img_emb = fair_clip_imgs(img_path, BATCH_SIZE, model_name=("ViT-L-14", model))
with Path(
cfg_dataset.paths.save_path, f"ImageNet_img_emb_clip{model}.pkl"
).open("wb") as f:
pickle.dump(img_emb, f)
print("FairCLIP embeddings saved")

text_emb = fair_clip_text(
text_descriptions, BATCH_SIZE, model_name=("ViT-L-14", model)
)
with Path(
cfg_dataset.paths.save_path, f"ImageNet_text_emb_clip{model}.pkl"
).open("wb") as f:
pickle.dump(text_emb, f)
print("FairCLIP embeddings saved")

text_emb = clip_text(text_descriptions, BATCH_SIZE)
with Path(cfg_dataset.paths.save_path, "ImageNet_text_emb_clip.pkl").open(
"wb"
Expand Down Expand Up @@ -536,6 +559,47 @@ def main(cfg: DictConfig) -> None: # noqa: PLR0915, C901, PLR0912
pickle.dump(img_emb, f)
print("CLIP embeddings saved")

elif dataset == "handwriting":
data, labels, num2alphabet, alphabets_hand = load_handwriting(cfg_dataset)
# save data
with Path(cfg_dataset.paths.save_path, "Handwriting_data.pkl").open("wb") as f:
pickle.dump(data, f)
print("Handwriting data saved")
return

embeddings = clip_imgs(alphabets_hand, 256)
print("text shape:", embeddings.shape)
with Path(cfg_dataset.paths.save_path, "Handwriting_emb_clip.pkl").open(
"wb"
) as f:
pickle.dump(embeddings, f)
print("CLIP embeddings saved")

sentences = [f"Alphabet {num2alphabet[label]}." for label in labels]
print(sentences[15:21])
embeddings = gtr_text(sentences)
assert np.allclose(
embeddings[15], embeddings[20], atol=1e-3, rtol=1e-4
), f"{embeddings[15].shape}!={embeddings[20].shape}"
with Path(cfg_dataset.paths.save_path, "Handwriting_emb_gtr.pkl").open(
"wb"
) as f:
pickle.dump(embeddings, f)
print("GTR shape:", embeddings.shape)
print("GTR embeddings saved")

embeddings = chronos_ts(data)
# check if embeddings has unique rows
assert embeddings.shape[0] == len(
np.unique(embeddings, axis=0)
), f"Embeddings has repeated entries. {embeddings.shape[0]}!={len(np.unique(embeddings, axis=0))}"
print("Chronos shape:", embeddings.shape)
with Path(cfg_dataset.paths.save_path, "Handwriting_emb_chronos.pkl").open(
"wb"
) as f:
pickle.dump(embeddings, f)
print("Chronos embeddings saved")

# TODO: add more datasets
else:
msg = f"Dataset {dataset} not supported."
Expand Down
29 changes: 29 additions & 0 deletions mmda/handwriting_baseline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""This script is for the handwriting baseline."""

import numpy as np
from aeon.classification.deep_learning import InceptionTimeClassifier
from omegaconf import DictConfig
from sklearn.metrics import accuracy_score

import hydra
from mmda.utils.dataset_utils import load_handwriting


@hydra.main(version_base=None, config_path="../config", config_name="main")
def main(cfg: DictConfig) -> None:
"""Train the handwriting baseline."""
x, labels, _ = load_handwriting(cfg_dataset=cfg.handwriting)
inception = InceptionTimeClassifier()
for train_test_ratio in cfg.handwriting.train_test_ratios:
np.random.seed(42)
train_size = int(train_test_ratio * x.shape[0])
print(x.shape, labels.shape)
inception.fit(x[:train_size], labels[:train_size])
y_pred = inception.predict(x[train_size:])
accuracy = accuracy_score(labels[train_size:], y_pred)
print(f"train_test_ratio: {train_test_ratio}, accuracy: {accuracy}")


if __name__ == "__main__":
main()
# CUDA_VISIBLE_DEVICES="" poetry run python mmda/handwriting_baseline.py
Loading
Loading