feat: optional test set logprobs during inference

Joaoloula · Joaoloula · commit 469de70a0dfd · 2025-02-07T16:54:48.000Z
diff --git a/distance_synth_data_genjaxmix.py b/distance_synth_data_genjaxmix.py
@@ -0,0 +1,161 @@
+# # %%
+# %load_ext autoreload
+# %autoreload 2
+
+# %%
+import jax
+import jax.numpy as jnp
+import numpy as np
+import polars as pl
+
+# %%
+dataset_paths = [
+    "data/CTGAN/covertype", 
+    "data/CTGAN/kddcup", 
+    "data/CTGAN/sydt", 
+    "data/lpm/CES",
+    "data/lpm/PUMS",
+    "data/lpm/PUMD",
+]
+
+# times = {
+#     "covertype": 304.31628918647766,
+#     "kddcup": 5809.921473503113,
+#     "sydt": 4147.667294740677,
+#     "CES": 49.699519634246826,
+#     "PUMS": 557.475483417511,
+#     "PUMD": 127.56089353561401,
+# }
+times_single_rejuvenation_100 = {
+    "covertype": 5.269169092178345,
+    "kddcup": 61.410168170928955,
+    "sydt": 46.48610043525696,
+    "CES": 2.556819438934326,
+    "PUMS": 7.585843563079834,
+    "PUMD": 5.1315598487854,
+}
+times_single_rejuvenation_300 = {
+    "covertype": 35.914592266082764,
+    "kddcup": 511.4724328517914,
+    "sydt": 376.814204454422,
+    "CES": 13.747230291366577,
+    "PUMS": 57.392295598983765,
+    "PUMD": 38.446903228759766,
+}
+times_single_rejuvenation_500 = {
+    "covertype": 95.82071185112,
+    "kddcup": 1392.8732736110687,
+    "sydt": 1017.3137822151184,
+    "CES": 33.89802026748657,
+    "PUMS": 159.05469465255737,
+    "PUMD": 109.2985634803772,
+}
+
+
+
+# %%
+from minijaxmix.io import load_huggingface, discretize_dataframe, to_dummies
+from minijaxmix.infer import sample_categorical
+from minijaxmix.distances import js
+from functools import partial
+
+# partial_js = partial(js, batch_size=10)
+# jit_js = jax.jit(partial_js)
+jit_js = jax.jit(js)
+
+dfs = []
+for dataset_path in dataset_paths:
+    print(dataset_path)
+    train_df, test_df = load_huggingface(dataset_path)
+    df = pl.concat((train_df, test_df))
+
+    schema, discretized_df, categorical_idxs = discretize_dataframe(df)
+    dummies_df = to_dummies(discretized_df)
+    data = dummies_df.to_numpy().astype(np.bool_)
+
+    train_data = data[:len(train_df)]
+    test_data = data[len(train_df):][:10000]
+
+    files = jnp.load(f"{dataset_path.split('/')[-1]}_single_rejuvenation.npz")
+
+    p_ys = files["p_ys"]
+    ws = files["ws"]
+
+    n_sample = 10000
+
+    cs = jax.random.categorical(jax.random.PRNGKey(0), jnp.log(p_ys), shape=(n_sample,))
+    sample_ws = ws.take(cs, axis=0)
+    n_categories = categorical_idxs.max() + 1
+
+    samples = jax.vmap(sample_categorical, in_axes=(0, 0, None, None))(jax.random.split(jax.random.PRNGKey(0), n_sample), jnp.log(sample_ws), categorical_idxs, n_categories)
+
+    distances = jit_js(jnp.array(test_data), jnp.array(samples))
+
+    dfs.append(pl.DataFrame({
+        "distance": np.array(distances), 
+        "dataset": dataset_path, 
+        "model": "GenJaxMix", 
+        "time": times_single_rejuvenation_300[dataset_path.split("/")[-1]]
+    }))
+
+# %%
+result_df = pl.concat(dfs)
+
+# %%
+# times_no_rejuvenation = {
+#     "covertype": 35.12278389930725,
+#     "kddcup": 495.74342131614685,
+#     "sydt": 365.2887644767761,
+#     "CES": 13.5840482711792,
+#     "PUMS": 55.90764021873474,
+#     "PUMD": 38.18173289299011,
+# }
+
+# %%
+dfs = []
+for dataset_path in dataset_paths:
+    print(dataset_path)
+    train_df, test_df = load_huggingface(dataset_path)
+    df = pl.concat((train_df, test_df))
+
+    schema, discretized_df, categorical_idxs = discretize_dataframe(df)
+    dummies_df = to_dummies(discretized_df)
+    data = dummies_df.to_numpy().astype(np.bool_)
+
+    train_data = data[:len(train_df)]
+    test_data = data[len(train_df):][:10000]
+
+    files = jnp.load(f"{dataset_path.split('/')[-1]}_single_rejuvenation_100.npz")
+
+    p_ys = files["p_ys"]
+    ws = files["ws"]
+
+    n_sample = 10000
+
+    cs = jax.random.categorical(jax.random.PRNGKey(0), jnp.log(p_ys), shape=(n_sample,))
+    sample_ws = ws.take(cs, axis=0)
+    n_categories = categorical_idxs.max() + 1
+
+    samples = jax.vmap(sample_categorical, in_axes=(0, 0, None, None))(jax.random.split(jax.random.PRNGKey(0), n_sample), jnp.log(sample_ws), categorical_idxs, n_categories)
+
+    distances = jit_js(jnp.array(test_data), jnp.array(samples))
+
+    dfs.append(pl.DataFrame({
+        "distance": np.array(distances), 
+        "dataset": dataset_path, 
+        "model": "GenJaxMix", 
+        "time": times_single_rejuvenation_100[dataset_path.split("/")[-1]]
+    }))
+
+# %%
+no_rejuvenation_result_df = pl.concat(dfs)
+
+# %%
+prev_result_df = pl.read_parquet("distance_synth_data.parquet")
+
+# %%
+new_result_df = pl.concat((prev_result_df, result_df, no_rejuvenation_result_df), how="diagonal")
+
+# %%
+new_result_df.write_parquet("new_distance_synth_data.parquet")
+# %%
diff --git a/minijaxmix/infer.py b/minijaxmix/infer.py
@@ -1,14 +1,8 @@
 import jax
-jax.config.update("jax_compilation_cache_dir", "jax_cache")
-jax.config.update("jax_persistent_cache_min_entry_size_bytes", -1)
-jax.config.update("jax_persistent_cache_min_compile_time_secs", 0)
-
 import jax.numpy as jnp
 from functools import partial
-import numpy as np
 from jaxtyping import Array, Float, Bool, Integer
-import time
-from minijaxmix.query import sample_dirichlet, sample_categorical
+from minijaxmix.query import sample_dirichlet, logprob
 
 ALPHA = 1e-5
 
@@ -22,8 +16,8 @@ def conditional_entropy(data, c):
     res = - jnp.sum(jnp.where(c, c * p_x_y, 0), axis=0) / jnp.sum(c, axis=0)
     return res
 
-@partial(jax.jit, static_argnames=("n_clusters", "n_gibbs", "n_categories", "n_branch", "rejuvenation", "minibatch_size"))
-def infer(key, data, categorical_idxs, n_clusters, n_gibbs, n_categories, n_branch=2, rejuvenation=True, minibatch_size=1000):
+@partial(jax.jit, static_argnames=("n_clusters", "n_gibbs", "n_categories", "n_branch", "rejuvenation", "minibatch_size", "test"))
+def infer(key, data, categorical_idxs, n_clusters, n_gibbs, n_categories, n_branch=2, rejuvenation=True, minibatch_size=1000, test=False, test_data=None):
     N, k = data.shape
     p_ys = jnp.zeros(n_clusters)
     p_ys = p_ys.at[0].set(1.)
@@ -77,7 +71,13 @@ def infer_step(carry, key_i):
 
         total_H_split = jnp.nansum(conditional_H * p_y) - jnp.nansum(p_y * jnp.log(p_y))
 
-        return (p_y, w, conditional_H), (total_H_split, total_H_hard_clustering)
+        if test:
+            logprobs = jax.vmap(jax.vmap(logprob, in_axes=(None, 0)), in_axes=(0, None))(test_data, w)
+            logprobs = jax.nn.logsumexp(logprobs, b=p_y, axis=1)
+            logprobs = jnp.sum(logprobs)
+            return (p_y, w, conditional_H), (total_H_split, total_H_hard_clustering, logprobs)
+        else:
+            return (p_y, w, conditional_H), (total_H_split, total_H_hard_clustering, None)
 
     def rejuvenation(carry, key):
         p_y, w, conditional_H = carry
@@ -100,13 +100,13 @@ def rejuvenation_step(p_y_w, key):
     keys = jax.random.split(subkey, n_clusters - 1)
     # we could use lax.scan here, but at the cost of padding each step to the max number of clusters
 
-    (p_ys, ws, conditional_H), (total_H_split, total_H_hard_clustering) = jax.lax.scan(
+    (p_ys, ws, conditional_H), (total_H_split, total_H_hard_clustering, logprobs) = jax.lax.scan(
         infer_step, (p_ys, ws, conditional_H), (keys, jnp.arange(n_clusters - 1)))
 
     if rejuvenation:
         (p_ys, ws, conditional_H), total_H_rejuvenation =  rejuvenation((p_ys, ws, conditional_H), key) 
 
-    return p_ys, ws, conditional_H, total_H_split, total_H_rejuvenation, total_H_hard_clustering
+    return p_ys, ws, conditional_H, total_H_split, total_H_rejuvenation, total_H_hard_clustering, logprobs
 
 def make_minibatches(key, data, c, num_clusters, minibatch_size):
     keys = jax.random.split(key, num_clusters)
diff --git a/plot_time.py b/plot_time.py
@@ -0,0 +1,37 @@
+# %%
+import plotnine as pn
+import polars as pl
+
+# %%
+df = pl.read_parquet("new_distance_synth_data.parquet")
+df
+
+# %%
+median_df = df.group_by(["dataset", "model", "time"]).agg(pl.median("distance").alias("median_distance"))
+median_df
+
+
+# %%
+dataset_map = {
+    "data/CTGAN/covertype": "Covertype",
+    "data/CTGAN/kddcup": "KDDCup",
+    "data/CTGAN/sydt": "SYDT",
+    "data/lpm/CES": "CES",
+    "data/lpm/PUMS": "PUMS",
+    "data/lpm/PUMD": "PUMD",
+}
+median_df = median_df.with_columns(pl.col("dataset").replace(dataset_map))
+
+# %%
+(
+    # pn.ggplot(median_df.filter(pl.col("dataset") == "data/CTGAN/covertype"))
+    pn.ggplot(median_df)
+    + pn.geom_line(pn.aes(x="time", y="median_distance", color="model", fill="model"))
+    + pn.geom_point(pn.aes(x="time", y="median_distance", color="model", fill="model"))
+    + pn.labs(y="2D Jensen-Shannon distance between\nreal and synthetic data (median)", x="Training time (seconds)")
+    + pn.scale_x_log10()
+    + pn.scale_y_log10()
+    + pn.facet_wrap("~dataset", scales="free")
+)
+
+# %%