diff --git a/demo/APO Sample Existing Demo.ipynb b/demo/APO Sample Existing Demo.ipynb new file mode 100644 index 0000000..8e9df98 --- /dev/null +++ b/demo/APO Sample Existing Demo.ipynb @@ -0,0 +1,184 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "75f106cc", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.insert(0, '../')\n", + "\n", + "print(sys.path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "117a382f", + "metadata": {}, + "outputs": [], + "source": [ + "import obsidian\n", + "import pandas as pd\n", + "import numpy as np\n", + "print(f'obsidian version: ' + obsidian.__version__)\n", + "\n", + "from obsidian.experiment import AdvExpDesigner\n", + "from obsidian.experiment.sampling import sample_with_bias, best_sample" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1cd0a6b0", + "metadata": {}, + "outputs": [], + "source": [ + "#generate random data for this demo\n", + "np.random.seed(42)\n", + "\n", + "n = 1000\n", + "demo_data = pd.DataFrame({\n", + " 'reagent_conc': np.round(np.random.uniform(0.1, 1.0, n), 2),\n", + " 'ionic_strength': np.round(np.random.uniform(10, 100, n), 2),\n", + " 'surfactant_conc': np.round(np.random.uniform(0.01, 0.2, n), 3),\n", + " 'compound_A': np.round(np.random.uniform(0, 50, n), 2),\n", + " 'compound_B': np.round(np.random.uniform(0, 50, n), 2),\n", + " 'sugar': np.random.choice(['glucose', 'fructose', 'sucrose'], n),\n", + " 'surfactant': np.random.choice(['SDS', 'Tween20', 'TritonX'], n),\n", + " 'buffer': np.random.choice(['PBS', 'Tris', 'HEPES'], n),\n", + " 'pH': np.round(np.random.uniform(5.5, 8.5, n), 2)\n", + "})\n", + "\n", + "demo_data.index.name = 'FormulationID'\n", + "demo_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "57ed0226", + "metadata": {}, + "outputs": [], + "source": [ + "#Initialize existing experimental data as an AdvExpDesigner object\n", + "designer = AdvExpDesigner(design_df=demo_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9feae24b", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "You can sample an existing dataset with or without bias: \n", + "Bias dictionary format : {\"column\": [lower_bound, upper_bound, relative_weight]}\n", + "\n", + "- Weight >1 increases sampling probability for in-range rows.\n", + "- Weight <1 decreases it.\n", + "- Weight = 0 excludes those rows entirely.\n", + "\"\"\"\n", + "\n", + "bias = {\n", + " \"ionic_strength\": [50, 60, 3.0], \n", + "}\n", + "\n", + "seed = np.random.randint(0,1000)\n", + "print(f\"Random seed for reproducibility: {seed}\")\n", + "\n", + "#We can easily create a random sample of n samples with weights using built in Pandas functions\n", + "#enforce = True allows you to force the boundary to be true ; resultant sample may not be space-filling.\n", + "sample = sample_with_bias(designer.design, n=1000, replace=False, seed=seed, bias=bias, plot_weights=True, enforce=False)\n", + "\n", + "sample" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ab457ed8", + "metadata": {}, + "outputs": [], + "source": [ + "#One-hot encode your categorical columns for easy handling in determining Euclidean distance\n", + "df_encoded = pd.get_dummies(designer.design, columns=[\"sugar\", \"surfactant\", \"buffer\"], dtype=int) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a46bcd0", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "perform random sampling n_trial times, select the best one via criteria metric:\n", + "metric:\n", + " - \"maximin\": maximize the minimum pairwise Euclidean distance\n", + " - \"mean_nn\": maximize the mean nearest-neighbor Euclidean distance\n", + " - \"hybrid\": 0.6*maximin + 0.4*mean_nn \n", + "\"\"\"\n", + "seed = np.random.randint(0,1000)\n", + "print(f\"Random seed for reproducibility: {seed}\")\n", + "\n", + "optimal_sample, info = best_sample(\n", + " df_encoded, 10, feature_cols=df_encoded.columns, n_trials=1000,\n", + " bias=bias, plot_weights=True, enforce=False, random_state=seed, metric=\"hybrid\"\n", + ")\n", + "\n", + "print(info)\n", + "optimal_sample\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ea1bcd8", + "metadata": {}, + "outputs": [], + "source": [ + "#decode from one-hot encoding\n", + "normal_cols = list(optimal_sample.columns)[0:6]\n", + "encoded_cols = list(optimal_sample.columns)[6:]\n", + "decoded = pd.from_dummies(optimal_sample[encoded_cols],sep=\"_\")\n", + "optimal_design_decoded = pd.concat([optimal_sample[normal_cols], decoded], axis=1)\n", + "optimal_design_decoded" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2137e315", + "metadata": {}, + "outputs": [], + "source": [ + "print(designer.plot_histograms(optimal_design_decoded))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv (3.13.5)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/demo/Advanced Experimental Design.ipynb b/demo/Advanced Experimental Design.ipynb index b635be5..45d208b 100644 --- a/demo/Advanced Experimental Design.ipynb +++ b/demo/Advanced Experimental Design.ipynb @@ -33,10 +33,16 @@ "# Define continuous parameters: key -> (low, high, step)\n", "\n", "continuous_params = {\n", - " 'temperature': (20, 80, 5), # Linear steps of 5 between 20 and 80\n", - " 'concentration': (0.1, 1.0, 0.1), # Linear steps of 0.1 between 0.1 and 1.0\n", - " 'pressure': (1, 16, 'geometric'), # Geometric steps doubling from 1 to 16 (1, 2, 4, 8, 16)\n", - " 'time': (10, 1000, 'logarithmic') # Logarithmic steps (powers of 10) between 10 and 1000\n", + " 'temperature': (20, 80, 5), # Linear steps of 5 between 20 and 80\n", + " 'concentration': (0.1, 1.0, 0.1), # Linear steps of 0.1 between 0.1 and 1.0\n", + " 'pressure': (1, 16, 'geometric'), # Geometric steps doubling from 1 to 16 (1, 2, 4, 8, 16)\n", + " 'time': (10, 1000, 'logarithmic'), # Logarithmic steps (powers of 10) between 10 and 1000\n", + " 'flow_rate': [0.5, 1.0, 2.0, 5.0, 10.0], # Custom discrete levels, equal biases\n", + " 'Reagent Concentration': {\n", + " 'levels': [1.0, 2.0, 3.0, 5.0, 10.0], # Custom levels with biased sampling\n", + " 'biases': [0.1, 0.2, 0.4, 0.2, 0.1] # Higher probability for middle values\n", + "}\n", + "\n", "}\n", "\n", "# Define conditional categorical parameters with subparameters and frequencies: key -> {subkey: {'freq': frequency, 'subparams': ([values], [frequencies])}}\n", @@ -170,7 +176,7 @@ ], "metadata": { "kernelspec": { - "display_name": "obsidian", + "display_name": ".venv (3.13.5)", "language": "python", "name": "python3" }, @@ -184,7 +190,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.14" + "version": "3.13.5" } }, "nbformat": 4, diff --git a/obsidian/experiment/advanced_design.py b/obsidian/experiment/advanced_design.py index e7594af..11581cf 100644 --- a/obsidian/experiment/advanced_design.py +++ b/obsidian/experiment/advanced_design.py @@ -22,25 +22,41 @@ class AdvExpDesigner: """ def __init__( - self, continuous_params, conditional_subparameters, subparam_mapping=None + self, continuous_params=None, conditional_subparameters=None, subparam_mapping=None, design_df=None ): """ Initializes the AdvExpDesigner with experimental parameters and optional subparameter mappings. :param continuous_params: A dictionary containing the continuous parameters for the design. + Each parameter can be specified as: + - (low, high, step): Linear spacing with a fixed step size + - (low, high, "geometric"): Geometric spacing (doubling) + - (low, high, "logarithmic"): Logarithmic spacing (powers of 10) + - [value1, value2, ...]: Custom list of specific levels to sample from + - {'levels': [value1, value2, ...], 'biases': [weight1, weight2, ...]}: + Custom levels with optional biases/weights for non-uniform sampling. + Biases will be normalized to sum to 1.0 if they don't already. :param conditional_subparameters: A dictionary containing the conditional subparameters for the design. :param subparam_mapping: A dictionary for mapping, will be inferred if not provided. + :param design_df: A Pandas DataFrame of an existing experimental design, default None """ - self.continuous_params = continuous_params - self.conditional_subparameters = conditional_subparameters - self.subparam_mapping = subparam_mapping or infer_subparam_mapping( - self.conditional_subparameters - ) - self.continuous_keys = list(self.continuous_params.keys()) - self.categorical_keys = list(self.conditional_subparameters.keys()) - self.subparam_key = ( - list(self.subparam_mapping.values())[0] if self.subparam_mapping else None - ) + self.continuous_params = continuous_params if continuous_params else {} + self.conditional_subparameters = conditional_subparameters if conditional_subparameters else {} + + if design_df is not None and not design_df.empty: + self.design = design_df + self.categorical_keys = design_df.select_dtypes(exclude=['number']).columns.tolist() + if continuous_params: + self.continuous_keys = list(self.continuous_params.keys()) + else: + self.continuous_keys = design_df.select_dtypes(include=['number']).columns.tolist() + else: + self.continuous_keys = list(self.continuous_params.keys()) if continuous_params else [] + self.categorical_keys = list(self.conditional_subparameters.keys()) if conditional_subparameters else [] + + self.subparam_mapping = subparam_mapping or infer_subparam_mapping(self.conditional_subparameters) + self.subparam_keys = list(self.subparam_mapping.values()) if self.subparam_mapping else [] + self.subparam_key = self.subparam_keys[0] if self.subparam_keys else None def generate_design(self, seed, n_samples, optimize_categories=True): """ @@ -76,6 +92,7 @@ def evaluate_design(self, design, metrics_to_optimize=None): "Pairwise Distance CV", "Max Continuous Corr", "Max Categorical Corr", + "Max Mixed Corr", ] return evaluate_design( @@ -114,6 +131,7 @@ def optimize_design( "Pairwise Distance CV", "Max Continuous Corr", "Max Categorical Corr", + "Max Mixed Corr", ] if maximize_metrics is None: maximize_metrics = "D-optimality" @@ -269,34 +287,74 @@ def sample_continuous_lhs(continuous_params, n_samples, seed): cont_samples = {} keys = list(continuous_params.keys()) for idx, key in enumerate(keys): - low, high, step = continuous_params[key] - if step == "geometric": - possible = [] - value = low - while value <= high: - possible.append(value) - value *= 2 - possible = np.array(possible) - indices = np.floor(sample_cont[:, idx] * len(possible)).astype(int) - indices = np.clip(indices, 0, len(possible) - 1) - cont_samples[key] = possible[indices] - elif step == "logarithmic": - if low <= 0 or high <= 0: - raise ValueError( - f"Logarithmic step requires positive low and high for parameter '{key}'" - ) - exp_low = int(np.floor(np.log10(low))) - exp_high = int(np.floor(np.log10(high))) - possible = 10.0 ** np.arange(exp_low, exp_high + 1) + param_spec = continuous_params[key] + + # Check if parameter specification is a dictionary with 'levels' and optional 'biases' + if isinstance(param_spec, dict): + levels = param_spec.get('levels') + biases = param_spec.get('biases', None) + + if levels is None: + raise ValueError(f"Dictionary specification for parameter '{key}' must include 'levels' key") + + possible = np.array(levels) + + if biases is not None: + # Use biased sampling with inverse transform method + biases = np.array(biases) + if len(biases) != len(possible): + raise ValueError(f"Bias length ({len(biases)}) must match levels length ({len(possible)}) for parameter '{key}'") + if not np.isclose(sum(biases), 1.0): + biases = biases / np.sum(biases) + + # Create CDF for inverse transform sampling + cdf = np.cumsum(biases) + uniform_samples = sample_cont[:, idx] + indices = np.searchsorted(cdf, uniform_samples) + indices = np.clip(indices, 0, len(possible) - 1) + cont_samples[key] = possible[indices] + else: + # Uniform sampling across custom levels + indices = np.floor(sample_cont[:, idx] * len(possible)).astype(int) + indices = np.clip(indices, 0, len(possible) - 1) + cont_samples[key] = possible[indices] + + # Check if the parameter specification is a list (custom levels without biases) + elif isinstance(param_spec, (list, tuple)) and not (len(param_spec) == 3 and not isinstance(param_spec[0], (list, tuple))): + possible = np.array(param_spec) indices = np.floor(sample_cont[:, idx] * len(possible)).astype(int) indices = np.clip(indices, 0, len(possible) - 1) cont_samples[key] = possible[indices] else: - num_steps = int(round((high - low) / step)) + 1 - possible = np.linspace(low, high, num_steps) - indices = np.floor(sample_cont[:, idx] * num_steps).astype(int) - indices = np.clip(indices, 0, num_steps - 1) - cont_samples[key] = possible[indices] + # Traditional (low, high, step) format + low, high, step = param_spec + if step == "geometric": + possible = [] + value = low + while value <= high: + possible.append(value) + value *= 2 + possible = np.array(possible) + indices = np.floor(sample_cont[:, idx] * len(possible)).astype(int) + indices = np.clip(indices, 0, len(possible) - 1) + cont_samples[key] = possible[indices] + elif step == "logarithmic": + if low <= 0 or high <= 0: + raise ValueError( + f"Logarithmic step requires positive low and high for parameter '{key}'" + ) + exp_low = int(np.floor(np.log10(low))) + exp_high = int(np.floor(np.log10(high))) + possible = 10.0 ** np.arange(exp_low, exp_high + 1) + indices = np.floor(sample_cont[:, idx] * len(possible)).astype(int) + indices = np.clip(indices, 0, len(possible) - 1) + cont_samples[key] = possible[indices] + else: + num_steps = int(round((high - low) / step)) + 1 + possible = np.linspace(low, high, num_steps) + indices = np.floor(sample_cont[:, idx] * num_steps).astype(int) + indices = np.clip(indices, 0, num_steps - 1) + cont_samples[key] = possible[indices] return cont_samples @@ -426,13 +484,16 @@ def assign_conditional_subparameter( def infer_subparam_mapping(conditional_subparameters): mapping = {} - for cat_param, levels in conditional_subparameters.items(): - subparam_candidates = set() - for level_info in levels.values(): - subparams = [k for k in level_info if k != "freq"] - subparam_candidates.update(subparams) - if len(subparam_candidates) == 1: - mapping[cat_param] = subparam_candidates.pop() + if len(conditional_subparameters) == 0: + return mapping + else: + for cat_param, levels in conditional_subparameters.items(): + subparam_candidates = set() + for level_info in levels.values(): + subparams = [k for k in level_info if k != "freq"] + subparam_candidates.update(subparams) + if len(subparam_candidates) == 1: + mapping[cat_param] = subparam_candidates.pop() return mapping @@ -495,18 +556,57 @@ def sample_design( # Round and format continuous variables for key in continuous_params.keys(): - step = continuous_params[key][2] - if step == "geometric": - design[key] = design[key].round(3) - elif step == "logarithmic": - design[key] = design[key].round(5) - elif isinstance(step, (int, float)): - decimals = max(0, -int(np.floor(np.log10(step)))) if step < 1 else 0 - design[key] = design[key].round(decimals) - if isinstance(step, int) or (isinstance(step, float) and step.is_integer()): + param_spec = continuous_params[key] + + # Check if parameter specification is a dictionary with 'levels' and optional 'biases' + if isinstance(param_spec, dict): + levels = param_spec.get('levels') + possible = np.array(levels) + # Check if all values are integers + if np.all(np.equal(np.mod(possible, 1), 0)): design[key] = design[key].astype(int) + else: + # Find the maximum number of decimal places in the custom levels + max_decimals = 0 + for val in possible: + if not np.isnan(val): + val_str = str(float(val)) + if '.' in val_str: + decimals = len(val_str.split('.')[1].rstrip('0')) + max_decimals = max(max_decimals, decimals) + design[key] = design[key].round(max_decimals) + + # Check if the parameter specification is a list (custom levels) + elif isinstance(param_spec, (list, tuple)) and not (len(param_spec) == 3 and not isinstance(param_spec[0], (list, tuple))): + # Custom levels - determine appropriate rounding from the values + possible = np.array(param_spec) + # Check if all values are integers + if np.all(np.equal(np.mod(possible, 1), 0)): + design[key] = design[key].astype(int) + else: + # Find the maximum number of decimal places in the custom levels + max_decimals = 0 + for val in possible: + if not np.isnan(val): + val_str = str(float(val)) + if '.' in val_str: + decimals = len(val_str.split('.')[1].rstrip('0')) + max_decimals = max(max_decimals, decimals) + design[key] = design[key].round(max_decimals) else: - design[key] = design[key].round(5) + # Traditional (low, high, step) format + step = param_spec[2] + if step == "geometric": + design[key] = design[key].round(3) + elif step == "logarithmic": + design[key] = design[key].round(5) + elif isinstance(step, (int, float)): + decimals = max(0, -int(np.floor(np.log10(step)))) if step < 1 else 0 + design[key] = design[key].round(decimals) + if isinstance(step, int) or (isinstance(step, float) and step.is_integer()): + design[key] = design[key].astype(int) + else: + design[key] = design[key].round(5) # Round subparameters like pH for subparam_key in subparam_mapping.values(): @@ -600,8 +700,14 @@ def calculate_mixed_correlation_matrix(df, categorical_vars=None): # --- Design Quality Metrics --- -def calculate_d_optimality(design, continuous_params_keys, pH_key): - continuous_keys = continuous_params_keys + [pH_key] +def calculate_d_optimality(design, continuous_params_keys, subparam_keys): + """Calculate D-optimality including all continuous parameters and subparameters.""" + if subparam_keys is None: + subparam_keys = [] + elif isinstance(subparam_keys, str): + subparam_keys = [subparam_keys] + + continuous_keys = continuous_params_keys + subparam_keys X = design[continuous_keys].values X_std = (X - np.mean(X, axis=0)) / np.std(X, axis=0) X_model = np.column_stack((np.ones(X_std.shape[0]), X_std)) @@ -609,8 +715,14 @@ def calculate_d_optimality(design, continuous_params_keys, pH_key): return linalg.det(XtX) -def calculate_a_optimality(design, continuous_params_keys, pH_key): - continuous_keys = continuous_params_keys + [pH_key] +def calculate_a_optimality(design, continuous_params_keys, subparam_keys): + """Calculate A-optimality including all continuous parameters and subparameters.""" + if subparam_keys is None: + subparam_keys = [] + elif isinstance(subparam_keys, str): + subparam_keys = [subparam_keys] + + continuous_keys = continuous_params_keys + subparam_keys X = design[continuous_keys].values X_std = (X - np.mean(X, axis=0)) / np.std(X, axis=0) X_model = np.column_stack((np.ones(X_std.shape[0]), X_std)) @@ -623,8 +735,14 @@ def calculate_a_optimality(design, continuous_params_keys, pH_key): return np.inf -def calculate_condition_number(design, continuous_params_keys, pH_key): - continuous_keys = continuous_params_keys + [pH_key] +def calculate_condition_number(design, continuous_params_keys, subparam_keys): + """Calculate condition number including all continuous parameters and subparameters.""" + if subparam_keys is None: + subparam_keys = [] + elif isinstance(subparam_keys, str): + subparam_keys = [subparam_keys] + + continuous_keys = continuous_params_keys + subparam_keys X = design[continuous_keys].values X_std = (X - np.mean(X, axis=0)) / np.std(X, axis=0) X_model = np.column_stack((np.ones(X_std.shape[0]), X_std)) @@ -632,8 +750,14 @@ def calculate_condition_number(design, continuous_params_keys, pH_key): return np.linalg.cond(XtX) -def calculate_pairwise_distance_uniformity(design, continuous_params_keys, pH_key): - continuous_keys = continuous_params_keys + [pH_key] +def calculate_pairwise_distance_uniformity(design, continuous_params_keys, subparam_keys): + """Calculate pairwise distance uniformity including all continuous parameters and subparameters.""" + if subparam_keys is None: + subparam_keys = [] + elif isinstance(subparam_keys, str): + subparam_keys = [subparam_keys] + + continuous_keys = continuous_params_keys + subparam_keys X = design[continuous_keys].values X_std = (X - np.mean(X, axis=0)) / np.std(X, axis=0) @@ -644,8 +768,14 @@ def calculate_pairwise_distance_uniformity(design, continuous_params_keys, pH_ke return std_dist / mean_dist if mean_dist != 0 else np.inf -def calculate_max_continuous_correlation(design, continuous_params_keys, pH_key): - continuous_keys = continuous_params_keys + [pH_key] +def calculate_max_continuous_correlation(design, continuous_params_keys, subparam_keys): + """Calculate max continuous correlation including all continuous parameters and subparameters.""" + if subparam_keys is None: + subparam_keys = [] + elif isinstance(subparam_keys, str): + subparam_keys = [subparam_keys] + + continuous_keys = continuous_params_keys + subparam_keys corr_matrix = design[continuous_keys].corr().abs() np.fill_diagonal(corr_matrix.values, 0) return corr_matrix.values.max() @@ -659,12 +789,50 @@ def calculate_max_categorical_correlation(design, categorical_keys): return corr_matrix.abs().values.max() +def calculate_max_mixed_correlation(design, continuous_keys, categorical_keys, subparam_mapping): + """ + Calculate the maximum correlation between categorical and continuous/subparameter variables. + This captures mixed correlations that are neither purely continuous nor purely categorical. + Excludes parent-subparameter correlations (e.g., buffer ↔ pH) which are structural. + + :param design: The design DataFrame + :param continuous_keys: List of continuous parameter names + :param categorical_keys: List of categorical parameter names + :param subparam_mapping: Dictionary mapping categorical variables to their subparameters + :return: Maximum absolute mixed correlation value + """ + # Get all subparameter columns + subparam_keys = list(subparam_mapping.values()) if subparam_mapping else [] + + # Get all continuous columns (parameters + subparameters) + all_continuous = continuous_keys + subparam_keys + + # Calculate full correlation matrix + full_corr = calculate_mixed_correlation_matrix(design, categorical_vars=categorical_keys) + + # Extract mixed correlations: categorical ↔ continuous/subparameter + # Exclude parent-subparameter relationships (those are structural by design) + max_mixed = 0.0 + for cat in categorical_keys: + for cont in all_continuous: + # Skip if this is a parent-subparameter relationship + if subparam_mapping and cat in subparam_mapping and subparam_mapping[cat] == cont: + continue # Skip structural parent-subparameter correlation + + if cat in full_corr.index and cont in full_corr.columns: + corr_val = abs(full_corr.loc[cat, cont]) + max_mixed = max(max_mixed, corr_val) + + return max_mixed + + # --- Dimensionality Reduction Plots --- def plot_pca(design, continuous_params_keys, subparam_mapping, hue=None): - pH_key = list(subparam_mapping.values())[0] - continuous_keys = continuous_params_keys + [pH_key] + """Plot PCA including all continuous parameters and subparameters.""" + subparam_keys = list(subparam_mapping.values()) if subparam_mapping else [] + continuous_keys = continuous_params_keys + subparam_keys X = design[continuous_keys].values X_std = (X - np.mean(X, axis=0)) / np.std(X, axis=0) pca = PCA(n_components=2) @@ -689,8 +857,9 @@ def plot_pca(design, continuous_params_keys, subparam_mapping, hue=None): def plot_mds( design, continuous_params_keys, subparam_mapping, hue=None, metric="euclidean" ): - pH_key = list(subparam_mapping.values())[0] - continuous_keys = continuous_params_keys + [pH_key] + """Plot MDS including all continuous parameters and subparameters.""" + subparam_keys = list(subparam_mapping.values()) if subparam_mapping else [] + continuous_keys = continuous_params_keys + subparam_keys X = design[continuous_keys].values X_std = (X - np.mean(X, axis=0)) / np.std(X, axis=0) @@ -722,9 +891,9 @@ def plot_umap( min_dist=0.1, metric="euclidean", ): - - categorical_keys = list(subparam_mapping.values())[0] - continuous_keys = continuous_params_keys + [categorical_keys] + """Plot UMAP including all continuous parameters and subparameters.""" + subparam_keys = list(subparam_mapping.values()) if subparam_mapping else [] + continuous_keys = continuous_params_keys + subparam_keys X = design[continuous_keys].values X_std = (X - np.mean(X, axis=0)) / np.std(X, axis=0) @@ -780,27 +949,31 @@ def evaluate_design( :param design: The design DataFrame to evaluate. :param continuous_keys: List of continuous parameter names. :param categorical_keys: List of categorical parameter names. - :param subparam_mapping: Dictionary mapping categorical variable to its subparameter (e.g., {'buffer_type': 'pH'}). + :param subparam_mapping: Dictionary mapping categorical variable to its subparameter (e.g., {'buffer': 'pH', 'sugar': '[Sugar] (%)'}). :param metrics_to_optimize: List of metric names to compute. :return: Dictionary of computed metrics. """ - subparam_key = list(subparam_mapping.values())[0] if subparam_mapping else None + # Get all subparameter keys from the mapping + subparam_keys = list(subparam_mapping.values()) if subparam_mapping else [] metrics = { - "D-optimality": calculate_d_optimality(design, continuous_keys, subparam_key), - "A-optimality": calculate_a_optimality(design, continuous_keys, subparam_key), + "D-optimality": calculate_d_optimality(design, continuous_keys, subparam_keys), + "A-optimality": calculate_a_optimality(design, continuous_keys, subparam_keys), "Condition Number": calculate_condition_number( - design, continuous_keys, subparam_key + design, continuous_keys, subparam_keys ), "Pairwise Distance CV": calculate_pairwise_distance_uniformity( - design, continuous_keys, subparam_key + design, continuous_keys, subparam_keys ), "Max Continuous Corr": calculate_max_continuous_correlation( - design, continuous_keys, subparam_key + design, continuous_keys, subparam_keys ), "Max Categorical Corr": calculate_max_categorical_correlation( design, categorical_keys ), + "Max Mixed Corr": calculate_max_mixed_correlation( + design, continuous_keys, categorical_keys, subparam_mapping + ), } return {k: v for k, v in metrics.items() if k in metrics_to_optimize} @@ -859,6 +1032,7 @@ def find_best_design_parallel( "Pairwise Distance CV", "Max Continuous Corr", "Max Categorical Corr", + "Max Mixed Corr", ] if maximize_metrics is None: maximize_metrics = [True] + [False] * (len(metrics_to_optimize) - 1) @@ -914,24 +1088,28 @@ def find_best_design_parallel( def plot_design_quality_evolution(metrics_df): metrics_df = metrics_df.sort_values("seed") - fig, axes = plt.subplots(2, 3, figsize=(15, 10)) - metrics = [ - "D-optimality", - "A-optimality", - "Pairwise Distance CV", - "Max Continuous Corr", - "Max Categorical Corr", - "score", - ] + metrics = metrics_df.columns + n_metrics = len(metrics) + + # Calculate grid dimensions + n_cols = 3 + n_rows = (n_metrics + n_cols - 1) // n_cols + + fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5 * n_rows)) + axes = axes.flatten() if n_metrics > 1 else [axes] # Handle single subplot case for i, metric in enumerate(metrics): - ax = axes[i // 3, i % 3] + ax = axes[i] ax.bar(metrics_df["seed"].astype(str), metrics_df[metric]) ax.set_title(f"{metric} vs Seed") ax.set_xlabel("Seed") ax.set_ylabel(metric) ax.tick_params(axis='x', rotation=45) ax.grid(axis="y") + + # Hide unused subplots + for i in range(n_metrics, len(axes)): + axes[i].set_visible(False) plt.tight_layout() plt.show() @@ -1055,6 +1233,7 @@ def evaluate_candidate( "Pairwise Distance CV": calculate_pairwise_distance_uniformity(combined_design, continuous_keys, subparam_key), "Max Continuous Corr": calculate_max_continuous_correlation(combined_design, continuous_keys, subparam_key), "Max Categorical Corr": calculate_max_categorical_correlation(combined_design, categorical_keys), + "Max Mixed Corr": calculate_max_mixed_correlation(combined_design, continuous_keys, categorical_keys, subparam_mapping), } return { @@ -1087,6 +1266,7 @@ def extend_design( "Pairwise Distance CV", "Max Continuous Corr", "Max Categorical Corr", + "Max Mixed Corr", ] if maximize_metrics is None: maximize_metrics = [True] + [False] * (len(metrics_to_optimize) - 1) diff --git a/obsidian/experiment/sampling.py b/obsidian/experiment/sampling.py new file mode 100644 index 0000000..be4a48a --- /dev/null +++ b/obsidian/experiment/sampling.py @@ -0,0 +1,144 @@ +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + + +def generate_weights(df, n, bias, plot_weights=False, enforce=False): + """ + Generates a Pandas series of weights for each datum given a particular bias. + + df: DataFrame of candidates + n: size of the design to pick + bias: dictionary of biases in the format : {"column": [lower_bound, upper_bound, relative_weight]} + - Weight >1 increases sampling probability for in-range rows. + - Weight <1 decreases it. + - Weight = 0 excludes those rows entirely. + plot_weights: boolean, whether to plot distribution of weights, default False + enforce: boolean, whether to force biases, default False + + Returns: Pandas Series of normalized row weights. + """ + weights = pd.Series(1.0, index=df.index) + for col, params in bias.items(): + lower, upper = params[0], params[1] + weight = params[2] if len(params) > 2 else 1.0 + mask = df[col].between(lower, upper, inclusive="both") + if enforce: + weights *= mask.astype(float) * weight + else: + weights *= mask.astype(float) * weight + (~mask).astype(float) * 1.0 + if enforce: + if (weights > 0).sum() < n: + raise ValueError(f"Not enough rows ({(weights > 0).sum()}) satisfy all enforce conditions for n={n}.") + + weights = weights / weights.sum() + + print("Weights min:", weights.min(), "max:", weights.max()) + + if plot_weights: + plt.figure(figsize=(8, 4)) + plt.hist(weights, bins=50) + plt.title("Distribution of Sampling Weights") + plt.xlabel("Weight") + plt.ylabel("Count") + plt.show() + + return weights + + +def sample_with_bias(df, n, replace=False, seed=None, bias=None, enforce=False, plot_weights=False): + """ + Returns a random Pandas DataFrame sample of data points from a population with or without bias. + + df: DataFrame of candidates + n: int, size of the design to pick + replace: boolean, allow or disallow sampling from the same row more than once, default False + bias: dictionary of biases in the format : {"column": [lower_bound, upper_bound, relative_weight]}, default None + - Weight >1 increases sampling probability for in-range rows. + - Weight <1 decreases it. + - Weight = 0 excludes those rows entirely. + enforce: boolean, whether to force biases, default False + plot_weights: boolean, whether to plot distribution of weights, default False + + Returns: Pandas DataFrame of sampled data points. + """ + if bias: + w = generate_weights(df, n, bias, plot_weights, enforce) + return df.sample(n=n, replace=replace, random_state=seed, weights=w) + else: + return df.sample(n=n, replace=replace, random_state=seed) + + +def _space_filling_score(Z, metric="hybrid"): + """ + Z: (k, d) standardized features of the candidate sample + metric: + - "maximin": maximize the minimum pairwise distance + - "mean_nn": maximize the mean nearest-neighbor distance + - "hybrid": 0.6*maximin + 0.4*mean_nn (more stable in practice) + """ + k = Z.shape[0] + D = np.sqrt(((Z[:, None, :] - Z[None, :, :])**2).sum(-1)) + np.fill_diagonal(D, np.inf) + d_min = D[np.triu_indices(k, 1)].min() + d_mnn = D.min(axis=1).mean() + if metric == "maximin": + return d_min + if metric == "mean_nn": + return d_mnn + if metric == "hybrid": + return 0.6 * d_min + 0.4 * d_mnn + raise ValueError("Unknown metric") + + +def best_sample(df, k, feature_cols, *, n_trials=500, bias=None, plot_weights=False, enforce=False, + random_state=None, standardize=True, dropna=True, metric="hybrid"): + """ + Repeats random sampling n_trials times and returns the most space-filling sample. + + df: DataFrame of candidates + k: size of the design to pick + feature_cols: columns that define “space” (numeric; one-hot encode cats if needed) + weights: None | column name | arraylike | Series aligned to df.index + (use this to bias, e.g., pH in [6, 6.5] heavier) + """ + base = df[feature_cols] + idx = base.dropna().index if dropna else base.index + dfv = df.loc[idx] + Xfull = base.loc[idx].to_numpy(dtype=float) + + if bias: + weights = generate_weights(df, k, bias, plot_weights, enforce) + else: + weights = None + + # standardize once using the FULL population (not per-trial) for fair geometry + if standardize: + mu = Xfull.mean(axis=0) + sig = Xfull.std(axis=0) + sig[sig == 0] = 1.0 + def toZ(X): return (X - mu) / sig + else: + def toZ(X): return X + + # prep weights aligned to the filtered df + w = None + if weights is not None: + if isinstance(weights, str): + w = dfv[weights] + else: + w = pd.Series(weights, index=df.index).reindex(dfv.index).fillna(0.0) + + rng = np.random.default_rng(random_state) # reproducible stream + best_df = None + best_score = -np.inf + + for _ in range(n_trials): + cand = dfv.sample(n=k, replace=False, weights=w, random_state=rng) + Z = toZ(cand[feature_cols].to_numpy(dtype=float)) + s = _space_filling_score(Z, metric=metric) + if s > best_score: + best_score = s + best_df = cand + + return best_df, {"score": best_score, "metric": metric, "n_trials": n_trials}