From f5415af979f81dccd922a89391e3c03aee744c66 Mon Sep 17 00:00:00 2001 From: Olivier Winter Date: Tue, 30 Sep 2025 16:00:05 +0100 Subject: [PATCH 1/5] WIP on lf_features models --- ...rain_region_predictor_gradient_boosting.py | 39 ++++++++++++++----- src/ephysatlas/data.py | 9 +++++ 2 files changed, 38 insertions(+), 10 deletions(-) diff --git a/examples/train_region_predictor_gradient_boosting.py b/examples/train_region_predictor_gradient_boosting.py index 6cc7749..ac31995 100644 --- a/examples/train_region_predictor_gradient_boosting.py +++ b/examples/train_region_predictor_gradient_boosting.py @@ -7,7 +7,7 @@ import sklearn.metrics from xgboost import XGBClassifier # pip install xgboost # https://xgboost.readthedocs.io/en/stable/prediction.html - +import xgboost import iblutil.numerical import ephysatlas.anatomy import ephysatlas.data @@ -15,10 +15,8 @@ import ephysatlas.regionclassifier VINTAGE = '2024_W50' -VINTAGE = '2025_W28' -path_features = Path(f'/Users/olivier/Documents/datadisk/ephys-atlas-decoding/features/{VINTAGE}') # mac -path_features = Path(f'/mnt/s0/ephys-atlas-decoding/features/{VINTAGE}') # parede -path_features = Path(f'/datadisk/Data/paper-ephys-atlas/ephys-atlas-decoding/features/{VINTAGE}') # ferret +VINTAGE = '2025_W39' +path_features = Path(f'/datadisk/Data/paper-ephys-atlas/ephys-atlas-decoding/features/ea_active/{VINTAGE}/agg_full') # ferret if not path_features.exists(): from one.api import ONE @@ -42,6 +40,26 @@ # "micro-manipulator", ] x_list = ephysatlas.features.voltage_features_set(FEATURE_SET) +# x_list = list(set(x_list) - set( +# ['aperiodic_exponent', +# 'aperiodic_offset', +# 'decay_fit_error', +# 'decay_fit_r_squared', +# 'decay_n_peaks', +# 'psd_residual_alpha', +# 'psd_residual_beta', +# 'psd_residual_delta', +# 'psd_residual_gamma', +# 'psd_residual_lfp', +# 'psd_residual_theta'])) + + +x_list = list(set(x_list) - set( + ['decay_fit_error', + 'decay_fit_r_squared' + ])) + + x_list.append("outside") TRAIN_LABEL = "Cosmos_id" # ['Beryl_id', 'Cosmos_id'] @@ -56,10 +74,10 @@ def train(test_idx, fold_label=''): train_idx = ~test_idx print(f"{fold_label}: {df_features.shape[0]} channels", f'training set {np.sum(test_idx) / test_idx.size}') df_features.loc[train_idx, :].groupby(TRAIN_LABEL).count() - x_train = df_features.loc[train_idx, x_list].values - x_test = df_features.loc[test_idx, x_list].values - y_train = df_features.loc[train_idx, TRAIN_LABEL].values - y_test = df_features.loc[test_idx, TRAIN_LABEL].values + x_train = df_features.loc[train_idx, x_list].values.astype(float) + x_test = df_features.loc[test_idx, x_list].values.astype(float) + y_train = df_features.loc[train_idx, TRAIN_LABEL].values.astype(float) + y_test = df_features.loc[test_idx, TRAIN_LABEL].values.astype(float) df_test = df_features.loc[test_idx, :].copy() classes = np.unique(df_features.loc[train_idx, TRAIN_LABEL]) @@ -82,6 +100,7 @@ def train(test_idx, fold_label=''): print(f"{fold_label} Accuracy: {accuracy}") np.testing.assert_array_equal(classes, rids) + return classifier.predict_proba(x_test), classifier, accuracy, confusion_matrix @@ -124,7 +143,7 @@ def train(test_idx, fold_label=''): accuracy = sklearn.metrics.accuracy_score(df_features[TRAIN_LABEL].values, df_predictions['prediction'].values.astype(int)) -sklearn.metrics.ConfusionMatrixDisplay.from_predictions(df_features[TRAIN_LABEL].values, df_predictions['prediction'].values.astype(int), normalize='true', cmap='Blues') +sklearn.metrics.ConfusionMatrixDisplay.from_predictions(df_features[TRAIN_LABEL].values, df_predictions['prediction'].values.astype(int), normalize='true', cmap='Blues', im_kw=dict(vmax=.75)) _, classifier, _, _ = train(test_idx=np.zeros(df_features.shape[0], dtype=bool)) meta = dict( RANDOM_SEED=rs, diff --git a/src/ephysatlas/data.py b/src/ephysatlas/data.py index ac9d23f..2d413d3 100644 --- a/src/ephysatlas/data.py +++ b/src/ephysatlas/data.py @@ -410,6 +410,15 @@ def read_features_from_disk( if strict: df_features = pd.DataFrame(ephysatlas.features.ModelRawFeatures(df_features)) + ialpha_bad = np.logical_or( + df_features['alpha_mean'].values >= (1e3 * np.nanmedian(df_features['alpha_mean'])), + df_features['alpha_std'].values >= 1e3 * np.nanmedian(df_features['alpha_std']) + ) + # then we join with the channel information to get coordinates and anatomical information + print(f"Number of channels with bad alpha: {np.sum(ialpha_bad)}") + df_features.loc[ialpha_bad, 'alpha_mean'] = np.nanmedian(df_features['alpha_mean']) + df_features.loc[ialpha_bad, 'alpha_std'] = np.nanmedian(df_features['alpha_std']) + return df_features From 185ad2be570f1e9e7c151fe884583c03ba9f3666 Mon Sep 17 00:00:00 2001 From: Olivier Date: Wed, 8 Oct 2025 22:04:49 +0100 Subject: [PATCH 2/5] features models --- ...rain_region_predictor_gradient_boosting.py | 18 ++++--- src/ephysatlas/features.py | 50 +++++-------------- 2 files changed, 22 insertions(+), 46 deletions(-) diff --git a/examples/train_region_predictor_gradient_boosting.py b/examples/train_region_predictor_gradient_boosting.py index ac31995..15200e5 100644 --- a/examples/train_region_predictor_gradient_boosting.py +++ b/examples/train_region_predictor_gradient_boosting.py @@ -14,15 +14,17 @@ import ephysatlas.fixtures import ephysatlas.regionclassifier -VINTAGE = '2024_W50' +PROJECT = 'ea_active' VINTAGE = '2025_W39' -path_features = Path(f'/datadisk/Data/paper-ephys-atlas/ephys-atlas-decoding/features/ea_active/{VINTAGE}/agg_full') # ferret +LOWQ = ephysatlas.fixtures.misaligned_pids + +root_path_features = Path(f'/datadisk/Data/paper-ephys-atlas/ephys-atlas-decoding/features') # ferret +path_features = root_path_features.joinpath(PROJECT, VINTAGE, 'agg_full') if not path_features.exists(): from one.api import ONE one = ONE() - ephysatlas.data.download_tables(path_features.parent, label=VINTAGE, one=one) -LOWQ = ephysatlas.fixtures.misaligned_pids + ephysatlas.data.download_tables(path_features, label=VINTAGE, one=one) brain_atlas = ephysatlas.anatomy.ClassifierAtlas() @@ -54,10 +56,10 @@ # 'psd_residual_theta'])) -x_list = list(set(x_list) - set( - ['decay_fit_error', - 'decay_fit_r_squared' - ])) +# x_list = list(set(x_list) - set( +# ['decay_fit_error', +# 'decay_fit_r_squared' +# ])) x_list.append("outside") diff --git a/src/ephysatlas/features.py b/src/ephysatlas/features.py index 0884c01..aeb51b4 100644 --- a/src/ephysatlas/features.py +++ b/src/ephysatlas/features.py @@ -284,24 +284,23 @@ class ModelLfFeatures(BaseChannelFeatures): rms_lf: Series[float] = pa.Field( coerce=True, metadata={"transform": lambda x: 20 * np.log10(x)} ) + psd_lfp: Series[float] = pa.Field(coerce=True) psd_delta: Series[float] = pa.Field(coerce=True) psd_theta: Series[float] = pa.Field(coerce=True) psd_alpha: Series[float] = pa.Field(coerce=True) psd_beta: Series[float] = pa.Field(coerce=True) psd_gamma: Series[float] = pa.Field(coerce=True) - psd_lfp: Series[float] = pa.Field(coerce=True) - aperiodic_offset: Optional[Series[float]] = pa.Field(coerce=True) - aperiodic_exponent: Optional[Series[float]] = pa.Field(coerce=True) - decay_fit_error: Optional[Series[float]] = pa.Field(coerce=True) - decay_fit_r_squared: Optional[Series[float]] = pa.Field(coerce=True) - decay_n_peaks: Optional[Series[int]] = pa.Field(coerce=True) + psd_residual_lfp: Optional[Series[float]] = pa.Field(coerce=True, nullable=True) psd_residual_delta: Optional[Series[float]] = pa.Field(coerce=True, nullable=True) psd_residual_theta: Optional[Series[float]] = pa.Field(coerce=True, nullable=True) psd_residual_alpha: Optional[Series[float]] = pa.Field(coerce=True, nullable=True) psd_residual_beta: Optional[Series[float]] = pa.Field(coerce=True, nullable=True) psd_residual_gamma: Optional[Series[float]] = pa.Field(coerce=True, nullable=True) - psd_residual_lfp: Optional[Series[float]] = pa.Field(coerce=True, nullable=True) - + aperiodic_offset: Optional[Series[float]] = pa.Field(coerce=True) + aperiodic_exponent: Optional[Series[float]] = pa.Field(coerce=True) + decay_fit_error: Optional[Series[float]] = pa.Field(coerce=True) + decay_fit_r_squared: Optional[Series[float]] = pa.Field(coerce=True) + decay_n_peaks: Optional[Series[int]] = pa.Field(coerce=True) class ModelCsdFeatures(BaseChannelFeatures): """Schema for current source density features. @@ -491,40 +490,15 @@ def voltage_features_set(features_list=FEATURES_LIST): for feature_group in features_list: match feature_group: case "raw_ap": - x_list += sorted( - list( - set(ModelApFeatures.to_schema().columns.keys()) - - set(["channel"]) - ) - ) + x_list += list(ModelApFeatures.to_schema().columns.keys()) case "raw_lf": - x_list += sorted( - list( - set(ModelLfFeatures.to_schema().columns.keys()) - - set(["channel"]) - ) - ) + x_list += list(ModelLfFeatures.to_schema().columns.keys()) case "raw_lf_csd": - x_list += sorted( - list( - set(ModelCsdFeatures.to_schema().columns.keys()) - - set(["channel"]) - ) - ) + x_list += list(ModelCsdFeatures.to_schema().columns.keys()) case "waveforms": - x_list += sorted( - list( - set(ModelSpikeFeatures.to_schema().columns.keys()) - - set(["channel"]) - ) - ) + x_list += list(ModelSpikeFeatures.to_schema().columns.keys()) case "micro-manipulator": - x_list += sorted( - list( - set(ModelHistologyPlanned.to_schema().columns.keys()) - - set(["channel"]) - ) - ) + x_list += list(ModelHistologyPlanned.to_schema().columns.keys()) return x_list From f387f7ac3a5b59788fbc34c33fc337563b7165e3 Mon Sep 17 00:00:00 2001 From: Olivier Winter Date: Fri, 10 Oct 2025 11:33:08 +0100 Subject: [PATCH 3/5] remove requirements.txt --- requirements.txt | 25 ------------------------- 1 file changed, 25 deletions(-) delete mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index c13e426..0000000 --- a/requirements.txt +++ /dev/null @@ -1,25 +0,0 @@ ---extra-index-url https://download.pytorch.org/whl/cpu -dartsort @ git+https://github.com/cwindolf/dartsort@iblsorter -dredge @ git+https://github.com/evarol/dredge@v0.2.2 -h5py -hdbscan -iblatlas >= 0.3.0 -ibllib -ibl_style -linear_operator -matplotlib -mock -numba -numpy >= 2.0 -opt-einsum -pandas -pandera -pydantic -pytest -scikit-learn -scipy >= 1.13 -seaborn -spikeinterface -torch >= 2.0 -tqdm -xgboost \ No newline at end of file From 401a22b3150ba3da175bb01eee83693a62a412d2 Mon Sep 17 00:00:00 2001 From: Olivier Winter Date: Fri, 10 Oct 2025 11:36:46 +0100 Subject: [PATCH 4/5] update changelog --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 485b063..2eb3845 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,11 @@ This file documents the changes to the features for supported feature versions. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). +## [Unreleased] + +### Fixed +- `features.voltage_features_set` returns features by categories, sorted as the pydantic model definitions + ## [0.2.2] - 2025-09-25 ### Added From 701c00c60a4d84e086c2ba1cf7d09f2e60a2048b Mon Sep 17 00:00:00 2001 From: Pranav Rai Date: Wed, 22 Oct 2025 11:29:48 +0100 Subject: [PATCH 5/5] Moved the bad alpha filtering when processing per pid. --- src/ephysatlas/aggregation.py | 11 +++++++++++ src/ephysatlas/data.py | 8 -------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/ephysatlas/aggregation.py b/src/ephysatlas/aggregation.py index adb6c97..933a9bf 100644 --- a/src/ephysatlas/aggregation.py +++ b/src/ephysatlas/aggregation.py @@ -300,6 +300,7 @@ def get_aggregated_features_per_pid(snippet_df_per_pid: pd.DataFrame): Note: - The function requires that all rows have the same PID value. - Channel metadata is loaded from channels.pqt in the snippet directory's parent. + - Channels with bad alpha are filtered out. And those values are set to NaN. - The result includes both aggregated features and channel position information. """ # Ensure only one PID is present in the DataFrame @@ -319,6 +320,16 @@ def get_aggregated_features_per_pid(snippet_df_per_pid: pd.DataFrame): # Reset the index to make pid and channel regular columns agg_df_per_pid = agg_df_per_pid.reset_index() + #Filter out the channels with bad alpha + ialpha_bad = np.logical_or( + agg_df_per_pid['alpha_mean'].values >= (1e3 * np.nanmedian(agg_df_per_pid['alpha_mean'])), + agg_df_per_pid['alpha_std'].values >= 1e3 * np.nanmedian(agg_df_per_pid['alpha_std']) + ) + # then we join with the channel information to get coordinates and anatomical information + logger.info(f"Number of channels with bad alpha: {np.sum(ialpha_bad)}") + agg_df_per_pid.loc[ialpha_bad, 'alpha_mean'] = np.nan + agg_df_per_pid.loc[ialpha_bad, 'alpha_std'] = np.nan + # Load channel metadata (axial and lateral positions) from channels.pqt # Construct the path to the snippet directory to find its parent snippet_level_dir = Path(snippet_df_per_pid["base_level_dir"].iloc[0]) / Path( diff --git a/src/ephysatlas/data.py b/src/ephysatlas/data.py index 2d413d3..68f4ce7 100644 --- a/src/ephysatlas/data.py +++ b/src/ephysatlas/data.py @@ -410,14 +410,6 @@ def read_features_from_disk( if strict: df_features = pd.DataFrame(ephysatlas.features.ModelRawFeatures(df_features)) - ialpha_bad = np.logical_or( - df_features['alpha_mean'].values >= (1e3 * np.nanmedian(df_features['alpha_mean'])), - df_features['alpha_std'].values >= 1e3 * np.nanmedian(df_features['alpha_std']) - ) - # then we join with the channel information to get coordinates and anatomical information - print(f"Number of channels with bad alpha: {np.sum(ialpha_bad)}") - df_features.loc[ialpha_bad, 'alpha_mean'] = np.nanmedian(df_features['alpha_mean']) - df_features.loc[ialpha_bad, 'alpha_std'] = np.nanmedian(df_features['alpha_std']) return df_features