From f5415af979f81dccd922a89391e3c03aee744c66 Mon Sep 17 00:00:00 2001
From: Olivier Winter <olivier.winter@hotmail.fr>
Date: Tue, 30 Sep 2025 16:00:05 +0100
Subject: [PATCH 1/5] WIP on lf_features models

---
 ...rain_region_predictor_gradient_boosting.py | 39 ++++++++++++++-----
 src/ephysatlas/data.py                        |  9 +++++
 2 files changed, 38 insertions(+), 10 deletions(-)

diff --git a/examples/train_region_predictor_gradient_boosting.py b/examples/train_region_predictor_gradient_boosting.py
index 6cc7749..ac31995 100644
--- a/examples/train_region_predictor_gradient_boosting.py
+++ b/examples/train_region_predictor_gradient_boosting.py
@@ -7,7 +7,7 @@
 
 import sklearn.metrics
 from xgboost import XGBClassifier  # pip install xgboost  # https://xgboost.readthedocs.io/en/stable/prediction.html
-
+import xgboost
 import iblutil.numerical
 import ephysatlas.anatomy
 import ephysatlas.data
@@ -15,10 +15,8 @@
 import ephysatlas.regionclassifier
 
 VINTAGE = '2024_W50'
-VINTAGE = '2025_W28'
-path_features = Path(f'/Users/olivier/Documents/datadisk/ephys-atlas-decoding/features/{VINTAGE}')  # mac
-path_features = Path(f'/mnt/s0/ephys-atlas-decoding/features/{VINTAGE}')  # parede
-path_features = Path(f'/datadisk/Data/paper-ephys-atlas/ephys-atlas-decoding/features/{VINTAGE}')  # ferret
+VINTAGE = '2025_W39'
+path_features = Path(f'/datadisk/Data/paper-ephys-atlas/ephys-atlas-decoding/features/ea_active/{VINTAGE}/agg_full')  # ferret
 
 if not path_features.exists():
     from one.api import ONE
@@ -42,6 +40,26 @@
     # "micro-manipulator",
 ]
 x_list = ephysatlas.features.voltage_features_set(FEATURE_SET)
+# x_list = list(set(x_list) - set(
+#     ['aperiodic_exponent',
+#  'aperiodic_offset',
+#  'decay_fit_error',
+#  'decay_fit_r_squared',
+#  'decay_n_peaks',
+#  'psd_residual_alpha',
+#  'psd_residual_beta',
+#  'psd_residual_delta',
+#  'psd_residual_gamma',
+#  'psd_residual_lfp',
+#  'psd_residual_theta']))
+
+
+x_list = list(set(x_list) - set(
+    ['decay_fit_error',
+ 'decay_fit_r_squared'
+     ]))
+
+
 x_list.append("outside")
 
 TRAIN_LABEL = "Cosmos_id"  # ['Beryl_id', 'Cosmos_id']
@@ -56,10 +74,10 @@ def train(test_idx, fold_label=''):
     train_idx = ~test_idx
     print(f"{fold_label}: {df_features.shape[0]} channels", f'training set {np.sum(test_idx) / test_idx.size}')
     df_features.loc[train_idx, :].groupby(TRAIN_LABEL).count()
-    x_train = df_features.loc[train_idx, x_list].values
-    x_test = df_features.loc[test_idx, x_list].values
-    y_train = df_features.loc[train_idx, TRAIN_LABEL].values
-    y_test = df_features.loc[test_idx, TRAIN_LABEL].values
+    x_train = df_features.loc[train_idx, x_list].values.astype(float)
+    x_test = df_features.loc[test_idx, x_list].values.astype(float)
+    y_train = df_features.loc[train_idx, TRAIN_LABEL].values.astype(float)
+    y_test = df_features.loc[test_idx, TRAIN_LABEL].values.astype(float)
     df_test = df_features.loc[test_idx, :].copy()
     classes = np.unique(df_features.loc[train_idx, TRAIN_LABEL])
 
@@ -82,6 +100,7 @@ def train(test_idx, fold_label=''):
     print(f"{fold_label} Accuracy: {accuracy}")
 
     np.testing.assert_array_equal(classes, rids)
+
     return classifier.predict_proba(x_test), classifier, accuracy, confusion_matrix
 
 
@@ -124,7 +143,7 @@ def train(test_idx, fold_label=''):
 
 
 accuracy = sklearn.metrics.accuracy_score(df_features[TRAIN_LABEL].values, df_predictions['prediction'].values.astype(int))
-sklearn.metrics.ConfusionMatrixDisplay.from_predictions(df_features[TRAIN_LABEL].values, df_predictions['prediction'].values.astype(int), normalize='true', cmap='Blues')
+sklearn.metrics.ConfusionMatrixDisplay.from_predictions(df_features[TRAIN_LABEL].values, df_predictions['prediction'].values.astype(int), normalize='true', cmap='Blues', im_kw=dict(vmax=.75))
 _, classifier, _, _ = train(test_idx=np.zeros(df_features.shape[0], dtype=bool))
 meta = dict(
     RANDOM_SEED=rs,
diff --git a/src/ephysatlas/data.py b/src/ephysatlas/data.py
index ac9d23f..2d413d3 100644
--- a/src/ephysatlas/data.py
+++ b/src/ephysatlas/data.py
@@ -410,6 +410,15 @@ def read_features_from_disk(
     if strict:
         df_features = pd.DataFrame(ephysatlas.features.ModelRawFeatures(df_features))
 
+    ialpha_bad = np.logical_or(
+        df_features['alpha_mean'].values >= (1e3 * np.nanmedian(df_features['alpha_mean'])),
+        df_features['alpha_std'].values >= 1e3 * np.nanmedian(df_features['alpha_std'])
+    )
+    # then we join with the channel information to get coordinates and anatomical information
+    print(f"Number of channels with bad alpha: {np.sum(ialpha_bad)}")
+    df_features.loc[ialpha_bad, 'alpha_mean'] = np.nanmedian(df_features['alpha_mean'])
+    df_features.loc[ialpha_bad, 'alpha_std'] = np.nanmedian(df_features['alpha_std'])
+
     return df_features
 
 

From 185ad2be570f1e9e7c151fe884583c03ba9f3666 Mon Sep 17 00:00:00 2001
From: Olivier <olivier.winter@hotmail.fr>
Date: Wed, 8 Oct 2025 22:04:49 +0100
Subject: [PATCH 2/5] features models

---
 ...rain_region_predictor_gradient_boosting.py | 18 ++++---
 src/ephysatlas/features.py                    | 50 +++++--------------
 2 files changed, 22 insertions(+), 46 deletions(-)

diff --git a/examples/train_region_predictor_gradient_boosting.py b/examples/train_region_predictor_gradient_boosting.py
index ac31995..15200e5 100644
--- a/examples/train_region_predictor_gradient_boosting.py
+++ b/examples/train_region_predictor_gradient_boosting.py
@@ -14,15 +14,17 @@
 import ephysatlas.fixtures
 import ephysatlas.regionclassifier
 
-VINTAGE = '2024_W50'
+PROJECT = 'ea_active'
 VINTAGE = '2025_W39'
-path_features = Path(f'/datadisk/Data/paper-ephys-atlas/ephys-atlas-decoding/features/ea_active/{VINTAGE}/agg_full')  # ferret
+LOWQ = ephysatlas.fixtures.misaligned_pids
+
+root_path_features = Path(f'/datadisk/Data/paper-ephys-atlas/ephys-atlas-decoding/features')  # ferret
+path_features = root_path_features.joinpath(PROJECT, VINTAGE, 'agg_full')
 
 if not path_features.exists():
     from one.api import ONE
     one = ONE()
-    ephysatlas.data.download_tables(path_features.parent, label=VINTAGE, one=one)
-LOWQ = ephysatlas.fixtures.misaligned_pids
+    ephysatlas.data.download_tables(path_features, label=VINTAGE, one=one)
 
 brain_atlas = ephysatlas.anatomy.ClassifierAtlas()
 
@@ -54,10 +56,10 @@
 #  'psd_residual_theta']))
 
 
-x_list = list(set(x_list) - set(
-    ['decay_fit_error',
- 'decay_fit_r_squared'
-     ]))
+# x_list = list(set(x_list) - set(
+#     ['decay_fit_error',
+#  'decay_fit_r_squared'
+#      ]))
 
 
 x_list.append("outside")
diff --git a/src/ephysatlas/features.py b/src/ephysatlas/features.py
index 0884c01..aeb51b4 100644
--- a/src/ephysatlas/features.py
+++ b/src/ephysatlas/features.py
@@ -284,24 +284,23 @@ class ModelLfFeatures(BaseChannelFeatures):
     rms_lf: Series[float] = pa.Field(
         coerce=True, metadata={"transform": lambda x: 20 * np.log10(x)}
     )
+    psd_lfp: Series[float] = pa.Field(coerce=True)
     psd_delta: Series[float] = pa.Field(coerce=True)
     psd_theta: Series[float] = pa.Field(coerce=True)
     psd_alpha: Series[float] = pa.Field(coerce=True)
     psd_beta: Series[float] = pa.Field(coerce=True)
     psd_gamma: Series[float] = pa.Field(coerce=True)
-    psd_lfp: Series[float] = pa.Field(coerce=True)
-    aperiodic_offset: Optional[Series[float]] = pa.Field(coerce=True)
-    aperiodic_exponent: Optional[Series[float]] = pa.Field(coerce=True)
-    decay_fit_error: Optional[Series[float]] = pa.Field(coerce=True)
-    decay_fit_r_squared: Optional[Series[float]] = pa.Field(coerce=True)
-    decay_n_peaks: Optional[Series[int]] = pa.Field(coerce=True)
+    psd_residual_lfp: Optional[Series[float]] = pa.Field(coerce=True, nullable=True)
     psd_residual_delta: Optional[Series[float]] = pa.Field(coerce=True, nullable=True)
     psd_residual_theta: Optional[Series[float]] = pa.Field(coerce=True, nullable=True)
     psd_residual_alpha: Optional[Series[float]] = pa.Field(coerce=True, nullable=True)
     psd_residual_beta: Optional[Series[float]] = pa.Field(coerce=True, nullable=True)
     psd_residual_gamma: Optional[Series[float]] = pa.Field(coerce=True, nullable=True)
-    psd_residual_lfp: Optional[Series[float]] = pa.Field(coerce=True, nullable=True)
-
+    aperiodic_offset: Optional[Series[float]] = pa.Field(coerce=True)
+    aperiodic_exponent: Optional[Series[float]] = pa.Field(coerce=True)
+    decay_fit_error: Optional[Series[float]] = pa.Field(coerce=True)
+    decay_fit_r_squared: Optional[Series[float]] = pa.Field(coerce=True)
+    decay_n_peaks: Optional[Series[int]] = pa.Field(coerce=True)
 
 class ModelCsdFeatures(BaseChannelFeatures):
     """Schema for current source density features.
@@ -491,40 +490,15 @@ def voltage_features_set(features_list=FEATURES_LIST):
     for feature_group in features_list:
         match feature_group:
             case "raw_ap":
-                x_list += sorted(
-                    list(
-                        set(ModelApFeatures.to_schema().columns.keys())
-                        - set(["channel"])
-                    )
-                )
+                x_list += list(ModelApFeatures.to_schema().columns.keys())
             case "raw_lf":
-                x_list += sorted(
-                    list(
-                        set(ModelLfFeatures.to_schema().columns.keys())
-                        - set(["channel"])
-                    )
-                )
+                x_list += list(ModelLfFeatures.to_schema().columns.keys())
             case "raw_lf_csd":
-                x_list += sorted(
-                    list(
-                        set(ModelCsdFeatures.to_schema().columns.keys())
-                        - set(["channel"])
-                    )
-                )
+                x_list += list(ModelCsdFeatures.to_schema().columns.keys())
             case "waveforms":
-                x_list += sorted(
-                    list(
-                        set(ModelSpikeFeatures.to_schema().columns.keys())
-                        - set(["channel"])
-                    )
-                )
+                x_list += list(ModelSpikeFeatures.to_schema().columns.keys())
             case "micro-manipulator":
-                x_list += sorted(
-                    list(
-                        set(ModelHistologyPlanned.to_schema().columns.keys())
-                        - set(["channel"])
-                    )
-                )
+                x_list += list(ModelHistologyPlanned.to_schema().columns.keys())
     return x_list
 
 

From f387f7ac3a5b59788fbc34c33fc337563b7165e3 Mon Sep 17 00:00:00 2001
From: Olivier Winter <olivier.winter@hotmail.fr>
Date: Fri, 10 Oct 2025 11:33:08 +0100
Subject: [PATCH 3/5] remove requirements.txt

---
 requirements.txt | 25 -------------------------
 1 file changed, 25 deletions(-)
 delete mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index c13e426..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1,25 +0,0 @@
---extra-index-url https://download.pytorch.org/whl/cpu
-dartsort @ git+https://github.com/cwindolf/dartsort@iblsorter
-dredge @ git+https://github.com/evarol/dredge@v0.2.2
-h5py
-hdbscan
-iblatlas >= 0.3.0
-ibllib
-ibl_style
-linear_operator
-matplotlib
-mock
-numba
-numpy >= 2.0
-opt-einsum
-pandas
-pandera
-pydantic
-pytest
-scikit-learn
-scipy >= 1.13
-seaborn
-spikeinterface
-torch >= 2.0
-tqdm
-xgboost
\ No newline at end of file

From 401a22b3150ba3da175bb01eee83693a62a412d2 Mon Sep 17 00:00:00 2001
From: Olivier Winter <olivier.winter@hotmail.fr>
Date: Fri, 10 Oct 2025 11:36:46 +0100
Subject: [PATCH 4/5] update changelog

---
 CHANGELOG.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 485b063..2eb3845 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,11 @@
 This file documents the changes to the features for supported feature versions.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
 
+## [Unreleased]
+
+### Fixed
+-   `features.voltage_features_set` returns features by categories, sorted as the pydantic model definitions
+
 ## [0.2.2] - 2025-09-25
 
 ### Added

From 701c00c60a4d84e086c2ba1cf7d09f2e60a2048b Mon Sep 17 00:00:00 2001
From: Pranav Rai <rpranaviitk@gmail.com>
Date: Wed, 22 Oct 2025 11:29:48 +0100
Subject: [PATCH 5/5] Moved the bad alpha filtering when processing per pid.

---
 src/ephysatlas/aggregation.py | 11 +++++++++++
 src/ephysatlas/data.py        |  8 --------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/ephysatlas/aggregation.py b/src/ephysatlas/aggregation.py
index adb6c97..933a9bf 100644
--- a/src/ephysatlas/aggregation.py
+++ b/src/ephysatlas/aggregation.py
@@ -300,6 +300,7 @@ def get_aggregated_features_per_pid(snippet_df_per_pid: pd.DataFrame):
     Note:
         - The function requires that all rows have the same PID value.
         - Channel metadata is loaded from channels.pqt in the snippet directory's parent.
+        - Channels with bad alpha are filtered out. And those values are set to NaN.
         - The result includes both aggregated features and channel position information.
     """
     # Ensure only one PID is present in the DataFrame
@@ -319,6 +320,16 @@ def get_aggregated_features_per_pid(snippet_df_per_pid: pd.DataFrame):
     # Reset the index to make pid and channel regular columns
     agg_df_per_pid = agg_df_per_pid.reset_index()
 
+    #Filter out the channels with bad alpha
+    ialpha_bad = np.logical_or(
+        agg_df_per_pid['alpha_mean'].values >= (1e3 * np.nanmedian(agg_df_per_pid['alpha_mean'])),
+        agg_df_per_pid['alpha_std'].values >= 1e3 * np.nanmedian(agg_df_per_pid['alpha_std'])
+    )
+    # then we join with the channel information to get coordinates and anatomical information
+    logger.info(f"Number of channels with bad alpha: {np.sum(ialpha_bad)}")
+    agg_df_per_pid.loc[ialpha_bad, 'alpha_mean'] = np.nan
+    agg_df_per_pid.loc[ialpha_bad, 'alpha_std'] = np.nan
+
     # Load channel metadata (axial and lateral positions) from channels.pqt
     # Construct the path to the snippet directory to find its parent
     snippet_level_dir = Path(snippet_df_per_pid["base_level_dir"].iloc[0]) / Path(
diff --git a/src/ephysatlas/data.py b/src/ephysatlas/data.py
index 2d413d3..68f4ce7 100644
--- a/src/ephysatlas/data.py
+++ b/src/ephysatlas/data.py
@@ -410,14 +410,6 @@ def read_features_from_disk(
     if strict:
         df_features = pd.DataFrame(ephysatlas.features.ModelRawFeatures(df_features))
 
-    ialpha_bad = np.logical_or(
-        df_features['alpha_mean'].values >= (1e3 * np.nanmedian(df_features['alpha_mean'])),
-        df_features['alpha_std'].values >= 1e3 * np.nanmedian(df_features['alpha_std'])
-    )
-    # then we join with the channel information to get coordinates and anatomical information
-    print(f"Number of channels with bad alpha: {np.sum(ialpha_bad)}")
-    df_features.loc[ialpha_bad, 'alpha_mean'] = np.nanmedian(df_features['alpha_mean'])
-    df_features.loc[ialpha_bad, 'alpha_std'] = np.nanmedian(df_features['alpha_std'])
 
     return df_features