From 1f0f57bee1aff5cea22f5fc6eb7ef7ee33c48529 Mon Sep 17 00:00:00 2001
From: simon-p-2000 <simon.pusterhofer@gmx.at>
Date: Wed, 12 Mar 2025 10:10:00 +0100
Subject: [PATCH] Adding new classifiers, extending feature extraction (#262)

* adding new sklearn classifiers, extending feature extraction to accomodate activity counts

* adding functionality to allow saving sklearn classifiers

* formatting

* fixing typo

* changing from np.trapz to scipy.integrate trapezoid

* saving and loading sklearn classifier via pickle

* removing type annotations for activity counts

* fixing bug that caused test cases to fail

* style changes

* reworking actigraphy test case to work with arbitrary durations

* refactoring activity count extraction

* removing dummy feature for actigraphy extraction

* style check, removing unrelated files

* adding documentation

* Revert example

* Fix typo and insert newlines

* Simplify padding

* Simplify dummy file creation

* Move changelog entry
---
 CHANGELOG.md                       |  2 ++
 docs/feature_extraction.md         | 15 ++++++++++
 src/sleepecg/feature_extraction.py |  4 +++
 src/sleepecg/io/sleep_readers.py   | 16 +++++++++--
 tests/test_sleep_readers.py        | 45 +++++++++++++++++-------------
 5 files changed, 60 insertions(+), 22 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ae169f8a..b92b107c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,6 @@
 ## [UNRELEASED] - YYYY-MM-DD
+## Added
+- Add support for activity counts feature ([#262](https://github.com/cbrnr/sleepecg/pull/262) by [Simon Pusterhofer](https://github.com/simon-p-2000))
 
 ## [0.5.9] - 2025-02-01
 ### Added
diff --git a/docs/feature_extraction.md b/docs/feature_extraction.md
index 9f117343..f0f6c0b4 100644
--- a/docs/feature_extraction.md
+++ b/docs/feature_extraction.md
@@ -1,9 +1,12 @@
 # Feature extraction
 
 ## Heart rate variability features
+
 Features are based on standards of heart rate variability (HRV) measurement and interpretation described in [Task Force of the European Society of Cardiology (1996)](https://doi.org/10.1161/01.CIR.93.5.1043) and [Shaffer & Ginsberg (2017)](https://doi.org/10.3389/fpubh.2017.00258).
 
+
 ### Time domain
+
 Group identifier: `hrv-time`
 
 All time domain HRV features are either derived from normal-to-normal (NN) intervals, from successive differences between NN intervals (SD), or from the [Poincaré plot (PP)](https://en.wikipedia.org/wiki/Poincar%C3%A9_plot).
@@ -41,7 +44,9 @@ All time domain HRV features are either derived from normal-to-normal (NN) inter
 |`CSI`|cardiac sympathetic index|PP|
 |`CVI`|cardiac vagal index|PP|
 
+
 ### Frequency domain
+
 Group identifier: `hrv-frequency`
 
 For calculating frequency domain HRV features, the RR time series is resampled at regular intervals, after which the power spectral density (PSD) is estimated using [Welch's method](https://en.wikipedia.org/wiki/Welch%27s_method).
@@ -58,6 +63,7 @@ For calculating frequency domain HRV features, the RR time series is resampled a
 
 
 ## Metadata features
+
 Group identifier: `metadata`
 
 |Feature|Description|
@@ -66,3 +72,12 @@ Group identifier: `metadata`
 |`age`|age of the subject in years|
 |`gender`|`0` (female) or `1` (male)|
 |`weight`|weight of the subject in kg|
+
+
+## Actigraphy features
+
+Group identifier: `actigraphy`
+
+| Feature           | Description                                                                                            |
+|-------------------|--------------------------------------------------------------------------------------------------------|
+| `activity_counts` | Philips Actiwatch proprietary metric to quantify amount of patient movement measured via accelerometry |
diff --git a/src/sleepecg/feature_extraction.py b/src/sleepecg/feature_extraction.py
index 0654bfee..2a780585 100644
--- a/src/sleepecg/feature_extraction.py
+++ b/src/sleepecg/feature_extraction.py
@@ -57,6 +57,7 @@
         "LF_HF_ratio",
     ),
     "metadata": ("recording_start_time", "age", "gender", "weight"),
+    "actigraphy": ("activity_counts",),
 }
 _FEATURE_ID_TO_GROUP = {id: group for group, ids in _FEATURE_GROUPS.items() for id in ids}
 
@@ -657,6 +658,9 @@ def _extract_features_single(
             )
         elif feature_group == "metadata":
             X.append(_metadata_features(record, num_stages))
+        elif feature_group == "actigraphy":
+            if record.activity_counts is not None:
+                X.append(record.activity_counts.reshape(-1, 1))
     features = np.hstack(X)[:, col_indices]
 
     if record.sleep_stages is None or sleep_stage_duration == record.sleep_stage_duration:
diff --git a/src/sleepecg/io/sleep_readers.py b/src/sleepecg/io/sleep_readers.py
index 714eb5c6..a87666a1 100644
--- a/src/sleepecg/io/sleep_readers.py
+++ b/src/sleepecg/io/sleep_readers.py
@@ -477,7 +477,7 @@ def read_mesa(
 
                 for item in activity_data:
                     if item.get("linetime") == recording_end_time_str:
-                        end_line = int(item["line"]) - 1
+                        end_line = int(item["line"])
                         break
                 else:
                     print(
@@ -487,10 +487,22 @@ def read_mesa(
                     continue
 
                 activity_counts = [
-                    item["activity"] for item in activity_data[start_line - 1 : end_line]
+                    item["activity"] for item in activity_data[start_line:end_line]
                 ]
 
                 activity_counts = np.array(activity_counts)
+
+                diff = len(activity_counts) - len(parsed_xml.sleep_stages)
+                if abs(diff) > 2:
+                    print(f"Skipping {record_id} due to invalid activity counts.")
+                    continue
+                elif diff > 0:
+                    activity_counts = activity_counts[:-diff]
+                elif diff < 0:
+                    activity_counts = np.append(activity_counts, activity_counts[diff:])
+
+                activity_counts[activity_counts == ""] = "0"
+                activity_counts = activity_counts.astype(float)
                 np.save(activity_counts_file, activity_counts)
 
         yield SleepRecord(
diff --git a/tests/test_sleep_readers.py b/tests/test_sleep_readers.py
index 8901c82e..fd63204e 100644
--- a/tests/test_sleep_readers.py
+++ b/tests/test_sleep_readers.py
@@ -24,24 +24,26 @@ def _dummy_nsrr_overlap(filename: str, mesa_ids: list[int]):
             csv.write(f"{mesa_ids[i][-1]},1,20:30:00,20:29:59\n")
 
 
-def _dummy_nsrr_actigraphy(filename: str, mesa_id: str):
+def _dummy_nsrr_actigraphy(filename: str, mesa_id: str, hours: float):
     """Create dummy actigraphy file with four usable activity counts."""
     base_time = datetime.datetime(2024, 1, 1, 20, 30, 0)
-
+    # hours * 3600 / 30 second epoch, additional 20 counts for safety
+    number_activity_counts = int(hours * 120) + 20
     linetimes = [
         (base_time + datetime.timedelta(seconds=30 * i)).strftime("%H:%M:%S")
-        for i in range(10)
+        for i in range(number_activity_counts)
     ]
 
     with open(filename, "w") as csv:
         csv.write("mesaid,line,linetime,activity\n")
-        for i in range(10):
+        for i in range(number_activity_counts):
             csv.write(f"{mesa_id[-1]},{1 + i},{linetimes[i]},10\n")
 
 
-def _dummy_nsrr_actigraphy_cached(filename: str):
+def _dummy_nsrr_actigraphy_cached(filename: str, hours: float):
     """Create dummy npy file that resembles cached activity counts."""
-    activity_counts = np.array([10, 10, 10, 10, 10, 10])
+    number_activity_counts = int(hours * 120)
+    activity_counts = np.array([10 for i in range(number_activity_counts)])
     np.save(filename, activity_counts)
 
 
@@ -54,7 +56,6 @@ def _dummy_nsrr_edf(filename: str, hours: float, ecg_channel: str):
 
 def _dummy_nsrr_xml(filename: str, hours: float, random_state: int):
     EPOCH_LENGTH = 30
-    RECORDING_DURATION = 154.0
     STAGES = [
         "Wake|0",
         "Stage 1 sleep|1",
@@ -66,7 +67,7 @@ def _dummy_nsrr_xml(filename: str, hours: float, random_state: int):
     ]
 
     rng = np.random.default_rng(random_state)
-
+    record_duration = hours * 60 * 60
     with open(filename, "w") as xml_file:
         xml_file.write(
             '<?xml version="1.0" encoding="UTF-8" standalone="no"?>\n'
@@ -76,16 +77,16 @@ def _dummy_nsrr_xml(filename: str, hours: float, random_state: int):
             "<ScoredEvent>\n"
             "<EventType/>\n"
             "<EventConcept>Recording Start Time</EventConcept>\n"
-            f"<Duration>{RECORDING_DURATION}</Duration>\n"
+            f"<Duration>{record_duration}</Duration>\n"
             "<ClockTime>01.01.85 20.29.59</ClockTime>\n"
             "</ScoredEvent>\n",
         )
-        record_duration = hours * 60 * 60
         start = 0
-        while True:
-            if start > record_duration:
-                break
-            epoch_duration = rng.choice(np.arange(4, 21)) * EPOCH_LENGTH
+        while start < record_duration:
+            # choose a candidate epoch duration in seconds.
+            epoch_duration_candidate = rng.choice(np.arange(4, 21)) * EPOCH_LENGTH
+            # use the remaining time if the candidate overshoots the record duration
+            epoch_duration = min(epoch_duration_candidate, record_duration - start)
             stage = rng.choice(STAGES)
             xml_file.write(
                 "<ScoredEvent>\n"
@@ -134,9 +135,11 @@ def _create_dummy_mesa(
         _dummy_nsrr_edf(f"{edf_dir}/{record_id}.edf", hours, ecg_channel="EKG")
         _dummy_nsrr_xml(f"{annotations_dir}/{record_id}-nsrr.xml", hours, random_state)
         if actigraphy:
-            _dummy_nsrr_actigraphy(f"{activity_dir}/{record_id}.csv", mesa_id=record_id)
+            _dummy_nsrr_actigraphy(
+                f"{activity_dir}/{record_id}.csv", mesa_id=record_id, hours=hours
+            )
             _dummy_nsrr_actigraphy_cached(
-                f"{activity_counts_dir}/{record_id}-activity-counts.npy"
+                f"{activity_counts_dir}/{record_id}-activity-counts.npy", hours
             )
             record_ids.append(record_id)
 
@@ -213,10 +216,12 @@ def test_read_mesa_actigraphy(tmp_path):
 
     assert len(records) == 2
 
-    for rec in records:
+    for i, rec in enumerate(records):
         assert rec.sleep_stage_duration == 30
         assert set(rec.sleep_stages) - valid_stages == set()
-        assert len(rec.activity_counts) == 4
+        # multiply with 3600 to convert duration (hours) to seconds, divide by 30 (epoch
+        # length for this test)
+        assert len(rec.activity_counts) == int(durations[i] * 120)
         assert Path(
             f"{tmp_path}/mesa/preprocessed/activity_counts/{rec.id}-activity-counts.npy"
         ).exists()
@@ -239,10 +244,10 @@ def test_read_mesa_actigraphy_cached(tmp_path):
 
     assert len(records) == 2
 
-    for rec in records:
+    for i, rec in enumerate(records):
         assert rec.sleep_stage_duration == 30
         assert set(rec.sleep_stages) - valid_stages == set()
-        assert len(rec.activity_counts) == 6
+        assert len(rec.activity_counts) == int(durations[i] * 120)
 
 
 def test_read_shhs(tmp_path):