From 1f0f57bee1aff5cea22f5fc6eb7ef7ee33c48529 Mon Sep 17 00:00:00 2001 From: simon-p-2000 Date: Wed, 12 Mar 2025 10:10:00 +0100 Subject: [PATCH] Adding new classifiers, extending feature extraction (#262) * adding new sklearn classifiers, extending feature extraction to accomodate activity counts * adding functionality to allow saving sklearn classifiers * formatting * fixing typo * changing from np.trapz to scipy.integrate trapezoid * saving and loading sklearn classifier via pickle * removing type annotations for activity counts * fixing bug that caused test cases to fail * style changes * reworking actigraphy test case to work with arbitrary durations * refactoring activity count extraction * removing dummy feature for actigraphy extraction * style check, removing unrelated files * adding documentation * Revert example * Fix typo and insert newlines * Simplify padding * Simplify dummy file creation * Move changelog entry --- CHANGELOG.md | 2 ++ docs/feature_extraction.md | 15 ++++++++++ src/sleepecg/feature_extraction.py | 4 +++ src/sleepecg/io/sleep_readers.py | 16 +++++++++-- tests/test_sleep_readers.py | 45 +++++++++++++++++------------- 5 files changed, 60 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ae169f8a..b92b107c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,6 @@ ## [UNRELEASED] - YYYY-MM-DD +## Added +- Add support for activity counts feature ([#262](https://github.com/cbrnr/sleepecg/pull/262) by [Simon Pusterhofer](https://github.com/simon-p-2000)) ## [0.5.9] - 2025-02-01 ### Added diff --git a/docs/feature_extraction.md b/docs/feature_extraction.md index 9f117343..f0f6c0b4 100644 --- a/docs/feature_extraction.md +++ b/docs/feature_extraction.md @@ -1,9 +1,12 @@ # Feature extraction ## Heart rate variability features + Features are based on standards of heart rate variability (HRV) measurement and interpretation described in [Task Force of the European Society of Cardiology (1996)](https://doi.org/10.1161/01.CIR.93.5.1043) and [Shaffer & Ginsberg (2017)](https://doi.org/10.3389/fpubh.2017.00258). + ### Time domain + Group identifier: `hrv-time` All time domain HRV features are either derived from normal-to-normal (NN) intervals, from successive differences between NN intervals (SD), or from the [Poincaré plot (PP)](https://en.wikipedia.org/wiki/Poincar%C3%A9_plot). @@ -41,7 +44,9 @@ All time domain HRV features are either derived from normal-to-normal (NN) inter |`CSI`|cardiac sympathetic index|PP| |`CVI`|cardiac vagal index|PP| + ### Frequency domain + Group identifier: `hrv-frequency` For calculating frequency domain HRV features, the RR time series is resampled at regular intervals, after which the power spectral density (PSD) is estimated using [Welch's method](https://en.wikipedia.org/wiki/Welch%27s_method). @@ -58,6 +63,7 @@ For calculating frequency domain HRV features, the RR time series is resampled a ## Metadata features + Group identifier: `metadata` |Feature|Description| @@ -66,3 +72,12 @@ Group identifier: `metadata` |`age`|age of the subject in years| |`gender`|`0` (female) or `1` (male)| |`weight`|weight of the subject in kg| + + +## Actigraphy features + +Group identifier: `actigraphy` + +| Feature | Description | +|-------------------|--------------------------------------------------------------------------------------------------------| +| `activity_counts` | Philips Actiwatch proprietary metric to quantify amount of patient movement measured via accelerometry | diff --git a/src/sleepecg/feature_extraction.py b/src/sleepecg/feature_extraction.py index 0654bfee..2a780585 100644 --- a/src/sleepecg/feature_extraction.py +++ b/src/sleepecg/feature_extraction.py @@ -57,6 +57,7 @@ "LF_HF_ratio", ), "metadata": ("recording_start_time", "age", "gender", "weight"), + "actigraphy": ("activity_counts",), } _FEATURE_ID_TO_GROUP = {id: group for group, ids in _FEATURE_GROUPS.items() for id in ids} @@ -657,6 +658,9 @@ def _extract_features_single( ) elif feature_group == "metadata": X.append(_metadata_features(record, num_stages)) + elif feature_group == "actigraphy": + if record.activity_counts is not None: + X.append(record.activity_counts.reshape(-1, 1)) features = np.hstack(X)[:, col_indices] if record.sleep_stages is None or sleep_stage_duration == record.sleep_stage_duration: diff --git a/src/sleepecg/io/sleep_readers.py b/src/sleepecg/io/sleep_readers.py index 714eb5c6..a87666a1 100644 --- a/src/sleepecg/io/sleep_readers.py +++ b/src/sleepecg/io/sleep_readers.py @@ -477,7 +477,7 @@ def read_mesa( for item in activity_data: if item.get("linetime") == recording_end_time_str: - end_line = int(item["line"]) - 1 + end_line = int(item["line"]) break else: print( @@ -487,10 +487,22 @@ def read_mesa( continue activity_counts = [ - item["activity"] for item in activity_data[start_line - 1 : end_line] + item["activity"] for item in activity_data[start_line:end_line] ] activity_counts = np.array(activity_counts) + + diff = len(activity_counts) - len(parsed_xml.sleep_stages) + if abs(diff) > 2: + print(f"Skipping {record_id} due to invalid activity counts.") + continue + elif diff > 0: + activity_counts = activity_counts[:-diff] + elif diff < 0: + activity_counts = np.append(activity_counts, activity_counts[diff:]) + + activity_counts[activity_counts == ""] = "0" + activity_counts = activity_counts.astype(float) np.save(activity_counts_file, activity_counts) yield SleepRecord( diff --git a/tests/test_sleep_readers.py b/tests/test_sleep_readers.py index 8901c82e..fd63204e 100644 --- a/tests/test_sleep_readers.py +++ b/tests/test_sleep_readers.py @@ -24,24 +24,26 @@ def _dummy_nsrr_overlap(filename: str, mesa_ids: list[int]): csv.write(f"{mesa_ids[i][-1]},1,20:30:00,20:29:59\n") -def _dummy_nsrr_actigraphy(filename: str, mesa_id: str): +def _dummy_nsrr_actigraphy(filename: str, mesa_id: str, hours: float): """Create dummy actigraphy file with four usable activity counts.""" base_time = datetime.datetime(2024, 1, 1, 20, 30, 0) - + # hours * 3600 / 30 second epoch, additional 20 counts for safety + number_activity_counts = int(hours * 120) + 20 linetimes = [ (base_time + datetime.timedelta(seconds=30 * i)).strftime("%H:%M:%S") - for i in range(10) + for i in range(number_activity_counts) ] with open(filename, "w") as csv: csv.write("mesaid,line,linetime,activity\n") - for i in range(10): + for i in range(number_activity_counts): csv.write(f"{mesa_id[-1]},{1 + i},{linetimes[i]},10\n") -def _dummy_nsrr_actigraphy_cached(filename: str): +def _dummy_nsrr_actigraphy_cached(filename: str, hours: float): """Create dummy npy file that resembles cached activity counts.""" - activity_counts = np.array([10, 10, 10, 10, 10, 10]) + number_activity_counts = int(hours * 120) + activity_counts = np.array([10 for i in range(number_activity_counts)]) np.save(filename, activity_counts) @@ -54,7 +56,6 @@ def _dummy_nsrr_edf(filename: str, hours: float, ecg_channel: str): def _dummy_nsrr_xml(filename: str, hours: float, random_state: int): EPOCH_LENGTH = 30 - RECORDING_DURATION = 154.0 STAGES = [ "Wake|0", "Stage 1 sleep|1", @@ -66,7 +67,7 @@ def _dummy_nsrr_xml(filename: str, hours: float, random_state: int): ] rng = np.random.default_rng(random_state) - + record_duration = hours * 60 * 60 with open(filename, "w") as xml_file: xml_file.write( '\n' @@ -76,16 +77,16 @@ def _dummy_nsrr_xml(filename: str, hours: float, random_state: int): "\n" "\n" "Recording Start Time\n" - f"{RECORDING_DURATION}\n" + f"{record_duration}\n" "01.01.85 20.29.59\n" "\n", ) - record_duration = hours * 60 * 60 start = 0 - while True: - if start > record_duration: - break - epoch_duration = rng.choice(np.arange(4, 21)) * EPOCH_LENGTH + while start < record_duration: + # choose a candidate epoch duration in seconds. + epoch_duration_candidate = rng.choice(np.arange(4, 21)) * EPOCH_LENGTH + # use the remaining time if the candidate overshoots the record duration + epoch_duration = min(epoch_duration_candidate, record_duration - start) stage = rng.choice(STAGES) xml_file.write( "\n" @@ -134,9 +135,11 @@ def _create_dummy_mesa( _dummy_nsrr_edf(f"{edf_dir}/{record_id}.edf", hours, ecg_channel="EKG") _dummy_nsrr_xml(f"{annotations_dir}/{record_id}-nsrr.xml", hours, random_state) if actigraphy: - _dummy_nsrr_actigraphy(f"{activity_dir}/{record_id}.csv", mesa_id=record_id) + _dummy_nsrr_actigraphy( + f"{activity_dir}/{record_id}.csv", mesa_id=record_id, hours=hours + ) _dummy_nsrr_actigraphy_cached( - f"{activity_counts_dir}/{record_id}-activity-counts.npy" + f"{activity_counts_dir}/{record_id}-activity-counts.npy", hours ) record_ids.append(record_id) @@ -213,10 +216,12 @@ def test_read_mesa_actigraphy(tmp_path): assert len(records) == 2 - for rec in records: + for i, rec in enumerate(records): assert rec.sleep_stage_duration == 30 assert set(rec.sleep_stages) - valid_stages == set() - assert len(rec.activity_counts) == 4 + # multiply with 3600 to convert duration (hours) to seconds, divide by 30 (epoch + # length for this test) + assert len(rec.activity_counts) == int(durations[i] * 120) assert Path( f"{tmp_path}/mesa/preprocessed/activity_counts/{rec.id}-activity-counts.npy" ).exists() @@ -239,10 +244,10 @@ def test_read_mesa_actigraphy_cached(tmp_path): assert len(records) == 2 - for rec in records: + for i, rec in enumerate(records): assert rec.sleep_stage_duration == 30 assert set(rec.sleep_stages) - valid_stages == set() - assert len(rec.activity_counts) == 6 + assert len(rec.activity_counts) == int(durations[i] * 120) def test_read_shhs(tmp_path):