Add observation matrix form transformer

Northo · Northo · commit a57796e4fd74 · 2023-03-24T16:44:48.000+01:00
diff --git a/decipher/processing/transformers.py b/decipher/processing/transformers.py
@@ -1,8 +1,10 @@
 import logging
+from datetime import timedelta
 from pathlib import Path
-from typing import Any
+from typing import Any, Callable
 
 import numpy as np
+import numpy.typing as npt
 import pandas as pd
 from sklearn.base import BaseEstimator, TransformerMixin
 
@@ -244,3 +246,41 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
             )
             .dropna(subset="value")
         ).astype({"variable": "category", "value": "category"})
+
+
+class ObservationMatrix(BaseEstimator, TransformerMixin):
+    """Convert exams df to observations"""
+
+    def __init__(
+        self, risk_agg_method: str | Callable = "max", months_per_bin: float = 3
+    ):
+        self.risk_agg_method = risk_agg_method
+        self.months_per_bin = months_per_bin
+        super().__init__()
+
+    def fit(self, X: pd.DataFrame, y=None):
+        CleanData(dtypes={"PID": "int64", "age": "timedelta64[ns]", "risk": "Int64"})
+        # Create a mapping between row in matrix and PID
+        pids = X["PID"].unique()
+        self.pid_to_row = {pid: i for i, pid in enumerate(pids)}
+
+        # Make the time bins
+        days_per_month = 30
+        bin_width = timedelta(days=self.months_per_bin * days_per_month)
+        self.bins: npt.NDArray = np.arange(
+            X["age"].min(),
+            X["age"].max() + bin_width,  # Add to ensure endpoint is included
+            bin_width,
+        )
+        return self
+
+    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
+        out = X[["risk"]]
+        out["row"] = X["PID"].apply(lambda pid: self.pid_to_row[pid])
+        out["bin"] = pd.cut(
+            X["age"], self.bins, right=False
+        )  # type: ignore[call-overload]  # right=False indicates close left side
+
+        return out.groupby(["row", "bin"], as_index=False)["risk"].agg(
+            self.risk_agg_method
+        )  # type: ignore[return-value]  # as_index=False makes this a DataFrame
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "decipher"
-version = "0.1.9"
+version = "0.1.10"
 description = "Utilities for Decipher"
 authors = ["Thorvald Molthe Ballestad <thorvald@simula.no>"]
 readme = "README.md"
diff --git a/tests/test_processing.py b/tests/test_processing.py
@@ -13,7 +13,7 @@
     read_raw_df,
     write_to_csv,
 )
-from decipher.processing.transformers import HPVResults, PersonStats
+from decipher.processing.transformers import HPVResults, ObservationMatrix, PersonStats
 
 logger = logging.getLogger(__name__)
 
@@ -52,6 +52,25 @@ def test_read_and_hpv_pipeline():
     logger.debug(hpv_df)
 
 
+def test_observation_out():
+    raw = read_raw_df(test_data_screening)
+
+    exam_pipeline = get_exam_pipeline(
+        birthday_file=test_data_dob, drop_missing_birthday=True
+    )
+    exam_df = exam_pipeline.fit_transform(raw)
+    observations = ObservationMatrix().fit_transform(exam_df)
+    logger.info(observations)
+
+    assert {"bin", "row", "risk"} == set(observations)
+    # Assert only one risk per person per time
+    assert observations.value_counts(subset=["row", "bin"]).unique() == [1]
+
+    bins_intervals = observations["bin"].cat.categories
+    assert bins_intervals[0].left <= exam_df["age"].min()
+    assert bins_intervals[-1].right > exam_df["age"].max()
+
+
 def test_person_stats():
     raw = read_raw_df(test_data_screening)