Optional detailed HPV info attribute (#9)

Northo · web-flow · commit cd2eb0b51a51 · 2023-06-21T11:01:10.000+02:00
* Adds hpv test type to name mapping

* Improve testing and dtypes

* Rename HPVReults column names

* Adds optional HPV details to DataManager

* Adds docstring to DataManager

* Include recipe for adding detailed information on test type and result

* Bump version to 0.1.33
diff --git a/README.md b/README.md
@@ -98,6 +98,73 @@ feature_matrix = data_manager.feature_data_as_coo_array(
 )
 ```
 
+### Recipes
+
+** Adding detailed HPV test type and result information to the `exams_df` **
+The `exams_df` of the `DataManger` only contains whether the HPV result was positive
+or not, and no specific information about the test type.
+This information is stored in `hpv_df` (which is only populated if `read_hpv` is set
+to `True` in `DataManager.read_from_csv`).
+
+In some cases, it is desirable to have this information in the `exams_df`, as new columns.
+We here do it in two 'steps' to more clearly show what is going on.
+
+```python
+def hpv_details_per_exam(hpv_df: pd.DataFrame) -> pd.DataFrame:
+    """Return a DataFrame with the HPV details per exam.
+
+    The index of the returned DataFrame is the exam_index. Note that this is
+    not the same as the index of the exams_df!"""
+
+    per_exam = hpv_df.groupby("exam_index")
+    if per_exam["hpvTesttype"].nunique().max() != 1:
+        raise ValueError("Not all exams have the same HPV test type!")
+
+    return pd.DataFrame(
+        {
+            "exam_detailed_type": per_exam["test_type_name"].first(),
+            "exam_detailed_results": per_exam["genotype"].apply(
+                lambda genotypes: ",".join(genotypes)
+            ),
+        }
+    )
+
+def add_hpv_detailed_information(
+    exams_df: pd.DataFrame, hpv_df: pd.DataFrame
+) -> pd.DataFrame:
+    """ "Add detailed exam type name and results to exams_df"""
+    # Find the exam_index -> exams_df.index map
+    # exam_index is not unique in exams_df, because one exam may give
+    # cyt, hist, and HPV results
+    # Therefore, we find the indices where there is an HPV test
+    hpv_indices = exams_df.query("exam_type == 'HPV'")["index"]
+    mapping = pd.Series(data=hpv_indices.index, index=hpv_indices.values)
+
+    hpv_details = hpv_details_per_exam(hpv_df)
+    hpv_details.index = hpv_details.index.map(mapping)
+
+    # TODO: this will give nan on the hist and cyt rows
+    exams_df = exams_df.join(hpv_details)
+
+    # Set the Cytology and Histology results
+    def _fill(base_series: pd.Series, fill_series: pd.Series) -> pd.Series:
+        """Fill base series with fill series where base series is nan. Handles category data."""
+        return base_series.astype("string").fillna(fill_series.astype("string"))
+
+    exams_df["exam_detailed_type"] = _fill(
+        exams_df["exam_detailed_type"], exams_df["exam_type"]
+    )
+    exams_df["exam_detailed_results"] = _fill(
+        exams_df["exam_detailed_results"], exams_df["exam_diagnosis"]
+    )
+
+    return exams_df
+
+
+# Assuming the DataManger has hpv_df
+data_manager.exams_df = add_hpv_detailed_information(data_manager.exams_df, data_manager.hpv_df)
+```
+
 ## Install
 
 ## Parquet support
diff --git a/decipher/data/data_manager.py b/decipher/data/data_manager.py
@@ -14,6 +14,7 @@
 from decipher.processing.pipeline import get_base_pipeline, read_raw_df
 from decipher.processing.transformers import (
     AgeAdder,
+    HPVResults,
     ObservationMatrix,
     PersonStats,
     RiskAdder,
@@ -213,20 +214,77 @@ def metadata(self) -> dict:
 
 
 class DataManager:
+    """DataManager is a class for managing and organizing the datasets.
+
+    DataManager provides methods to read data from CSV files, save and load DataFrames
+    as parquet files for improved performance, filter data, and get feature data.
+
+    Attributes:
+        person_df: DataFrame containing personal data.
+        exams_df: DataFrame containing data about exams.
+        hpv_df: DataFrame containing HPV results data. See `decipher.processing.transformers.HPVResults` for details. Default is None.
+        screening_data: DataFrame containing screening data. Default is None.
+        metadata: Dictionary containing metadata.
+
+    Examples:
+        *** Reading data from CSV files ***
+        ```python
+        from pathlib import Path
+        from decipher.data import DataManager
+
+        screening_data = Path(<screening data>)
+        dob_data = Path(<dob data>)
+
+        # Read in from CSV
+        data_manager = DataManager.read_from_csv(screening_data, dob_data)
+        ```
+
+         **Read and Write with Parquet**
+        ```
+        from pathlib import Path
+        from decipher.data import DataManager
+
+        screening_data = Path(<screening data>)
+        dob_data = Path(<dob data>)
+        parquet_dir = Path(<parquet dir>)
+
+        # Read in from CSV
+        data_manager = DataManager.read_from_csv(screening_data, dob_data)
+
+        # Store to Parquet
+        data_manager.save_to_parquet(parquet_dir, engine="pyarrow")
+
+        # Read from Parquet
+        # Will fail if `decipher` version does not match that of stored data
+        data_manager = DataManager.from_parquet(parquet_dir, engine="pyarrow")
+
+        # See metadata
+        data_manager.metadata
+        ```
+
+    Note:
+        It is strongly advised to read the CSV files once, and then store the DataManager
+        to parquet. This gives much faster read times.
+    """
+
     def __init__(
         self,
         person_df: pd.DataFrame,
         exams_df: pd.DataFrame,
+        hpv_df: pd.DataFrame | None = None,
         screening_data: pd.DataFrame | None = None,
         metadata: dict | None = None,
     ):
         self.screening_data = screening_data
         self.person_df = person_df
+        self.hpv_df = hpv_df
         self.exams_df = exams_df
         self.metadata = metadata or {"decipher_version": version("decipher")}
 
     @classmethod
-    def read_from_csv(cls, screening_path: Path, dob_path: Path):
+    def read_from_csv(
+        cls, screening_path: Path, dob_path: Path, read_hpv: bool = False
+    ):
         base_df = _get_base_df(screening_path, dob_path)
         logger.debug("Got base DF")
         exams = Pipeline(
@@ -238,11 +296,17 @@ def read_from_csv(cls, screening_path: Path, dob_path: Path):
             verbose=True,
         ).fit_transform(base_df)
         logger.debug("Got exams DF")
+
         person_df: pd.DataFrame = PersonStats(base_df=base_df).fit_transform(exams)
         logger.debug("Got person DF")
+
+        hpv_df = HPVResults().fit_transform(base_df) if read_hpv else None
+        logger.debug("Got HPV DF")
+
         return DataManager(
             person_df=person_df,
             exams_df=exams,
+            hpv_df=hpv_df,
         )
 
     def save_to_parquet(
@@ -254,13 +318,22 @@ def save_to_parquet(
             self.screening_data.to_parquet(
                 directory / "screening_data.parquet", engine=engine
             )
+        if self.hpv_df is not None:
+            self.hpv_df.to_parquet(directory / "hpv_df.parquet", engine=engine)
         self.person_df.to_parquet(directory / "person_df.parquet", engine=engine)
         self.exams_df.to_parquet(directory / "exams_df.parquet", engine=engine)
         with open(directory / "metadata.json", "w") as file:
             # We always want to store the decipher version, so if it is not
             # in the metadata, add it.
             json.dump({"decipher_version": version("decipher")} | self.metadata, file)
 
+    @staticmethod
+    def _read_if_exists(
+        path: Path, engine: _parquet_engine_types
+    ) -> pd.DataFrame | None:
+        """Read the parquet file at path if it exists, otherwise return None"""
+        return pd.read_parquet(path, engine=engine) if path.exists() else None
+
     @classmethod
     def from_parquet(
         cls,
@@ -282,15 +355,16 @@ def from_parquet(
                 )
             else:
                 logger.warning(message)
-        if (screening_file := directory / "screening_data.parquet").exists():
-            screening_data = pd.read_parquet(screening_file, engine=engine)
-        else:
-            screening_data = None
+        screening_data = cls._read_if_exists(
+            directory / "screening_data.parquet", engine
+        )
+        hpv_df = cls._read_if_exists(directory / "hpv_df.parquet", engine)
         person_df = pd.read_parquet(directory / "person_df.parquet", engine=engine)
         exams_df = pd.read_parquet(directory / "exams_df.parquet", engine=engine)
         return DataManager(
             person_df=person_df,
             exams_df=exams_df,
+            hpv_df=hpv_df,
             screening_data=screening_data,
             metadata=metadata,
         )
diff --git a/decipher/exam_data.py b/decipher/exam_data.py
@@ -118,3 +118,24 @@ class ExamTypes(str, Enum):
 assert set(risk_mapping.keys()) == set(Diagnosis), set(risk_mapping.keys()) ^ set(
     Diagnosis
 )
+
+HPV_TEST_TYPE_NAMES = {
+    1: "HCII",
+    2: "HCIII",
+    3: "PreTect HPV-Proofer",
+    4: "Amplicor",
+    5: "PCR-primer",
+    6: "Real time PCR",
+    7: "Ventana Inform HPV (ISH)",
+    8: "ISH andre",
+    9: "PAP 13 Tele-lab",
+    10: "Paptype13 realtime",
+    11: "Cobas 4800 System",
+    12: "Abbott RealTime High Risk HPV",
+    13: "BD Onclarity HPV Assay",
+    14: "Inno Lipa",
+    15: "(Ukjent)",
+    16: "Abbot Alinity",
+    17: "Cobas 6800",
+}
+"""Mapping from HPV test type code to long form name."""
diff --git a/decipher/processing/transformers.py b/decipher/processing/transformers.py
@@ -8,7 +8,7 @@
 from loguru import logger
 from sklearn.base import BaseEstimator, TransformerMixin
 
-from decipher.exam_data import Diagnosis, ExamTypes, risk_mapping
+from decipher.exam_data import HPV_TEST_TYPE_NAMES, Diagnosis, ExamTypes, risk_mapping
 
 
 class PandasTransformerMixin(TransformerMixin):
@@ -349,6 +349,15 @@ def count_in_time_window(
 class HPVResults(BaseEstimator, PandasTransformerMixin):
     """Take a raw DF, and generate HPV results
 
+    The resulting DF will have the following columns:
+    - PID
+    - exam_index: the index of the exam in the raw data
+    - hpvTesttype
+    - hpvDate
+    - genotype_field: the genotype column of the raw data, i.e. hpv1Genotype, hpv2Genotype, etc.
+    - genotype: the genotype, i.e. 16, 18, HR, etc
+    - hpv_test_type_name: the name of the test type, i.e. "Cobas 4800 System".
+
     Warning:
       HPV negative and hpv non-conclusive are _not_ included!!!
     """
@@ -365,15 +374,23 @@ def fit(self, X: pd.DataFrame, y=None):
         return self
 
     def transform(self, X: pd.DataFrame) -> pd.DataFrame:
-        return (
+        hpv_df = (
             X.dropna(subset="hpvDate")
             .reset_index(names="exam_index")
             .melt(
                 id_vars=["PID", "exam_index", "hpvTesttype", "hpvDate"],
                 value_vars=self.hpv_genotype_columns,
+                var_name="genotype_field",
+                value_name="genotype",
             )
-            .dropna(subset="value")
-        ).astype({"variable": "category", "value": "category"})
+            .dropna(subset="genotype")
+        ).astype(
+            {"genotype_field": "category", "genotype": "category", "hpvTesttype": "int"}
+        )
+        hpv_df["test_type_name"] = (
+            hpv_df["hpvTesttype"].map(HPV_TEST_TYPE_NAMES).astype("category")
+        )
+        return hpv_df
 
 
 class ObservationMatrix(BaseEstimator, PandasTransformerMixin):
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "decipher"
-version = "0.1.32"
+version = "0.1.33"
 description = "Utilities for Decipher"
 authors = ["Thorvald Molthe Ballestad <thorvald@simula.no>"]
 readme = "README.md"
diff --git a/tests/test_data.py b/tests/test_data.py
@@ -31,7 +31,7 @@
 
 @pytest.fixture()
 def data_manager() -> DataManager:
-    return DataManager.read_from_csv(test_data_screening, test_data_dob)
+    return DataManager.read_from_csv(test_data_screening, test_data_dob, read_hpv=True)
 
 
 @pytest.mark.parametrize("min_non_hpv_exams", [0, 2, 3])
@@ -126,6 +126,11 @@ def test_parquet(
         )
     else:
         assert new_data_manager.screening_data is None
+    if data_manager.hpv_df is not None:
+        assert new_data_manager.hpv_df is not None
+        assert new_data_manager.hpv_df.equals(data_manager.hpv_df)
+    else:
+        assert new_data_manager.hpv_df is None
     assert data_manager.person_df.equals(new_data_manager.person_df)
     assert data_manager.exams_df.equals(new_data_manager.exams_df)
 
diff --git a/tests/test_processing.py b/tests/test_processing.py
@@ -10,6 +10,7 @@
 from hypothesis import assume, given
 from hypothesis.extra.pandas import column, data_frames, range_indexes
 
+from decipher.exam_data import HPV_TEST_TYPE_NAMES
 from decipher.processing.pipeline import (
     get_base_pipeline,
     get_exam_pipeline,
@@ -228,6 +229,31 @@ def test_hpv_results():
     hpv_df = HPVResults().fit_transform(raw)
     logger.debug(f"HPV DF:\n{hpv_df.head()}")
 
+    assert not hpv_df.isna().any().any()
+
+    genotype_columns = ["hpv1Genotype", "hpv2Genotype"]
+
+    # Check that the data corresponds with the raw data
+    for exam_index, results in hpv_df.groupby("exam_index"):
+        raw_row = raw.loc[exam_index]
+
+        assert set(results["genotype"]) == set(raw_row[genotype_columns].dropna())
+
+        def _matches(field: str) -> bool:
+            """Assert the field is unique within the group and matches the raw data"""
+            return (
+                results[field].nunique() == 1
+                and results[field].iloc[0] == raw_row[field]
+            )
+
+        for field in ("PID", "hpvTesttype", "hpvDate"):
+            assert _matches(field)
+
+    # Check that the test type names are correct
+    assert hpv_df["test_type_name"].equals(
+        hpv_df["hpvTesttype"].map(HPV_TEST_TYPE_NAMES).astype("category")
+    )
+
 
 def test_read_from_csv(tmp_path: Path):
     data_file = tmp_path / "data.csv"