Detailed HPV exam type info in exams_df (#11)

Northo · web-flow · commit b99ed48286e2 · 2023-06-23T15:00:50.000+02:00
* Extend docstring on hpv_df

* Update test dataset to include all HPV test types

* Add detailed exam type to exams_df

* Update recipe in README

* Bump version to 0.1.35
diff --git a/README.md b/README.md
@@ -110,57 +110,34 @@ In some cases, it is desirable to have this information in the `exams_df`, as ne
 We here do it in two 'steps' to more clearly show what is going on.
 
 ```python
-def hpv_details_per_exam(hpv_df: pd.DataFrame) -> pd.DataFrame:
-    """Return a DataFrame with the HPV details per exam.
-
-    The index of the returned DataFrame is the exam_index. Note that this is
-    not the same as the index of the exams_df!"""
-
-    per_exam = hpv_df.groupby("exam_index")
-    if per_exam["hpvTesttype"].nunique().max() != 1:
-        raise ValueError("Not all exams have the same HPV test type!")
-
-    return pd.DataFrame(
-        {
-            "exam_detailed_type": per_exam["test_type_name"].first(),
-            "exam_detailed_results": per_exam["genotype"].apply(
-                lambda genotypes: ",".join(genotypes)
-            ),
-        }
-    )
-
 def add_hpv_detailed_information(
     exams_df: pd.DataFrame, hpv_df: pd.DataFrame
 ) -> pd.DataFrame:
-    """ "Add detailed exam type name and results to exams_df"""
+    """Add detailed results to exams_df, under the key "exam_detailed_results"."""
+
     # Find the exam_index -> exams_df.index map
     # exam_index is not unique in exams_df, because one exam may give
     # cyt, hist, and HPV results
     # Therefore, we find the indices where there is an HPV test
     hpv_indices = exams_df.query("exam_type == 'HPV'")["index"]
     mapping = pd.Series(data=hpv_indices.index, index=hpv_indices.values)
 
-    hpv_details = hpv_details_per_exam(hpv_df)
+    hpv_details = hpv_df.groupby("exam_index")["genotype"].apply(",".join)
     hpv_details.index = hpv_details.index.map(mapping)
 
-    # TODO: this will give nan on the hist and cyt rows
-    exams_df = exams_df.join(hpv_details)
+    exams_df["exam_detailed_results"] = hpv_details
 
     # Set the Cytology and Histology results
     def _fill(base_series: pd.Series, fill_series: pd.Series) -> pd.Series:
         """Fill base series with fill series where base series is nan. Handles category data."""
         return base_series.astype("string").fillna(fill_series.astype("string"))
 
-    exams_df["exam_detailed_type"] = _fill(
-        exams_df["exam_detailed_type"], exams_df["exam_type"]
-    )
     exams_df["exam_detailed_results"] = _fill(
         exams_df["exam_detailed_results"], exams_df["exam_diagnosis"]
     )
 
     return exams_df
 
-
 # Assuming the DataManger has hpv_df
 data_manager.exams_df = add_hpv_detailed_information(data_manager.exams_df, data_manager.hpv_df)
 ```
diff --git a/decipher/data/data_manager.py b/decipher/data/data_manager.py
@@ -226,6 +226,11 @@ class DataManager:
         screening_data: DataFrame containing screening data. Default is None.
         metadata: Dictionary containing metadata.
 
+    Warning:
+        hpv_df contains details about the HPV genotype results.
+        For tests without genotype results, there will be no entry in hpv_df.
+        _However_, the test may still be positive.
+
     Examples:
         *** Reading data from CSV files ***
         ```python
diff --git a/decipher/processing/transformers.py b/decipher/processing/transformers.py
@@ -121,13 +121,17 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
         exams = (
             X.reset_index()
             .melt(
-                id_vars="index",
+                id_vars=[
+                    "index",
+                    "hpvTesttype",
+                ],  # Keep hpvTesttype. We will remove it for cyt and hist results later.
                 value_vars=mapper.keys(),  # type: ignore[arg-type]
                 var_name="exam_type",
                 value_name="exam_date",
             )
-            .dropna()
+            .dropna(subset=["exam_date"])
             .astype({"exam_type": "category"})
+            .rename(columns={"hpvTesttype": "detailed_exam_type"})
         )
 
         # Join on result columns
@@ -150,6 +154,17 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
             .astype("category")
         )
 
+        # Set detailed_exam_type for cyt and hist exams
+        not_hpv_result = exams["exam_type"] != ExamTypes.HPV
+        exams.loc[not_hpv_result, "detailed_exam_type"] = exams.loc[
+            not_hpv_result, "exam_type"
+        ].apply(lambda exam_type: exam_type.value)
+        # Map hpv type codes to names
+        exams.loc[~not_hpv_result, "detailed_exam_type"] = exams.loc[
+            ~not_hpv_result, "detailed_exam_type"
+        ].map(HPV_TEST_TYPE_NAMES)
+        exams["detailed_exam_type"] = exams["detailed_exam_type"].astype("category")
+
         return exams.join(X[self.fields_to_keep], on="index")
 
     @staticmethod
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "decipher"
-version = "0.1.34"
+version = "0.1.35"
 description = "Utilities for Decipher"
 authors = ["Thorvald Molthe Ballestad <thorvald@simula.no>"]
 readme = "README.md"
diff --git a/tests/test_processing.py b/tests/test_processing.py
@@ -19,7 +19,12 @@
     read_raw_df,
     write_to_csv,
 )
-from decipher.processing.transformers import HPVResults, ObservationMatrix, PersonStats
+from decipher.processing.transformers import (
+    HPVResults,
+    ObservationMatrix,
+    PersonStats,
+    ToExam,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -43,6 +48,21 @@ def test_base_pipeline():
     )
 
 
+def test_to_exam():
+    base_df = get_base_pipeline(
+        test_data_dob, drop_missing_birthday=True
+    ).fit_transform(read_raw_df(test_data_screening))
+    exams = ToExam().fit_transform(base_df)
+    assert exams["detailed_exam_type"].notna().all()
+    assert exams["detailed_exam_type"].dtype == "category"
+
+    assert set(exams["exam_type"]) == {"HPV", "cytology", "histology"}
+    assert set(exams["detailed_exam_type"]) == set(HPV_TEST_TYPE_NAMES.values()) | {
+        "cytology",
+        "histology",
+    }
+
+
 def test_read_and_exam_pipeline():
     """Simply try reading and running pipeline"""
     raw = read_raw_df(test_data_screening)
diff --git a/tests/test_processing_datasets/test_screening_data.csv b/tests/test_processing_datasets/test_screening_data.csv
@@ -10,38 +10,38 @@
 8,3,HSIL,2001-04-06,,,,,,,
 9,3,ADC,2016-04-01,,,,,,,
 10,3,SCC,2013-06-10,,,,,,,
-11,4,,,80703.0,2004-03-09,negativ,2004-03-09,88,,11.0
-12,4,,,80703.0,2004-03-09,negativ,2004-03-09,88,,12.0
-13,4,,,80703.0,2004-03-09,positiv,2004-03-09,HR,HR,12.0
-14,4,,,80703.0,2004-03-09,positiv,2004-03-09,HR,HX,12.0
+11,4,,,80703.0,2004-03-09,negativ,2004-03-09,88,,1.0
+12,4,,,80703.0,2004-03-09,negativ,2004-03-09,88,,2.0
+13,4,,,80703.0,2004-03-09,positiv,2004-03-09,HR,HR,3.0
+14,4,,,80703.0,2004-03-09,positiv,2004-03-09,HR,HX,4.0
 15,4,ACIS,2003-05-02,,,,,,,
 16,4,,,80402.0,2019-12-05,,,,,
 17,5,ACIS,2016-12-28,,,,,,,
 18,5,,,80703.0,2018-02-27,,,,,
-19,5,,,74007.0,2004-12-05,positiv,2004-12-05,,,11.0
+19,5,,,74007.0,2004-12-05,positiv,2004-12-05,,,5.0
 20,6,LSIL,2002-02-06,,,,,,,
 21,6,ASC-US,2002-10-03,,,,,,,
 22,6,SCC,2008-11-25,,,,,,,
 23,6,ASC-H,2016-04-27,,,,,,,
 24,6,Metastase,2000-03-05,,,,,,,
-25,8,Normal,2001-04-09,,,negativ,2001-04-09,32,88,12.0
+25,8,Normal,2001-04-09,,,negativ,2001-04-09,32,88,6.0
 26,8,Metastase,2001-12-15,,,,,,,
-27,9,,,74007.0,2018-01-21,positiv,2018-01-21,,,1.0
+27,9,,,74007.0,2018-01-21,positiv,2018-01-21,,,7.0
 28,9,Normal,2007-08-15,,,,,,,
 29,9,ACIS,2018-05-20,,,,,,,
 30,10,Cancer Cervix cancer andre/usp,2020-02-29,,,,,,,
-31,10,,,74007.0,2010-12-04,uegnet,2010-12-04,,,11.0
+31,10,,,74007.0,2010-12-04,uegnet,2010-12-04,,,8.0
 32,10,,,10.0,2009-11-17,,,,,
 33,10,ASC-US,2019-01-09,,,,,,,
-34,11,SCC,2019-06-15,,,positiv,2019-06-15,,,12.0
+34,11,SCC,2019-06-15,,,positiv,2019-06-15,,,9.0
 35,11,SCC,2000-11-09,,,,,,,
 36,11,ACIS,2008-11-15,,,,,,,
 37,12,,,82103.0,2006-03-01,,,,,
 38,12,Uegnet,2016-11-25,,,,,,,
 39,13,,,80402.0,2016-01-27,,,,,
 40,13,HSIL,2003-11-16,,,,,,,
 41,14,ACIS,2008-11-01,,,,,,,
-42,14,ADC,2011-06-15,,,uegnet,2011-06-15,16,,11.0
+42,14,ADC,2011-06-15,,,uegnet,2011-06-15,16,,10.0
 43,14,,,100.0,2001-07-14,,,,,
 44,16,SCC,2016-06-30,,,,,,,
 45,16,Normal,2002-07-28,,,,,,,
@@ -50,11 +50,11 @@
 48,17,,,1000.0,2016-05-22,,,,,
 49,18,Normal m betennelse eller blod,2014-08-24,,,uegnet,2014-08-24,,,11.0
 50,18,SCC,2019-06-25,,,,,,,
-51,18,Uegnet,2008-01-15,,,uegnet,2008-01-15,HR,,12.0
-52,18,,,100.0,2007-07-04,uegnet,2007-07-04,88,18,1.0
-53,18,,,100.0,2007-07-04,uegnet,2007-07-04,,,11.0
-54,18,,,100.0,2007-07-04,uegnet,2007-07-04,,,11.0
-55,18,,,100.0,2007-07-04,uegnet,2007-07-04,16,,11.0
+51,18,Uegnet,2008-01-15,,,uegnet,2008-01-15,HR,,13.0
+52,18,,,100.0,2007-07-04,uegnet,2007-07-04,88,18,14.0
+53,18,,,100.0,2007-07-04,uegnet,2007-07-04,,,15.0
+54,18,,,100.0,2007-07-04,uegnet,2007-07-04,,,16.0
+55,18,,,100.0,2007-07-04,uegnet,2007-07-04,16,,17.0
 56,18,LSIL,2018-11-14,,,,,,,
 57,19,SCC,2011-02-26,,,,,,,
 58,19,Cancer Cervix cancer andre/usp,2016-01-14,,,,,,,