Skip to content

Commit cd2eb0b

Browse files
authored
Optional detailed HPV info attribute (#9)
* Adds hpv test type to name mapping * Improve testing and dtypes * Rename HPVReults column names * Adds optional HPV details to DataManager * Adds docstring to DataManager * Include recipe for adding detailed information on test type and result * Bump version to 0.1.33
1 parent 3789694 commit cd2eb0b

File tree

7 files changed

+221
-11
lines changed

7 files changed

+221
-11
lines changed

README.md

+67
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,73 @@ feature_matrix = data_manager.feature_data_as_coo_array(
9898
)
9999
```
100100

101+
### Recipes
102+
103+
** Adding detailed HPV test type and result information to the `exams_df` **
104+
The `exams_df` of the `DataManger` only contains whether the HPV result was positive
105+
or not, and no specific information about the test type.
106+
This information is stored in `hpv_df` (which is only populated if `read_hpv` is set
107+
to `True` in `DataManager.read_from_csv`).
108+
109+
In some cases, it is desirable to have this information in the `exams_df`, as new columns.
110+
We here do it in two 'steps' to more clearly show what is going on.
111+
112+
```python
113+
def hpv_details_per_exam(hpv_df: pd.DataFrame) -> pd.DataFrame:
114+
"""Return a DataFrame with the HPV details per exam.
115+
116+
The index of the returned DataFrame is the exam_index. Note that this is
117+
not the same as the index of the exams_df!"""
118+
119+
per_exam = hpv_df.groupby("exam_index")
120+
if per_exam["hpvTesttype"].nunique().max() != 1:
121+
raise ValueError("Not all exams have the same HPV test type!")
122+
123+
return pd.DataFrame(
124+
{
125+
"exam_detailed_type": per_exam["test_type_name"].first(),
126+
"exam_detailed_results": per_exam["genotype"].apply(
127+
lambda genotypes: ",".join(genotypes)
128+
),
129+
}
130+
)
131+
132+
def add_hpv_detailed_information(
133+
exams_df: pd.DataFrame, hpv_df: pd.DataFrame
134+
) -> pd.DataFrame:
135+
""" "Add detailed exam type name and results to exams_df"""
136+
# Find the exam_index -> exams_df.index map
137+
# exam_index is not unique in exams_df, because one exam may give
138+
# cyt, hist, and HPV results
139+
# Therefore, we find the indices where there is an HPV test
140+
hpv_indices = exams_df.query("exam_type == 'HPV'")["index"]
141+
mapping = pd.Series(data=hpv_indices.index, index=hpv_indices.values)
142+
143+
hpv_details = hpv_details_per_exam(hpv_df)
144+
hpv_details.index = hpv_details.index.map(mapping)
145+
146+
# TODO: this will give nan on the hist and cyt rows
147+
exams_df = exams_df.join(hpv_details)
148+
149+
# Set the Cytology and Histology results
150+
def _fill(base_series: pd.Series, fill_series: pd.Series) -> pd.Series:
151+
"""Fill base series with fill series where base series is nan. Handles category data."""
152+
return base_series.astype("string").fillna(fill_series.astype("string"))
153+
154+
exams_df["exam_detailed_type"] = _fill(
155+
exams_df["exam_detailed_type"], exams_df["exam_type"]
156+
)
157+
exams_df["exam_detailed_results"] = _fill(
158+
exams_df["exam_detailed_results"], exams_df["exam_diagnosis"]
159+
)
160+
161+
return exams_df
162+
163+
164+
# Assuming the DataManger has hpv_df
165+
data_manager.exams_df = add_hpv_detailed_information(data_manager.exams_df, data_manager.hpv_df)
166+
```
167+
101168
## Install
102169

103170
## Parquet support

decipher/data/data_manager.py

+79-5
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from decipher.processing.pipeline import get_base_pipeline, read_raw_df
1515
from decipher.processing.transformers import (
1616
AgeAdder,
17+
HPVResults,
1718
ObservationMatrix,
1819
PersonStats,
1920
RiskAdder,
@@ -213,20 +214,77 @@ def metadata(self) -> dict:
213214

214215

215216
class DataManager:
217+
"""DataManager is a class for managing and organizing the datasets.
218+
219+
DataManager provides methods to read data from CSV files, save and load DataFrames
220+
as parquet files for improved performance, filter data, and get feature data.
221+
222+
Attributes:
223+
person_df: DataFrame containing personal data.
224+
exams_df: DataFrame containing data about exams.
225+
hpv_df: DataFrame containing HPV results data. See `decipher.processing.transformers.HPVResults` for details. Default is None.
226+
screening_data: DataFrame containing screening data. Default is None.
227+
metadata: Dictionary containing metadata.
228+
229+
Examples:
230+
*** Reading data from CSV files ***
231+
```python
232+
from pathlib import Path
233+
from decipher.data import DataManager
234+
235+
screening_data = Path(<screening data>)
236+
dob_data = Path(<dob data>)
237+
238+
# Read in from CSV
239+
data_manager = DataManager.read_from_csv(screening_data, dob_data)
240+
```
241+
242+
**Read and Write with Parquet**
243+
```
244+
from pathlib import Path
245+
from decipher.data import DataManager
246+
247+
screening_data = Path(<screening data>)
248+
dob_data = Path(<dob data>)
249+
parquet_dir = Path(<parquet dir>)
250+
251+
# Read in from CSV
252+
data_manager = DataManager.read_from_csv(screening_data, dob_data)
253+
254+
# Store to Parquet
255+
data_manager.save_to_parquet(parquet_dir, engine="pyarrow")
256+
257+
# Read from Parquet
258+
# Will fail if `decipher` version does not match that of stored data
259+
data_manager = DataManager.from_parquet(parquet_dir, engine="pyarrow")
260+
261+
# See metadata
262+
data_manager.metadata
263+
```
264+
265+
Note:
266+
It is strongly advised to read the CSV files once, and then store the DataManager
267+
to parquet. This gives much faster read times.
268+
"""
269+
216270
def __init__(
217271
self,
218272
person_df: pd.DataFrame,
219273
exams_df: pd.DataFrame,
274+
hpv_df: pd.DataFrame | None = None,
220275
screening_data: pd.DataFrame | None = None,
221276
metadata: dict | None = None,
222277
):
223278
self.screening_data = screening_data
224279
self.person_df = person_df
280+
self.hpv_df = hpv_df
225281
self.exams_df = exams_df
226282
self.metadata = metadata or {"decipher_version": version("decipher")}
227283

228284
@classmethod
229-
def read_from_csv(cls, screening_path: Path, dob_path: Path):
285+
def read_from_csv(
286+
cls, screening_path: Path, dob_path: Path, read_hpv: bool = False
287+
):
230288
base_df = _get_base_df(screening_path, dob_path)
231289
logger.debug("Got base DF")
232290
exams = Pipeline(
@@ -238,11 +296,17 @@ def read_from_csv(cls, screening_path: Path, dob_path: Path):
238296
verbose=True,
239297
).fit_transform(base_df)
240298
logger.debug("Got exams DF")
299+
241300
person_df: pd.DataFrame = PersonStats(base_df=base_df).fit_transform(exams)
242301
logger.debug("Got person DF")
302+
303+
hpv_df = HPVResults().fit_transform(base_df) if read_hpv else None
304+
logger.debug("Got HPV DF")
305+
243306
return DataManager(
244307
person_df=person_df,
245308
exams_df=exams,
309+
hpv_df=hpv_df,
246310
)
247311

248312
def save_to_parquet(
@@ -254,13 +318,22 @@ def save_to_parquet(
254318
self.screening_data.to_parquet(
255319
directory / "screening_data.parquet", engine=engine
256320
)
321+
if self.hpv_df is not None:
322+
self.hpv_df.to_parquet(directory / "hpv_df.parquet", engine=engine)
257323
self.person_df.to_parquet(directory / "person_df.parquet", engine=engine)
258324
self.exams_df.to_parquet(directory / "exams_df.parquet", engine=engine)
259325
with open(directory / "metadata.json", "w") as file:
260326
# We always want to store the decipher version, so if it is not
261327
# in the metadata, add it.
262328
json.dump({"decipher_version": version("decipher")} | self.metadata, file)
263329

330+
@staticmethod
331+
def _read_if_exists(
332+
path: Path, engine: _parquet_engine_types
333+
) -> pd.DataFrame | None:
334+
"""Read the parquet file at path if it exists, otherwise return None"""
335+
return pd.read_parquet(path, engine=engine) if path.exists() else None
336+
264337
@classmethod
265338
def from_parquet(
266339
cls,
@@ -282,15 +355,16 @@ def from_parquet(
282355
)
283356
else:
284357
logger.warning(message)
285-
if (screening_file := directory / "screening_data.parquet").exists():
286-
screening_data = pd.read_parquet(screening_file, engine=engine)
287-
else:
288-
screening_data = None
358+
screening_data = cls._read_if_exists(
359+
directory / "screening_data.parquet", engine
360+
)
361+
hpv_df = cls._read_if_exists(directory / "hpv_df.parquet", engine)
289362
person_df = pd.read_parquet(directory / "person_df.parquet", engine=engine)
290363
exams_df = pd.read_parquet(directory / "exams_df.parquet", engine=engine)
291364
return DataManager(
292365
person_df=person_df,
293366
exams_df=exams_df,
367+
hpv_df=hpv_df,
294368
screening_data=screening_data,
295369
metadata=metadata,
296370
)

decipher/exam_data.py

+21
Original file line numberDiff line numberDiff line change
@@ -118,3 +118,24 @@ class ExamTypes(str, Enum):
118118
assert set(risk_mapping.keys()) == set(Diagnosis), set(risk_mapping.keys()) ^ set(
119119
Diagnosis
120120
)
121+
122+
HPV_TEST_TYPE_NAMES = {
123+
1: "HCII",
124+
2: "HCIII",
125+
3: "PreTect HPV-Proofer",
126+
4: "Amplicor",
127+
5: "PCR-primer",
128+
6: "Real time PCR",
129+
7: "Ventana Inform HPV (ISH)",
130+
8: "ISH andre",
131+
9: "PAP 13 Tele-lab",
132+
10: "Paptype13 realtime",
133+
11: "Cobas 4800 System",
134+
12: "Abbott RealTime High Risk HPV",
135+
13: "BD Onclarity HPV Assay",
136+
14: "Inno Lipa",
137+
15: "(Ukjent)",
138+
16: "Abbot Alinity",
139+
17: "Cobas 6800",
140+
}
141+
"""Mapping from HPV test type code to long form name."""

decipher/processing/transformers.py

+21-4
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from loguru import logger
99
from sklearn.base import BaseEstimator, TransformerMixin
1010

11-
from decipher.exam_data import Diagnosis, ExamTypes, risk_mapping
11+
from decipher.exam_data import HPV_TEST_TYPE_NAMES, Diagnosis, ExamTypes, risk_mapping
1212

1313

1414
class PandasTransformerMixin(TransformerMixin):
@@ -349,6 +349,15 @@ def count_in_time_window(
349349
class HPVResults(BaseEstimator, PandasTransformerMixin):
350350
"""Take a raw DF, and generate HPV results
351351
352+
The resulting DF will have the following columns:
353+
- PID
354+
- exam_index: the index of the exam in the raw data
355+
- hpvTesttype
356+
- hpvDate
357+
- genotype_field: the genotype column of the raw data, i.e. hpv1Genotype, hpv2Genotype, etc.
358+
- genotype: the genotype, i.e. 16, 18, HR, etc
359+
- hpv_test_type_name: the name of the test type, i.e. "Cobas 4800 System".
360+
352361
Warning:
353362
HPV negative and hpv non-conclusive are _not_ included!!!
354363
"""
@@ -365,15 +374,23 @@ def fit(self, X: pd.DataFrame, y=None):
365374
return self
366375

367376
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
368-
return (
377+
hpv_df = (
369378
X.dropna(subset="hpvDate")
370379
.reset_index(names="exam_index")
371380
.melt(
372381
id_vars=["PID", "exam_index", "hpvTesttype", "hpvDate"],
373382
value_vars=self.hpv_genotype_columns,
383+
var_name="genotype_field",
384+
value_name="genotype",
374385
)
375-
.dropna(subset="value")
376-
).astype({"variable": "category", "value": "category"})
386+
.dropna(subset="genotype")
387+
).astype(
388+
{"genotype_field": "category", "genotype": "category", "hpvTesttype": "int"}
389+
)
390+
hpv_df["test_type_name"] = (
391+
hpv_df["hpvTesttype"].map(HPV_TEST_TYPE_NAMES).astype("category")
392+
)
393+
return hpv_df
377394

378395

379396
class ObservationMatrix(BaseEstimator, PandasTransformerMixin):

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "decipher"
3-
version = "0.1.32"
3+
version = "0.1.33"
44
description = "Utilities for Decipher"
55
authors = ["Thorvald Molthe Ballestad <[email protected]>"]
66
readme = "README.md"

tests/test_data.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131

3232
@pytest.fixture()
3333
def data_manager() -> DataManager:
34-
return DataManager.read_from_csv(test_data_screening, test_data_dob)
34+
return DataManager.read_from_csv(test_data_screening, test_data_dob, read_hpv=True)
3535

3636

3737
@pytest.mark.parametrize("min_non_hpv_exams", [0, 2, 3])
@@ -126,6 +126,11 @@ def test_parquet(
126126
)
127127
else:
128128
assert new_data_manager.screening_data is None
129+
if data_manager.hpv_df is not None:
130+
assert new_data_manager.hpv_df is not None
131+
assert new_data_manager.hpv_df.equals(data_manager.hpv_df)
132+
else:
133+
assert new_data_manager.hpv_df is None
129134
assert data_manager.person_df.equals(new_data_manager.person_df)
130135
assert data_manager.exams_df.equals(new_data_manager.exams_df)
131136

tests/test_processing.py

+26
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from hypothesis import assume, given
1111
from hypothesis.extra.pandas import column, data_frames, range_indexes
1212

13+
from decipher.exam_data import HPV_TEST_TYPE_NAMES
1314
from decipher.processing.pipeline import (
1415
get_base_pipeline,
1516
get_exam_pipeline,
@@ -228,6 +229,31 @@ def test_hpv_results():
228229
hpv_df = HPVResults().fit_transform(raw)
229230
logger.debug(f"HPV DF:\n{hpv_df.head()}")
230231

232+
assert not hpv_df.isna().any().any()
233+
234+
genotype_columns = ["hpv1Genotype", "hpv2Genotype"]
235+
236+
# Check that the data corresponds with the raw data
237+
for exam_index, results in hpv_df.groupby("exam_index"):
238+
raw_row = raw.loc[exam_index]
239+
240+
assert set(results["genotype"]) == set(raw_row[genotype_columns].dropna())
241+
242+
def _matches(field: str) -> bool:
243+
"""Assert the field is unique within the group and matches the raw data"""
244+
return (
245+
results[field].nunique() == 1
246+
and results[field].iloc[0] == raw_row[field]
247+
)
248+
249+
for field in ("PID", "hpvTesttype", "hpvDate"):
250+
assert _matches(field)
251+
252+
# Check that the test type names are correct
253+
assert hpv_df["test_type_name"].equals(
254+
hpv_df["hpvTesttype"].map(HPV_TEST_TYPE_NAMES).astype("category")
255+
)
256+
231257

232258
def test_read_from_csv(tmp_path: Path):
233259
data_file = tmp_path / "data.csv"

0 commit comments

Comments
 (0)