Skip to content

Commit b12ca8c

Browse files
authored
Merge pull request #5 from Simula-Consulting/CAN-418/more-hpv-features
Adds more features to `person_df`
2 parents ac539d9 + e03ab74 commit b12ca8c

File tree

3 files changed

+112
-77
lines changed

3 files changed

+112
-77
lines changed

decipher/data/data_manager.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -390,8 +390,6 @@ def get_feature_data(
390390
... dtype="int8",
391391
... )
392392
"""
393-
if self.screening_data is None:
394-
raise ValueError("Screening data is None!")
395393
columns = (
396394
list(columns)
397395
if columns is not None
@@ -402,6 +400,10 @@ def get_feature_data(
402400
"has_hr_2",
403401
]
404402
)
403+
if not set(columns).issubset(self.person_df.columns):
404+
raise ValueError(
405+
f"{set(columns) - set(self.person_df.columns)} are not in the person_df"
406+
)
405407

406408
people_in_data = (
407409
self.person_df[self.person_df.index.isin(pids)]

decipher/processing/transformers.py

+107-74
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,16 @@
1-
import itertools
2-
import logging
31
from dataclasses import dataclass
42
from datetime import timedelta
5-
from functools import partial
63
from pathlib import Path
7-
from typing import Any, Callable, Iterable
4+
from typing import Any, Callable, cast
85

96
import numpy as np
107
import numpy.typing as npt
118
import pandas as pd
9+
from loguru import logger
1210
from sklearn.base import BaseEstimator, TransformerMixin
1311

1412
from decipher.exam_data import Diagnosis, ExamTypes, risk_mapping
1513

16-
logger = logging.getLogger(__name__)
17-
1814

1915
class PandasTransformerMixin(TransformerMixin):
2016
"""Transformer mixin with type hint set to Pandas."""
@@ -52,6 +48,9 @@ def fit(self, X: pd.DataFrame, y=None):
5248
raise ValueError(
5349
f"Column {column} must have dtype {expected_type}, but it is {actual_type}"
5450
)
51+
logger.debug(
52+
f"Column {column} has dtype {actual_type}, expected {expected_type}. {actual_type == expected_type}"
53+
)
5554

5655
return self
5756

@@ -211,11 +210,31 @@ class PersonStats(BaseEstimator, PandasTransformerMixin):
211210

212211
def __init__(self, base_df: pd.DataFrame | None = None) -> None:
213212
self.base_df = base_df
213+
self.high_risk_hpv_types: list[str | int] = [
214+
16,
215+
8,
216+
45,
217+
31,
218+
33,
219+
35,
220+
52,
221+
58,
222+
39,
223+
51,
224+
56,
225+
59,
226+
68,
227+
"HR",
228+
"HF",
229+
"HD",
230+
"HE",
231+
]
232+
self.low_risk_hpv_types: list[str | int] = [11, 6]
214233

215234
def fit(self, X: pd.DataFrame, y=None):
216235
CleanData(
217236
dtypes={
218-
"PID": "int",
237+
"PID": "int64",
219238
"exam_type": None,
220239
"exam_date": "datetime64[ns]",
221240
"age": "timedelta64[ns]",
@@ -231,85 +250,99 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
231250
person_df = person_df.join(X.groupby("PID")["FOEDT"].agg("first"), on="PID")
232251

233252
if self.base_df is not None:
234-
person_df = person_df.join(self._get_hpv_features())
253+
person_df = person_df.join(self._get_hpv_features(X, person_df))
235254
return person_df
236255

237-
def _get_hpv_features(self) -> pd.DataFrame:
238-
"""Construct a DataFrame with relevant HPV features.
239-
240-
Warning:
241-
As it is currently implemented, there might be some confusing behavior when
242-
for example there are multiple HPV exams at the same time point.
243-
For example, if a person has two HPV exams at the same time, one giving
244-
positive and one giving negative, they will both be counted in the
245-
'has_positive' and 'has_negative' features.
256+
def _get_hpv_features(
257+
self, exams_df: pd.DataFrame, person_df: pd.DataFrame
258+
) -> pd.DataFrame:
259+
"""Construct and return a DataFrame with a variety of HPV related features.
260+
261+
The features include:
262+
- 'has_positive': Count of positive HPV results per PID.
263+
- 'has_negative': Count of negative HPV results per PID.
264+
- 'number_of_screenings': Count of screenings per PID.
265+
- 'age_last_exam': Age at last exam per PID.
266+
- 'hr_count': Count of high-risk HPV types per PID.
267+
- 'lr_count': Count of low-risk HPV types per PID.
268+
- 'age_first_hr': Age at first detection of high-risk HPV types per PID.
269+
- 'age_first_lr': Age at first detection of low-risk HPV types per PID.
270+
- 'age_first_positive': Age at first positive HPV result per PID.
271+
- 'age_first_negative': Age at first negative HPV result per PID.
272+
273+
Arguments:
274+
- exams_df - Should have 'PID', 'risk', 'age' and 'exam_diagnosis' fields.
275+
- person_df - Should have 'PID' and 'FOEDT' fields.
246276
"""
247277
if self.base_df is None:
248278
raise ValueError()
249279
hpv_details_df: pd.DataFrame = HPVResults().fit_transform(self.base_df)
250280

251-
def _true_where_result_match(match: str, field_to_query: str = "hpvResultat"):
252-
pids = self.base_df.query(f"{field_to_query} == '{match}'")["PID"].unique() # type: ignore[union-attr]
253-
values = True
254-
return pids, values
255-
256-
features: list[PersonFeature] = [
257-
PersonFeature(
258-
"has_positive",
259-
False,
260-
partial(_true_where_result_match, match="positiv"),
261-
),
262-
PersonFeature(
263-
"has_negative",
264-
False,
265-
partial(_true_where_result_match, match="negativ"),
266-
),
267-
PersonFeature(
268-
"has_hr",
269-
None,
270-
partial(self._get_people_with_hr_hpv, hpv_details_df=hpv_details_df),
271-
),
272-
PersonFeature(
273-
"has_hr_2",
274-
None,
275-
partial(
276-
self._get_people_with_hr_hpv,
277-
hpv_details_df=hpv_details_df,
278-
hr_subgroups=[0, 1],
279-
),
280-
),
281-
]
281+
feature_df = pd.DataFrame(index=self.base_df["PID"].unique())
282282

283-
feature_df = pd.DataFrame(
284-
index=self.base_df["PID"].unique(),
285-
data={feature.name: feature.initial_value for feature in features},
286-
dtype="boolean",
283+
def _count_where_result_match(match: str, field_to_query: str = "hpvResultat"):
284+
# MyPy does not infer the correct type of self.base_df inside the closure
285+
# even though we have a guard above.
286+
counts = (
287+
cast(pd.DataFrame, self.base_df)
288+
.query(f"{field_to_query} == '{match}'")["PID"]
289+
.value_counts()
290+
)
291+
return counts
292+
293+
def age_first_field_match(match: str, field_to_query: str = "exam_diagnosis"):
294+
ages = (
295+
exams_df.query(f"{field_to_query} == '{match}'")
296+
.groupby("PID")["age"]
297+
.agg("min")
298+
.apply(lambda x: x.days / 365)
299+
)
300+
return ages
301+
302+
def _hpv_type_counts(hr_types: list[str | int]):
303+
"""Get pid-counts for hr_types"""
304+
pid_counts = hpv_details_df[hpv_details_df["value"].isin(hr_types)][
305+
"PID"
306+
].value_counts()
307+
return pid_counts
308+
309+
def _age_first_hr(
310+
hr_types: list[str | int],
311+
):
312+
"""Get pid-age for hr_types"""
313+
dates = (
314+
hpv_details_df[hpv_details_df["value"].isin(hr_types)]
315+
.groupby("PID")["hpvDate"]
316+
.agg("min")
317+
)
318+
ages = (dates - person_df["FOEDT"]).apply(lambda x: x.days / 365)
319+
return ages
320+
321+
feature_df["count_positive"] = _count_where_result_match(
322+
match="positiv"
323+
).reindex(feature_df.index, fill_value=0)
324+
feature_df["count_negative"] = _count_where_result_match(
325+
match="negativ"
326+
).reindex(feature_df.index, fill_value=0)
327+
feature_df["number_of_screenings"] = exams_df.dropna(subset=["risk"])[
328+
"PID"
329+
].value_counts()
330+
feature_df["age_last_exam"] = (
331+
exams_df.groupby("PID")["age"].agg("max").apply(lambda x: x.days / 365)
287332
)
288-
289-
for feature in features:
290-
pids, values = feature.getter()
291-
feature_df.loc[pids, feature.name] = values
333+
feature_df["hr_count"] = _hpv_type_counts(
334+
hr_types=self.high_risk_hpv_types
335+
).reindex(feature_df.index, fill_value=0)
336+
feature_df["lr_count"] = _hpv_type_counts(
337+
hr_types=self.low_risk_hpv_types
338+
).reindex(feature_df.index, fill_value=0)
339+
feature_df["age_first_hr"] = _age_first_hr(hr_types=self.high_risk_hpv_types)
340+
feature_df["age_first_lr"] = _age_first_hr(hr_types=self.low_risk_hpv_types)
341+
feature_df["age_first_positive"] = age_first_field_match(match="positiv")
342+
feature_df["age_first_negative"] = age_first_field_match(match="negativ")
292343

293344
return feature_df
294345

295-
def _get_people_with_hr_hpv(
296-
self, hpv_details_df: pd.DataFrame, hr_subgroups: list[int] | None = None
297-
):
298-
# Risky HPV types, grouped from most risky group
299-
risky_hpv_types: list[list[int | str]] = [
300-
[16, 18, 45],
301-
["HR"],
302-
[31, 33, 35, 52, 58],
303-
]
304-
if hr_subgroups is None:
305-
hr_subgroups = list(range(len(risky_hpv_types)))
306-
hr_types: Iterable[int | str] = itertools.chain.from_iterable(
307-
risky_hpv_types[subgroup] for subgroup in hr_subgroups
308-
)
309-
pids = hpv_details_df[hpv_details_df["value"].isin(hr_types)]["PID"].unique()
310-
values = True
311-
return pids, values
312-
313346

314347
class HPVResults(BaseEstimator, PandasTransformerMixin):
315348
"""Take a raw DF, and generate HPV results

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "decipher"
3-
version = "0.1.27"
3+
version = "0.1.28"
44
description = "Utilities for Decipher"
55
authors = ["Thorvald Molthe Ballestad <[email protected]>"]
66
readme = "README.md"

0 commit comments

Comments
 (0)