1
- import itertools
2
- import logging
3
1
from dataclasses import dataclass
4
2
from datetime import timedelta
5
- from functools import partial
6
3
from pathlib import Path
7
- from typing import Any , Callable , Iterable
4
+ from typing import Any , Callable , cast
8
5
9
6
import numpy as np
10
7
import numpy .typing as npt
11
8
import pandas as pd
9
+ from loguru import logger
12
10
from sklearn .base import BaseEstimator , TransformerMixin
13
11
14
12
from decipher .exam_data import Diagnosis , ExamTypes , risk_mapping
15
13
16
- logger = logging .getLogger (__name__ )
17
-
18
14
19
15
class PandasTransformerMixin (TransformerMixin ):
20
16
"""Transformer mixin with type hint set to Pandas."""
@@ -52,6 +48,9 @@ def fit(self, X: pd.DataFrame, y=None):
52
48
raise ValueError (
53
49
f"Column { column } must have dtype { expected_type } , but it is { actual_type } "
54
50
)
51
+ logger .debug (
52
+ f"Column { column } has dtype { actual_type } , expected { expected_type } . { actual_type == expected_type } "
53
+ )
55
54
56
55
return self
57
56
@@ -211,11 +210,31 @@ class PersonStats(BaseEstimator, PandasTransformerMixin):
211
210
212
211
def __init__ (self , base_df : pd .DataFrame | None = None ) -> None :
213
212
self .base_df = base_df
213
+ self .high_risk_hpv_types : list [str | int ] = [
214
+ 16 ,
215
+ 8 ,
216
+ 45 ,
217
+ 31 ,
218
+ 33 ,
219
+ 35 ,
220
+ 52 ,
221
+ 58 ,
222
+ 39 ,
223
+ 51 ,
224
+ 56 ,
225
+ 59 ,
226
+ 68 ,
227
+ "HR" ,
228
+ "HF" ,
229
+ "HD" ,
230
+ "HE" ,
231
+ ]
232
+ self .low_risk_hpv_types : list [str | int ] = [11 , 6 ]
214
233
215
234
def fit (self , X : pd .DataFrame , y = None ):
216
235
CleanData (
217
236
dtypes = {
218
- "PID" : "int " ,
237
+ "PID" : "int64 " ,
219
238
"exam_type" : None ,
220
239
"exam_date" : "datetime64[ns]" ,
221
240
"age" : "timedelta64[ns]" ,
@@ -231,85 +250,99 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
231
250
person_df = person_df .join (X .groupby ("PID" )["FOEDT" ].agg ("first" ), on = "PID" )
232
251
233
252
if self .base_df is not None :
234
- person_df = person_df .join (self ._get_hpv_features ())
253
+ person_df = person_df .join (self ._get_hpv_features (X , person_df ))
235
254
return person_df
236
255
237
- def _get_hpv_features (self ) -> pd .DataFrame :
238
- """Construct a DataFrame with relevant HPV features.
239
-
240
- Warning:
241
- As it is currently implemented, there might be some confusing behavior when
242
- for example there are multiple HPV exams at the same time point.
243
- For example, if a person has two HPV exams at the same time, one giving
244
- positive and one giving negative, they will both be counted in the
245
- 'has_positive' and 'has_negative' features.
256
+ def _get_hpv_features (
257
+ self , exams_df : pd .DataFrame , person_df : pd .DataFrame
258
+ ) -> pd .DataFrame :
259
+ """Construct and return a DataFrame with a variety of HPV related features.
260
+
261
+ The features include:
262
+ - 'has_positive': Count of positive HPV results per PID.
263
+ - 'has_negative': Count of negative HPV results per PID.
264
+ - 'number_of_screenings': Count of screenings per PID.
265
+ - 'age_last_exam': Age at last exam per PID.
266
+ - 'hr_count': Count of high-risk HPV types per PID.
267
+ - 'lr_count': Count of low-risk HPV types per PID.
268
+ - 'age_first_hr': Age at first detection of high-risk HPV types per PID.
269
+ - 'age_first_lr': Age at first detection of low-risk HPV types per PID.
270
+ - 'age_first_positive': Age at first positive HPV result per PID.
271
+ - 'age_first_negative': Age at first negative HPV result per PID.
272
+
273
+ Arguments:
274
+ - exams_df - Should have 'PID', 'risk', 'age' and 'exam_diagnosis' fields.
275
+ - person_df - Should have 'PID' and 'FOEDT' fields.
246
276
"""
247
277
if self .base_df is None :
248
278
raise ValueError ()
249
279
hpv_details_df : pd .DataFrame = HPVResults ().fit_transform (self .base_df )
250
280
251
- def _true_where_result_match (match : str , field_to_query : str = "hpvResultat" ):
252
- pids = self .base_df .query (f"{ field_to_query } == '{ match } '" )["PID" ].unique () # type: ignore[union-attr]
253
- values = True
254
- return pids , values
255
-
256
- features : list [PersonFeature ] = [
257
- PersonFeature (
258
- "has_positive" ,
259
- False ,
260
- partial (_true_where_result_match , match = "positiv" ),
261
- ),
262
- PersonFeature (
263
- "has_negative" ,
264
- False ,
265
- partial (_true_where_result_match , match = "negativ" ),
266
- ),
267
- PersonFeature (
268
- "has_hr" ,
269
- None ,
270
- partial (self ._get_people_with_hr_hpv , hpv_details_df = hpv_details_df ),
271
- ),
272
- PersonFeature (
273
- "has_hr_2" ,
274
- None ,
275
- partial (
276
- self ._get_people_with_hr_hpv ,
277
- hpv_details_df = hpv_details_df ,
278
- hr_subgroups = [0 , 1 ],
279
- ),
280
- ),
281
- ]
281
+ feature_df = pd .DataFrame (index = self .base_df ["PID" ].unique ())
282
282
283
- feature_df = pd .DataFrame (
284
- index = self .base_df ["PID" ].unique (),
285
- data = {feature .name : feature .initial_value for feature in features },
286
- dtype = "boolean" ,
283
+ def _count_where_result_match (match : str , field_to_query : str = "hpvResultat" ):
284
+ # MyPy does not infer the correct type of self.base_df inside the closure
285
+ # even though we have a guard above.
286
+ counts = (
287
+ cast (pd .DataFrame , self .base_df )
288
+ .query (f"{ field_to_query } == '{ match } '" )["PID" ]
289
+ .value_counts ()
290
+ )
291
+ return counts
292
+
293
+ def age_first_field_match (match : str , field_to_query : str = "exam_diagnosis" ):
294
+ ages = (
295
+ exams_df .query (f"{ field_to_query } == '{ match } '" )
296
+ .groupby ("PID" )["age" ]
297
+ .agg ("min" )
298
+ .apply (lambda x : x .days / 365 )
299
+ )
300
+ return ages
301
+
302
+ def _hpv_type_counts (hr_types : list [str | int ]):
303
+ """Get pid-counts for hr_types"""
304
+ pid_counts = hpv_details_df [hpv_details_df ["value" ].isin (hr_types )][
305
+ "PID"
306
+ ].value_counts ()
307
+ return pid_counts
308
+
309
+ def _age_first_hr (
310
+ hr_types : list [str | int ],
311
+ ):
312
+ """Get pid-age for hr_types"""
313
+ dates = (
314
+ hpv_details_df [hpv_details_df ["value" ].isin (hr_types )]
315
+ .groupby ("PID" )["hpvDate" ]
316
+ .agg ("min" )
317
+ )
318
+ ages = (dates - person_df ["FOEDT" ]).apply (lambda x : x .days / 365 )
319
+ return ages
320
+
321
+ feature_df ["count_positive" ] = _count_where_result_match (
322
+ match = "positiv"
323
+ ).reindex (feature_df .index , fill_value = 0 )
324
+ feature_df ["count_negative" ] = _count_where_result_match (
325
+ match = "negativ"
326
+ ).reindex (feature_df .index , fill_value = 0 )
327
+ feature_df ["number_of_screenings" ] = exams_df .dropna (subset = ["risk" ])[
328
+ "PID"
329
+ ].value_counts ()
330
+ feature_df ["age_last_exam" ] = (
331
+ exams_df .groupby ("PID" )["age" ].agg ("max" ).apply (lambda x : x .days / 365 )
287
332
)
288
-
289
- for feature in features :
290
- pids , values = feature .getter ()
291
- feature_df .loc [pids , feature .name ] = values
333
+ feature_df ["hr_count" ] = _hpv_type_counts (
334
+ hr_types = self .high_risk_hpv_types
335
+ ).reindex (feature_df .index , fill_value = 0 )
336
+ feature_df ["lr_count" ] = _hpv_type_counts (
337
+ hr_types = self .low_risk_hpv_types
338
+ ).reindex (feature_df .index , fill_value = 0 )
339
+ feature_df ["age_first_hr" ] = _age_first_hr (hr_types = self .high_risk_hpv_types )
340
+ feature_df ["age_first_lr" ] = _age_first_hr (hr_types = self .low_risk_hpv_types )
341
+ feature_df ["age_first_positive" ] = age_first_field_match (match = "positiv" )
342
+ feature_df ["age_first_negative" ] = age_first_field_match (match = "negativ" )
292
343
293
344
return feature_df
294
345
295
- def _get_people_with_hr_hpv (
296
- self , hpv_details_df : pd .DataFrame , hr_subgroups : list [int ] | None = None
297
- ):
298
- # Risky HPV types, grouped from most risky group
299
- risky_hpv_types : list [list [int | str ]] = [
300
- [16 , 18 , 45 ],
301
- ["HR" ],
302
- [31 , 33 , 35 , 52 , 58 ],
303
- ]
304
- if hr_subgroups is None :
305
- hr_subgroups = list (range (len (risky_hpv_types )))
306
- hr_types : Iterable [int | str ] = itertools .chain .from_iterable (
307
- risky_hpv_types [subgroup ] for subgroup in hr_subgroups
308
- )
309
- pids = hpv_details_df [hpv_details_df ["value" ].isin (hr_types )]["PID" ].unique ()
310
- values = True
311
- return pids , values
312
-
313
346
314
347
class HPVResults (BaseEstimator , PandasTransformerMixin ):
315
348
"""Take a raw DF, and generate HPV results
0 commit comments