Skip to content

Commit b99ed48

Browse files
authored
Detailed HPV exam type info in exams_df (#11)
* Extend docstring on hpv_df * Update test dataset to include all HPV test types * Add detailed exam type to exams_df * Update recipe in README * Bump version to 0.1.35
1 parent d48cdda commit b99ed48

File tree

6 files changed

+63
-46
lines changed

6 files changed

+63
-46
lines changed

README.md

+4-27
Original file line numberDiff line numberDiff line change
@@ -110,57 +110,34 @@ In some cases, it is desirable to have this information in the `exams_df`, as ne
110110
We here do it in two 'steps' to more clearly show what is going on.
111111

112112
```python
113-
def hpv_details_per_exam(hpv_df: pd.DataFrame) -> pd.DataFrame:
114-
"""Return a DataFrame with the HPV details per exam.
115-
116-
The index of the returned DataFrame is the exam_index. Note that this is
117-
not the same as the index of the exams_df!"""
118-
119-
per_exam = hpv_df.groupby("exam_index")
120-
if per_exam["hpvTesttype"].nunique().max() != 1:
121-
raise ValueError("Not all exams have the same HPV test type!")
122-
123-
return pd.DataFrame(
124-
{
125-
"exam_detailed_type": per_exam["test_type_name"].first(),
126-
"exam_detailed_results": per_exam["genotype"].apply(
127-
lambda genotypes: ",".join(genotypes)
128-
),
129-
}
130-
)
131-
132113
def add_hpv_detailed_information(
133114
exams_df: pd.DataFrame, hpv_df: pd.DataFrame
134115
) -> pd.DataFrame:
135-
""" "Add detailed exam type name and results to exams_df"""
116+
"""Add detailed results to exams_df, under the key "exam_detailed_results"."""
117+
136118
# Find the exam_index -> exams_df.index map
137119
# exam_index is not unique in exams_df, because one exam may give
138120
# cyt, hist, and HPV results
139121
# Therefore, we find the indices where there is an HPV test
140122
hpv_indices = exams_df.query("exam_type == 'HPV'")["index"]
141123
mapping = pd.Series(data=hpv_indices.index, index=hpv_indices.values)
142124

143-
hpv_details = hpv_details_per_exam(hpv_df)
125+
hpv_details = hpv_df.groupby("exam_index")["genotype"].apply(",".join)
144126
hpv_details.index = hpv_details.index.map(mapping)
145127

146-
# TODO: this will give nan on the hist and cyt rows
147-
exams_df = exams_df.join(hpv_details)
128+
exams_df["exam_detailed_results"] = hpv_details
148129

149130
# Set the Cytology and Histology results
150131
def _fill(base_series: pd.Series, fill_series: pd.Series) -> pd.Series:
151132
"""Fill base series with fill series where base series is nan. Handles category data."""
152133
return base_series.astype("string").fillna(fill_series.astype("string"))
153134

154-
exams_df["exam_detailed_type"] = _fill(
155-
exams_df["exam_detailed_type"], exams_df["exam_type"]
156-
)
157135
exams_df["exam_detailed_results"] = _fill(
158136
exams_df["exam_detailed_results"], exams_df["exam_diagnosis"]
159137
)
160138

161139
return exams_df
162140

163-
164141
# Assuming the DataManger has hpv_df
165142
data_manager.exams_df = add_hpv_detailed_information(data_manager.exams_df, data_manager.hpv_df)
166143
```

decipher/data/data_manager.py

+5
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,11 @@ class DataManager:
226226
screening_data: DataFrame containing screening data. Default is None.
227227
metadata: Dictionary containing metadata.
228228
229+
Warning:
230+
hpv_df contains details about the HPV genotype results.
231+
For tests without genotype results, there will be no entry in hpv_df.
232+
_However_, the test may still be positive.
233+
229234
Examples:
230235
*** Reading data from CSV files ***
231236
```python

decipher/processing/transformers.py

+17-2
Original file line numberDiff line numberDiff line change
@@ -121,13 +121,17 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
121121
exams = (
122122
X.reset_index()
123123
.melt(
124-
id_vars="index",
124+
id_vars=[
125+
"index",
126+
"hpvTesttype",
127+
], # Keep hpvTesttype. We will remove it for cyt and hist results later.
125128
value_vars=mapper.keys(), # type: ignore[arg-type]
126129
var_name="exam_type",
127130
value_name="exam_date",
128131
)
129-
.dropna()
132+
.dropna(subset=["exam_date"])
130133
.astype({"exam_type": "category"})
134+
.rename(columns={"hpvTesttype": "detailed_exam_type"})
131135
)
132136

133137
# Join on result columns
@@ -150,6 +154,17 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
150154
.astype("category")
151155
)
152156

157+
# Set detailed_exam_type for cyt and hist exams
158+
not_hpv_result = exams["exam_type"] != ExamTypes.HPV
159+
exams.loc[not_hpv_result, "detailed_exam_type"] = exams.loc[
160+
not_hpv_result, "exam_type"
161+
].apply(lambda exam_type: exam_type.value)
162+
# Map hpv type codes to names
163+
exams.loc[~not_hpv_result, "detailed_exam_type"] = exams.loc[
164+
~not_hpv_result, "detailed_exam_type"
165+
].map(HPV_TEST_TYPE_NAMES)
166+
exams["detailed_exam_type"] = exams["detailed_exam_type"].astype("category")
167+
153168
return exams.join(X[self.fields_to_keep], on="index")
154169

155170
@staticmethod

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "decipher"
3-
version = "0.1.34"
3+
version = "0.1.35"
44
description = "Utilities for Decipher"
55
authors = ["Thorvald Molthe Ballestad <[email protected]>"]
66
readme = "README.md"

tests/test_processing.py

+21-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,12 @@
1919
read_raw_df,
2020
write_to_csv,
2121
)
22-
from decipher.processing.transformers import HPVResults, ObservationMatrix, PersonStats
22+
from decipher.processing.transformers import (
23+
HPVResults,
24+
ObservationMatrix,
25+
PersonStats,
26+
ToExam,
27+
)
2328

2429
logger = logging.getLogger(__name__)
2530

@@ -43,6 +48,21 @@ def test_base_pipeline():
4348
)
4449

4550

51+
def test_to_exam():
52+
base_df = get_base_pipeline(
53+
test_data_dob, drop_missing_birthday=True
54+
).fit_transform(read_raw_df(test_data_screening))
55+
exams = ToExam().fit_transform(base_df)
56+
assert exams["detailed_exam_type"].notna().all()
57+
assert exams["detailed_exam_type"].dtype == "category"
58+
59+
assert set(exams["exam_type"]) == {"HPV", "cytology", "histology"}
60+
assert set(exams["detailed_exam_type"]) == set(HPV_TEST_TYPE_NAMES.values()) | {
61+
"cytology",
62+
"histology",
63+
}
64+
65+
4666
def test_read_and_exam_pipeline():
4767
"""Simply try reading and running pipeline"""
4868
raw = read_raw_df(test_data_screening)

tests/test_processing_datasets/test_screening_data.csv

+15-15
Original file line numberDiff line numberDiff line change
@@ -10,38 +10,38 @@
1010
8,3,HSIL,2001-04-06,,,,,,,
1111
9,3,ADC,2016-04-01,,,,,,,
1212
10,3,SCC,2013-06-10,,,,,,,
13-
11,4,,,80703.0,2004-03-09,negativ,2004-03-09,88,,11.0
14-
12,4,,,80703.0,2004-03-09,negativ,2004-03-09,88,,12.0
15-
13,4,,,80703.0,2004-03-09,positiv,2004-03-09,HR,HR,12.0
16-
14,4,,,80703.0,2004-03-09,positiv,2004-03-09,HR,HX,12.0
13+
11,4,,,80703.0,2004-03-09,negativ,2004-03-09,88,,1.0
14+
12,4,,,80703.0,2004-03-09,negativ,2004-03-09,88,,2.0
15+
13,4,,,80703.0,2004-03-09,positiv,2004-03-09,HR,HR,3.0
16+
14,4,,,80703.0,2004-03-09,positiv,2004-03-09,HR,HX,4.0
1717
15,4,ACIS,2003-05-02,,,,,,,
1818
16,4,,,80402.0,2019-12-05,,,,,
1919
17,5,ACIS,2016-12-28,,,,,,,
2020
18,5,,,80703.0,2018-02-27,,,,,
21-
19,5,,,74007.0,2004-12-05,positiv,2004-12-05,,,11.0
21+
19,5,,,74007.0,2004-12-05,positiv,2004-12-05,,,5.0
2222
20,6,LSIL,2002-02-06,,,,,,,
2323
21,6,ASC-US,2002-10-03,,,,,,,
2424
22,6,SCC,2008-11-25,,,,,,,
2525
23,6,ASC-H,2016-04-27,,,,,,,
2626
24,6,Metastase,2000-03-05,,,,,,,
27-
25,8,Normal,2001-04-09,,,negativ,2001-04-09,32,88,12.0
27+
25,8,Normal,2001-04-09,,,negativ,2001-04-09,32,88,6.0
2828
26,8,Metastase,2001-12-15,,,,,,,
29-
27,9,,,74007.0,2018-01-21,positiv,2018-01-21,,,1.0
29+
27,9,,,74007.0,2018-01-21,positiv,2018-01-21,,,7.0
3030
28,9,Normal,2007-08-15,,,,,,,
3131
29,9,ACIS,2018-05-20,,,,,,,
3232
30,10,Cancer Cervix cancer andre/usp,2020-02-29,,,,,,,
33-
31,10,,,74007.0,2010-12-04,uegnet,2010-12-04,,,11.0
33+
31,10,,,74007.0,2010-12-04,uegnet,2010-12-04,,,8.0
3434
32,10,,,10.0,2009-11-17,,,,,
3535
33,10,ASC-US,2019-01-09,,,,,,,
36-
34,11,SCC,2019-06-15,,,positiv,2019-06-15,,,12.0
36+
34,11,SCC,2019-06-15,,,positiv,2019-06-15,,,9.0
3737
35,11,SCC,2000-11-09,,,,,,,
3838
36,11,ACIS,2008-11-15,,,,,,,
3939
37,12,,,82103.0,2006-03-01,,,,,
4040
38,12,Uegnet,2016-11-25,,,,,,,
4141
39,13,,,80402.0,2016-01-27,,,,,
4242
40,13,HSIL,2003-11-16,,,,,,,
4343
41,14,ACIS,2008-11-01,,,,,,,
44-
42,14,ADC,2011-06-15,,,uegnet,2011-06-15,16,,11.0
44+
42,14,ADC,2011-06-15,,,uegnet,2011-06-15,16,,10.0
4545
43,14,,,100.0,2001-07-14,,,,,
4646
44,16,SCC,2016-06-30,,,,,,,
4747
45,16,Normal,2002-07-28,,,,,,,
@@ -50,11 +50,11 @@
5050
48,17,,,1000.0,2016-05-22,,,,,
5151
49,18,Normal m betennelse eller blod,2014-08-24,,,uegnet,2014-08-24,,,11.0
5252
50,18,SCC,2019-06-25,,,,,,,
53-
51,18,Uegnet,2008-01-15,,,uegnet,2008-01-15,HR,,12.0
54-
52,18,,,100.0,2007-07-04,uegnet,2007-07-04,88,18,1.0
55-
53,18,,,100.0,2007-07-04,uegnet,2007-07-04,,,11.0
56-
54,18,,,100.0,2007-07-04,uegnet,2007-07-04,,,11.0
57-
55,18,,,100.0,2007-07-04,uegnet,2007-07-04,16,,11.0
53+
51,18,Uegnet,2008-01-15,,,uegnet,2008-01-15,HR,,13.0
54+
52,18,,,100.0,2007-07-04,uegnet,2007-07-04,88,18,14.0
55+
53,18,,,100.0,2007-07-04,uegnet,2007-07-04,,,15.0
56+
54,18,,,100.0,2007-07-04,uegnet,2007-07-04,,,16.0
57+
55,18,,,100.0,2007-07-04,uegnet,2007-07-04,16,,17.0
5858
56,18,LSIL,2018-11-14,,,,,,,
5959
57,19,SCC,2011-02-26,,,,,,,
6060
58,19,Cancer Cervix cancer andre/usp,2016-01-14,,,,,,,

0 commit comments

Comments
 (0)