Skip to content

Commit cc1154d

Browse files
committed
#30 adding marketyears to the generate_module function
Taking Eugene's restructure branch (from PR #59), this updates the following: 1. Added cleaned version of Rob's code to database.by --> creates an ingredient_rxcui_year table & a product_rxcui_year table, for their respective distributions in generate_module 2. added 'year' column to the generate_module dataframes/CSV files. 3. fixed default_probability typo in utils.py (if idx == 1 --> changed to if idx == 0)
1 parent 5f72506 commit cc1154d

File tree

2 files changed

+79
-21
lines changed

2 files changed

+79
-21
lines changed

src/mdt/database.py

+53-7
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
from . import rxnorm, meps, fda
1+
from mdt import rxnorm, meps, fda
22
from pathlib import Path
33
import zipfile
44
import io
55
import sqlite3
66
import pandas as pd
7+
from datetime import datetime
78

89

910
def to_data():
@@ -152,28 +153,73 @@ def load_fda():
152153
z = zipfile.ZipFile(
153154
fda.utils.get_dataset(handler=io.BytesIO)
154155
)
156+
157+
#moves FDA files to sqlite database by reading as dataframes
155158
product = pd.read_csv(z.open('product.txt'),sep='\t',dtype=object,header=0,encoding='cp1252')
156159
package = pd.read_csv(z.open('package.txt'),sep='\t',dtype=object,header=0,encoding='cp1252')
157160
sql_create_table('product',product)
158161
sql_create_table('package',package)
159-
del product
160-
del package
162+
161163

162164
#deletes FDA ZIP
163165
del z
164166

165-
#NOTE: Rob's python code to join one of these tables with the rxcui_ndc table goes here
166-
"""
167+
168+
169+
#join product table with the rxcui_ndc table
167170
rxcui_ndc_string = read_sql_string('rxcui_ndc.sql')
168171
rxcui_ndc = db_query(rxcui_ndc_string)
169172
sql_create_table('rxcui_ndc', rxcui_ndc)
170-
del rxcui_ndc
171-
"""
172173

173174

175+
product['PRODUCTNDC'] = product['PRODUCTNDC'].str.replace('-', '').str.zfill(9)
176+
rxcui_ndc['medication_ndc'] = rxcui_ndc['medication_ndc'].astype(str).str.zfill(9)
177+
product_rxcui = product.merge(rxcui_ndc, left_on = 'PRODUCTNDC', right_on = rxcui_ndc['medication_ndc'].str.slice(start=0,stop=9), how = 'left')
178+
179+
180+
#extract year from startmarketingdate & endmarketingdate
181+
#fill NULL endmarketingyear with current year
182+
product_rxcui['STARTMARKETINGYEAR'] = product_rxcui['STARTMARKETINGDATE'].str.slice(start=0, stop=4).astype(int)
183+
product_rxcui['ENDMARKETINGYEAR'] = product_rxcui['ENDMARKETINGDATE'].str.slice(start=0, stop=4)
184+
product_rxcui['ENDMARKETINGYEAR'] = product_rxcui['ENDMARKETINGYEAR'].fillna(datetime.now().year)
185+
product_rxcui['ENDMARKETINGYEAR'] = product_rxcui['ENDMARKETINGYEAR'].astype(int)
186+
product_rxcui = product_rxcui[['medication_ingredient_rxcui', 'medication_ingredient_name', 'medication_product_rxcui',
187+
'medication_product_name', 'STARTMARKETINGYEAR', 'ENDMARKETINGYEAR']]
188+
189+
med_marketing_year_dict = {}
190+
med_state_level_list = ['medication_ingredient', 'medication_product']
191+
192+
#create a dictionary of df's (one for ingredient, other for product) that contains a range of years that each rxcui was available o nthe market
193+
def med_marketing_year(med_state_level_list):
194+
for med_state_level in med_state_level_list:
195+
#takes MIN startmarketingdate and MAX endmarketingdate for each rxcui
196+
med_marketing_year_dict[med_state_level+'_max_marketingyear_range'] = product_rxcui.groupby([med_state_level+'_rxcui', med_state_level+'_name']).agg({'STARTMARKETINGYEAR': 'min', 'ENDMARKETINGYEAR': 'max'}).reset_index()
197+
198+
#creates a row for each year between startmarketingdate and endmarketingdate for each rxcui
199+
zipped = zip(med_marketing_year_dict[med_state_level+'_max_marketingyear_range'][med_state_level+'_rxcui'], med_marketing_year_dict[med_state_level+'_max_marketingyear_range']['STARTMARKETINGYEAR'], med_marketing_year_dict[med_state_level+'_max_marketingyear_range']['ENDMARKETINGYEAR'])
200+
med_marketing_year_dict[med_state_level+'_rxcui_years'] = pd.DataFrame([(i, y) for i, s, e in zipped for y in range(s, e+1)],
201+
columns=[med_state_level+'_rxcui','year'])
202+
sql_create_table(med_state_level+'_rxcui_years',med_marketing_year_dict[med_state_level+'_rxcui_years'])
203+
print(med_state_level+'_rxcui_years')
204+
205+
med_marketing_year(med_state_level_list)
206+
207+
#deletes other dataframes
208+
del product
209+
del package
210+
del rxcui_ndc
211+
del medication_ingredient_rxcui_years
212+
del medication_product_rxcui_years
213+
174214
#TEST!!!!!!!!!!!!!!!! reads record count from created database
175215
product = db_query("Select count(*) AS records from product limit 1")
176216
print('DB table product has {0} records'.format(product['records'].iloc[0]))
177217

178218
package = db_query("Select count(*) AS records from package limit 1")
179219
print('DB table package has {0} records'.format(package['records'].iloc[0]))
220+
221+
medication_product_rxcui_years = db_query("Select count(*) AS records from medication_product_rxcui_years limit 1")
222+
print('DB table medication_product_rxcui_years has {0} records'.format(medication_product_rxcui_years['records'].iloc[0]))
223+
224+
medication_ingredient_rxcui_years = db_query("Select count(*) AS records from medication_ingredient_rxcui_years limit 1")
225+
print('DB table medication_ingredient_rxcui_years has {0} records'.format(medication_ingredient_rxcui_years['records'].iloc[0]))

src/mdt/utils.py

+26-14
Original file line numberDiff line numberDiff line change
@@ -117,9 +117,16 @@ def generate_module(rxcui_ndc_df, rxclass_name):
117117
#Read in MEPS Reference table
118118
meps_reference = db_query(meps.utils.get_sql('meps_reference.sql'))
119119

120+
#Read in FDA Ingredient-RxCUI-Years Reference table (for years that a given ingredient was available on the market)
121+
ingredient_rxcui_years = db_query('SELECT * FROM medication_ingredient_rxcui_years')
122+
123+
#Read in FDA Product-RxCUI-Years Reference table (for years that a given product was available on the market)
124+
product_rxcui_years = db_query('SELECT * FROM medication_product_rxcui_years')
125+
120126
#Join MEPS to filtered rxcui_ndc dataframe (rxcui_list)
121127
meps_rxcui = meps_reference.astype(str).merge(rxcui_ndc_df.astype(str)[['medication_ingredient_name', 'medication_ingredient_rxcui','medication_product_name', 'medication_product_rxcui', 'medication_ndc']], how = 'inner', left_on = 'RXNDC', right_on = 'medication_ndc')
122128

129+
123130
#Optional: Age range join - can be customized in the mdt_config.json file
124131
#groupby_demographic_variable: must be either an empty list [] or list of patient demographics (e.g., age, gender, state) - based on user inputs in the mdt_config.json file
125132

@@ -162,20 +169,23 @@ def generate_module(rxcui_ndc_df, rxclass_name):
162169

163170
filename = rxclass_name + '_ingredient_distrib'
164171
#1
165-
dcp_dict['patient_count_ingredient'] = meps_rxcui[['medication_ingredient_name', 'medication_ingredient_rxcui', 'person_weight', 'DUPERSID']+groupby_demographic_variables].groupby(['medication_ingredient_name', 'medication_ingredient_rxcui', 'person_weight']+groupby_demographic_variables)['DUPERSID'].nunique()
172+
#Join MEPS to ingredient_rxcui_years dataframe (rxcuis_by_fda_marketingdates)
173+
meps_rxcui_ingred_years = meps_rxcui.astype(str).merge(ingredient_rxcui_years.astype(str)[['medication_ingredient_rxcui', 'year']], how = 'inner', on = 'medication_ingredient_rxcui')
174+
dcp_dict['patient_count_ingredient'] = meps_rxcui_ingred_years[['medication_ingredient_name', 'medication_ingredient_rxcui', 'year', 'person_weight', 'DUPERSID']+groupby_demographic_variables].groupby(['medication_ingredient_name', 'medication_ingredient_rxcui', 'year', 'person_weight']+groupby_demographic_variables)['DUPERSID'].nunique()
166175
dcp_df = pd.DataFrame(dcp_dict['patient_count_ingredient']).reset_index()
167176
#2
168177
dcp_df['weighted_patient_count_ingredient'] = dcp_df['person_weight'].astype(float)*dcp_df['DUPERSID']
169178
#3
170-
dcp_dict['patients_by_demographics_ingredient'] = dcp_df.groupby(['medication_ingredient_name']+groupby_demographic_variables)['weighted_patient_count_ingredient'].sum()
179+
dcp_dict['patients_by_demographics_ingredient'] = dcp_df.groupby(['medication_ingredient_name', 'year']+groupby_demographic_variables)['weighted_patient_count_ingredient'].sum()
171180
dcp_demographic_df = pd.DataFrame(dcp_dict['patients_by_demographics_ingredient']).reset_index()
172181
#4
173182
if len(groupby_demographic_variables) > 0:
174-
dcp_demographictotal_df = pd.merge(dcp_demographic_df, dcp_demographic_df.groupby(groupby_demographic_variables)['weighted_patient_count_ingredient'].sum(), how = 'inner', left_on = groupby_demographic_variables, right_index=True, suffixes = ('_demographic', '_total'))
183+
dcp_demographictotal_df = pd.merge(dcp_demographic_df, dcp_demographic_df.groupby(groupby_demographic_variables+['year'])['weighted_patient_count_ingredient'].sum(), how = 'inner', left_on = groupby_demographic_variables+['year'], right_index=True, suffixes = ('_demographic', '_total'))
175184
else:
176-
dcp_demographictotal_df = dcp_demographic_df
177-
dcp_demographictotal_df['weighted_patient_count_ingredient_demographic'] = dcp_demographic_df['weighted_patient_count_ingredient']
178-
dcp_demographictotal_df['weighted_patient_count_ingredient_total'] = dcp_demographic_df['weighted_patient_count_ingredient'].sum()
185+
# dcp_demographictotal_df = dcp_demographic_df
186+
# dcp_demographictotal_df['weighted_patient_count_ingredient_demographic'] = dcp_demographic_df['weighted_patient_count_ingredient']
187+
# dcp_demographictotal_df['weighted_patient_count_ingredient_total'] = dcp_demographic_df['weighted_patient_count_ingredient'].sum()
188+
dcp_demographictotal_df = pd.merge(dcp_demographic_df, dcp_demographic_df.groupby('year')['weighted_patient_count_ingredient'].sum(), how = 'inner', left_on = 'year', right_index=True, suffixes = ('_demographic', '_total'))
179189
#5
180190
dcp_demographictotal_df['percent_ingredient_patients'] = round(dcp_demographictotal_df['weighted_patient_count_ingredient_demographic']/dcp_demographictotal_df['weighted_patient_count_ingredient_total'], 3)
181191
#6 TODO: change this column to medication_product_state_name(?)
@@ -199,9 +209,9 @@ def generate_module(rxcui_ndc_df, rxclass_name):
199209
#7
200210
dcp_dict['percent_ingredient_patients'] = dcp_demographictotal_df
201211
if len(groupby_demographic_variables) > 0:
202-
dcp_dict['percent_ingredient_patients'] = dcp_dict['percent_ingredient_patients'].reset_index().pivot(index= groupby_demographic_variables, columns = 'medication_ingredient_name', values='percent_ingredient_patients').reset_index()
212+
dcp_dict['percent_ingredient_patients'] = dcp_dict['percent_ingredient_patients'].reset_index().pivot(index= groupby_demographic_variables+['year'], columns = 'medication_ingredient_name', values='percent_ingredient_patients').reset_index()
203213
else:
204-
dcp_dict['percent_ingredient_patients'] = dcp_dict['percent_ingredient_patients'][['medication_ingredient_name', 'percent_ingredient_patients']].set_index('medication_ingredient_name').T
214+
dcp_dict['percent_ingredient_patients'] = dcp_dict['percent_ingredient_patients'][['medication_ingredient_name', 'percent_ingredient_patients', 'year']].set_index('medication_ingredient_name').T
205215

206216
#Fill NULLs and save as CSV
207217
dcp_dict['percent_ingredient_patients'].fillna(0, inplace=True)
@@ -216,17 +226,19 @@ def generate_module(rxcui_ndc_df, rxclass_name):
216226
for ingred_name in medication_ingredient_list:
217227
filename = rxclass_name + '_product_' + ingred_name + '_distrib'
218228
#0
219-
meps_rxcui_ingred = meps_rxcui[meps_rxcui['medication_ingredient_name']==ingred_name][['medication_product_name', 'medication_product_rxcui', 'medication_ingredient_name', 'medication_ingredient_rxcui', 'person_weight', 'DUPERSID']+groupby_demographic_variables]
229+
#Join MEPS to product_rxcui_years dataframe (rxcuis_by_fda_marketingdates)
230+
meps_rxcui_prod_years = meps_rxcui.astype(str).merge(product_rxcui_years.astype(str)[['medication_product_rxcui', 'year']], how = 'inner', on = 'medication_product_rxcui')
231+
meps_rxcui_ingred = meps_rxcui_prod_years[meps_rxcui_prod_years['medication_ingredient_name']==ingred_name][['medication_product_name', 'medication_product_rxcui', 'medication_ingredient_name', 'medication_ingredient_rxcui', 'year', 'person_weight', 'DUPERSID']+groupby_demographic_variables]
220232
#1
221-
dcp_dict['patient_count_product'] = meps_rxcui_ingred.groupby(['medication_product_name', 'medication_product_rxcui', 'medication_ingredient_name', 'medication_ingredient_rxcui', 'person_weight']+groupby_demographic_variables)['DUPERSID'].nunique()
233+
dcp_dict['patient_count_product'] = meps_rxcui_ingred.groupby(['medication_product_name', 'medication_product_rxcui', 'medication_ingredient_name', 'medication_ingredient_rxcui', 'year', 'person_weight']+groupby_demographic_variables)['DUPERSID'].nunique()
222234
dcp_df = pd.DataFrame(dcp_dict['patient_count_product']).reset_index()
223235
#2
224236
dcp_df['weighted_patient_count_product'] = dcp_df['person_weight'].astype(float)*dcp_df['DUPERSID']
225237
#3
226-
dcp_dict['patients_by_demographics_product'] = dcp_df.groupby(['medication_product_name', 'medication_ingredient_name']+groupby_demographic_variables)['weighted_patient_count_product'].sum()
238+
dcp_dict['patients_by_demographics_product'] = dcp_df.groupby(['medication_product_name', 'medication_ingredient_name', 'year']+groupby_demographic_variables)['weighted_patient_count_product'].sum()
227239
dcp_demographic_df = pd.DataFrame(dcp_dict['patients_by_demographics_product']).reset_index()
228240
#4
229-
dcp_demographictotal_df = pd.merge(dcp_demographic_df, dcp_demographic_df.groupby(['medication_ingredient_name']+groupby_demographic_variables)['weighted_patient_count_product'].sum(), how = 'inner', left_on = ['medication_ingredient_name']+groupby_demographic_variables, right_index=True, suffixes = ('_demographic', '_total'))
241+
dcp_demographictotal_df = pd.merge(dcp_demographic_df, dcp_demographic_df.groupby(['medication_ingredient_name', 'year']+groupby_demographic_variables)['weighted_patient_count_product'].sum(), how = 'inner', left_on = ['medication_ingredient_name', 'year']+groupby_demographic_variables, right_index=True, suffixes = ('_demographic', '_total'))
230242
#5
231243
dcp_demographictotal_df['percent_product_patients'] = round(dcp_demographictotal_df['weighted_patient_count_product_demographic']/dcp_demographictotal_df['weighted_patient_count_product_total'], 3)
232244
#6 TODO: change this column to medication_product_state_name or medication_product_transition_name(?)
@@ -250,9 +262,9 @@ def generate_module(rxcui_ndc_df, rxclass_name):
250262
#7
251263
dcp_dict['percent_product_patients'] = dcp_demographictotal_df
252264
if len(groupby_demographic_variables) > 0:
253-
dcp_dict['percent_product_patients'] = dcp_dict['percent_product_patients'].reset_index().pivot(index= groupby_demographic_variables, columns = 'medication_product_name', values='percent_product_patients').reset_index()
265+
dcp_dict['percent_product_patients'] = dcp_dict['percent_product_patients'].reset_index().pivot(index= groupby_demographic_variables+['year'], columns = 'medication_product_name', values='percent_product_patients').reset_index()
254266
else:
255-
dcp_dict['percent_product_patients'] = dcp_dict['percent_product_patients'][['medication_product_name', 'percent_product_patients']].set_index('medication_product_name').T
267+
dcp_dict['percent_product_patients'] = dcp_dict['percent_product_patients'][['medication_product_name', 'percent_product_patients', 'year']].set_index('medication_product_name').T
256268

257269
#Fill NULLs and save as CSV
258270
dcp_dict['percent_product_patients'].fillna(0, inplace=True)

0 commit comments

Comments
 (0)