From 375b1b2db54f48a0c8ad30f32427d7dcdea130ba Mon Sep 17 00:00:00 2001 From: Tristan Looden <46874984+tlooden@users.noreply.github.com> Date: Wed, 7 Aug 2019 14:30:24 +0200 Subject: [PATCH 1/4] included option for matching without replacement. --- pymatch/Matcher.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/pymatch/Matcher.py b/pymatch/Matcher.py index c7020bf..2f00b1e 100644 --- a/pymatch/Matcher.py +++ b/pymatch/Matcher.py @@ -141,7 +141,7 @@ def predict_scores(self): scores += m.predict(self.X[m.params.index]) self.data['scores'] = scores/self.nmodels - def match(self, threshold=0.001, nmatches=1, method='min', max_rand=10): + def match(self, threshold=0.001, nmatches=1, method='min', max_rand=10, matchtype='replacement'): """ Finds suitable match(es) for each record in the minority dataset, if one exists. Records are exlcuded from the final @@ -165,11 +165,19 @@ def match(self, threshold=0.001, nmatches=1, method='min', max_rand=10): "min" - choose the profile with the closest score max_rand : int max number of profiles to consider when using random tie-breaks + matchtype : str + "replacement" - matching is performed with replacement, in the + majority group. The same entry from the majority group can be + matched to multiple entries from the minority group + "no_replacement" - matching is performed without replacement, in + the majority group. All matches consist of unique entries. + Matching order is randomized. Returns ------- None """ + if 'scores' not in self.data.columns: print("Propensity Scores have not been calculated. Using defaults...") self.fit_scores() @@ -177,6 +185,8 @@ def match(self, threshold=0.001, nmatches=1, method='min', max_rand=10): test_scores = self.data[self.data[self.yvar]==True][['scores']] ctrl_scores = self.data[self.data[self.yvar]==False][['scores']] result, match_ids = [], [] + if matchtype=='no_replacement': + test_scores=test_scores.reindex(np.random.permutation(test_scores.index)) for i in range(len(test_scores)): # uf.progress(i+1, len(test_scores), 'Matching Control to Test...') match_id = i @@ -195,6 +205,8 @@ def match(self, threshold=0.001, nmatches=1, method='min', max_rand=10): chosen = np.random.choice(matches.index, min(select, nmatches), replace=False) result.extend([test_scores.index[i]] + list(chosen)) match_ids.extend([i] * (len(chosen)+1)) + if matchtype=='no_replacement': + ctrl_scores['scores'].iloc[list(chosen-len(test_scores))]=999 self.matched_data = self.data.loc[result] self.matched_data['match_id'] = match_ids self.matched_data['record_id'] = self.matched_data.index From 791921555a9e296d5662312edca1f52450bcdca6 Mon Sep 17 00:00:00 2001 From: Tristan Looden <46874984+tlooden@users.noreply.github.com> Date: Wed, 7 Aug 2019 15:02:31 +0200 Subject: [PATCH 2/4] changed with_replacement argument to a bool --- pymatch/Matcher.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pymatch/Matcher.py b/pymatch/Matcher.py index 2f00b1e..3cfdf2a 100644 --- a/pymatch/Matcher.py +++ b/pymatch/Matcher.py @@ -141,7 +141,7 @@ def predict_scores(self): scores += m.predict(self.X[m.params.index]) self.data['scores'] = scores/self.nmodels - def match(self, threshold=0.001, nmatches=1, method='min', max_rand=10, matchtype='replacement'): + def match(self, threshold=0.001, nmatches=1, method='min', max_rand=10, with_replacement=True): """ Finds suitable match(es) for each record in the minority dataset, if one exists. Records are exlcuded from the final @@ -165,11 +165,11 @@ def match(self, threshold=0.001, nmatches=1, method='min', max_rand=10, matchtyp "min" - choose the profile with the closest score max_rand : int max number of profiles to consider when using random tie-breaks - matchtype : str - "replacement" - matching is performed with replacement, in the + with_replacement : bool + True - matching is performed with replacement, in the majority group. The same entry from the majority group can be matched to multiple entries from the minority group - "no_replacement" - matching is performed without replacement, in + False - matching is performed without replacement, in the majority group. All matches consist of unique entries. Matching order is randomized. From 0e13f0edad2bd627ba77da21b7fc3d9002ad674f Mon Sep 17 00:00:00 2001 From: Tristan Looden <46874984+tlooden@users.noreply.github.com> Date: Mon, 21 Oct 2019 20:05:42 +0200 Subject: [PATCH 3/4] Fixed inconsistent variable name between match function parameters and body changed 'matchtype' to 'with_replacement' in matcher function body --- pymatch/Matcher.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pymatch/Matcher.py b/pymatch/Matcher.py index 3cfdf2a..cdb2b9a 100644 --- a/pymatch/Matcher.py +++ b/pymatch/Matcher.py @@ -185,7 +185,7 @@ def match(self, threshold=0.001, nmatches=1, method='min', max_rand=10, with_rep test_scores = self.data[self.data[self.yvar]==True][['scores']] ctrl_scores = self.data[self.data[self.yvar]==False][['scores']] result, match_ids = [], [] - if matchtype=='no_replacement': + if with_replacement==False: test_scores=test_scores.reindex(np.random.permutation(test_scores.index)) for i in range(len(test_scores)): # uf.progress(i+1, len(test_scores), 'Matching Control to Test...') @@ -205,7 +205,7 @@ def match(self, threshold=0.001, nmatches=1, method='min', max_rand=10, with_rep chosen = np.random.choice(matches.index, min(select, nmatches), replace=False) result.extend([test_scores.index[i]] + list(chosen)) match_ids.extend([i] * (len(chosen)+1)) - if matchtype=='no_replacement': + if with_replacement==False: ctrl_scores['scores'].iloc[list(chosen-len(test_scores))]=999 self.matched_data = self.data.loc[result] self.matched_data['match_id'] = match_ids From bccbf407cc29acd52d7434f0f575776108182ee8 Mon Sep 17 00:00:00 2001 From: Tristan Looden <46874984+tlooden@users.noreply.github.com> Date: Mon, 21 Oct 2019 22:46:18 +0200 Subject: [PATCH 4/4] 'threshold' parameter used by method='min' Before, the 'threshold' parameter was only references for method='random'. --- pymatch/Matcher.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/pymatch/Matcher.py b/pymatch/Matcher.py index cdb2b9a..0dbb571 100644 --- a/pymatch/Matcher.py +++ b/pymatch/Matcher.py @@ -191,13 +191,10 @@ def match(self, threshold=0.001, nmatches=1, method='min', max_rand=10, with_rep # uf.progress(i+1, len(test_scores), 'Matching Control to Test...') match_id = i score = test_scores.iloc[i] - if method == 'random': - bool_match = abs(ctrl_scores - score) <= threshold - matches = ctrl_scores.loc[bool_match[bool_match.scores].index] - elif method == 'min': - matches = abs(ctrl_scores - score).sort_values('scores').head(nmatches) - else: - raise(AssertionError, "Invalid method parameter, use ('random', 'min')") + + bool_match = abs(ctrl_scores - score) <= threshold + matches = ctrl_scores.loc[bool_match[bool_match.scores].index] + if len(matches) == 0: continue # randomly choose nmatches indices, if len(matches) > nmatches