From 442874406be7266a1af56898f4dcb1db8ebe1d33 Mon Sep 17 00:00:00 2001 From: RektPunk Date: Sun, 21 Dec 2025 16:45:10 +0900 Subject: [PATCH 1/6] faster get leafs --- xbooster/xgb_constructor.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/xbooster/xgb_constructor.py b/xbooster/xgb_constructor.py index 0376ec9..bd8f087 100644 --- a/xbooster/xgb_constructor.py +++ b/xbooster/xgb_constructor.py @@ -202,15 +202,14 @@ def get_leafs( # Predict leaf index tree_leaf_idx = self.booster_.predict(xgb_features, pred_leaf=True) return pd.DataFrame(tree_leaf_idx, columns=_colnames) - - df_leafs = pd.DataFrame() + tree_results = [] for i in range(n_rounds): - # Predict margin tree_leafs = ( self.booster_.predict(xgb_features, iteration_range=(i, i + 1), output_margin=True) - scores ) - df_leafs[f"tree_{i}"] = tree_leafs.flatten() + tree_results.append(tree_leafs.flatten()) + df_leafs = pd.DataFrame(np.column_stack(tree_results), index=X.index, columns=_colnames) return df_leafs def extract_leaf_weights(self) -> pd.DataFrame: From ce6cef2fc702437ceb4a6660517ea75a61c937cc Mon Sep 17 00:00:00 2001 From: RektPunk Date: Sun, 21 Dec 2025 17:00:40 +0900 Subject: [PATCH 2/6] construct scorecard optimize --- xbooster/xgb_constructor.py | 56 ++++++++++--------------------------- 1 file changed, 15 insertions(+), 41 deletions(-) diff --git a/xbooster/xgb_constructor.py b/xbooster/xgb_constructor.py index bd8f087..f659cbf 100644 --- a/xbooster/xgb_constructor.py +++ b/xbooster/xgb_constructor.py @@ -334,7 +334,6 @@ def construct_scorecard(self) -> pd.DataFrame: # pylint: disable=R0914 n_rounds = self.booster_.num_boosted_rounds() labels = xgb_features_and_labels.get_label() - df_binning_table = pd.DataFrame() # TODO: Refactor this part to re-use the get_leafs method in the future # Summing margins from a booster, adopted from here: # https://xgboost.readthedocs.io/en/latest/python/examples/individual_trees.html @@ -345,52 +344,27 @@ def construct_scorecard(self) -> pd.DataFrame: # pylint: disable=R0914 f"Invalid leaf index shape {tree_leaf_idx.shape}. Expected {(len(labels), n_rounds)}" ) - for i in range(n_rounds): - # Get counts of events and non-events - index_and_label = pd.concat( - [ - pd.Series(tree_leaf_idx[:, i], name="leaf_idx"), - pd.Series(labels, name="label"), - ], - axis=1, - ) - # Create a binning table - binning_table = ( - index_and_label.groupby("leaf_idx").agg(["sum", "count"]).reset_index() - ).astype(float) - binning_table.columns = ["leaf_idx", "Events", "Count"] # type: ignore - binning_table["tree"] = i - binning_table["NonEvents"] = binning_table["Count"] - binning_table["Events"] - binning_table["EventRate"] = binning_table["Events"] / binning_table["Count"] - binning_table = binning_table[ - ["tree", "leaf_idx", "Events", "NonEvents", "Count", "EventRate"] - ] - # Aggregate indices, leafs, and counts of events and non-events - df_binning_table = pd.concat([df_binning_table, binning_table], axis=0) + tree_leaf_idx_long = pd.DataFrame(tree_leaf_idx).melt(var_name="Tree", value_name="Node") + tree_leaf_idx_long["label"] = np.tile(labels, tree_leaf_idx.shape[1]) + binning_table = ( + tree_leaf_idx_long.groupby(["Tree", "Node"])["label"] + .agg(["sum", "count"]) + .reset_index() + ) + binning_table.columns = ["Tree", "Node", "Events", "Count"] + df_binning_table = binning_table.assign( + NonEvents=lambda df: df["Count"] - df["Events"], + EventRate=lambda df: df["Events"] / df["Count"], + )[["Tree", "Node", "Events", "NonEvents", "Count", "EventRate"]] + # Extract leaf weights (XAddEvidence) df_x_add_evidence = self.extract_leaf_weights() self.xgb_scorecard = df_x_add_evidence.merge( df_binning_table, - left_on=["Tree", "Node"], - right_on=["tree", "leaf_idx"], + on=["Tree", "Node"], how="left", - ).drop(["tree", "leaf_idx"], axis=1) - - self.xgb_scorecard = self.xgb_scorecard[ - [ - "Tree", - "Node", - "Feature", - "Sign", - "Split", - "Count", - "NonEvents", - "Events", - "EventRate", - "XAddEvidence", - ] - ] + ) # Sort by Tree and Node self.xgb_scorecard = self.xgb_scorecard.sort_values(by=["Tree", "Node"]).reset_index( From 0e23ed525f771823a7987d36e5bf126696e0697d Mon Sep 17 00:00:00 2001 From: RektPunk Date: Sun, 21 Dec 2025 17:20:11 +0900 Subject: [PATCH 3/6] _convert_tree_to_points optimize --- xbooster/xgb_constructor.py | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/xbooster/xgb_constructor.py b/xbooster/xgb_constructor.py index f659cbf..5c06178 100644 --- a/xbooster/xgb_constructor.py +++ b/xbooster/xgb_constructor.py @@ -540,22 +540,23 @@ def _convert_tree_to_points(self, X): # pylint: disable=C0103 """ X_leaf_weights = self.get_leafs(X, output_type="leaf_index") # pylint: disable=C0103 - result = pd.DataFrame() - for col in X_leaf_weights.columns: - tree_number = col.split("_")[1] - if self.xgb_scorecard_with_points is not None: - subset_points_df = self.xgb_scorecard_with_points[ - self.xgb_scorecard_with_points["Tree"] == int(tree_number) - ].copy() - merged_df = pd.merge( - X_leaf_weights[[col]].round(4), - subset_points_df[["Node", "Points"]], - left_on=col, - right_on="Node", - how="left", - ) - result[f"Score_{tree_number}"] = merged_df["Points"] - result = pd.concat([result, result.sum(axis=1).rename("Score")], axis=1) + n_samples, n_rounds = X_leaf_weights.shape + points_matrix = np.zeros((n_samples, n_rounds)) + leaf_idx_values = X_leaf_weights.values + for t in range(n_rounds): + # Get points for this tree + tree_points = self.xgb_scorecard_with_points[ + self.xgb_scorecard_with_points["Tree"] == t + ] + # Mapping dictionary instead of merge + mapping_dict = dict(zip(tree_points["Node"], tree_points["Points"])) + points_matrix[:, t] = np.vectorize(mapping_dict.get)(leaf_idx_values[:, t]) + + result = pd.DataFrame( + points_matrix, index=X.index, columns=[f"Score_{i}" for i in range(n_rounds)] + ) + # Add total score + result["Score"] = points_matrix.sum(axis=1) return result def predict_score(self, X: pd.DataFrame) -> pd.Series: # pylint: disable=C0103 From 0faf07ec7e079459ffb679ebfb13b0d664a92041 Mon Sep 17 00:00:00 2001 From: RektPunk Date: Sun, 21 Dec 2025 17:32:36 +0900 Subject: [PATCH 4/6] set count as integer in test --- tests/test_xgb_regression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_xgb_regression.py b/tests/test_xgb_regression.py index f6c268f..e4652dd 100644 --- a/tests/test_xgb_regression.py +++ b/tests/test_xgb_regression.py @@ -126,7 +126,7 @@ def test_construct_scorecard_output_structure(self, sample_data, trained_model): # Verify data types assert pd.api.types.is_integer_dtype(scorecard["Tree"]) assert pd.api.types.is_integer_dtype(scorecard["Node"]) - assert pd.api.types.is_float_dtype(scorecard["Count"]) + assert pd.api.types.is_integer_dtype(scorecard["Count"]) assert pd.api.types.is_float_dtype(scorecard["EventRate"]) def test_construct_scorecard_statistical_properties(self, sample_data, trained_model): From 7610a0fd57be935dce97f5a1be40cd0d4566418b Mon Sep 17 00:00:00 2001 From: RektPunk Date: Sun, 21 Dec 2025 22:28:19 +0900 Subject: [PATCH 5/6] raise value error when scorecard is None --- xbooster/xgb_constructor.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/xbooster/xgb_constructor.py b/xbooster/xgb_constructor.py index 5c06178..0d592a5 100644 --- a/xbooster/xgb_constructor.py +++ b/xbooster/xgb_constructor.py @@ -539,6 +539,10 @@ def _convert_tree_to_points(self, X): # pylint: disable=C0103 pd.DataFrame: The DataFrame containing scores per tree and the total score. """ + if self.xgb_scorecard_with_points is None: + raise ValueError( + "No scorecard with points has been created yet. Call create_points() first." + ) X_leaf_weights = self.get_leafs(X, output_type="leaf_index") # pylint: disable=C0103 n_samples, n_rounds = X_leaf_weights.shape points_matrix = np.zeros((n_samples, n_rounds)) From 87e80933d9a3d6d73a72e85c41c3250b1cdfb874 Mon Sep 17 00:00:00 2001 From: RektPunk Date: Sun, 21 Dec 2025 22:56:33 +0900 Subject: [PATCH 6/6] use map instead of np vectorize --- xbooster/xgb_constructor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xbooster/xgb_constructor.py b/xbooster/xgb_constructor.py index 0d592a5..3a42c73 100644 --- a/xbooster/xgb_constructor.py +++ b/xbooster/xgb_constructor.py @@ -554,7 +554,7 @@ def _convert_tree_to_points(self, X): # pylint: disable=C0103 ] # Mapping dictionary instead of merge mapping_dict = dict(zip(tree_points["Node"], tree_points["Points"])) - points_matrix[:, t] = np.vectorize(mapping_dict.get)(leaf_idx_values[:, t]) + points_matrix[:, t] = pd.Series(leaf_idx_values[:, t]).map(mapping_dict).to_numpy() result = pd.DataFrame( points_matrix, index=X.index, columns=[f"Score_{i}" for i in range(n_rounds)]