Source code for tabular_trees.xgboost.xgboost_tabular_trees

"""XGBoost trees in tabular format."""

import json
from dataclasses import dataclass, field

import numpy as np
import pandas as pd
import xgboost as xgb
from numpy.typing import NDArray

from .. import checks
from ..trees import BaseModelTabularTrees, TabularTrees, export_tree_data


def xgboost_get_root_node_given_tree(tree: int) -> str:
    """Return the name of the root node of a given tree."""
    return f"{tree}-0"


[docs]@dataclass class XGBoostTabularTrees(BaseModelTabularTrees): """Class to hold the XGBoost trees in tabular format. The preferred way to create XGBoostTabularTrees objects is with the from_booster method. """ data: pd.DataFrame """Tree data.""" Tree: NDArray[np.int_] = field(init=False, repr=False) """Tree number.""" Node: NDArray[np.int_] = field(init=False, repr=False) """Node number.""" ID: NDArray[np.object_] = field(init=False, repr=False) """Id for ech node combining tree and node numbers.""" Feature: NDArray[np.object_] = field(init=False, repr=False) """The name of the feature split on. Null for leaf nodes. """ Split: NDArray[np.float64] = field(init=False, repr=False) """The split point for a node. Null for leaf nodes. """ Yes: NDArray[np.object_] = field(init=False, repr=False) """Left child node. Null for leaf nodes. """ No: NDArray[np.object_] = field(init=False, repr=False) """Right child node. Null for leaf nodes. """ Missing: NDArray[np.object_] = field(init=False, repr=False) """Child node for rows with null values in the split feature.""" Gain: NDArray[np.float64] = field(init=False, repr=False) """Gain for a given split.""" Cover: NDArray[np.float64] = field(init=False, repr=False) """Related to the 2nd order derivative of the loss function with respect to a the split feature.""" Category: NDArray[np.float64] = field(init=False, repr=False) G: NDArray[np.float64] = field(init=False, repr=False) """Use in calculation of internal node predictions.""" H: NDArray[np.float64] = field(init=False, repr=False) """Cover.""" weight: NDArray[np.float64] = field(init=False, repr=False) """Node prediction."""
[docs] @classmethod def from_booster(cls, booster: xgb.Booster) -> "XGBoostTabularTrees": """Create XGBoostTabularTrees from a xgb.Booster object. Parameters ---------- booster : xgb.Booster XGBoost model to pull tree data from. Returns ------- trees : XGBoostTabularTrees Model trees in tabular format. Examples -------- >>> import xgboost as xgb >>> from sklearn.datasets import load_diabetes >>> from tabular_trees import XGBoostTabularTrees >>> # get data in DMatrix >>> diabetes = load_diabetes() >>> data = xgb.DMatrix(diabetes["data"], label=diabetes["target"]) >>> # build model >>> params = {"max_depth": 3, "verbosity": 0} >>> model = xgb.train(params, dtrain=data, num_boost_round=10) >>> # export to XGBoostTabularTrees >>> xgboost_tabular_trees = XGBoostTabularTrees.from_booster(model) >>> type(xgboost_tabular_trees) <class 'tabular_trees.xgboost.xgboost_tabular_trees.XGBoostTabularTrees'> """ checks.check_type(booster, xgb.Booster, "booster") model_config = json.loads(booster.save_config()) train_params = model_config["learner"]["gradient_booster"]["tree_train_param"] model_alpha = float(train_params["alpha"]) model_lambda = float(train_params["lambda"]) if model_alpha != 0: raise ValueError("Only Booster objects with alpha = 0 are supported.") tree_data = booster.trees_to_dataframe() tree_data_with_predictions = XGBoostTabularTrees.derive_predictions( df=tree_data, lambda_=model_lambda ) return XGBoostTabularTrees(data=tree_data_with_predictions)
[docs] def to_tabular_trees(self) -> TabularTrees: """Convert the tree data to a TabularTrees object. Returns ------- trees : TabularTrees Model trees in TabularTrees form. """ trees = self.data.copy() # derive leaf node flag trees["leaf"] = (trees["Feature"] == "Leaf").astype(int) column_mapping = { "Tree": "tree", "ID": "node", "Yes": "left_child", "No": "right_child", "Missing": "missing", "Feature": "feature", "Split": "split_condition", "weight": "prediction", "leaf": "leaf", "Cover": "count", } tree_data_converted = trees[column_mapping.keys()].rename( columns=column_mapping ) return TabularTrees( trees=tree_data_converted, get_root_node_given_tree=xgboost_get_root_node_given_tree, )
[docs] @staticmethod def derive_predictions(df: pd.DataFrame, lambda_: float) -> pd.DataFrame: """Derive predictons for internal nodes in trees. Predictions will be available in 'weight' column in the output. Returns ------- trees : pd.DataFrame Tree data with 'weight', 'H' and 'G' columns added. """ n_trees = df["Tree"].max() # identify leaf and internal nodes leaf_nodes = df["Feature"] == "Leaf" internal_nodes = ~leaf_nodes df["H"] = df["Cover"] df["G"] = 0.0 # column to hold predictions df["weight"] = 0.0 df.loc[leaf_nodes, "weight"] = df.loc[leaf_nodes, "Gain"] df.loc[leaf_nodes, "G"] = -df.loc[leaf_nodes, "weight"] * ( df.loc[leaf_nodes, "H"] + lambda_ ) # propagate G up from the leaf nodes to internal nodes, for each tree df_g_list = [ XGBoostTabularTrees._derive_internal_node_g(df.loc[df["Tree"] == n]) for n in range(n_trees + 1) ] # append all updated trees df_g = pd.concat(df_g_list, axis=0) # update weight values for internal nodes df_g.loc[internal_nodes, "weight"] = -df_g.loc[internal_nodes, "G"] / ( df_g.loc[internal_nodes, "H"] + lambda_ ) return df_g
@staticmethod def _derive_internal_node_g(tree_df: pd.DataFrame) -> pd.DataFrame: """Derive predictons for internal nodes in a single tree. This involves starting at each leaf node in the tree and propagating the G value back up through the tree, adding this leaf node G to each node that is travelled to. Parameters ---------- tree_df : pd.DataFrame Rows from corresponding to a single tree, from _derive_predictions. Returns ------- pd.DataFrame Updated tree_df with G propagated up the tree s.t. each internal node's g value is the sum of G for it's child nodes. """ tree_df = tree_df.copy() leaf_df = tree_df.loc[tree_df["Feature"] == "Leaf"] # loop through each leaf node for i in leaf_df.index: leaf_row = leaf_df.loc[[i]] leaf_g = leaf_row["G"].item() current_node = leaf_row["Node"].item() current_tree_node = leaf_row["ID"].item() # if the current node is not also the first node in the tree # traverse the tree bottom from bottom to top and propagate the G # value upwards while current_node > 0: # find parent node row parent = (tree_df["Yes"] == current_tree_node) | ( tree_df["No"] == current_tree_node ) # get parent node G tree_df.loc[parent, "G"] = tree_df.loc[parent, "G"] + leaf_g # update the current node to be the parent node leaf_row = tree_df.loc[parent] current_node = leaf_row["Node"].item() current_tree_node = leaf_row["ID"].item() return tree_df
@export_tree_data.register(xgb.Booster) def _export_tree_data__xgb_booster(model: xgb.Booster) -> XGBoostTabularTrees: """Export tree data from Booster object. Parameters ---------- model : Booster XGBoost booster to export tree data from. """ checks.check_type(model, xgb.Booster, "model") return XGBoostTabularTrees.from_booster(model)