Source code for tabular_trees.xgboost.xgboost_tabular_trees

"""XGBoost trees in tabular format."""

import json
from dataclasses import dataclass, field

import numpy as np
import pandas as pd
import xgboost as xgb
from numpy.typing import NDArray

from .. import checks
from ..trees import BaseModelTabularTrees, TabularTrees, export_tree_data


def xgboost_get_root_node_given_tree(tree: int) -> str:
    """Return the name of the root node of a given tree."""
    return f"{tree}-0"


[docs]@dataclass
class XGBoostTabularTrees(BaseModelTabularTrees):
    """Class to hold the XGBoost trees in tabular format.

    The preferred way to create XGBoostTabularTrees objects is with the from_booster
    method.

    """

    data: pd.DataFrame
    """Tree data."""

    Tree: NDArray[np.int_] = field(init=False, repr=False)
    """Tree number."""

    Node: NDArray[np.int_] = field(init=False, repr=False)
    """Node number."""

    ID: NDArray[np.object_] = field(init=False, repr=False)
    """Id for ech node combining tree and node numbers."""

    Feature: NDArray[np.object_] = field(init=False, repr=False)
    """The name of the feature split on.

    Null for leaf nodes.

    """

    Split: NDArray[np.float64] = field(init=False, repr=False)
    """The split point for a node.

    Null for leaf nodes.

    """

    Yes: NDArray[np.object_] = field(init=False, repr=False)
    """Left child node.

    Null for leaf nodes.

    """

    No: NDArray[np.object_] = field(init=False, repr=False)
    """Right child node.

    Null for leaf nodes.

    """

    Missing: NDArray[np.object_] = field(init=False, repr=False)
    """Child node for rows with null values in the split feature."""

    Gain: NDArray[np.float64] = field(init=False, repr=False)
    """Gain for a given split."""

    Cover: NDArray[np.float64] = field(init=False, repr=False)
    """Related to the 2nd order derivative of the loss function with respect to a the
    split feature."""

    Category: NDArray[np.float64] = field(init=False, repr=False)

    G: NDArray[np.float64] = field(init=False, repr=False)
    """Use in calculation of internal node predictions."""

    H: NDArray[np.float64] = field(init=False, repr=False)
    """Cover."""

    weight: NDArray[np.float64] = field(init=False, repr=False)
    """Node prediction."""

[docs]    @classmethod
    def from_booster(cls, booster: xgb.Booster) -> "XGBoostTabularTrees":
        """Create XGBoostTabularTrees from a xgb.Booster object.

        Parameters
        ----------
        booster : xgb.Booster
            XGBoost model to pull tree data from.

        Returns
        -------
        trees : XGBoostTabularTrees
            Model trees in tabular format.

        Examples
        --------
        >>> import xgboost as xgb
        >>> from sklearn.datasets import load_diabetes
        >>> from tabular_trees import XGBoostTabularTrees
        >>> # get data in DMatrix
        >>> diabetes = load_diabetes()
        >>> data = xgb.DMatrix(diabetes["data"], label=diabetes["target"])
        >>> # build model
        >>> params = {"max_depth": 3, "verbosity": 0}
        >>> model = xgb.train(params, dtrain=data, num_boost_round=10)
        >>> # export to XGBoostTabularTrees
        >>> xgboost_tabular_trees = XGBoostTabularTrees.from_booster(model)
        >>> type(xgboost_tabular_trees)
        <class 'tabular_trees.xgboost.xgboost_tabular_trees.XGBoostTabularTrees'>

        """
        checks.check_type(booster, xgb.Booster, "booster")

        model_config = json.loads(booster.save_config())
        train_params = model_config["learner"]["gradient_booster"]["tree_train_param"]

        model_alpha = float(train_params["alpha"])
        model_lambda = float(train_params["lambda"])

        if model_alpha != 0:
            raise ValueError("Only Booster objects with alpha = 0 are supported.")

        tree_data = booster.trees_to_dataframe()

        tree_data_with_predictions = XGBoostTabularTrees.derive_predictions(
            df=tree_data, lambda_=model_lambda
        )

        return XGBoostTabularTrees(data=tree_data_with_predictions)

[docs]    def to_tabular_trees(self) -> TabularTrees:
        """Convert the tree data to a TabularTrees object.

        Returns
        -------
        trees : TabularTrees
            Model trees in TabularTrees form.

        """
        trees = self.data.copy()

        # derive leaf node flag
        trees["leaf"] = (trees["Feature"] == "Leaf").astype(int)

        column_mapping = {
            "Tree": "tree",
            "ID": "node",
            "Yes": "left_child",
            "No": "right_child",
            "Missing": "missing",
            "Feature": "feature",
            "Split": "split_condition",
            "weight": "prediction",
            "leaf": "leaf",
            "Cover": "count",
        }

        tree_data_converted = trees[column_mapping.keys()].rename(
            columns=column_mapping
        )

        return TabularTrees(
            trees=tree_data_converted,
            get_root_node_given_tree=xgboost_get_root_node_given_tree,
        )

[docs]    @staticmethod
    def derive_predictions(df: pd.DataFrame, lambda_: float) -> pd.DataFrame:
        """Derive predictons for internal nodes in trees.

        Predictions will be available in 'weight' column in the output.

        Returns
        -------
        trees : pd.DataFrame
            Tree data  with 'weight', 'H' and 'G' columns added.

        """
        n_trees = df["Tree"].max()

        # identify leaf and internal nodes
        leaf_nodes = df["Feature"] == "Leaf"
        internal_nodes = ~leaf_nodes

        df["H"] = df["Cover"]
        df["G"] = 0.0

        # column to hold predictions
        df["weight"] = 0.0
        df.loc[leaf_nodes, "weight"] = df.loc[leaf_nodes, "Gain"]

        df.loc[leaf_nodes, "G"] = -df.loc[leaf_nodes, "weight"] * (
            df.loc[leaf_nodes, "H"] + lambda_
        )

        # propagate G up from the leaf nodes to internal nodes, for each tree
        df_g_list = [
            XGBoostTabularTrees._derive_internal_node_g(df.loc[df["Tree"] == n])
            for n in range(n_trees + 1)
        ]

        # append all updated trees
        df_g = pd.concat(df_g_list, axis=0)

        # update weight values for internal nodes
        df_g.loc[internal_nodes, "weight"] = -df_g.loc[internal_nodes, "G"] / (
            df_g.loc[internal_nodes, "H"] + lambda_
        )

        return df_g

    @staticmethod
    def _derive_internal_node_g(tree_df: pd.DataFrame) -> pd.DataFrame:
        """Derive predictons for internal nodes in a single tree.

        This involves starting at each leaf node in the tree and propagating
        the G value back up through the tree, adding this leaf node G to each
        node that is travelled to.

        Parameters
        ----------
        tree_df : pd.DataFrame
            Rows from corresponding to a single tree, from _derive_predictions.

        Returns
        -------
        pd.DataFrame
            Updated tree_df with G propagated up the tree s.t. each internal
            node's g value is the sum of G for it's child nodes.

        """
        tree_df = tree_df.copy()

        leaf_df = tree_df.loc[tree_df["Feature"] == "Leaf"]

        # loop through each leaf node
        for i in leaf_df.index:
            leaf_row = leaf_df.loc[[i]]

            leaf_g = leaf_row["G"].item()
            current_node = leaf_row["Node"].item()
            current_tree_node = leaf_row["ID"].item()

            # if the current node is not also the first node in the tree
            # traverse the tree bottom from bottom to top and propagate the G
            # value upwards
            while current_node > 0:
                # find parent node row
                parent = (tree_df["Yes"] == current_tree_node) | (
                    tree_df["No"] == current_tree_node
                )

                # get parent node G
                tree_df.loc[parent, "G"] = tree_df.loc[parent, "G"] + leaf_g

                # update the current node to be the parent node
                leaf_row = tree_df.loc[parent]
                current_node = leaf_row["Node"].item()
                current_tree_node = leaf_row["ID"].item()

        return tree_df


@export_tree_data.register(xgb.Booster)
def _export_tree_data__xgb_booster(model: xgb.Booster) -> XGBoostTabularTrees:
    """Export tree data from Booster object.

    Parameters
    ----------
    model : Booster
        XGBoost booster to export tree data from.

    """
    checks.check_type(model, xgb.Booster, "model")

    return XGBoostTabularTrees.from_booster(model)