Source code for tabular_trees.explain.prediction_decomposition

"""Module implementing prediction decomposition method."""

from dataclasses import dataclass, field
from typing import Callable

import numpy as np
import pandas as pd

from ..checks import check_condition, check_type
from ..trees import TabularTrees


[docs]@dataclass
class PredictionDecomposition:
    """Prediction decomposition results."""

    summary: pd.DataFrame
    """Prediction contribution for each feature."""

    nodes: pd.DataFrame = field(repr=False)
    """Node level prediction contributions for all trees."""

[docs]    def __init__(self, summary: pd.DataFrame, nodes: pd.DataFrame):
        """Initialise the PredictionDecomposition object.

        Parameters
        ----------
        summary : pd.DataFrame
            Prediction contribution for each feature.

        nodes : pd.DataFrame
            Node level prediction contributions for all trees.

        """
        self.summary = summary
        self.nodes = nodes


[docs]def decompose_prediction(
    tabular_trees: TabularTrees, row: pd.DataFrame
) -> PredictionDecomposition:
    """Decompose prediction from tree based model with Saabas method[1].

    This method attributes the change in prediction from moving to a lower node to the
    variable that was split on. This can then be summed over all splits in a tree and
    all trees in a model.

    Parameters
    ----------
    tabular_trees : TabularTrees
        Tree based model to explain prediction for.

    row : pd.DataFrame
        Single row of data to explain prediction from tabular_trees object.

    Returns
    -------
    results : PredictionDecomposition
        Prediction decomposed into change attributed to each feature.

    Notes
    -----
    [1] Saabas, Ando (2014) 'Interpreting random forests', Diving into data blog, 19
    October. Available at http://blog.datadive.net/interpreting-random-forests/
    (Accessed 26 February 2023).

    Examples
    --------
    >>> import xgboost as xgb
    >>> import pandas as pd
    >>> from sklearn.datasets import load_diabetes
    >>> from tabular_trees import export_tree_data
    >>> from tabular_trees import decompose_prediction
    >>> # get data in DMatrix
    >>> diabetes = load_diabetes()
    >>> data = xgb.DMatrix(
    ...     diabetes["data"],
    ...     label=diabetes["target"],
    ...     feature_names=diabetes["feature_names"]
    ... )
    >>> # build model
    >>> params = {"max_depth": 3, "verbosity": 0}
    >>> model = xgb.train(params, dtrain=data, num_boost_round=10)
    >>> # export to TabularTrees
    >>> xgboost_tabular_trees = export_tree_data(model)
    >>> tabular_trees = xgboost_tabular_trees.to_tabular_trees()
    >>> # get data to score
    >>> scoring_data = pd.DataFrame(diabetes["data"], columns=diabetes["feature_names"])
    >>> row_to_score = scoring_data.iloc[[0]]
    >>> # decompose prediction
    >>> results = decompose_prediction(tabular_trees, row=row_to_score)
    >>> type(results)
    <class 'tabular_trees.explain.prediction_decomposition.PredictionDecomposition'>

    """
    check_type(tabular_trees, TabularTrees, "tabular_trees")
    check_type(row, pd.DataFrame, "row")
    check_condition(row.shape[0] == 1, "row is a single pd.DataFrame row")

    return _decompose_prediction(
        trees_df=tabular_trees.trees,
        row=row,
        calculate_root_node=tabular_trees.get_root_node_given_tree,
    )


def _decompose_prediction(
    trees_df: pd.DataFrame, row: pd.DataFrame, calculate_root_node: Callable
) -> PredictionDecomposition:
    """Decompose prediction from tree based model with Saabas method.

    Parameters
    ----------
    tree_df : pd.DataFrame
        Tree data from TabularTrees object.

    row : pd.DataFrame
        Single row of data to explain prediction.

    calculate_root_node : callable
        Function that can return the root node id when passed tree index.

    """
    n_trees = trees_df.tree.max()

    prediction_decompositions = []

    for n in range(n_trees + 1):
        leaf_node_path = _find_path_to_leaf_node(
            tree_df=trees_df.loc[trees_df.tree == n],
            row=row,
            calculate_root_node=calculate_root_node,
        )

        tree_prediction_decomposition = _calculate_change_in_node_predictions(
            path=leaf_node_path
        )

        prediction_decompositions.append(tree_prediction_decomposition)

    return _format_prediction_decomposition_results(prediction_decompositions)


def _find_path_to_leaf_node(
    tree_df: pd.DataFrame, row: pd.DataFrame, calculate_root_node: Callable
) -> pd.DataFrame:
    """Traverse tree down to leaf for given row of data.

    Parameters
    ----------
    tree_df : pd.DataFrame
        Subset of tree data for a single tree.

    row : pd.DataFrame
        Single row of data (observation) to send through tree to leaf node.

    calculate_root_node : callable
        Function that can return the root node id when passed tree index.

    Returns
    -------
    pd.DataFrame
        DataFrame where each successive row shows the path of row through the tree.

    """
    # get column headers no rows
    path = tree_df.loc[tree_df["node"] == -1]

    root_node_index_for_tree = calculate_root_node(tree_df["tree"].values[0])

    # get the first node in the tree
    current_node = tree_df.loc[tree_df["node"] == root_node_index_for_tree].copy()

    # for internal nodes record the value of the variable that will be used to split
    if current_node["leaf"].item() != 1:
        current_node["value"] = row[current_node["feature"]].values[0]

    else:
        current_node["value"] = np.nan

    path = pd.concat([path, current_node], axis=0)

    # as long as we are not at a leaf node already
    if current_node["leaf"].item() != 1:
        # determine if the value of the split variable sends the row left
        # (yes) or right (no)
        if (
            row[current_node["feature"]].values[0]
            < current_node["split_condition"].values[0]
        ):
            next_node = current_node["left_child"].item()

        else:
            next_node = current_node["right_child"].item()

        # (loop) traverse the tree until a leaf node is reached
        while True:
            current_node = tree_df.loc[tree_df["node"] == next_node].copy()

            # for internal nodes record the value of the variable that will be
            # used to split
            if current_node["leaf"].item() != 1:
                current_node["value"] = row[current_node["feature"]].values[0]

            path = pd.concat([path, current_node], axis=0)

            if current_node["leaf"].item() != 1:
                # determine if the value of the split variable sends the row left
                # (yes) or right (no)
                if (
                    row[current_node["feature"]].values[0]
                    < current_node["split_condition"].values[0]
                ):
                    next_node = current_node["left_child"].item()

                else:
                    next_node = current_node["right_child"].item()

            else:
                break

    return path


def _calculate_change_in_node_predictions(path: pd.DataFrame) -> pd.DataFrame:
    """Calcualte change in node prediction through a particular path through the tree.

    Parameters
    ----------
    path : pd.DataFrame
        DataFrame where each successive row shows the next node visited though a tree.
        Must have feautre and prediction columns.

    """
    # shift features down by 1 to get the variable which is contributing to the change
    # in prediction
    path["contributing_feature"] = path["feature"].shift(1)

    # calculate the change in prediction
    path["contribution"] = path["prediction"] - path["prediction"].shift(1).fillna(0)

    path.loc[path["contributing_feature"].isnull(), "contributing_feature"] = "base"

    return path


def _format_prediction_decomposition_results(
    list_decomposition_results: list[pd.DataFrame],
) -> PredictionDecomposition:
    """Combine results for each tree into PredictionDecomposition object.

    The list of individual prediction deomposition results is combined into a single
    DataFrame and contirbutions are summed over trees.

    """
    prediction_decompositions_df = pd.concat(list_decomposition_results, axis=0)

    keep_columns = ["tree", "node", "contributing_feature", "contribution"]
    prediction_decompositions_df_subset = prediction_decompositions_df[
        keep_columns
    ].rename({"node": "node_path"})

    decomposition_summary = pd.DataFrame(
        prediction_decompositions_df_subset.groupby(
            "contributing_feature"
        ).contribution.sum()
    ).reset_index()

    return PredictionDecomposition(
        summary=decomposition_summary, nodes=prediction_decompositions_df_subset
    )