Source code for tabular_trees.explain.shapley_values

"""Module implementing shapley values calculation."""

import itertools
import warnings
from dataclasses import dataclass, field
from math import factorial

import numpy as np
import pandas as pd
from tqdm import tqdm

from ..checks import check_type
from ..trees import TabularTrees


[docs]@dataclass
class ShapleyValues:
    """Shapley values results."""

    summary: pd.DataFrame
    """Shapley values at feature level.

    Average of permutations over features.

    """

    permutations: pd.DataFrame = field(repr=False)
    """Shapley values for each feature permutation and tree."""

[docs]    def __init__(self, summary: pd.DataFrame, permutations: pd.DataFrame):
        """Initialise the ShapleyValues object.

        Parameters
        ----------
        summary : pd.DataFrame
            Shapley values at feature level. Average of permutations over features.

        permutations : pd.DataFrame
            Shapley values for each feature permutation and tree.

        """
        self.summary = summary
        self.permutations = permutations


[docs]def calculate_shapley_values(
    tabular_trees: TabularTrees, row: pd.Series
) -> ShapleyValues:
    """Calculate shapley values from TabularTrees model for row of data.

    This is Algorithm 1 as presented in https://arxiv.org/pdf/1802.03888.pdf.

    Parameters
    ----------
    tree_df : pd.DataFrame
        Model (multiple trees) in tabular structure. Should be the output of
        pygbm.expl.xgb.extract_model_predictions.

    row : pd.Series
        Single row of data to explain prediction for. It is the users responsibility to
        pass the relevant columns in row (i.e. the columns used by the model). If extra
        columns are added this will exponentially increase the number of runs - even if
        they are not relevant to the model.

    Returns
    -------
    results : ShapleyValues
        Shapley values for prediction from model on the input row.

    Notes
    -----
    This algorithm has O(TL2^M) complexity (where M is the number of features) and this
    implementation is not intended to be efficient - rather it is intended to
    illustrate the algorithm. Beware of using this on models or datasets (specifically
    columns) of any significant size.

    Examples
    --------
    >>> import xgboost as xgb
    >>> import pandas as pd
    >>> from sklearn.datasets import load_diabetes
    >>> from tabular_trees import export_tree_data
    >>> from tabular_trees import calculate_shapley_values
    >>> # get data in DMatrix
    >>> diabetes = load_diabetes()
    >>> data = xgb.DMatrix(
    ...     diabetes["data"][:,:3],
    ...     label=diabetes["target"],
    ...     feature_names=diabetes["feature_names"][:3]
    ... )
    >>> # build model
    >>> params = {"max_depth": 3, "verbosity": 0}
    >>> model = xgb.train(params, dtrain=data, num_boost_round=10)
    >>> # export to TabularTrees
    >>> xgboost_tabular_trees = export_tree_data(model)
    >>> tabular_trees = xgboost_tabular_trees.to_tabular_trees()
    >>> # get data to score
    >>> scoring_data = pd.DataFrame(
    ...     diabetes["data"][:,:3],
    ...     columns=diabetes["feature_names"][:3]
    ... )
    >>> row_to_score = scoring_data.iloc[0]
    >>> # calculate shapley values
    >>> results = calculate_shapley_values(tabular_trees, row=row_to_score)
    >>> type(results)
    <class 'tabular_trees.explain.shapley_values.ShapleyValues'>

    """
    warnings.warn(
        "This algorithm has very long runtime. It will produce the same results as "
        "treeSHAP but will take much longer to run.",
        stacklevel=2,
    )

    check_type(tabular_trees, TabularTrees, "tabular_trees")
    check_type(row, pd.Series, "row")

    return _calculate_shapley_values(tree_df=tabular_trees.trees, row=row)


def _calculate_shapley_values(tree_df: pd.DataFrame, row: pd.Series) -> ShapleyValues:
    """Calculate shapley values for each tree and combine.

    Parameters
    ----------
    tree_df : pd.DataFrame
        Tree data.

    row : pd.Series
        Row of data to explain prediction for.

    """
    # TODO: do not modify TabularTrees data
    tree_df.reset_index(drop=True, inplace=True)

    n_trees = tree_df["tree"].max()

    results_list = []

    for tree_no in range(n_trees + 1):
        tree_rows = tree_df.loc[tree_df["tree"] == tree_no]

        tree_shapley_values = _shapley_values_tree(tree_rows, row)

        results_list.append(tree_shapley_values)

    return _combine_shapley_values_across_trees(results_list)


def convert_node_columns_to_integer(tree_df: pd.DataFrame) -> pd.DataFrame:
    """Convert node, left_child and right_child columns to integer."""
    node_mapping = {node: i for i, node in enumerate(tree_df["node"].tolist())}

    tree_df["node"] = tree_df["node"].map(node_mapping).astype(int)
    tree_df["left_child"] = tree_df["left_child"].map(node_mapping)
    tree_df["right_child"] = tree_df["right_child"].map(node_mapping)

    return tree_df


def _shapley_values_tree(tree_df: pd.DataFrame, row: pd.Series) -> ShapleyValues:
    """Calculate shapley values for single tree.

    Parameters
    ----------
    tree_df : pd.DataFrame
        Single tree in tabular structure. Should be subset of output of
        pygbm.expl.xgb.extract_model_predictions.

    row : pd.Series
        Single row of data to explain prediction.

    """
    tree_number = tree_df["tree"].values[0]

    internal_nodes = tree_df["leaf"] == 0

    mean_prediction = (
        tree_df.loc[~internal_nodes, "count"]
        * tree_df.loc[~internal_nodes, "prediction"]
    ).sum() / tree_df.loc[~internal_nodes, "count"].sum()

    keep_cols = [
        "node",
        "left_child",
        "right_child",
        "split_condition",
        "feature",
        "count",
        "leaf",
        "prediction",
    ]

    tree_df_cols_subset = tree_df[keep_cols].copy()

    tree_df_cols_subset = convert_node_columns_to_integer(tree_df_cols_subset)

    tree_df_cols_subset.loc[internal_nodes, "prediction"] = np.nan

    cols_rename = {
        "left_child": "a",
        "right_child": "b",
        "split_condition": "t",
        "feature": "d",
        "count": "r",
        "prediction": "v",
    }

    tree_df_cols_subset.rename(columns=cols_rename, inplace=True)

    tree_df_cols_subset.drop(columns=["leaf", "node"], inplace=True)

    # convert child node index column to pandas int type with nulls
    # this is to prevent error in G when trying to select items from list by
    # index (with float)
    tree_df_cols_subset["a"] = pd.Series(tree_df_cols_subset["a"], dtype="Int64")
    tree_df_cols_subset["b"] = pd.Series(tree_df_cols_subset["b"], dtype="Int64")

    features = row.index.values.tolist()

    results_list = []

    for i, feature_permutation in enumerate(
        tqdm(itertools.permutations(features), total=factorial(len(features)))
    ):
        selected_features = []

        current_prediction = mean_prediction

        contributions = {
            "bias": mean_prediction,
            "permutation": str(list(feature_permutation)),
        }

        for feature in feature_permutation:
            selected_features.append(feature)

            expected_value_given_features = _expvalue(
                row, selected_features, tree_df_cols_subset
            )

            contributions[feature] = expected_value_given_features - current_prediction

            current_prediction = expected_value_given_features

        contributions_df = pd.DataFrame(contributions, index=[i])

        results_list.append(contributions_df)

    return _format_shapley_value_for_tree(results_list, tree_number)


def _expvalue(x: pd.Series, s: list, tree: pd.DataFrame) -> float:
    """Estimate E[f(x)|x_S].

    Algorithm 1 from Consistent Individualized Feature Attribution for Tree Ensembles.

    Note, the node indices start at 0 for this implementation whereas in the paper they
    start at 1.

    Note, the arguments [v, a, b, t, r, d] define the tree under consideration.

    Parameters
    ----------
    x : pd.Series
        Single row of data to estimate E[f(x)|x_S] for.

    s : list
        subset of features

    tree : pd.DataFrame
        tree structure in tabular form with the following columns;
        v - node values which for internal nodes take the value 'internal' otherwise
        the predicted value for the leaf node
        a - left child node indices
        b - right child node indices
        t - thresholds for split in internal nodes
        r - cover for each node (# samples)
        d - split variable for each node

    """
    return _g(0, 1, x, s, tree)


def _g(j: int, w: float, x: pd.Series, s: list, tree: pd.DataFrame) -> float:
    """Recusively traverse down tree and return prediction for x.

    This algorithm follows the route allowed by the features in s, if a node is
    encountered that is made on a feature not in s then an average of predictions
    for both child nodes is used.

    Parameters
    ----------
    j : int
        Node index

    w : float
        Proportion of training samples that meet node considtion

    x : pd.Series
        Row of data to explain prediction for

    s : list
        Subset of features being considered

    tree : pd.DataFrame
        Tree structure in tabular form

    """
    v: list[float] = tree["v"].tolist()
    a = tree["a"].tolist()
    b = tree["b"].tolist()
    t = tree["t"].tolist()
    r = tree["r"].tolist()
    d = tree["d"].tolist()

    if not np.isnan(v[j]):
        return w * v[j]

    else:
        if d[j] in s:
            if x[d[j]] <= t[j]:
                return _g(a[j], w, x, s, tree)

            else:
                return _g(b[j], w, x, s, tree)

        else:
            return _g(a[j], w * r[a[j]] / r[j], x, s, tree) + _g(
                b[j], w * r[b[j]] / r[j], x, s, tree
            )


def _format_shapley_value_for_tree(
    results_list: list[pd.DataFrame], tree_no: int
) -> ShapleyValues:
    """Format shapley values for a single tree.

    The summary results give the average contribution across all permutations.

    Parameters
    ----------
    results_list : list[pd.DataFrame]
        List of contributions for different feature permutations.

    tree_no: int
        Tree number.

    """
    permutations_df = pd.concat(results_list, axis=0, sort=True)

    permutations_df.insert(0, "tree", tree_no)

    results_feature_level = pd.DataFrame(
        permutations_df.drop(columns="permutation").mean(axis=0)
    ).T

    return ShapleyValues(summary=results_feature_level, permutations=permutations_df)


def _combine_shapley_values_across_trees(
    tree_shapley_values: list[ShapleyValues],
) -> ShapleyValues:
    """Combine shapley values across all trees in the model.

    Permutation results from each ShapleyValues object are simply appended. For the
    summary results, these are appended and then summed over trees.

    Parameters
    ----------
    tree_shapley_values : list[ShapleyValues]
        Shapley values for each tree.

    Notes
    -----
    The summary attribute on the returned ShapleyValues object does not have the 'tree'
    column - as results are summed over all trees.

    """
    summary_appended = pd.concat(
        [shapley_values.summary for shapley_values in tree_shapley_values],
        axis=0,
        sort=True,
    )

    summary_summed_over_trees = pd.DataFrame(summary_appended.sum(axis=0)).T.drop(
        columns="tree"
    )

    permutations_appended = pd.concat(
        [shapley_values.permutations for shapley_values in tree_shapley_values],
        axis=0,
        sort=True,
    )

    return ShapleyValues(
        summary=summary_summed_over_trees, permutations=permutations_appended
    )