Source code for mlpaper.classification

# Ryan Turner (turnerry@iro.umontreal.ca)
from __future__ import absolute_import, division, print_function

from builtins import range

import numpy as np
import pandas as pd
from joblib import Memory
from scipy.special import logsumexp

import mlpaper.boot_util as bu
import mlpaper.perf_curves as pc
from mlpaper.constants import CURVE_STATS, ERR_COL, METHOD, METRIC, PAIRWISE_DEFAULT, PVAL_COL, STAT, STD_STATS
from mlpaper.mlpaper import loss_summary_table
from mlpaper.util import area, interp1d, normalize, one_hot

DEFAULT_NGRID = 100
LABEL = "label"  # Don't put in constants since only needed for classification


[docs]def shape_and_validate(y, log_pred_prob):
    """Validate shapes and types of predictive distribution against data and
    return the shape information.

    Parameters
    ----------
    y : ndarray of type int or bool, shape (n_samples,)
        True labels for each classication data point.
    log_pred_prob : ndarray, shape (n_samples, n_labels)
        Array of shape ``(len(y), n_labels)``. Each row corresponds to a
        categorical distribution with *normalized* probabilities in log scale.
        Therefore, the number of columns must be at least 1.

    Returns
    -------
    n_samples : int
        Number of data points (length of `y`)
    n_labels : int
        The number of possible labels in `y`. Inferred from size of
        `log_pred_prob` and *not* from `y`.

    Notes
    -----
    This does *not* check normalization.
    """
    n_samples, n_labels = log_pred_prob.shape
    assert n_samples >= 1  # Otherwise min and max confused
    assert n_labels >= 1  # Otherwise makes no sense
    assert y.shape == (n_samples,) and y.dtype.kind in ("b", "i")
    assert 0 <= y.min() and y.max() < n_labels
    return n_samples, n_labels


# ============================================================================
# Loss functions
# ============================================================================


[docs]def hard_loss_decision(log_pred_prob, loss_mat):
    """Make Bayes' optimal action according to predictive probability
    distribution and loss matrix.

    Parameters
    ----------
    log_pred_prob : ndarray, shape (n_samples, n_labels)
        Array of shape ``(len(y), n_labels)``. Each row corresponds to a
        categorical distribution with *normalized* probabilities in log scale.
        Therefore, the number of columns must be at least 1.
    loss_mat : ndarray, shape (n_labels, n_actions)
        Loss matrix to use for making decisions of size
        ``(n_labels, n_actions)``. The loss of taking action a when the true
        outcome (label) is y is found in ``loss_mat[y, a]``.

    Returns
    -------
    action : ndarray of type int, shape (n_samples,)
        Array of resulting Bayes' optimal action for each data point.
    """
    pred_prob = np.exp(log_pred_prob)
    E_loss = np.dot(pred_prob, loss_mat)
    action = np.argmin(E_loss, axis=1)
    return action


[docs]def hard_loss(y, log_pred_prob, loss_mat=None):
    """Loss function for making classification decisions from a loss matrix.

    This function both computes the optimal action under the predictive
    distribution and the loss matrix, and then scores that decision using the
    loss matrix.

    Parameters
    ----------
    y : ndarray of type int or bool, shape (n_samples,)
        True labels for each classication data point.
    log_pred_prob : ndarray, shape (n_samples, n_labels)
        Array of shape ``(len(y), n_labels)``. Each row corresponds to a
        categorical distribution with *normalized* probabilities in log scale.
        Therefore, the number of columns must be at least 1.
    loss_mat : None or ndarray of shape (n_labels, n_actions)
        Loss matrix to use for making decisions of size
        ``(n_labels, n_actions)``. The loss of taking action a when the true
        outcome (label) is y is found in ``loss_mat[y, a]``. If None,
        1 - identity matrix is used to obtain the 0-1 loss function.

    Returns
    -------
    loss : ndarray, shape (n_samples,)
        Array of the resulting loss for the predictions on each point in `y`.
    """
    n_samples, n_labels = shape_and_validate(y, log_pred_prob)
    loss_mat = (1.0 - np.eye(n_labels)) if loss_mat is None else loss_mat
    assert np.ndim(loss_mat) == 2 and loss_mat.shape[0] == n_labels
    assert loss_mat.shape[1] >= 1  # Must be least one action

    action = hard_loss_decision(log_pred_prob, loss_mat)

    assert action.shape == y.shape and action.dtype.kind == "i"
    loss = loss_mat[y.astype(int), action]
    assert loss.shape == (n_samples,)
    return loss


[docs]def log_loss(y, log_pred_prob):
    """Compute log loss (e.g, negative log likelihood or cross-entropy).

    Parameters
    ----------
    y : ndarray of type int or bool, shape (n_samples,)
        True labels for each classication data point.
    log_pred_prob : ndarray, shape (n_samples, n_labels)
        Array of shape ``(len(y), n_labels)``. Each row corresponds to a
        categorical distribution with *normalized* probabilities in log scale.
        Therefore, the number of columns must be at least 1.

    Returns
    -------
    loss : ndarray, shape (n_samples,)
        Array of the log loss for the predictions on each data point in `y`.
    """
    n_samples, n_labels = shape_and_validate(y, log_pred_prob)
    nll = -log_pred_prob[np.arange(n_samples), y.astype(int)]
    return nll


[docs]def brier_loss(y, log_pred_prob, rescale=True):
    """Compute (rescaled) Brier loss.

    Parameters
    ----------
    y : ndarray of type int or bool, shape (n_samples,)
        True labels for each classication data point.
    log_pred_prob : ndarray, shape (n_samples, n_labels)
        Array of shape ``(len(y), n_labels)``. Each row corresponds to a
        categorical distribution with *normalized* probabilities in log scale.
        Therefore, the number of columns must be at least 1.
    rescale : bool
        If True, linearly rescales lost so perfect (P=1) predictions give 0.0
        loss and a uniform prediction gives loss of 1.0. False gives the
        standard Brier loss.

    Returns
    -------
    loss : ndarray, shape (n_samples,)
        Array of the Brier loss for the predictions on each data point in `y`.
    """
    n_samples, n_labels = shape_and_validate(y, log_pred_prob)

    y_bin = one_hot(y.astype(int), n_labels)
    loss = np.sum((np.exp(log_pred_prob) - y_bin) ** 2, axis=1)

    if rescale and n_labels > 1:
        # Linearly rescale so perfect is 0.0 and uniform gives 1.0
        loss = np.true_divide(n_labels, n_labels - 1) * loss
    return loss


[docs]def spherical_loss(y, log_pred_prob, rescale=True):
    """Compute (rescaled) spherical loss.

    Parameters
    ----------
    y : ndarray of type int or bool, shape (n_samples,)
        True labels for each classication data point.
    log_pred_prob : ndarray, shape (n_samples, n_labels)
        Array of shape ``(len(y), n_labels)``. Each row corresponds to a
        categorical distribution with *normalized* probabilities in log scale.
        Therefore, the number of columns must be at least 1.
    rescale : bool
        If True, linearly rescales lost so perfect (P=1) predictions give 0.0
        loss and a uniform prediction gives loss of 1.0. False gives the
        standard spherical loss, which is the negative spherical *score*.

    Returns
    -------
    loss : ndarray, shape (n_samples,)
        Array of the spherical loss for the predictions on each point in `y`.
    """
    N, n_labels = shape_and_validate(y, log_pred_prob)

    log_normalizer = 0.5 * logsumexp(2.0 * log_pred_prob, axis=1)
    # Need to do negative of spherical score to make a loss function
    loss = -np.exp(log_pred_prob[np.arange(N), y.astype(int)] - log_normalizer)

    if rescale:
        # Linearly rescale so perfect is 0.0 and uniform gives 1.0, when
        # n_labels = 1 everything is perfect so loss = 0.
        c = 1.0 - 1.0 / np.sqrt(n_labels) if n_labels > 1 else 1.0
        loss = (1.0 + loss) / c
    return loss


# ============================================================================
# Loss summary: the main purpose of this file.
# ============================================================================


[docs]def loss_table(log_pred_prob_table, y, metrics_dict, assume_normalized=False):
    """Compute loss table from table of probalistic predictions.

    Parameters
    ----------
    log_pred_prob_table : DataFrame, shape (n_samples, n_methods * n_labels)
        DataFrame with predictive distributions. Each row is a data point.
        The columns should be hierarchical index that is the cartesian product
        of methods x labels. For exampe, ``log_pred_prob_table.loc[5, 'foo']``
        is the categorical distribution (in log scale) prediction that method
        foo places on ``y[5]``.
    y : ndarray of type int or bool, shape (n_samples,)
        True labels for each classication data point. Must be of same length as
        DataFrame `log_pred_prob_table`.
    metrics_dict : dict of str to callable
        Dictionary mapping loss function name to function that computes loss,
        e.g., `log_loss`, `brier_loss`, ...
    assume_normalized : bool
        If False, renormalize the predictive distributions to ensure there is
        no cheating. If True, skips this step for speed.

    Returns
    -------
    loss_tbl : DataFrame, shape (n_samples, n_metrics * n_methods)
        DataFrame with loss of each method according to each loss function on
        each data point. The rows are the data points in `y` (that is the index
        matches `log_pred_prob_table`). The columns are a hierarchical index
        that is the cartesian product of loss x method. That is, the loss of
        method foo's prediction of ``y[5]`` according to loss function bar is
        stored in ``loss_tbl.loc[5, ('bar', 'foo')]``.
    """
    methods, labels = log_pred_prob_table.columns.levels
    n_samples, n_labels = len(log_pred_prob_table), len(labels)
    assert y.shape == (n_samples,)
    assert n_samples >= 1 and n_labels >= 1 and len(methods) >= 1

    col_names = pd.MultiIndex.from_product([metrics_dict.keys(), methods], names=[METRIC, METHOD])
    loss_tbl = pd.DataFrame(index=log_pred_prob_table.index, columns=col_names, dtype=float)
    for method in methods:
        # Make sure the columns are in right order and we aren't mixing things
        assert list(log_pred_prob_table[method].columns) == list(range(n_labels))

        log_pred_prob = log_pred_prob_table[method].values
        assert log_pred_prob.shape == (n_samples, n_labels)
        assert not np.any(np.isnan(log_pred_prob))  # Would let method cheat

        if not assume_normalized:
            log_pred_prob = normalize(log_pred_prob)

        for metric, metric_f in metrics_dict.items():
            loss_tbl.loc[:, (metric, method)] = metric_f(y, log_pred_prob)
    return loss_tbl


# ============================================================================
# Use and summarize performance curves
# ============================================================================


[docs]def check_curve(result, x_grid=None):
    """Check performance curve output matches expected format and return the
    curve after validation.

    Parameters
    ----------
    curve : result of curve function, e.g., classification.roc_curve
        Curves defined by a ROC or other curve estimation.
    x_grid : None or ndarray of shape (n_grid,)
        If provided, check that all the curves are defined over a wider range
        than the x_grid. So, when the functions are interpolated onto the range
        of x_grid no extrapolation is needed.

    Returns
    -------
    curve : tuple of (ndarray, ndarray, str)
        Returns same object passed in after some input checks. Each of the
        ndarrays have shape (n_boot, n_thresholds).
    """
    curve, _ = result  # Skipping tholds (2nd arg) since not used here
    x_curve, y_curve, kind = curve

    # Check shape
    assert x_curve.ndim == 2 and y_curve.ndim == 2
    assert x_curve.shape == y_curve.shape
    assert x_curve.shape[1] >= 2  # Otherwise not curve

    # Check values
    assert np.all(np.isfinite(x_curve))
    assert np.all(y_curve < np.inf)  # PRG can be -inf, but all curves < inf
    assert np.all(np.diff(x_curve, axis=1) >= 0.0)  # also check is sorted
    if x_grid is not None:  # Make sure we won't need to extrapolate for grid
        assert np.all(x_curve[:, 0] <= x_grid[0])
        assert np.all(x_grid[-1] <= x_curve[:, -1])
    return curve


[docs]def curve_boot(
    y, log_pred_prob, ref, curve_f=pc.roc_curve, x_grid=None, n_boot=1000, pairwise_CI=PAIRWISE_DEFAULT, confidence=0.95
):
    """Perform boot strap analysis of performance curve, e.g., ROC or prec-rec.
    For binary classification only.

    Parameters
    ----------
    y : ndarray of type int or bool, shape (n_samples,)
        Array containing true labels, must be `bool` or {0,1}.
    log_pred_prob : ndarray, shape (n_samples, 2)
        Array of shape ``(len(y), 2)``. Each row corresponds to a categorical
        distribution with *normalized* probabilities in log scale. However,
        many curves (e.g., ROC) are invariant to monotonic transformation and
        hence linear scale could also be used.
    ref : float or ndarray of shape (n_samples, 2)
        If `ref` is an rray of shape ``(len(y), 2)``: Same as `log_pred_prob`
        except for the reference (baseline) method if a paired statistical test
        is desired on the area under the curve. If `ref` is a scalar float:
        `curve_boot` tests the statistical significance that the area under the
        curve differs from `ref` in a non-paired test. For ROC analysis, `ref`
        is typically 0.5.
    curve_f : callable
        Function to compute the performance curve. Standard choices are:
        `perf_curves.roc_curve` or `perf_curves.recall_precision_curve`.
    x_grid : None or ndarray of shape (n_grid,)
        Grid of points to evaluate curve in results. If `None`, defaults to
        linear grid on [0,1].
    n_boot : int
        Number of bootstrap iterations to perform.
    pairwise_CI : bool
        If True, compute error bars on ``summary - summary_ref`` instead of
        just the summary. This typically results in smaller error bars.
    confidence : float
        Confidence probability (in (0, 1)) to construct error bar.

    Returns
    -------
    summary : tuple of floats, shape (3,)
        Tuple containing (mu, EB, pval), where mu is the best estimate on the
        summary statistic of the curve, EB is the error bar, and pval is the
        p-value from the two-sided boot strap significance test that its value
        is the same as the reference summary value (from either
        `log_pred_prob_ref` or `default_summary_ref`).
    curve : DataFrame, shape (n_grid, 4)
        DataFrame containing four columns: `x_grid`, the curve value, the lower
        end of confidence envelope, and the upper end of the confidence
        envelope.
    """
    N, n_labels = shape_and_validate(y, log_pred_prob)
    assert n_labels == 2
    assert np.ndim(ref) == 0 or ref.shape == log_pred_prob.shape
    assert not np.any(np.isnan(ref))
    assert n_boot >= 1
    assert not np.any(np.isnan(log_pred_prob))  # Would let method cheat

    # Setup constants
    epsilon = 1e-10  # Min bootstrap weight since 0 weight can cause problems
    pos_label = 1  # Label=1 of [0,1] is considered a positive case
    x_grid = np.linspace(0.0, 1.0, DEFAULT_NGRID) if x_grid is None else x_grid
    assert np.ndim(x_grid) == 1

    # Put everything into a vector of right type for binary classification
    y = y.astype(bool)
    log_pred_prob = log_pred_prob[:, pos_label]

    # Get estimator on original data. Could use _interp1d directly since only 1
    # curve, but this is more consistent with bootstrap version below.
    curve = check_curve(curve_f(y, log_pred_prob), x_grid)
    auc, = area(*curve)
    assert auc.ndim == 0
    y_grid, = interp1d(x_grid, *curve)
    assert y_grid.shape == x_grid.shape

    # Setup boot strap weights
    weight = bu.boot_weights(N, n_boot, epsilon=epsilon)

    # Get boot strapped scores
    curve_boot_ = check_curve(curve_f(y, log_pred_prob, weight), x_grid)
    auc_boot = area(*curve_boot_)
    assert auc_boot.shape == (n_boot,)
    y_grid_boot = interp1d(x_grid, *curve_boot_)
    assert y_grid_boot.shape == (n_boot, x_grid.size)

    # Repeat area boot strap with reference predictor (if provided)
    ref_boot = ref
    if np.ndim(ref) == 2:  # Note dim must be 0 or 2
        ref_boot = area(*check_curve(curve_f(y, ref[:, pos_label], weight)))
        assert ref_boot.shape == (n_boot,)
        ref, = area(*check_curve(curve_f(y, ref[:, pos_label])))
    assert np.ndim(ref) == 0

    # Pack up standard numeric summary triple
    EB = (
        bu.error_bar(auc_boot - ref_boot, auc - ref, confidence=confidence)
        if pairwise_CI
        else bu.error_bar(auc_boot, auc, confidence=confidence)
    )
    pval = bu.significance(auc_boot, ref_boot)
    summary = (auc, EB, pval)

    # Pack up data frame with graphical summaries (performance curves)
    # Could also try bu.basic and see which works better
    y_LB, y_UB = bu.percentile(y_grid_boot, confidence)
    curve = pd.DataFrame(
        data=np.stack((x_grid, y_grid, y_LB, y_UB), axis=1), index=range(x_grid.size), columns=CURVE_STATS, dtype=float
    )
    return summary, curve


[docs]def curve_summary_table(
    log_pred_prob_table,
    y,
    curve_dict,
    ref_method,
    x_grid=None,
    n_boot=1000,
    pairwise_CI=PAIRWISE_DEFAULT,
    confidence=0.95,
):
    """Build table with mean and error bars of curve summaries from a table of
    probalistic predictions.

    Parameters
    ----------
    log_pred_prob_table : DataFrame, shape (n_samples, n_methods * n_labels)
        DataFrame with predictive distributions. Each row is a data point.
        The columns should be hierarchical index that is the cartesian product
        of methods x labels. For exampe, ``log_pred_prob_table.loc[5, 'foo']``
        is the categorical distribution (in log scale) prediction that method
        foo places on ``y[5]``.
    y : ndarray of type int or bool, shape (n_samples,)
        True labels for each classication data point. Must be of same length as
        DataFrame `log_pred_prob_table`.
    curve_dict : dict of str to callable
        Dictionary mapping curve name to performance curve. Standard choices:
        `perf_curves.roc_curve` or `perf_curves.recall_precision_curve`.
    ref_method : str
        Name of method that is used as reference point in paired statistical
        tests. This is usually some some of baseline method. `ref_method` must
        be found in the 1st level of the columns of `log_pred_prob_table`.
    x_grid : None or ndarray of shape (n_grid,)
        Grid of points to evaluate curve in results. If `None`, defaults to
        linear grid on [0,1].
    n_boot : int
        Number of bootstrap iterations to perform.
    pairwise_CI : bool
        If True, compute error bars on ``summary - summary_ref`` instead of
        just the summary. This typically results in smaller error bars.
    confidence : float
        Confidence probability (in (0, 1)) to construct error bar.

    Returns
    -------
    curve_tbl : DataFrame, shape (n_methods, n_metrics * 3)
        DataFrame with curve summary of each method according to each curve.
        The rows are the methods. The columns are a hierarchical index that is
        the cartesian product of curve x (summary, error bar, p-value).
        That is, ``curve_tbl.loc['foo', 'bar']`` is a pandas series with
        (summary of bar curve on foo, corresponding error bar, statistical sig)
        The statistical significance is a p-value from a two-sided hypothesis
        test on the hypothesis H0 that foo has the same curve summary as the
        reference method `ref_method`.
    curve_dump : dict of (str, str) to DataFrame of shape (n_grid, 4)
        Each key is a pair of (method name, curve name) with the value being
        a pandas dataframe with the performance curve, which has four columns:
        `x_grid`, the curve value, the lower end of confidence envelope,
        and the upper end of the confidence envelope.
    """
    methods, labels = log_pred_prob_table.columns.levels
    N, n_labels = len(log_pred_prob_table), len(labels)
    assert y.shape == (N,)
    assert ref_method in methods  # ==> len(methods) >= 1
    assert N >= 1 and n_labels >= 1 and len(curve_dict) >= 1

    assert list(log_pred_prob_table[ref_method].columns) == list(range(n_labels))
    log_pred_prob_ref = log_pred_prob_table[ref_method].values
    assert log_pred_prob_ref.shape == (N, n_labels)
    # Note: Most curve methods are rank based and so normalization is not
    # needed to prevent cheating. However, if we expect non-normalized methods
    # they should be normalized before to keep consistency with loss metrics.

    col_names = pd.MultiIndex.from_product([curve_dict.keys(), STD_STATS], names=[METRIC, STAT])
    curve_tbl = pd.DataFrame(index=methods, columns=col_names, dtype=float)
    curve_tbl.index.set_names(METHOD, inplace=True)

    curve_dump = {}
    for method in methods:
        assert list(log_pred_prob_table[method].columns) == list(range(n_labels))
        log_pred_prob = log_pred_prob_table[method].values
        assert log_pred_prob.shape == (N, n_labels)

        for curve_name, curve_f in curve_dict.items():
            R = curve_boot(
                y,
                log_pred_prob,
                ref=log_pred_prob_ref,
                curve_f=curve_f,
                x_grid=x_grid,
                n_boot=n_boot,
                pairwise_CI=pairwise_CI,
                confidence=confidence,
            )
            curve_summary, curr_curve = R
            curve_tbl.loc[method, curve_name] = curve_summary
            if pairwise_CI and method == ref_method:
                curve_tbl.loc[method, (curve_name, ERR_COL)] = np.nan
            if method == ref_method:  # NaN probably makes more sense than 1
                curve_tbl.loc[method, (curve_name, PVAL_COL)] = np.nan
            curve_dump[(method, curve_name)] = curr_curve
    return curve_tbl, curve_dump


[docs]def summary_table(
    log_pred_prob_table,
    y,
    loss_dict,
    curve_dict,
    ref_method,
    x_grid=None,
    n_boot=1000,
    pairwise_CI=PAIRWISE_DEFAULT,
    confidence=0.95,
    method_EB="t",
    limits={},
):
    """Build table with mean and error bars of both loss and curve summaries
    from a table of probalistic predictions.

    Parameters
    ----------
    log_pred_prob_table : DataFrame, shape (n_samples, n_methods * n_labels)
        DataFrame with predictive distributions. Each row is a data point.
        The columns should be hierarchical index that is the cartesian product
        of methods x labels. For exampe, ``log_pred_prob_table.loc[5, 'foo']``
        is the categorical distribution (in log scale) prediction that method
        foo places on ``y[5]``.
    y : ndarray of type int or bool, shape (n_samples,)
        True labels for each classication data point. Must be of same length as
        DataFrame `log_pred_prob_table`.
    loss_dict : dict of str to callable
        Dictionary mapping loss function name to function that computes loss,
        e.g., `log_loss`, `brier_loss`, ...
    curve_dict : dict of str to callable
        Dictionary mapping curve name to performance curve. Standard choices:
        `perf_curves.roc_curve` or `perf_curves.recall_precision_curve`.
    ref_method : str
        Name of method that is used as reference point in paired statistical
        tests. This is usually some some of baseline method. `ref_method` must
        be found in the 1st level of the columns of `log_pred_prob_table`.
    x_grid : None or ndarray of shape (n_grid,)
        Grid of points to evaluate curve in results. If `None`, defaults to
        linear grid on [0,1].
    n_boot : int
        Number of bootstrap iterations to perform for performance curves.
    pairwise_CI : bool
        If True, compute error bars on ``summary - summary_ref`` instead of
        just the summary. This typically results in smaller error bars.
    confidence : float
        Confidence probability (in (0, 1)) to construct error bar.
    method_EB : {'t', 'bernstein', 'boot'}
        Method to use for building error bar.
    limits : dict of str to (float, float)
        Dictionary mapping metric name to tuple with (lower, upper) which are
        the theoretical limits on the mean loss. For instance, zero-one loss
        should be ``(0.0, 1.0)``. If entry missing, (-inf, inf) is used.

    Returns
    -------
    full_tbl : DataFrame, shape (n_methods, (n_loss + n_curve) * 3)
        DataFrame with curve/loss summary of each method according to each
        curve or loss function. The rows are the methods. The columns are a
        hierarchical index that is the cartesian product of
        metric x (summary, error bar, p-value), where metric can be a loss or
        a curve summary: ``full_tbl.loc['foo', 'bar']`` is a pandas series
        with (metric bar on foo, corresponding error bar, statistical sig)
        The statistical significance is a p-value from a two-sided hypothesis
        test on the hypothesis H0 that foo has the same metric as the reference
        method `ref_method`.
    curve_dump : dict of (str, str) to DataFrame of shape (n_grid, 4)
        Each key is a pair of (method name, curve name) with the value being
        a pandas dataframe with the performance curve, which has four columns:
        `x_grid`, the curve value, the lower end of confidence envelope,
        and the upper end of the confidence envelope. Only metrics from
        `curve_dict` and *not* from `loss_dict` are found here.
    """
    # Do the curve metrics
    curve_summary, dump_tbl = curve_summary_table(
        log_pred_prob_table,
        y,
        curve_dict,
        ref_method,
        x_grid=x_grid,
        n_boot=n_boot,
        pairwise_CI=pairwise_CI,
        confidence=confidence,
    )

    # Do loss based metrics
    loss_tbl = loss_table(log_pred_prob_table, y, loss_dict)
    loss_summary = loss_summary_table(
        loss_tbl, ref_method, pairwise_CI=pairwise_CI, confidence=confidence, method_EB=method_EB, limits=limits
    )

    # Return the combo
    full_tbl = pd.concat((loss_summary, curve_summary), axis=1)
    return full_tbl, dump_tbl


# ============================================================================
# Variables and functions to make getting results from sklearn objects easy
# ============================================================================

# Pre-build some standard metric dicts for the user
STD_CLASS_LOSS = {"NLL": log_loss, "Brier": brier_loss, "sphere": spherical_loss, "zero_one": hard_loss}

STD_BINARY_CURVES = {"AUC": pc.roc_curve, "AP": pc.recall_precision_curve, "AUPRG": pc.prg_curve}


[docs]class JustNoise:
    """Class version of iid predictor compatible with sklearn interface. Same
    as ``sklearn.dummy.DummyClassifier(strategy='prior').``"""

    def __init__(self, n_labels=2, pseudo_count=0.0):
        self.pred = np.nan + np.zeros(n_labels)
        self.pseudo_count = pseudo_count

    def fit(self, X_train, y_train):
        n_labels = len(self.pred)
        n_points, = np.shape(y_train)

        counts = self.pseudo_count + np.sum(one_hot(y_train, n_labels), axis=0)
        n_total = n_points + n_labels * self.pseudo_count
        self.pred = np.log(counts / n_total)
        assert self.pred.shape == (n_labels,)

    def predict_log_proba(self, X_test):
        n_samples = X_test.shape[0]
        pred_log_prob = np.repeat([self.pred], n_samples, axis=0)
        return pred_log_prob


[docs]def get_pred_log_prob(
    X_train, y_train, X_test, n_labels, methods, min_log_prob=-np.inf, verbose=False, checkpointdir=None
):
    """Get the predictive probability tables for each test point on a
    collection of classification methods.

    Parameters
    ----------
    X_train : ndarray, shape (n_train, n_features)
        Training set 2d feature array for classifiers. Each row is an
        indepedent data point and each column is a feature.
    y_train : ndarray of type int or bool, shape (n_train,)
        Training set 1d array of truth labels for classifiers. Must be of same
        length as `X_train`. Values must be in range [0, `n_labels`) or `bool`.
    X_test : ndarray, shape (n_test, n_features)
        Test set 2d feature array for classifiers. Each row is an indepedent
        data point and each column is a feature.
    n_labels : int
        Number of labels, must be >= 1. This is not infered from `y` because
        some labels may not be found in small data chunks.
    methods : dict of str to sklearn estimator
        Dictionary mapping method name (`str`) to object that performs training
        and test. Object must follow the interface of sklearn estimators, that
        is it has a ``fit()`` method and either a ``predict_log_proba()`` or
        ``predict_proba()`` method.
    min_log_prob : float
        Minimum value to floor the predictive log probabilities (while still
        normalizing). Must be < 0. Useful to prevent inf log loss penalties.
    verbose : bool
        If True, display which method being trained.
    checkpointdir : str (directory)
        If provided, stores checkpoint results using joblib for the train/test
        in case process interrupted. If None, no checkpointing is done.

    Returns
    -------
    log_pred_prob_table : DataFrame, shape (n_samples, n_methods * n_labels)
        DataFrame with predictive distributions. Each row is a data point.
        The columns should be hierarchical index that is the cartesian product
        of methods x labels. For exampe, ``log_pred_prob_table.loc[5, 'foo']``
        is the categorical distribution (in log scale) prediction that method
        foo places on ``y[5]``.

    Notes
    -----
    If a train/test operation is loaded from a checkpoint file, the estimator
    object in methods will not be in a fit state.
    """
    n_test = X_test.shape[0]
    assert n_test > 0
    # Allow ndim >= 2 since some input data (e.g. images) come that way
    assert X_train.ndim >= 2
    assert y_train.shape == (X_train.shape[0],)
    assert y_train.dtype.kind in ("b", "i")
    assert 0 <= y_train.min() and y_train.max() < n_labels
    assert X_test.ndim == X_train.ndim and X_test.shape[1] == X_train.shape[1]
    assert X_train.dtype.kind == X_test.dtype.kind  # Would be weird otherwise
    assert min_log_prob < 0.0  # Ensure is a log-prob

    memory = Memory(cachedir=checkpointdir, verbose=0)

    @memory.cache
    def train_predict(method_obj, X_train, y_train, X_test):
        method_obj.fit(X_train, y_train)
        try:
            pred_log_prob = method_obj.predict_log_proba(X_test)
        except:  # noqa: E722 If there is no log proba available
            # TODO add exception type
            with np.errstate(divide="ignore"):  # Not unusual to have some p=0 cases
                pred_log_prob = np.log(method_obj.predict_proba(X_test))
        return pred_log_prob

    col_names = pd.MultiIndex.from_product([methods.keys(), range(n_labels)], names=[METHOD, LABEL])
    log_pred_prob_table = pd.DataFrame(index=range(n_test), columns=col_names, dtype=float)
    for method_name, method_obj in methods.items():
        if verbose:
            print("Running fit/predict for {}".format(method_name))
        pred_log_prob = train_predict(method_obj, X_train, y_train, X_test)
        assert pred_log_prob.shape == (n_test, n_labels)

        pred_log_prob = normalize(np.maximum(min_log_prob, pred_log_prob))
        log_pred_prob_table.loc[:, method_name] = pred_log_prob
    return log_pred_prob_table


[docs]def just_benchmark(
    X_train,
    y_train,
    X_test,
    y_test,
    n_labels,
    methods,
    loss_dict,
    curve_dict,
    ref_method,
    min_pred_log_prob=-np.inf,
    pairwise_CI=PAIRWISE_DEFAULT,
    method_EB="t",
    limits={},
):
    """Simplest one-call interface to this package. Just pass it data and
    method objects and a performance summary DataFrame is returned.

    Parameters
    ----------
    X_train : ndarray, shape (n_train, n_features)
        Training set 2d feature array for classifiers. Each row is an
        indepedent data point and each column is a feature.
    y_train : ndarray of type int or bool, shape (n_train,)
        Training set 1d array of truth labels for classifiers. Must be of same
        length as `X_train`. Values must be in range [0, `n_labels`) or `bool`.
    X_test : ndarray, shape (n_test, n_features)
        Test set 2d feature array for classifiers. Each row is an indepedent
        data point and each column is a feature.
    y_test : ndarray of type int or bool, shape (n_test,)
        Test set 1d array of truth labels for classifiers. Must be of same
        length as `X_test`. Values must be in range [0, `n_labels`) or `bool`.
    n_labels : int
        Number of labels, must be >= 1. This is not infered from `y` because
        some labels may not be found in small data chunks.
    methods : dict of str to sklearn estimator
        Dictionary mapping method name (`str`) to object that performs training
        and test. Object must follow the interface of sklearn estimators, that
        is it has a ``fit()`` method and either a ``predict_log_proba()`` or
        ``predict_proba()`` method.
    loss_dict : dict of str to callable
        Dictionary mapping loss function name to function that computes loss,
        e.g., `log_loss`, `brier_loss`, ...
    curve_dict : dict of str to callable
        Dictionary mapping curve name to performance curve. Standard choices:
        `perf_curves.roc_curve` or `perf_curves.recall_precision_curve`.
    ref_method : str
        Name of method that is used as reference point in paired statistical
        tests. This is usually some some of baseline method. `ref_method` must
        be found in `methods` dictionary.
    min_log_prob : float
        Minimum value to floor the predictive log probabilities (while still
        normalizing). Must be < 0. Useful to prevent inf log loss penalties.
    pairwise_CI : bool
        If True, compute error bars on the mean of ``loss - loss_ref`` instead
        of just the mean of `loss`. This typically gives smaller error bars.
    method_EB : {'t', 'bernstein', 'boot'}
        Method to use for building error bar.
    limits : dict of str to (float, float)
        Dictionary mapping metric name to tuple with (lower, upper) which are
        the theoretical limits on the mean loss. For instance, zero-one loss
        should be ``(0.0, 1.0)``. If entry missing, (-inf, inf) is used.

    Returns
    -------
    full_tbl : DataFrame, shape (n_methods, (n_loss + n_curve) * 3)
        DataFrame with curve/loss summary of each method according to each
        curve or loss function. The rows are the methods. The columns are a
        hierarchical index that is the cartesian product of
        metric x (summary, error bar, p-value), where metric can be a loss or
        a curve summary: ``full_tbl.loc['foo', 'bar']`` is a pandas series
        with (metric bar on foo, corresponding error bar, statistical sig)
        The statistical significance is a p-value from a two-sided hypothesis
        test on the hypothesis H0 that foo has the same metric as the reference
        method `ref_method`.
    curve_dump : dict of (str, str) to DataFrame of shape (n_grid, 4)
        Each key is a pair of (method name, curve name) with the value being
        a pandas dataframe with the performance curve, which has four columns:
        `x_grid`, the curve value, the lower end of confidence envelope,
        and the upper end of the confidence envelope. Only metrics from
        `curve_dict` and *not* from `loss_dict` are found here.
    """
    assert y_train.dtype == y_test.dtype  # Would be weird otherwise
    pred_tbl = get_pred_log_prob(X_train, y_train, X_test, n_labels, methods, min_log_prob=min_pred_log_prob)
    full_tbl, dump = summary_table(
        pred_tbl, y_test, loss_dict, curve_dict, ref_method, pairwise_CI=pairwise_CI, method_EB=method_EB, limits=limits
    )
    return full_tbl, dump