# Ryan Turner (turnerry@iro.umontreal.ca)
from __future__ import absolute_import, division, print_function
from builtins import range
import numpy as np
import pandas as pd
from joblib import Memory
from scipy.special import logsumexp
import mlpaper.boot_util as bu
import mlpaper.perf_curves as pc
from mlpaper.constants import CURVE_STATS, ERR_COL, METHOD, METRIC, PAIRWISE_DEFAULT, PVAL_COL, STAT, STD_STATS
from mlpaper.mlpaper import loss_summary_table
from mlpaper.util import area, interp1d, normalize, one_hot
DEFAULT_NGRID = 100
LABEL = "label" # Don't put in constants since only needed for classification
[docs]def shape_and_validate(y, log_pred_prob):
"""Validate shapes and types of predictive distribution against data and
return the shape information.
Parameters
----------
y : ndarray of type int or bool, shape (n_samples,)
True labels for each classication data point.
log_pred_prob : ndarray, shape (n_samples, n_labels)
Array of shape ``(len(y), n_labels)``. Each row corresponds to a
categorical distribution with *normalized* probabilities in log scale.
Therefore, the number of columns must be at least 1.
Returns
-------
n_samples : int
Number of data points (length of `y`)
n_labels : int
The number of possible labels in `y`. Inferred from size of
`log_pred_prob` and *not* from `y`.
Notes
-----
This does *not* check normalization.
"""
n_samples, n_labels = log_pred_prob.shape
assert n_samples >= 1 # Otherwise min and max confused
assert n_labels >= 1 # Otherwise makes no sense
assert y.shape == (n_samples,) and y.dtype.kind in ("b", "i")
assert 0 <= y.min() and y.max() < n_labels
return n_samples, n_labels
# ============================================================================
# Loss functions
# ============================================================================
[docs]def hard_loss_decision(log_pred_prob, loss_mat):
"""Make Bayes' optimal action according to predictive probability
distribution and loss matrix.
Parameters
----------
log_pred_prob : ndarray, shape (n_samples, n_labels)
Array of shape ``(len(y), n_labels)``. Each row corresponds to a
categorical distribution with *normalized* probabilities in log scale.
Therefore, the number of columns must be at least 1.
loss_mat : ndarray, shape (n_labels, n_actions)
Loss matrix to use for making decisions of size
``(n_labels, n_actions)``. The loss of taking action a when the true
outcome (label) is y is found in ``loss_mat[y, a]``.
Returns
-------
action : ndarray of type int, shape (n_samples,)
Array of resulting Bayes' optimal action for each data point.
"""
pred_prob = np.exp(log_pred_prob)
E_loss = np.dot(pred_prob, loss_mat)
action = np.argmin(E_loss, axis=1)
return action
[docs]def hard_loss(y, log_pred_prob, loss_mat=None):
"""Loss function for making classification decisions from a loss matrix.
This function both computes the optimal action under the predictive
distribution and the loss matrix, and then scores that decision using the
loss matrix.
Parameters
----------
y : ndarray of type int or bool, shape (n_samples,)
True labels for each classication data point.
log_pred_prob : ndarray, shape (n_samples, n_labels)
Array of shape ``(len(y), n_labels)``. Each row corresponds to a
categorical distribution with *normalized* probabilities in log scale.
Therefore, the number of columns must be at least 1.
loss_mat : None or ndarray of shape (n_labels, n_actions)
Loss matrix to use for making decisions of size
``(n_labels, n_actions)``. The loss of taking action a when the true
outcome (label) is y is found in ``loss_mat[y, a]``. If None,
1 - identity matrix is used to obtain the 0-1 loss function.
Returns
-------
loss : ndarray, shape (n_samples,)
Array of the resulting loss for the predictions on each point in `y`.
"""
n_samples, n_labels = shape_and_validate(y, log_pred_prob)
loss_mat = (1.0 - np.eye(n_labels)) if loss_mat is None else loss_mat
assert np.ndim(loss_mat) == 2 and loss_mat.shape[0] == n_labels
assert loss_mat.shape[1] >= 1 # Must be least one action
action = hard_loss_decision(log_pred_prob, loss_mat)
assert action.shape == y.shape and action.dtype.kind == "i"
loss = loss_mat[y.astype(int), action]
assert loss.shape == (n_samples,)
return loss
[docs]def log_loss(y, log_pred_prob):
"""Compute log loss (e.g, negative log likelihood or cross-entropy).
Parameters
----------
y : ndarray of type int or bool, shape (n_samples,)
True labels for each classication data point.
log_pred_prob : ndarray, shape (n_samples, n_labels)
Array of shape ``(len(y), n_labels)``. Each row corresponds to a
categorical distribution with *normalized* probabilities in log scale.
Therefore, the number of columns must be at least 1.
Returns
-------
loss : ndarray, shape (n_samples,)
Array of the log loss for the predictions on each data point in `y`.
"""
n_samples, n_labels = shape_and_validate(y, log_pred_prob)
nll = -log_pred_prob[np.arange(n_samples), y.astype(int)]
return nll
[docs]def brier_loss(y, log_pred_prob, rescale=True):
"""Compute (rescaled) Brier loss.
Parameters
----------
y : ndarray of type int or bool, shape (n_samples,)
True labels for each classication data point.
log_pred_prob : ndarray, shape (n_samples, n_labels)
Array of shape ``(len(y), n_labels)``. Each row corresponds to a
categorical distribution with *normalized* probabilities in log scale.
Therefore, the number of columns must be at least 1.
rescale : bool
If True, linearly rescales lost so perfect (P=1) predictions give 0.0
loss and a uniform prediction gives loss of 1.0. False gives the
standard Brier loss.
Returns
-------
loss : ndarray, shape (n_samples,)
Array of the Brier loss for the predictions on each data point in `y`.
"""
n_samples, n_labels = shape_and_validate(y, log_pred_prob)
y_bin = one_hot(y.astype(int), n_labels)
loss = np.sum((np.exp(log_pred_prob) - y_bin) ** 2, axis=1)
if rescale and n_labels > 1:
# Linearly rescale so perfect is 0.0 and uniform gives 1.0
loss = np.true_divide(n_labels, n_labels - 1) * loss
return loss
[docs]def spherical_loss(y, log_pred_prob, rescale=True):
"""Compute (rescaled) spherical loss.
Parameters
----------
y : ndarray of type int or bool, shape (n_samples,)
True labels for each classication data point.
log_pred_prob : ndarray, shape (n_samples, n_labels)
Array of shape ``(len(y), n_labels)``. Each row corresponds to a
categorical distribution with *normalized* probabilities in log scale.
Therefore, the number of columns must be at least 1.
rescale : bool
If True, linearly rescales lost so perfect (P=1) predictions give 0.0
loss and a uniform prediction gives loss of 1.0. False gives the
standard spherical loss, which is the negative spherical *score*.
Returns
-------
loss : ndarray, shape (n_samples,)
Array of the spherical loss for the predictions on each point in `y`.
"""
N, n_labels = shape_and_validate(y, log_pred_prob)
log_normalizer = 0.5 * logsumexp(2.0 * log_pred_prob, axis=1)
# Need to do negative of spherical score to make a loss function
loss = -np.exp(log_pred_prob[np.arange(N), y.astype(int)] - log_normalizer)
if rescale:
# Linearly rescale so perfect is 0.0 and uniform gives 1.0, when
# n_labels = 1 everything is perfect so loss = 0.
c = 1.0 - 1.0 / np.sqrt(n_labels) if n_labels > 1 else 1.0
loss = (1.0 + loss) / c
return loss
# ============================================================================
# Loss summary: the main purpose of this file.
# ============================================================================
[docs]def loss_table(log_pred_prob_table, y, metrics_dict, assume_normalized=False):
"""Compute loss table from table of probalistic predictions.
Parameters
----------
log_pred_prob_table : DataFrame, shape (n_samples, n_methods * n_labels)
DataFrame with predictive distributions. Each row is a data point.
The columns should be hierarchical index that is the cartesian product
of methods x labels. For exampe, ``log_pred_prob_table.loc[5, 'foo']``
is the categorical distribution (in log scale) prediction that method
foo places on ``y[5]``.
y : ndarray of type int or bool, shape (n_samples,)
True labels for each classication data point. Must be of same length as
DataFrame `log_pred_prob_table`.
metrics_dict : dict of str to callable
Dictionary mapping loss function name to function that computes loss,
e.g., `log_loss`, `brier_loss`, ...
assume_normalized : bool
If False, renormalize the predictive distributions to ensure there is
no cheating. If True, skips this step for speed.
Returns
-------
loss_tbl : DataFrame, shape (n_samples, n_metrics * n_methods)
DataFrame with loss of each method according to each loss function on
each data point. The rows are the data points in `y` (that is the index
matches `log_pred_prob_table`). The columns are a hierarchical index
that is the cartesian product of loss x method. That is, the loss of
method foo's prediction of ``y[5]`` according to loss function bar is
stored in ``loss_tbl.loc[5, ('bar', 'foo')]``.
"""
methods, labels = log_pred_prob_table.columns.levels
n_samples, n_labels = len(log_pred_prob_table), len(labels)
assert y.shape == (n_samples,)
assert n_samples >= 1 and n_labels >= 1 and len(methods) >= 1
col_names = pd.MultiIndex.from_product([metrics_dict.keys(), methods], names=[METRIC, METHOD])
loss_tbl = pd.DataFrame(index=log_pred_prob_table.index, columns=col_names, dtype=float)
for method in methods:
# Make sure the columns are in right order and we aren't mixing things
assert list(log_pred_prob_table[method].columns) == list(range(n_labels))
log_pred_prob = log_pred_prob_table[method].values
assert log_pred_prob.shape == (n_samples, n_labels)
assert not np.any(np.isnan(log_pred_prob)) # Would let method cheat
if not assume_normalized:
log_pred_prob = normalize(log_pred_prob)
for metric, metric_f in metrics_dict.items():
loss_tbl.loc[:, (metric, method)] = metric_f(y, log_pred_prob)
return loss_tbl
# ============================================================================
# Use and summarize performance curves
# ============================================================================
[docs]def check_curve(result, x_grid=None):
"""Check performance curve output matches expected format and return the
curve after validation.
Parameters
----------
curve : result of curve function, e.g., classification.roc_curve
Curves defined by a ROC or other curve estimation.
x_grid : None or ndarray of shape (n_grid,)
If provided, check that all the curves are defined over a wider range
than the x_grid. So, when the functions are interpolated onto the range
of x_grid no extrapolation is needed.
Returns
-------
curve : tuple of (ndarray, ndarray, str)
Returns same object passed in after some input checks. Each of the
ndarrays have shape (n_boot, n_thresholds).
"""
curve, _ = result # Skipping tholds (2nd arg) since not used here
x_curve, y_curve, kind = curve
# Check shape
assert x_curve.ndim == 2 and y_curve.ndim == 2
assert x_curve.shape == y_curve.shape
assert x_curve.shape[1] >= 2 # Otherwise not curve
# Check values
assert np.all(np.isfinite(x_curve))
assert np.all(y_curve < np.inf) # PRG can be -inf, but all curves < inf
assert np.all(np.diff(x_curve, axis=1) >= 0.0) # also check is sorted
if x_grid is not None: # Make sure we won't need to extrapolate for grid
assert np.all(x_curve[:, 0] <= x_grid[0])
assert np.all(x_grid[-1] <= x_curve[:, -1])
return curve
[docs]def curve_boot(
y, log_pred_prob, ref, curve_f=pc.roc_curve, x_grid=None, n_boot=1000, pairwise_CI=PAIRWISE_DEFAULT, confidence=0.95
):
"""Perform boot strap analysis of performance curve, e.g., ROC or prec-rec.
For binary classification only.
Parameters
----------
y : ndarray of type int or bool, shape (n_samples,)
Array containing true labels, must be `bool` or {0,1}.
log_pred_prob : ndarray, shape (n_samples, 2)
Array of shape ``(len(y), 2)``. Each row corresponds to a categorical
distribution with *normalized* probabilities in log scale. However,
many curves (e.g., ROC) are invariant to monotonic transformation and
hence linear scale could also be used.
ref : float or ndarray of shape (n_samples, 2)
If `ref` is an rray of shape ``(len(y), 2)``: Same as `log_pred_prob`
except for the reference (baseline) method if a paired statistical test
is desired on the area under the curve. If `ref` is a scalar float:
`curve_boot` tests the statistical significance that the area under the
curve differs from `ref` in a non-paired test. For ROC analysis, `ref`
is typically 0.5.
curve_f : callable
Function to compute the performance curve. Standard choices are:
`perf_curves.roc_curve` or `perf_curves.recall_precision_curve`.
x_grid : None or ndarray of shape (n_grid,)
Grid of points to evaluate curve in results. If `None`, defaults to
linear grid on [0,1].
n_boot : int
Number of bootstrap iterations to perform.
pairwise_CI : bool
If True, compute error bars on ``summary - summary_ref`` instead of
just the summary. This typically results in smaller error bars.
confidence : float
Confidence probability (in (0, 1)) to construct error bar.
Returns
-------
summary : tuple of floats, shape (3,)
Tuple containing (mu, EB, pval), where mu is the best estimate on the
summary statistic of the curve, EB is the error bar, and pval is the
p-value from the two-sided boot strap significance test that its value
is the same as the reference summary value (from either
`log_pred_prob_ref` or `default_summary_ref`).
curve : DataFrame, shape (n_grid, 4)
DataFrame containing four columns: `x_grid`, the curve value, the lower
end of confidence envelope, and the upper end of the confidence
envelope.
"""
N, n_labels = shape_and_validate(y, log_pred_prob)
assert n_labels == 2
assert np.ndim(ref) == 0 or ref.shape == log_pred_prob.shape
assert not np.any(np.isnan(ref))
assert n_boot >= 1
assert not np.any(np.isnan(log_pred_prob)) # Would let method cheat
# Setup constants
epsilon = 1e-10 # Min bootstrap weight since 0 weight can cause problems
pos_label = 1 # Label=1 of [0,1] is considered a positive case
x_grid = np.linspace(0.0, 1.0, DEFAULT_NGRID) if x_grid is None else x_grid
assert np.ndim(x_grid) == 1
# Put everything into a vector of right type for binary classification
y = y.astype(bool)
log_pred_prob = log_pred_prob[:, pos_label]
# Get estimator on original data. Could use _interp1d directly since only 1
# curve, but this is more consistent with bootstrap version below.
curve = check_curve(curve_f(y, log_pred_prob), x_grid)
auc, = area(*curve)
assert auc.ndim == 0
y_grid, = interp1d(x_grid, *curve)
assert y_grid.shape == x_grid.shape
# Setup boot strap weights
weight = bu.boot_weights(N, n_boot, epsilon=epsilon)
# Get boot strapped scores
curve_boot_ = check_curve(curve_f(y, log_pred_prob, weight), x_grid)
auc_boot = area(*curve_boot_)
assert auc_boot.shape == (n_boot,)
y_grid_boot = interp1d(x_grid, *curve_boot_)
assert y_grid_boot.shape == (n_boot, x_grid.size)
# Repeat area boot strap with reference predictor (if provided)
ref_boot = ref
if np.ndim(ref) == 2: # Note dim must be 0 or 2
ref_boot = area(*check_curve(curve_f(y, ref[:, pos_label], weight)))
assert ref_boot.shape == (n_boot,)
ref, = area(*check_curve(curve_f(y, ref[:, pos_label])))
assert np.ndim(ref) == 0
# Pack up standard numeric summary triple
EB = (
bu.error_bar(auc_boot - ref_boot, auc - ref, confidence=confidence)
if pairwise_CI
else bu.error_bar(auc_boot, auc, confidence=confidence)
)
pval = bu.significance(auc_boot, ref_boot)
summary = (auc, EB, pval)
# Pack up data frame with graphical summaries (performance curves)
# Could also try bu.basic and see which works better
y_LB, y_UB = bu.percentile(y_grid_boot, confidence)
curve = pd.DataFrame(
data=np.stack((x_grid, y_grid, y_LB, y_UB), axis=1), index=range(x_grid.size), columns=CURVE_STATS, dtype=float
)
return summary, curve
[docs]def curve_summary_table(
log_pred_prob_table,
y,
curve_dict,
ref_method,
x_grid=None,
n_boot=1000,
pairwise_CI=PAIRWISE_DEFAULT,
confidence=0.95,
):
"""Build table with mean and error bars of curve summaries from a table of
probalistic predictions.
Parameters
----------
log_pred_prob_table : DataFrame, shape (n_samples, n_methods * n_labels)
DataFrame with predictive distributions. Each row is a data point.
The columns should be hierarchical index that is the cartesian product
of methods x labels. For exampe, ``log_pred_prob_table.loc[5, 'foo']``
is the categorical distribution (in log scale) prediction that method
foo places on ``y[5]``.
y : ndarray of type int or bool, shape (n_samples,)
True labels for each classication data point. Must be of same length as
DataFrame `log_pred_prob_table`.
curve_dict : dict of str to callable
Dictionary mapping curve name to performance curve. Standard choices:
`perf_curves.roc_curve` or `perf_curves.recall_precision_curve`.
ref_method : str
Name of method that is used as reference point in paired statistical
tests. This is usually some some of baseline method. `ref_method` must
be found in the 1st level of the columns of `log_pred_prob_table`.
x_grid : None or ndarray of shape (n_grid,)
Grid of points to evaluate curve in results. If `None`, defaults to
linear grid on [0,1].
n_boot : int
Number of bootstrap iterations to perform.
pairwise_CI : bool
If True, compute error bars on ``summary - summary_ref`` instead of
just the summary. This typically results in smaller error bars.
confidence : float
Confidence probability (in (0, 1)) to construct error bar.
Returns
-------
curve_tbl : DataFrame, shape (n_methods, n_metrics * 3)
DataFrame with curve summary of each method according to each curve.
The rows are the methods. The columns are a hierarchical index that is
the cartesian product of curve x (summary, error bar, p-value).
That is, ``curve_tbl.loc['foo', 'bar']`` is a pandas series with
(summary of bar curve on foo, corresponding error bar, statistical sig)
The statistical significance is a p-value from a two-sided hypothesis
test on the hypothesis H0 that foo has the same curve summary as the
reference method `ref_method`.
curve_dump : dict of (str, str) to DataFrame of shape (n_grid, 4)
Each key is a pair of (method name, curve name) with the value being
a pandas dataframe with the performance curve, which has four columns:
`x_grid`, the curve value, the lower end of confidence envelope,
and the upper end of the confidence envelope.
"""
methods, labels = log_pred_prob_table.columns.levels
N, n_labels = len(log_pred_prob_table), len(labels)
assert y.shape == (N,)
assert ref_method in methods # ==> len(methods) >= 1
assert N >= 1 and n_labels >= 1 and len(curve_dict) >= 1
assert list(log_pred_prob_table[ref_method].columns) == list(range(n_labels))
log_pred_prob_ref = log_pred_prob_table[ref_method].values
assert log_pred_prob_ref.shape == (N, n_labels)
# Note: Most curve methods are rank based and so normalization is not
# needed to prevent cheating. However, if we expect non-normalized methods
# they should be normalized before to keep consistency with loss metrics.
col_names = pd.MultiIndex.from_product([curve_dict.keys(), STD_STATS], names=[METRIC, STAT])
curve_tbl = pd.DataFrame(index=methods, columns=col_names, dtype=float)
curve_tbl.index.set_names(METHOD, inplace=True)
curve_dump = {}
for method in methods:
assert list(log_pred_prob_table[method].columns) == list(range(n_labels))
log_pred_prob = log_pred_prob_table[method].values
assert log_pred_prob.shape == (N, n_labels)
for curve_name, curve_f in curve_dict.items():
R = curve_boot(
y,
log_pred_prob,
ref=log_pred_prob_ref,
curve_f=curve_f,
x_grid=x_grid,
n_boot=n_boot,
pairwise_CI=pairwise_CI,
confidence=confidence,
)
curve_summary, curr_curve = R
curve_tbl.loc[method, curve_name] = curve_summary
if pairwise_CI and method == ref_method:
curve_tbl.loc[method, (curve_name, ERR_COL)] = np.nan
if method == ref_method: # NaN probably makes more sense than 1
curve_tbl.loc[method, (curve_name, PVAL_COL)] = np.nan
curve_dump[(method, curve_name)] = curr_curve
return curve_tbl, curve_dump
[docs]def summary_table(
log_pred_prob_table,
y,
loss_dict,
curve_dict,
ref_method,
x_grid=None,
n_boot=1000,
pairwise_CI=PAIRWISE_DEFAULT,
confidence=0.95,
method_EB="t",
limits={},
):
"""Build table with mean and error bars of both loss and curve summaries
from a table of probalistic predictions.
Parameters
----------
log_pred_prob_table : DataFrame, shape (n_samples, n_methods * n_labels)
DataFrame with predictive distributions. Each row is a data point.
The columns should be hierarchical index that is the cartesian product
of methods x labels. For exampe, ``log_pred_prob_table.loc[5, 'foo']``
is the categorical distribution (in log scale) prediction that method
foo places on ``y[5]``.
y : ndarray of type int or bool, shape (n_samples,)
True labels for each classication data point. Must be of same length as
DataFrame `log_pred_prob_table`.
loss_dict : dict of str to callable
Dictionary mapping loss function name to function that computes loss,
e.g., `log_loss`, `brier_loss`, ...
curve_dict : dict of str to callable
Dictionary mapping curve name to performance curve. Standard choices:
`perf_curves.roc_curve` or `perf_curves.recall_precision_curve`.
ref_method : str
Name of method that is used as reference point in paired statistical
tests. This is usually some some of baseline method. `ref_method` must
be found in the 1st level of the columns of `log_pred_prob_table`.
x_grid : None or ndarray of shape (n_grid,)
Grid of points to evaluate curve in results. If `None`, defaults to
linear grid on [0,1].
n_boot : int
Number of bootstrap iterations to perform for performance curves.
pairwise_CI : bool
If True, compute error bars on ``summary - summary_ref`` instead of
just the summary. This typically results in smaller error bars.
confidence : float
Confidence probability (in (0, 1)) to construct error bar.
method_EB : {'t', 'bernstein', 'boot'}
Method to use for building error bar.
limits : dict of str to (float, float)
Dictionary mapping metric name to tuple with (lower, upper) which are
the theoretical limits on the mean loss. For instance, zero-one loss
should be ``(0.0, 1.0)``. If entry missing, (-inf, inf) is used.
Returns
-------
full_tbl : DataFrame, shape (n_methods, (n_loss + n_curve) * 3)
DataFrame with curve/loss summary of each method according to each
curve or loss function. The rows are the methods. The columns are a
hierarchical index that is the cartesian product of
metric x (summary, error bar, p-value), where metric can be a loss or
a curve summary: ``full_tbl.loc['foo', 'bar']`` is a pandas series
with (metric bar on foo, corresponding error bar, statistical sig)
The statistical significance is a p-value from a two-sided hypothesis
test on the hypothesis H0 that foo has the same metric as the reference
method `ref_method`.
curve_dump : dict of (str, str) to DataFrame of shape (n_grid, 4)
Each key is a pair of (method name, curve name) with the value being
a pandas dataframe with the performance curve, which has four columns:
`x_grid`, the curve value, the lower end of confidence envelope,
and the upper end of the confidence envelope. Only metrics from
`curve_dict` and *not* from `loss_dict` are found here.
"""
# Do the curve metrics
curve_summary, dump_tbl = curve_summary_table(
log_pred_prob_table,
y,
curve_dict,
ref_method,
x_grid=x_grid,
n_boot=n_boot,
pairwise_CI=pairwise_CI,
confidence=confidence,
)
# Do loss based metrics
loss_tbl = loss_table(log_pred_prob_table, y, loss_dict)
loss_summary = loss_summary_table(
loss_tbl, ref_method, pairwise_CI=pairwise_CI, confidence=confidence, method_EB=method_EB, limits=limits
)
# Return the combo
full_tbl = pd.concat((loss_summary, curve_summary), axis=1)
return full_tbl, dump_tbl
# ============================================================================
# Variables and functions to make getting results from sklearn objects easy
# ============================================================================
# Pre-build some standard metric dicts for the user
STD_CLASS_LOSS = {"NLL": log_loss, "Brier": brier_loss, "sphere": spherical_loss, "zero_one": hard_loss}
STD_BINARY_CURVES = {"AUC": pc.roc_curve, "AP": pc.recall_precision_curve, "AUPRG": pc.prg_curve}
[docs]class JustNoise:
"""Class version of iid predictor compatible with sklearn interface. Same
as ``sklearn.dummy.DummyClassifier(strategy='prior').``"""
def __init__(self, n_labels=2, pseudo_count=0.0):
self.pred = np.nan + np.zeros(n_labels)
self.pseudo_count = pseudo_count
def fit(self, X_train, y_train):
n_labels = len(self.pred)
n_points, = np.shape(y_train)
counts = self.pseudo_count + np.sum(one_hot(y_train, n_labels), axis=0)
n_total = n_points + n_labels * self.pseudo_count
self.pred = np.log(counts / n_total)
assert self.pred.shape == (n_labels,)
def predict_log_proba(self, X_test):
n_samples = X_test.shape[0]
pred_log_prob = np.repeat([self.pred], n_samples, axis=0)
return pred_log_prob
[docs]def get_pred_log_prob(
X_train, y_train, X_test, n_labels, methods, min_log_prob=-np.inf, verbose=False, checkpointdir=None
):
"""Get the predictive probability tables for each test point on a
collection of classification methods.
Parameters
----------
X_train : ndarray, shape (n_train, n_features)
Training set 2d feature array for classifiers. Each row is an
indepedent data point and each column is a feature.
y_train : ndarray of type int or bool, shape (n_train,)
Training set 1d array of truth labels for classifiers. Must be of same
length as `X_train`. Values must be in range [0, `n_labels`) or `bool`.
X_test : ndarray, shape (n_test, n_features)
Test set 2d feature array for classifiers. Each row is an indepedent
data point and each column is a feature.
n_labels : int
Number of labels, must be >= 1. This is not infered from `y` because
some labels may not be found in small data chunks.
methods : dict of str to sklearn estimator
Dictionary mapping method name (`str`) to object that performs training
and test. Object must follow the interface of sklearn estimators, that
is it has a ``fit()`` method and either a ``predict_log_proba()`` or
``predict_proba()`` method.
min_log_prob : float
Minimum value to floor the predictive log probabilities (while still
normalizing). Must be < 0. Useful to prevent inf log loss penalties.
verbose : bool
If True, display which method being trained.
checkpointdir : str (directory)
If provided, stores checkpoint results using joblib for the train/test
in case process interrupted. If None, no checkpointing is done.
Returns
-------
log_pred_prob_table : DataFrame, shape (n_samples, n_methods * n_labels)
DataFrame with predictive distributions. Each row is a data point.
The columns should be hierarchical index that is the cartesian product
of methods x labels. For exampe, ``log_pred_prob_table.loc[5, 'foo']``
is the categorical distribution (in log scale) prediction that method
foo places on ``y[5]``.
Notes
-----
If a train/test operation is loaded from a checkpoint file, the estimator
object in methods will not be in a fit state.
"""
n_test = X_test.shape[0]
assert n_test > 0
# Allow ndim >= 2 since some input data (e.g. images) come that way
assert X_train.ndim >= 2
assert y_train.shape == (X_train.shape[0],)
assert y_train.dtype.kind in ("b", "i")
assert 0 <= y_train.min() and y_train.max() < n_labels
assert X_test.ndim == X_train.ndim and X_test.shape[1] == X_train.shape[1]
assert X_train.dtype.kind == X_test.dtype.kind # Would be weird otherwise
assert min_log_prob < 0.0 # Ensure is a log-prob
memory = Memory(cachedir=checkpointdir, verbose=0)
@memory.cache
def train_predict(method_obj, X_train, y_train, X_test):
method_obj.fit(X_train, y_train)
try:
pred_log_prob = method_obj.predict_log_proba(X_test)
except: # noqa: E722 If there is no log proba available
# TODO add exception type
with np.errstate(divide="ignore"): # Not unusual to have some p=0 cases
pred_log_prob = np.log(method_obj.predict_proba(X_test))
return pred_log_prob
col_names = pd.MultiIndex.from_product([methods.keys(), range(n_labels)], names=[METHOD, LABEL])
log_pred_prob_table = pd.DataFrame(index=range(n_test), columns=col_names, dtype=float)
for method_name, method_obj in methods.items():
if verbose:
print("Running fit/predict for {}".format(method_name))
pred_log_prob = train_predict(method_obj, X_train, y_train, X_test)
assert pred_log_prob.shape == (n_test, n_labels)
pred_log_prob = normalize(np.maximum(min_log_prob, pred_log_prob))
log_pred_prob_table.loc[:, method_name] = pred_log_prob
return log_pred_prob_table
[docs]def just_benchmark(
X_train,
y_train,
X_test,
y_test,
n_labels,
methods,
loss_dict,
curve_dict,
ref_method,
min_pred_log_prob=-np.inf,
pairwise_CI=PAIRWISE_DEFAULT,
method_EB="t",
limits={},
):
"""Simplest one-call interface to this package. Just pass it data and
method objects and a performance summary DataFrame is returned.
Parameters
----------
X_train : ndarray, shape (n_train, n_features)
Training set 2d feature array for classifiers. Each row is an
indepedent data point and each column is a feature.
y_train : ndarray of type int or bool, shape (n_train,)
Training set 1d array of truth labels for classifiers. Must be of same
length as `X_train`. Values must be in range [0, `n_labels`) or `bool`.
X_test : ndarray, shape (n_test, n_features)
Test set 2d feature array for classifiers. Each row is an indepedent
data point and each column is a feature.
y_test : ndarray of type int or bool, shape (n_test,)
Test set 1d array of truth labels for classifiers. Must be of same
length as `X_test`. Values must be in range [0, `n_labels`) or `bool`.
n_labels : int
Number of labels, must be >= 1. This is not infered from `y` because
some labels may not be found in small data chunks.
methods : dict of str to sklearn estimator
Dictionary mapping method name (`str`) to object that performs training
and test. Object must follow the interface of sklearn estimators, that
is it has a ``fit()`` method and either a ``predict_log_proba()`` or
``predict_proba()`` method.
loss_dict : dict of str to callable
Dictionary mapping loss function name to function that computes loss,
e.g., `log_loss`, `brier_loss`, ...
curve_dict : dict of str to callable
Dictionary mapping curve name to performance curve. Standard choices:
`perf_curves.roc_curve` or `perf_curves.recall_precision_curve`.
ref_method : str
Name of method that is used as reference point in paired statistical
tests. This is usually some some of baseline method. `ref_method` must
be found in `methods` dictionary.
min_log_prob : float
Minimum value to floor the predictive log probabilities (while still
normalizing). Must be < 0. Useful to prevent inf log loss penalties.
pairwise_CI : bool
If True, compute error bars on the mean of ``loss - loss_ref`` instead
of just the mean of `loss`. This typically gives smaller error bars.
method_EB : {'t', 'bernstein', 'boot'}
Method to use for building error bar.
limits : dict of str to (float, float)
Dictionary mapping metric name to tuple with (lower, upper) which are
the theoretical limits on the mean loss. For instance, zero-one loss
should be ``(0.0, 1.0)``. If entry missing, (-inf, inf) is used.
Returns
-------
full_tbl : DataFrame, shape (n_methods, (n_loss + n_curve) * 3)
DataFrame with curve/loss summary of each method according to each
curve or loss function. The rows are the methods. The columns are a
hierarchical index that is the cartesian product of
metric x (summary, error bar, p-value), where metric can be a loss or
a curve summary: ``full_tbl.loc['foo', 'bar']`` is a pandas series
with (metric bar on foo, corresponding error bar, statistical sig)
The statistical significance is a p-value from a two-sided hypothesis
test on the hypothesis H0 that foo has the same metric as the reference
method `ref_method`.
curve_dump : dict of (str, str) to DataFrame of shape (n_grid, 4)
Each key is a pair of (method name, curve name) with the value being
a pandas dataframe with the performance curve, which has four columns:
`x_grid`, the curve value, the lower end of confidence envelope,
and the upper end of the confidence envelope. Only metrics from
`curve_dict` and *not* from `loss_dict` are found here.
"""
assert y_train.dtype == y_test.dtype # Would be weird otherwise
pred_tbl = get_pred_log_prob(X_train, y_train, X_test, n_labels, methods, min_log_prob=min_pred_log_prob)
full_tbl, dump = summary_table(
pred_tbl, y_test, loss_dict, curve_dict, ref_method, pairwise_CI=pairwise_CI, method_EB=method_EB, limits=limits
)
return full_tbl, dump