# Ryan Turner (turnerry@iro.umontreal.ca)
from __future__ import absolute_import, division, print_function
import numpy as np
import pandas as pd
import scipy.stats as ss
import mlpaper.boot_util as bu
from mlpaper.constants import METHOD, METRIC, PAIRWISE_DEFAULT, STAT, STD_STATS
from mlpaper.util import clip_chk
N_BOOT = 1000 # Default number of bootstrap replications
# ============================================================================
# Statistical util functions
# ============================================================================
[docs]def clip_EB(mu, EB, lower=-np.inf, upper=np.inf, min_EB=0.0):
"""Clip error bars to both a minimum uncertainty level and a maximum level
determined by trivial error bars from the a prior known limits of the
unknown parameter `theta`. Similar to `np.clip`, but for error bars.
Parameters
----------
mu : float
Point estimate of unknown parameter `theta` around which error bars are
based.
EB : float
Size of error bar around `mu` (``EB > 0``). The confidence interval on
`theta` is ``[mu - EB, mu + EB]``.
lower : float
A priori known theoretical lower limit on unknown parameter `theta`.
For instance, for mean zero-one loss, ``lower=0``.
upper : float
A priori known theoretical upper limit on unknown parameter `theta`.
For instance, for mean zero-one loss, ``upper=1``.
min_EB : float
Minimum size beleivable size of error bar. Typically, leave
``min_EB=0`` for simplicity.
Returns
-------
EB : float
Error bar after possible clipping.
"""
assert np.ndim(mu) == 0 and np.ndim(EB) == 0
assert np.ndim(lower) == 0 and np.ndim(upper) == 0
assert upper - lower >= 0.0 # Also catch (inf, inf) or nans
assert np.ndim(min_EB) == 0
assert 0.0 <= min_EB and min_EB < np.inf
# Note: These conditions are designed to pass when NaNs are supplied.
if lower > mu or mu > upper:
raise ValueError("mu %f outside of given limits (%f, %f)" % (mu, lower, upper))
if 2 * min_EB > upper - lower:
raise ValueError("min error bar %f too small for limits (%f, %f)" % (min_EB, lower, upper))
with np.errstate(invalid="ignore"): # expect non-finite here
EB_trivial = np.fmax(upper - mu, mu - lower)
assert not (min_EB > EB_trivial) # Let NaNs pass
EB = np.clip(EB, min_EB, EB_trivial)
return EB
[docs]def t_test(x):
"""Perform a standard t-test to test if the values in `x` are sampled from
a distribution with a zero mean.
Parameters
----------
x : array-like, shape (n_samples,)
array of data points to test.
Returns
-------
pval : float
p-value (in [0,1]) from t-test on `x`.
"""
assert np.ndim(x) == 1 and (not np.any(np.isnan(x)))
if (len(x) <= 1) or (not np.all(np.isfinite(x))):
return 1.0 # Can't say anything about scale => p=1
_, pval = ss.ttest_1samp(x, 0.0)
if np.isnan(pval):
# Should only be possible if scale underflowed to zero:
assert np.var(x, ddof=1) <= 1e-100
# It is debatable if the condition should be ``np.mean(x) == 0.0`` or
# ``np.all(x == 0.0)``. Should not matter in practice.
pval = np.float(np.mean(x) == 0.0)
assert 0.0 <= pval and pval <= 1.0
return pval
[docs]def t_EB(x, confidence=0.95):
"""Get t statistic based error bars on mean of `x`.
Parameters
----------
x : array-like, shape (n_samples,)
Data points to estimate mean. Must not be empty or contain NaNs.
confidence : float
Confidence probability (in (0, 1)) to construct confidence interval
from t statistic.
Returns
-------
EB : float
Size of error bar on mean (>= 0). The confidence interval is
``[mean(x) - EB, mean(x) + EB]``. `EB` is inf when ``len(x) <= 1``.
"""
assert np.ndim(x) == 1 and (not np.any(np.isnan(x)))
assert np.ndim(confidence) == 0
assert 0.0 < confidence and confidence < 1.0
N = len(x)
if (N <= 1) or (not np.all(np.isfinite(x))):
return np.inf
# loc cancels out when we just want EB anyway
LB, UB = ss.t.interval(confidence, N - 1, loc=0.0, scale=1.0)
assert not (LB > UB)
# Just multiplying scale=ss.sem(x) is better for when scale=0
EB = 0.5 * ss.sem(x) * (UB - LB)
assert np.ndim(EB) == 0 and EB >= 0.0
return EB
[docs]def bernstein_test(x, lower, upper):
"""Perform Bernstein bound-based test to test if the values in `x` are
sampled from a distribution with a zero mean. This test makes no
distributional or central limit theorem assumption on `x`.
As a result the bound may be loose and the p-value will not be sampled from
a uniform distribution under H0 (E[x] = 0), but rather be skewed larger
than uniform.
Parameters
----------
x : array-like, shape (n_samples,)
array of data points to test.
lower : float
A priori known theoretical lower limit on unknown mean. For instance,
for mean zero-one loss, ``lower=0``.
upper : float
A priori known theoretical upper limit on unknown mean. For instance,
for mean zero-one loss, ``upper=1``.
Returns
-------
pval : float
p-value (in [0,1]) from t-test on `x`.
"""
assert np.ndim(x) == 1 and (not np.any(np.isnan(x)))
assert np.ndim(lower) == 0 and np.ndim(upper) == 0
range_ = upper - lower
assert range_ >= 0.0 # Also catch (inf, inf) or nans
assert np.all(lower <= x) and np.all(x <= upper)
if (len(x) <= 1) or (not np.all(np.isfinite(x))):
return 1.0 # Can't say anything about scale => p=1
if (range_ == 0.0) or (range_ == np.inf):
# If range_ = inf, we could use p=0, if 0 is outside of [lower, upper],
# but it is unclear if there is any advantage to the extra hassle.
# If range_ = 0, then roots not invertible and distn on data x is a
# point mass => everything has p=1.
return 1.0
# Get the moments
N = len(x)
mu = np.mean(x)
std = np.std(x, ddof=0)
coef = [(3.0 * range_) / N, std * np.sqrt(2.0 / N), -np.abs(mu)]
assert np.all(np.isfinite(coef)) # Should have caught non-finite cases
coef_roots = np.roots(coef)
assert len(coef_roots) == 2
assert coef_roots.dtype.kind == "f" # Appears roots are always real
# Appears to always be one neg and one pos root, but we looking for square
# root so the positive one is the correct one. The roots can be zero.
assert np.sum(coef_roots <= 0.0) >= 1
assert np.sum(coef_roots >= 0.0) >= 1
B = np.max(coef_roots) ** 2 # Bernstein test statistic
# Sampling CDF is bounded by exponential for any true distn on x.
delta = 3.0 * np.exp(-B)
pval = np.minimum(1.0, delta) # Can cap at 1 to make p-value
assert 0.0 <= pval and pval <= 1.0
return pval
[docs]def bernstein_EB(x, lower, upper, confidence=0.95):
"""Get Bernstein bound based error bars on mean of `x`. This error bar
makes no distributional or central limit theorem assumption on `x`.
Parameters
----------
x : array-like, shape (n_samples,)
Data points to estimate mean. Must not be empty or contain NaNs.
lower : float
A priori known theoretical lower limit on unknown mean. For instance,
for mean zero-one loss, ``lower=0``.
upper : float
A priori known theoretical upper limit on unknown mean. For instance,
for mean zero-one loss, ``upper=1``.
confidence : float
Confidence probability (in (0, 1)) to construct confidence interval
from t statistic.
Returns
-------
EB : float
Size of error bar on mean (>= 0). The confidence interval is
``[mean(x) - EB, mean(x) + EB]``. ``EB = upper - lower`` is inf when
``len(x) = 0``.
Notes
-----
This does not do clipping of to trivial error bars, i.e., `EB` could be
larger than ``upper - lower``. However, `clip_EB` can be called to enforce
trivial error bar limits.
References
----------
Audibert, Jean-Yves, Remi Munos, and Csaba Szepesvari.
"Exploration-exploitation tradeoff using variance estimates in multi-armed
bandits." Theoretical Computer Science 410.19 (2009): 1876-1902.
"""
assert np.ndim(x) == 1 and (not np.any(np.isnan(x)))
assert np.ndim(lower) == 0 and np.ndim(upper) == 0
range_ = upper - lower
assert range_ >= 0.0 # Also catch (inf, inf) or nans
assert np.all(lower <= x) and np.all(x <= upper)
assert np.ndim(confidence) == 0
assert 0.0 < confidence and confidence < 1.0
N = x.size
if (N <= 1) or (not np.all(np.isfinite(x))):
return range_
# From Thm 1 of Audibert et. al. (2009), must use MLE for std ==> ddof=0
delta = 1.0 - confidence
A = np.log(3.0 / delta)
EB = np.std(x, ddof=0) * np.sqrt((2.0 * A) / N) + (3.0 * A * range_) / N
assert np.ndim(EB) == 0 and EB >= 0.0
return EB
def _boot_EB_and_test(x, confidence=0.95, n_boot=N_BOOT, return_EB=True, return_test=True, return_CI=False):
"""Internal helper function to compute both bootstrap EB and significance
using the same random bootstrap weights, which saves computation and
guarantees the results are coherent with each other."""
assert np.ndim(x) == 1 and (not np.any(np.isnan(x)))
# confidence is checked by bu.error_bar
N = x.size
if (N <= 1) or (not np.all(np.isfinite(x))):
return np.inf, 1.0, (-np.inf, np.inf)
weight = bu.boot_weights(N, n_boot)
mu_boot = np.mean(x * weight, axis=1)
pval = bu.significance(mu_boot, ref=0.0) if return_test else 1.0
EB = np.inf
if return_EB:
mu = np.mean(x)
EB = bu.error_bar(mu_boot, mu, confidence=confidence)
# Useful in test:
CI = -np.inf, np.inf
if return_CI:
CI = bu.percentile(mu_boot, confidence=confidence)
return EB, pval, CI
[docs]def boot_test(x, n_boot=N_BOOT):
"""Perform a bootstrap-based test to test if the values in `x` are sampled
from a distribution with a zero mean.
Parameters
----------
x : array-like, shape (n_samples,)
array of data points to test.
n_boot : int
Number of bootstrap iterations to perform.
Returns
-------
pval : float
p-value (in [0,1]) from t-test on `x`.
"""
_, pval, _ = _boot_EB_and_test(x, n_boot=n_boot, return_EB=False, return_test=True)
assert 0.0 <= pval and pval <= 1.0
return pval
[docs]def boot_EB(x, confidence=0.95, n_boot=N_BOOT):
"""Get bootstrap bound based error bars on mean of `x`.
Parameters
----------
x : array-like, shape (n_samples,)
Data points to estimate mean. Must not be empty or contain NaNs.
confidence : float
Confidence probability (in (0, 1)) to construct confidence interval
from t statistic.
n_boot : int
Number of bootstrap iterations to perform.
Returns
-------
EB : float
Size of error bar on mean (>= 0). The confidence interval is
``[mean(x) - EB, mean(x) + EB]``. `EB` is inf when ``len(x) <= 1``.
"""
EB, _, _ = _boot_EB_and_test(x, confidence=confidence, n_boot=n_boot, return_EB=True, return_test=False)
assert np.ndim(EB) == 0 and EB >= 0.0
return EB
[docs]def get_mean_and_EB(x, confidence=0.95, min_EB=0.0, lower=-np.inf, upper=np.inf, method="t"):
"""Get mean loss and estimated error bar.
Parameters
----------
x : ndarray, shape (n_samples,)
Array of independent observations.
confidence : float
Confidence probability (in (0, 1)) to construct error bar.
min_EB : float
Minimum size of resulting error bar regardless of the data in `x`.
lower : float
A priori known theoretical lower limit on unknown mean of `x`. For
instance, for mean zero-one loss, ``lower=0``.
upper : float
A priori known theoretical upper limit on unknown mean of `x`. For
instance, for mean zero-one loss, ``upper=1``.
method : {'t', 'bernstein', 'boot'}
Method to use for building error bar.
Returns
-------
mu : float
Estimated mean of `x`.
EB : float
Size of error bar on mean of `x` (``EB > 0``). The confidence interval
is ``[mu - EB, mu + EB]``.
"""
assert np.all(lower <= x) and np.all(x <= upper)
if method == "t":
EB = t_EB(x, confidence=confidence)
elif method == "bernstein":
EB = bernstein_EB(x, lower, upper, confidence=confidence)
elif method == "boot":
EB = boot_EB(x, confidence=confidence)
else:
assert False
# EB subroutines already validated x for shape and nans
mu = clip_chk(np.mean(x), lower, upper)
EB = clip_EB(mu, EB, lower, upper, min_EB=min_EB)
return mu, EB
[docs]def get_test(x, lower=-np.inf, upper=np.inf, method="t"):
"""Perform a statistical test to determine if the values in `x` are sampled
from a distribution with a zero mean.
Parameters
----------
x : ndarray, shape (n_samples,)
Array of independent observations.
lower : float
A priori known theoretical lower limit on unknown mean of `x`. For
instance, for mean zero-one loss, ``lower=0``.
upper : float
A priori known theoretical upper limit on unknown mean of `x`. For
instance, for mean zero-one loss, ``upper=1``.
method : {'t', 'bernstein', 'boot'}
Method to use statistical test.
Returns
-------
pval : float
p-value (in [0,1]) from statistical test on `x`.
"""
if method == "t":
pval = t_test(x)
elif method == "bernstein":
pval = bernstein_test(x, lower, upper)
elif method == "boot":
pval = boot_test(x)
else:
assert False
return pval
[docs]def get_mean_EB_test(x, confidence=0.95, min_EB=0.0, lower=-np.inf, upper=np.inf, method="t"):
"""Get mean loss and estimated error bar. Also, perform a statistical test
to determine if the values in `x` are sampled from a distribution with a
zero mean.
Parameters
----------
x : ndarray, shape (n_samples,)
Array of independent observations.
confidence : float
Confidence probability (in (0, 1)) to construct error bar.
min_EB : float
Minimum size of resulting error bar regardless of the data in `x`.
lower : float
A priori known theoretical lower limit on unknown mean of `x`. For
instance, for mean zero-one loss, ``lower=0``.
upper : float
A priori known theoretical upper limit on unknown mean of `x`. For
instance, for mean zero-one loss, ``upper=1``.
method : {'t', 'bernstein', 'boot'}
Method to use for building error bar.
Returns
-------
mu : float
Estimated mean of `x`.
EB : float
Size of error bar on mean of `x` (``EB > 0``). The confidence interval
is ``[mu - EB, mu + EB]``.
pval : float
p-value (in [0,1]) from statistical test on `x`.
"""
assert np.all(lower <= x) and np.all(x <= upper)
if method == "t":
EB = t_EB(x, confidence=confidence)
pval = t_test(x)
elif method == "bernstein":
EB = bernstein_EB(x, lower, upper, confidence=confidence)
pval = bernstein_test(x, lower, upper)
elif method == "boot":
EB, pval, _ = _boot_EB_and_test(x, confidence=confidence)
else:
assert False
# EB subroutines already validated x for shape and nans
mu = clip_chk(np.mean(x), lower, upper)
EB = clip_EB(mu, EB, lower, upper, min_EB=min_EB)
return mu, EB, pval
# ============================================================================
# Loss summary: the main purpose of this file.
# ============================================================================
[docs]def loss_summary_table(loss_table, ref_method, pairwise_CI=PAIRWISE_DEFAULT, confidence=0.95, method_EB="t", limits={}):
"""Build table with mean and error bar summaries from a loss table that
contains losses on a per data point basis.
Parameters
----------
loss_tbl : DataFrame, shape (n_samples, n_metrics * n_methods)
DataFrame with loss of each method according to each loss function on
each data point. The rows are the data points in `y` (that is the index
matches `log_pred_prob_table`). The columns are a hierarchical index
that is the cartesian product of loss x method. That is, the loss of
method foo's prediction of ``y[5]`` according to loss function bar is
stored in ``loss_tbl.loc[5, ('bar', 'foo')]``.
ref_method : str
Name of method that is used as reference point in paired statistical
tests. This is usually some some of baseline method. `ref_method` must
be found in the 2nd level of the columns of `loss_tbl`.
pairwise_CI : bool
If True, compute error bars on the mean of ``loss - loss_ref`` instead
of just the mean of `loss`. This typically gives smaller error bars.
confidence : float
Confidence probability (in (0, 1)) to construct error bar.
method_EB : {'t', 'bernstein', 'boot'}
Method to use for building error bar.
limits : dict of str to (float, float)
Dictionary mapping metric name to tuple with (lower, upper) which are
the theoretical limits on the mean loss. For instance, zero-one loss
should be ``(0.0, 1.0)``. If entry missing, (-inf, inf) is used.
Returns
-------
perf_tbl : DataFrame, shape (n_methods, n_metrics * 3)
DataFrame with mean loss of each method according to each loss
function. The rows are the methods. The columns are a hierarchical
index that is the cartesian product of
loss x (mean, error bar, p-value). That is,
``perf_tbl.loc['foo', 'bar']`` is a pandas series with
(mean loss of foo on bar, corresponding error bar, statistical sig)
The statistical significance is a p-value from a two-sided hypothesis
test on the hypothesis H0 that foo has the same mean loss as the
reference method `ref_method`.
"""
assert loss_table.columns.names == (METRIC, METHOD)
metrics, methods = loss_table.columns.levels
assert ref_method in methods # ==> len(methods) >= 1
assert len(loss_table) >= 1 and len(metrics) >= 1
# Could also test these are cartesian product if we wanted to be exhaustive
col_names = pd.MultiIndex.from_product([metrics, STD_STATS], names=[METRIC, STAT])
perf_tbl = pd.DataFrame(index=methods, columns=col_names, dtype=float)
perf_tbl.index.set_names(METHOD, inplace=True)
for metric in metrics:
lower, upper = limits.get(metric, (-np.inf, np.inf))
assert lower <= upper
loss_ref = loss_table.loc[:, (metric, ref_method)].values
assert loss_ref.ndim == 1 # Weird stuff happens if names not unique
assert not np.any(np.isnan(loss_ref)) # Would let method cheat
assert np.all(lower <= loss_ref) and np.all(loss_ref <= upper)
for method in methods:
loss = loss_table.loc[:, (metric, method)].values
assert loss.ndim == 1 # Weird stuff happens if names not unique
assert not np.any(np.isnan(loss)) # Would let method cheat
assert np.all(lower <= loss) and np.all(loss <= upper)
mu = np.mean(loss) # This is the same in all cases
deltas = loss - loss_ref
range_ = upper - lower
self_comparison = method == ref_method
EB, pval = np.nan, np.nan
if pairwise_CI:
if not self_comparison: # Otherwise leave both as nan
_, EB, pval = get_mean_EB_test(deltas, confidence, lower=-range_, upper=range_, method=method_EB)
else:
mu_, EB = get_mean_and_EB(loss, confidence=confidence, lower=lower, upper=upper, method=method_EB)
assert mu_ == mu
if not self_comparison: # Otherwise pval as nan
pval = get_test(deltas, lower=-range_, upper=range_, method=method_EB)
# This is two-sided, could include one-sided option too.
perf_tbl.loc[method, metric] = (mu, EB, pval)
return perf_tbl