Source code for mlpaper.mlpaper

# Ryan Turner (turnerry@iro.umontreal.ca)
from __future__ import absolute_import, division, print_function

import numpy as np
import pandas as pd
import scipy.stats as ss

import mlpaper.boot_util as bu
from mlpaper.constants import METHOD, METRIC, PAIRWISE_DEFAULT, STAT, STD_STATS
from mlpaper.util import clip_chk

N_BOOT = 1000  # Default number of bootstrap replications

# ============================================================================
# Statistical util functions
# ============================================================================


[docs]def clip_EB(mu, EB, lower=-np.inf, upper=np.inf, min_EB=0.0): """Clip error bars to both a minimum uncertainty level and a maximum level determined by trivial error bars from the a prior known limits of the unknown parameter `theta`. Similar to `np.clip`, but for error bars. Parameters ---------- mu : float Point estimate of unknown parameter `theta` around which error bars are based. EB : float Size of error bar around `mu` (``EB > 0``). The confidence interval on `theta` is ``[mu - EB, mu + EB]``. lower : float A priori known theoretical lower limit on unknown parameter `theta`. For instance, for mean zero-one loss, ``lower=0``. upper : float A priori known theoretical upper limit on unknown parameter `theta`. For instance, for mean zero-one loss, ``upper=1``. min_EB : float Minimum size beleivable size of error bar. Typically, leave ``min_EB=0`` for simplicity. Returns ------- EB : float Error bar after possible clipping. """ assert np.ndim(mu) == 0 and np.ndim(EB) == 0 assert np.ndim(lower) == 0 and np.ndim(upper) == 0 assert upper - lower >= 0.0 # Also catch (inf, inf) or nans assert np.ndim(min_EB) == 0 assert 0.0 <= min_EB and min_EB < np.inf # Note: These conditions are designed to pass when NaNs are supplied. if lower > mu or mu > upper: raise ValueError("mu %f outside of given limits (%f, %f)" % (mu, lower, upper)) if 2 * min_EB > upper - lower: raise ValueError("min error bar %f too small for limits (%f, %f)" % (min_EB, lower, upper)) with np.errstate(invalid="ignore"): # expect non-finite here EB_trivial = np.fmax(upper - mu, mu - lower) assert not (min_EB > EB_trivial) # Let NaNs pass EB = np.clip(EB, min_EB, EB_trivial) return EB
[docs]def t_test(x): """Perform a standard t-test to test if the values in `x` are sampled from a distribution with a zero mean. Parameters ---------- x : array-like, shape (n_samples,) array of data points to test. Returns ------- pval : float p-value (in [0,1]) from t-test on `x`. """ assert np.ndim(x) == 1 and (not np.any(np.isnan(x))) if (len(x) <= 1) or (not np.all(np.isfinite(x))): return 1.0 # Can't say anything about scale => p=1 _, pval = ss.ttest_1samp(x, 0.0) if np.isnan(pval): # Should only be possible if scale underflowed to zero: assert np.var(x, ddof=1) <= 1e-100 # It is debatable if the condition should be ``np.mean(x) == 0.0`` or # ``np.all(x == 0.0)``. Should not matter in practice. pval = np.float(np.mean(x) == 0.0) assert 0.0 <= pval and pval <= 1.0 return pval
[docs]def t_EB(x, confidence=0.95): """Get t statistic based error bars on mean of `x`. Parameters ---------- x : array-like, shape (n_samples,) Data points to estimate mean. Must not be empty or contain NaNs. confidence : float Confidence probability (in (0, 1)) to construct confidence interval from t statistic. Returns ------- EB : float Size of error bar on mean (>= 0). The confidence interval is ``[mean(x) - EB, mean(x) + EB]``. `EB` is inf when ``len(x) <= 1``. """ assert np.ndim(x) == 1 and (not np.any(np.isnan(x))) assert np.ndim(confidence) == 0 assert 0.0 < confidence and confidence < 1.0 N = len(x) if (N <= 1) or (not np.all(np.isfinite(x))): return np.inf # loc cancels out when we just want EB anyway LB, UB = ss.t.interval(confidence, N - 1, loc=0.0, scale=1.0) assert not (LB > UB) # Just multiplying scale=ss.sem(x) is better for when scale=0 EB = 0.5 * ss.sem(x) * (UB - LB) assert np.ndim(EB) == 0 and EB >= 0.0 return EB
[docs]def bernstein_test(x, lower, upper): """Perform Bernstein bound-based test to test if the values in `x` are sampled from a distribution with a zero mean. This test makes no distributional or central limit theorem assumption on `x`. As a result the bound may be loose and the p-value will not be sampled from a uniform distribution under H0 (E[x] = 0), but rather be skewed larger than uniform. Parameters ---------- x : array-like, shape (n_samples,) array of data points to test. lower : float A priori known theoretical lower limit on unknown mean. For instance, for mean zero-one loss, ``lower=0``. upper : float A priori known theoretical upper limit on unknown mean. For instance, for mean zero-one loss, ``upper=1``. Returns ------- pval : float p-value (in [0,1]) from t-test on `x`. """ assert np.ndim(x) == 1 and (not np.any(np.isnan(x))) assert np.ndim(lower) == 0 and np.ndim(upper) == 0 range_ = upper - lower assert range_ >= 0.0 # Also catch (inf, inf) or nans assert np.all(lower <= x) and np.all(x <= upper) if (len(x) <= 1) or (not np.all(np.isfinite(x))): return 1.0 # Can't say anything about scale => p=1 if (range_ == 0.0) or (range_ == np.inf): # If range_ = inf, we could use p=0, if 0 is outside of [lower, upper], # but it is unclear if there is any advantage to the extra hassle. # If range_ = 0, then roots not invertible and distn on data x is a # point mass => everything has p=1. return 1.0 # Get the moments N = len(x) mu = np.mean(x) std = np.std(x, ddof=0) coef = [(3.0 * range_) / N, std * np.sqrt(2.0 / N), -np.abs(mu)] assert np.all(np.isfinite(coef)) # Should have caught non-finite cases coef_roots = np.roots(coef) assert len(coef_roots) == 2 assert coef_roots.dtype.kind == "f" # Appears roots are always real # Appears to always be one neg and one pos root, but we looking for square # root so the positive one is the correct one. The roots can be zero. assert np.sum(coef_roots <= 0.0) >= 1 assert np.sum(coef_roots >= 0.0) >= 1 B = np.max(coef_roots) ** 2 # Bernstein test statistic # Sampling CDF is bounded by exponential for any true distn on x. delta = 3.0 * np.exp(-B) pval = np.minimum(1.0, delta) # Can cap at 1 to make p-value assert 0.0 <= pval and pval <= 1.0 return pval
[docs]def bernstein_EB(x, lower, upper, confidence=0.95): """Get Bernstein bound based error bars on mean of `x`. This error bar makes no distributional or central limit theorem assumption on `x`. Parameters ---------- x : array-like, shape (n_samples,) Data points to estimate mean. Must not be empty or contain NaNs. lower : float A priori known theoretical lower limit on unknown mean. For instance, for mean zero-one loss, ``lower=0``. upper : float A priori known theoretical upper limit on unknown mean. For instance, for mean zero-one loss, ``upper=1``. confidence : float Confidence probability (in (0, 1)) to construct confidence interval from t statistic. Returns ------- EB : float Size of error bar on mean (>= 0). The confidence interval is ``[mean(x) - EB, mean(x) + EB]``. ``EB = upper - lower`` is inf when ``len(x) = 0``. Notes ----- This does not do clipping of to trivial error bars, i.e., `EB` could be larger than ``upper - lower``. However, `clip_EB` can be called to enforce trivial error bar limits. References ---------- Audibert, Jean-Yves, Remi Munos, and Csaba Szepesvari. "Exploration-exploitation tradeoff using variance estimates in multi-armed bandits." Theoretical Computer Science 410.19 (2009): 1876-1902. """ assert np.ndim(x) == 1 and (not np.any(np.isnan(x))) assert np.ndim(lower) == 0 and np.ndim(upper) == 0 range_ = upper - lower assert range_ >= 0.0 # Also catch (inf, inf) or nans assert np.all(lower <= x) and np.all(x <= upper) assert np.ndim(confidence) == 0 assert 0.0 < confidence and confidence < 1.0 N = x.size if (N <= 1) or (not np.all(np.isfinite(x))): return range_ # From Thm 1 of Audibert et. al. (2009), must use MLE for std ==> ddof=0 delta = 1.0 - confidence A = np.log(3.0 / delta) EB = np.std(x, ddof=0) * np.sqrt((2.0 * A) / N) + (3.0 * A * range_) / N assert np.ndim(EB) == 0 and EB >= 0.0 return EB
def _boot_EB_and_test(x, confidence=0.95, n_boot=N_BOOT, return_EB=True, return_test=True, return_CI=False): """Internal helper function to compute both bootstrap EB and significance using the same random bootstrap weights, which saves computation and guarantees the results are coherent with each other.""" assert np.ndim(x) == 1 and (not np.any(np.isnan(x))) # confidence is checked by bu.error_bar N = x.size if (N <= 1) or (not np.all(np.isfinite(x))): return np.inf, 1.0, (-np.inf, np.inf) weight = bu.boot_weights(N, n_boot) mu_boot = np.mean(x * weight, axis=1) pval = bu.significance(mu_boot, ref=0.0) if return_test else 1.0 EB = np.inf if return_EB: mu = np.mean(x) EB = bu.error_bar(mu_boot, mu, confidence=confidence) # Useful in test: CI = -np.inf, np.inf if return_CI: CI = bu.percentile(mu_boot, confidence=confidence) return EB, pval, CI
[docs]def boot_test(x, n_boot=N_BOOT): """Perform a bootstrap-based test to test if the values in `x` are sampled from a distribution with a zero mean. Parameters ---------- x : array-like, shape (n_samples,) array of data points to test. n_boot : int Number of bootstrap iterations to perform. Returns ------- pval : float p-value (in [0,1]) from t-test on `x`. """ _, pval, _ = _boot_EB_and_test(x, n_boot=n_boot, return_EB=False, return_test=True) assert 0.0 <= pval and pval <= 1.0 return pval
[docs]def boot_EB(x, confidence=0.95, n_boot=N_BOOT): """Get bootstrap bound based error bars on mean of `x`. Parameters ---------- x : array-like, shape (n_samples,) Data points to estimate mean. Must not be empty or contain NaNs. confidence : float Confidence probability (in (0, 1)) to construct confidence interval from t statistic. n_boot : int Number of bootstrap iterations to perform. Returns ------- EB : float Size of error bar on mean (>= 0). The confidence interval is ``[mean(x) - EB, mean(x) + EB]``. `EB` is inf when ``len(x) <= 1``. """ EB, _, _ = _boot_EB_and_test(x, confidence=confidence, n_boot=n_boot, return_EB=True, return_test=False) assert np.ndim(EB) == 0 and EB >= 0.0 return EB
[docs]def get_mean_and_EB(x, confidence=0.95, min_EB=0.0, lower=-np.inf, upper=np.inf, method="t"): """Get mean loss and estimated error bar. Parameters ---------- x : ndarray, shape (n_samples,) Array of independent observations. confidence : float Confidence probability (in (0, 1)) to construct error bar. min_EB : float Minimum size of resulting error bar regardless of the data in `x`. lower : float A priori known theoretical lower limit on unknown mean of `x`. For instance, for mean zero-one loss, ``lower=0``. upper : float A priori known theoretical upper limit on unknown mean of `x`. For instance, for mean zero-one loss, ``upper=1``. method : {'t', 'bernstein', 'boot'} Method to use for building error bar. Returns ------- mu : float Estimated mean of `x`. EB : float Size of error bar on mean of `x` (``EB > 0``). The confidence interval is ``[mu - EB, mu + EB]``. """ assert np.all(lower <= x) and np.all(x <= upper) if method == "t": EB = t_EB(x, confidence=confidence) elif method == "bernstein": EB = bernstein_EB(x, lower, upper, confidence=confidence) elif method == "boot": EB = boot_EB(x, confidence=confidence) else: assert False # EB subroutines already validated x for shape and nans mu = clip_chk(np.mean(x), lower, upper) EB = clip_EB(mu, EB, lower, upper, min_EB=min_EB) return mu, EB
[docs]def get_test(x, lower=-np.inf, upper=np.inf, method="t"): """Perform a statistical test to determine if the values in `x` are sampled from a distribution with a zero mean. Parameters ---------- x : ndarray, shape (n_samples,) Array of independent observations. lower : float A priori known theoretical lower limit on unknown mean of `x`. For instance, for mean zero-one loss, ``lower=0``. upper : float A priori known theoretical upper limit on unknown mean of `x`. For instance, for mean zero-one loss, ``upper=1``. method : {'t', 'bernstein', 'boot'} Method to use statistical test. Returns ------- pval : float p-value (in [0,1]) from statistical test on `x`. """ if method == "t": pval = t_test(x) elif method == "bernstein": pval = bernstein_test(x, lower, upper) elif method == "boot": pval = boot_test(x) else: assert False return pval
[docs]def get_mean_EB_test(x, confidence=0.95, min_EB=0.0, lower=-np.inf, upper=np.inf, method="t"): """Get mean loss and estimated error bar. Also, perform a statistical test to determine if the values in `x` are sampled from a distribution with a zero mean. Parameters ---------- x : ndarray, shape (n_samples,) Array of independent observations. confidence : float Confidence probability (in (0, 1)) to construct error bar. min_EB : float Minimum size of resulting error bar regardless of the data in `x`. lower : float A priori known theoretical lower limit on unknown mean of `x`. For instance, for mean zero-one loss, ``lower=0``. upper : float A priori known theoretical upper limit on unknown mean of `x`. For instance, for mean zero-one loss, ``upper=1``. method : {'t', 'bernstein', 'boot'} Method to use for building error bar. Returns ------- mu : float Estimated mean of `x`. EB : float Size of error bar on mean of `x` (``EB > 0``). The confidence interval is ``[mu - EB, mu + EB]``. pval : float p-value (in [0,1]) from statistical test on `x`. """ assert np.all(lower <= x) and np.all(x <= upper) if method == "t": EB = t_EB(x, confidence=confidence) pval = t_test(x) elif method == "bernstein": EB = bernstein_EB(x, lower, upper, confidence=confidence) pval = bernstein_test(x, lower, upper) elif method == "boot": EB, pval, _ = _boot_EB_and_test(x, confidence=confidence) else: assert False # EB subroutines already validated x for shape and nans mu = clip_chk(np.mean(x), lower, upper) EB = clip_EB(mu, EB, lower, upper, min_EB=min_EB) return mu, EB, pval
# ============================================================================ # Loss summary: the main purpose of this file. # ============================================================================
[docs]def loss_summary_table(loss_table, ref_method, pairwise_CI=PAIRWISE_DEFAULT, confidence=0.95, method_EB="t", limits={}): """Build table with mean and error bar summaries from a loss table that contains losses on a per data point basis. Parameters ---------- loss_tbl : DataFrame, shape (n_samples, n_metrics * n_methods) DataFrame with loss of each method according to each loss function on each data point. The rows are the data points in `y` (that is the index matches `log_pred_prob_table`). The columns are a hierarchical index that is the cartesian product of loss x method. That is, the loss of method foo's prediction of ``y[5]`` according to loss function bar is stored in ``loss_tbl.loc[5, ('bar', 'foo')]``. ref_method : str Name of method that is used as reference point in paired statistical tests. This is usually some some of baseline method. `ref_method` must be found in the 2nd level of the columns of `loss_tbl`. pairwise_CI : bool If True, compute error bars on the mean of ``loss - loss_ref`` instead of just the mean of `loss`. This typically gives smaller error bars. confidence : float Confidence probability (in (0, 1)) to construct error bar. method_EB : {'t', 'bernstein', 'boot'} Method to use for building error bar. limits : dict of str to (float, float) Dictionary mapping metric name to tuple with (lower, upper) which are the theoretical limits on the mean loss. For instance, zero-one loss should be ``(0.0, 1.0)``. If entry missing, (-inf, inf) is used. Returns ------- perf_tbl : DataFrame, shape (n_methods, n_metrics * 3) DataFrame with mean loss of each method according to each loss function. The rows are the methods. The columns are a hierarchical index that is the cartesian product of loss x (mean, error bar, p-value). That is, ``perf_tbl.loc['foo', 'bar']`` is a pandas series with (mean loss of foo on bar, corresponding error bar, statistical sig) The statistical significance is a p-value from a two-sided hypothesis test on the hypothesis H0 that foo has the same mean loss as the reference method `ref_method`. """ assert loss_table.columns.names == (METRIC, METHOD) metrics, methods = loss_table.columns.levels assert ref_method in methods # ==> len(methods) >= 1 assert len(loss_table) >= 1 and len(metrics) >= 1 # Could also test these are cartesian product if we wanted to be exhaustive col_names = pd.MultiIndex.from_product([metrics, STD_STATS], names=[METRIC, STAT]) perf_tbl = pd.DataFrame(index=methods, columns=col_names, dtype=float) perf_tbl.index.set_names(METHOD, inplace=True) for metric in metrics: lower, upper = limits.get(metric, (-np.inf, np.inf)) assert lower <= upper loss_ref = loss_table.loc[:, (metric, ref_method)].values assert loss_ref.ndim == 1 # Weird stuff happens if names not unique assert not np.any(np.isnan(loss_ref)) # Would let method cheat assert np.all(lower <= loss_ref) and np.all(loss_ref <= upper) for method in methods: loss = loss_table.loc[:, (metric, method)].values assert loss.ndim == 1 # Weird stuff happens if names not unique assert not np.any(np.isnan(loss)) # Would let method cheat assert np.all(lower <= loss) and np.all(loss <= upper) mu = np.mean(loss) # This is the same in all cases deltas = loss - loss_ref range_ = upper - lower self_comparison = method == ref_method EB, pval = np.nan, np.nan if pairwise_CI: if not self_comparison: # Otherwise leave both as nan _, EB, pval = get_mean_EB_test(deltas, confidence, lower=-range_, upper=range_, method=method_EB) else: mu_, EB = get_mean_and_EB(loss, confidence=confidence, lower=lower, upper=upper, method=method_EB) assert mu_ == mu if not self_comparison: # Otherwise pval as nan pval = get_test(deltas, lower=-range_, upper=range_, method=method_EB) # This is two-sided, could include one-sided option too. perf_tbl.loc[method, metric] = (mu, EB, pval) return perf_tbl