Source code for mlpaper.boot_util

# Ryan Turner (turnerry@iro.umontreal.ca)
import numpy as np


[docs]def boot_weights(N, n_boot, epsilon=0):
    """Sample weights for data points that makes it equivalent to bootstrap
    resampling of data points.

    Parameters
    ----------
    N : int
        Number of data points must be >= 1..
    n_boot : int
        Number of bootstrap replicates, must be >= 1.
    epsilon : int or float
        Minimum weight, typically 0 unless this creates numerical problems for
        a down stream algorithm in which case a value such as 1e-10 is used.

    Returns
    -------
    weight : ndarray, shape (n_boot, N)
        Weights equivalent to resampling for bootstrap algorithm.
    """
    assert N >= 1
    assert n_boot >= 1

    p_BS = np.ones(N) / N
    weight = np.maximum(epsilon, np.random.multinomial(N, p_BS, size=n_boot))
    assert weight.shape == (n_boot, N)
    return weight


[docs]def confidence_to_percentiles(confidence):
    """Convert confidence level to percentiles in sampling distribution to
    build confidence interval.

    Parameters
    ----------
    confidence : float
        Confidence level, use 0.95 for 95% interval. Must be in (0,1).

    Returns
    -------
    LB : float
        Lower end quantile in (0,1).
    UB : float
        Upper end quantile in (0,1).

    Examples
    --------
    >>> confidence_to_percentiles(0.95)
    (2.5, 97.5)
    """
    assert np.ndim(confidence) == 0 and 0.0 < confidence and confidence < 1.0

    alpha = 0.5 * (1.0 - confidence)
    LB, UB = 100.0 * alpha, 100.0 * (1.0 - alpha)
    return LB, UB


[docs]def percentile(boot_estimates, confidence=0.95):
    """Build confidence interval using percentile boostrap method.

    Parameters
    ----------
    boot_estimates : ndarray, shape (n_boot, ...)
        Estimated quantity across different bootstrap replications.
    confidence : float
        Confidence level, use 0.95 for 95% interval. Must be in (0,1).

    Returns
    -------
    LB : ndarray, shape (...)
        Lower end of confidence interval.
    UB : ndarray, shape (...)
        Upper end of confidence interval.
    """
    assert boot_estimates.ndim >= 1
    assert not np.any(np.isnan(boot_estimates))  # NaN ordering is arbitrary

    LB_perc, UB_perc = confidence_to_percentiles(confidence)
    # Expand upward to bigger interval for round off. Also, important for
    # keeping CI coherent with signficance test function.
    # TODO np.percentile does not exactly correspond to mathematical definition
    # of the quantile function wrt jump locations. And hence may not exactly be
    # inverse of the ECDF used in significance(). File bug report with numpy.
    LB = np.percentile(boot_estimates, LB_perc, axis=0, interpolation="lower")
    UB = np.percentile(boot_estimates, UB_perc, axis=0, interpolation="higher")
    assert LB.shape == boot_estimates.shape[1:]
    assert LB.shape == UB.shape
    return LB, UB


[docs]def basic(boot_estimates, original_estimate, confidence=0.95):
    """Build confidence interval using basic boostrap method.

    Parameters
    ----------
    boot_estimates : ndarray, shape (n_boot, ...)
        Estimated quantity across different bootstrap replications.
    original_estimate : ndarray, shape (...)
        Quantity estimated using original (non-bootstrap) data set.
    confidence : float
        Confidence level, use 0.95 for 95% interval. Must be in (0,1).

    Returns
    -------
    LB : ndarray, shape (...)
        Lower end of confidence interval.
    UB : ndarray, shape (...)
        Upper end of confidence interval.
    """
    assert boot_estimates.ndim >= 1
    assert boot_estimates.shape[1:] == np.shape(original_estimate)

    LB, UB = percentile(boot_estimates, confidence)
    LB, UB = 2 * original_estimate - UB, 2 * original_estimate - LB
    return LB, UB


[docs]def error_bar(boot_estimates, original_estimate, confidence=0.95):
    """Build error bar using boostrap method. The results is the same
    regardless of whether the percentile or basic boostrap is used for CIs.

    Parameters
    ----------
    boot_estimates : ndarray, shape (n_boot,)
        Estimated quantity across different bootstrap replications.
    original_estimate : float
        Quantity estimated using original (non-bootstrap) data set.
    confidence : float
        Confidence level, use 0.95 for 95% interval. Must be in (0,1).

    Returns
    -------
    EB : float
        Error bar around the original estimate.
    """
    assert boot_estimates.ndim == 1
    assert np.ndim(original_estimate) == 0

    LB, UB = percentile(boot_estimates, confidence)
    # This actually ends up the same whether we use basic or percentile
    EB = np.fmax(UB - original_estimate, original_estimate - LB)
    assert not np.any(EB < 0.0)  # Allows nans
    # NaN EB only ever occurs when ref is infinite and so are some samples
    assert np.all(np.isfinite(original_estimate) <= ~np.isnan(EB))
    return EB


[docs]def significance(boot_estimates, ref):
    """Perform a two-sided bootstrap based hypothesis test on whether the
    unknown quantity is equal to some reference.

    Parameters
    ----------
    boot_estimates : ndarray, shape (n_boot,)
        Estimated quantity across different bootstrap replications.
    ref : float or ndarray of shape (n_boot,)
        Reference value is in hypothesis test. Use a scalar value for a known
        reference value or a array of n_boot bootstraped value to perform a
        paired test against another unknown quantity.

    Returns
    -------
    pval : float
        Resulting p-value of hypothesis test in (0,1).
    """
    assert boot_estimates.ndim == 1
    assert np.ndim(ref) == 0 or ref.shape == boot_estimates.shape
    assert not np.any(np.isnan(boot_estimates))  # NaN ordering is arbitrary
    assert not np.any(np.isnan(ref))  # NaN ordering is arbitrary

    pval = 2.0 * np.minimum(np.mean(boot_estimates <= ref), np.mean(ref <= boot_estimates))
    # Only needed when some boot_estimates == ref exactly:
    pval = np.minimum(1.0, pval)
    return pval