Source code for mlpaper.boot_util

# Ryan Turner (turnerry@iro.umontreal.ca)
import numpy as np


[docs]def boot_weights(N, n_boot, epsilon=0): """Sample weights for data points that makes it equivalent to bootstrap resampling of data points. Parameters ---------- N : int Number of data points must be >= 1.. n_boot : int Number of bootstrap replicates, must be >= 1. epsilon : int or float Minimum weight, typically 0 unless this creates numerical problems for a down stream algorithm in which case a value such as 1e-10 is used. Returns ------- weight : ndarray, shape (n_boot, N) Weights equivalent to resampling for bootstrap algorithm. """ assert N >= 1 assert n_boot >= 1 p_BS = np.ones(N) / N weight = np.maximum(epsilon, np.random.multinomial(N, p_BS, size=n_boot)) assert weight.shape == (n_boot, N) return weight
[docs]def confidence_to_percentiles(confidence): """Convert confidence level to percentiles in sampling distribution to build confidence interval. Parameters ---------- confidence : float Confidence level, use 0.95 for 95% interval. Must be in (0,1). Returns ------- LB : float Lower end quantile in (0,1). UB : float Upper end quantile in (0,1). Examples -------- >>> confidence_to_percentiles(0.95) (2.5, 97.5) """ assert np.ndim(confidence) == 0 and 0.0 < confidence and confidence < 1.0 alpha = 0.5 * (1.0 - confidence) LB, UB = 100.0 * alpha, 100.0 * (1.0 - alpha) return LB, UB
[docs]def percentile(boot_estimates, confidence=0.95): """Build confidence interval using percentile boostrap method. Parameters ---------- boot_estimates : ndarray, shape (n_boot, ...) Estimated quantity across different bootstrap replications. confidence : float Confidence level, use 0.95 for 95% interval. Must be in (0,1). Returns ------- LB : ndarray, shape (...) Lower end of confidence interval. UB : ndarray, shape (...) Upper end of confidence interval. """ assert boot_estimates.ndim >= 1 assert not np.any(np.isnan(boot_estimates)) # NaN ordering is arbitrary LB_perc, UB_perc = confidence_to_percentiles(confidence) # Expand upward to bigger interval for round off. Also, important for # keeping CI coherent with signficance test function. # TODO np.percentile does not exactly correspond to mathematical definition # of the quantile function wrt jump locations. And hence may not exactly be # inverse of the ECDF used in significance(). File bug report with numpy. LB = np.percentile(boot_estimates, LB_perc, axis=0, interpolation="lower") UB = np.percentile(boot_estimates, UB_perc, axis=0, interpolation="higher") assert LB.shape == boot_estimates.shape[1:] assert LB.shape == UB.shape return LB, UB
[docs]def basic(boot_estimates, original_estimate, confidence=0.95): """Build confidence interval using basic boostrap method. Parameters ---------- boot_estimates : ndarray, shape (n_boot, ...) Estimated quantity across different bootstrap replications. original_estimate : ndarray, shape (...) Quantity estimated using original (non-bootstrap) data set. confidence : float Confidence level, use 0.95 for 95% interval. Must be in (0,1). Returns ------- LB : ndarray, shape (...) Lower end of confidence interval. UB : ndarray, shape (...) Upper end of confidence interval. """ assert boot_estimates.ndim >= 1 assert boot_estimates.shape[1:] == np.shape(original_estimate) LB, UB = percentile(boot_estimates, confidence) LB, UB = 2 * original_estimate - UB, 2 * original_estimate - LB return LB, UB
[docs]def error_bar(boot_estimates, original_estimate, confidence=0.95): """Build error bar using boostrap method. The results is the same regardless of whether the percentile or basic boostrap is used for CIs. Parameters ---------- boot_estimates : ndarray, shape (n_boot,) Estimated quantity across different bootstrap replications. original_estimate : float Quantity estimated using original (non-bootstrap) data set. confidence : float Confidence level, use 0.95 for 95% interval. Must be in (0,1). Returns ------- EB : float Error bar around the original estimate. """ assert boot_estimates.ndim == 1 assert np.ndim(original_estimate) == 0 LB, UB = percentile(boot_estimates, confidence) # This actually ends up the same whether we use basic or percentile EB = np.fmax(UB - original_estimate, original_estimate - LB) assert not np.any(EB < 0.0) # Allows nans # NaN EB only ever occurs when ref is infinite and so are some samples assert np.all(np.isfinite(original_estimate) <= ~np.isnan(EB)) return EB
[docs]def significance(boot_estimates, ref): """Perform a two-sided bootstrap based hypothesis test on whether the unknown quantity is equal to some reference. Parameters ---------- boot_estimates : ndarray, shape (n_boot,) Estimated quantity across different bootstrap replications. ref : float or ndarray of shape (n_boot,) Reference value is in hypothesis test. Use a scalar value for a known reference value or a array of n_boot bootstraped value to perform a paired test against another unknown quantity. Returns ------- pval : float Resulting p-value of hypothesis test in (0,1). """ assert boot_estimates.ndim == 1 assert np.ndim(ref) == 0 or ref.shape == boot_estimates.shape assert not np.any(np.isnan(boot_estimates)) # NaN ordering is arbitrary assert not np.any(np.isnan(ref)) # NaN ordering is arbitrary pval = 2.0 * np.minimum(np.mean(boot_estimates <= ref), np.mean(ref <= boot_estimates)) # Only needed when some boot_estimates == ref exactly: pval = np.minimum(1.0, pval) return pval