Source code for mlpaper.regression

# Ryan Turner (turnerry@iro.umontreal.ca)
from __future__ import absolute_import, division, print_function

from builtins import range

import numpy as np
import pandas as pd
import scipy.stats as ss
from joblib import Memory

from mlpaper.constants import METHOD, METRIC
from mlpaper.mlpaper import PAIRWISE_DEFAULT, loss_summary_table

MOMENT = "moment"  # Don't put in constants since only needed for regression


[docs]def shape_and_validate(y, mu, std): """Validate shapes and types of predictive distribution against data and return the shape information. Parameters ---------- y : ndarray, shape (n_samples,) True targets for each regression data point. Typically of type `float`. mu : ndarray, shape (n_samples,) Predictive mean for each regression data point. Typically of type `float`. Must be of same shape as `y`. std : ndarray, shape (n_samples,) Predictive standard deviation for each regression data point. Typically of type `float`. Must be positive and of same shape as `y`. Returns ------- n_samples : int Number of data points (length of `y`) """ n_samples, = y.shape assert n_samples >= 1 assert np.all(np.isfinite(y)) assert mu.shape == (n_samples,) and std.shape == (n_samples,) assert np.all(np.isfinite(mu)) and np.all(np.isfinite(std)) assert np.all(std > 0.0) return n_samples
# ============================================================================ # Loss functions # ============================================================================
[docs]def square_loss(y, mu, std): """Compute MSE of predictions vs true targets. Parameters ---------- y : ndarray, shape (n_samples,) True targets for each regression data point. Typically of type `float`. mu : ndarray, shape (n_samples,) Predictive mean for each regression data point. Typically of type `float`. Must be of same shape as `y`. std : ndarray, shape (n_samples,) Predictive standard deviation for each regression data point. Typically of type `float`. Must be positive and of same shape as `y`. Ignored in this function. Returns ------- loss : ndarray, shape (n_samples,) Square error of target vs prediction. Same shape as `y`. """ shape_and_validate(y, mu, std) loss = (y - mu) ** 2 return loss
[docs]def abs_loss(y, mu, std): """Compute MAE of predictions vs true targets. Parameters ---------- y : ndarray, shape (n_samples,) True targets for each regression data point. Typically of type `float`. mu : ndarray, shape (n_samples,) Predictive mean for each regression data point. Typically of type `float`. Must be of same shape as `y`. std : ndarray, shape (n_samples,) Predictive standard deviation for each regression data point. Typically of type `float`. Must be positive and of same shape as `y`. Ignored in this function. Returns ------- loss : ndarray, shape (n_samples,) Absolute error of target vs prediction. Same shape as `y`. """ shape_and_validate(y, mu, std) loss = np.abs(y - mu) return loss
[docs]def log_loss(y, mu, std): """Compute log loss of Gaussian predictive distribution on target `y`. Parameters ---------- y : ndarray, shape (n_samples,) True targets for each regression data point. Typically of type `float`. mu : ndarray, shape (n_samples,) Predictive mean for each regression data point. Typically of type `float`. Must be of same shape as `y`. std : ndarray, shape (n_samples,) Predictive standard deviation for each regression data point. Typically of type `float`. Must be positive and of same shape as `y`. Returns ------- loss : ndarray, shape (n_samples,) Log loss of Gaussian predictive distribution on target `y`. Same shape as `y`. """ shape_and_validate(y, mu, std) loss = -ss.norm.logpdf(y, loc=mu, scale=std) return loss
# ============================================================================ # Use and summarize loss functions # ============================================================================
[docs]def loss_table(pred_tbl, y, metrics_dict): """Compute loss table from table of Gaussian predictions. Parameters ---------- pred_tbl : DataFrame, shape (n_samples, n_methods * 2) DataFrame with predictive distributions. Each row is a data point. The columns should be hierarchical index that is the cartesian product of methods x moments. For exampe, ``log_pred_prob_table.loc[5, 'foo']`` is a pandas series with (mean, std deviation) prediction that method foo places on ``y[5]``. Cannot be empty. y : ndarray, shape (n_samples,) True targets for each regression data point. Typically of type `float`. metrics_dict : dict of str to callable Dictionary mapping loss function name to function that computes loss, e.g., `log_loss`, `square_loss`, ... Returns ------- loss_tbl : DataFrame, shape (n_samples, n_metrics * n_methods) DataFrame with loss of each method according to each loss function on each data point. The rows are the data points in `y` (that is the index matches `pred_tbl`). The columns are a hierarchical index that is the cartesian product of loss x method. That is, the loss of method foo's prediction of ``y[5]`` according to loss function bar is stored in ``loss_tbl.loc[5, ('bar', 'foo')]``. """ methods, moments = pred_tbl.columns.levels assert "mu" in moments and "std" in moments n_samples = len(pred_tbl) assert y.shape == (n_samples,) assert n_samples >= 1 and len(methods) >= 1 col_names = pd.MultiIndex.from_product([metrics_dict.keys(), methods], names=[METRIC, METHOD]) loss_tbl = pd.DataFrame(index=pred_tbl.index, columns=col_names, dtype=float) for method in methods: # These get validated inside loss function mu = pred_tbl[(method, "mu")].values std = pred_tbl[(method, "std")].values for metric, metric_f in metrics_dict.items(): loss_tbl.loc[:, (metric, method)] = metric_f(y, mu, std) return loss_tbl
# ============================================================================ # Variables and functions to make getting results from sklearn objects easy # ============================================================================ # Pre-build some standard metric dicts for the user STD_REGR_LOSS = {"NLL": log_loss, "MSE": square_loss, "MAE": abs_loss}
[docs]class JustNoise: """Class version of iid predictor compatible with sklearn interface. Same as ``sklearn.dummy.DummyRegressor(strategy='mean')`` but also keeps track of std to be able to accept ``return_std=True``.""" def __init__(self): self.mu = np.nan self.std = np.nan def fit(self, X_train, y_train): assert y_train.ndim == 1 assert len(y_train) >= 2 # Require N >= 2 for std self.mu = np.mean(y_train) self.std = np.std(y_train, ddof=0) def predict(self, X_test, return_std=True): assert return_std N = X_test.shape[0] mu = np.repeat([self.mu], N, axis=0) std = np.repeat([self.std], N, axis=0) return mu, std
[docs]def get_gauss_pred(X_train, y_train, X_test, methods, min_std=0.0, verbose=False, checkpointdir=None): """Get the Gaussian prediction tables for each test point on a collection of regression methods. Parameters ---------- X_train : ndarray, shape (n_train, n_features) Training set 2d feature array for classifiers. Each row is an indepedent data point and each column is a feature. y_train : ndarray, shape (n_train,) True training targets for each regression data point. Typically of type `float`. Must be of same length as `X_train`. X_test : ndarray, shape (n_test, n_features) Test set 2d feature array for classifiers. Each row is an indepedent data point and each column is a feature. methods : dict of str to sklearn estimator Dictionary mapping method name (`str`) to object that performs training and test. Object must follow the interface of sklearn estimators, that is, it has a ``fit()`` method and a ``predict()`` method that accepts the argument ``return_std=True``. min_std : float Minimum value to floor the predictive standard deviation. Must be >= 0. Useful to prevent inf log loss penalties. verbose : bool If True, display which method being trained. checkpointdir : str (directory) If provided, stores checkpoint results using joblib for the train/test in case process interrupted. If None, no checkpointing is done. Returns ------- pred_tbl : DataFrame, shape (n_samples, n_methods * 2) DataFrame with predictive distributions. Each row is a data point. The columns should be hierarchical index that is the cartesian product of methods x moments. For exampe, ``log_pred_prob_table.loc[5, 'foo']`` is a pandas series with (mean, std deviation) prediction that method foo places on ``y[5]``. Notes ----- If a train/test operation is loaded from a checkpoint file, the estimator object in methods will not be in a fit state. """ n_test = X_test.shape[0] assert n_test > 0 assert X_train.ndim == 2 assert y_train.shape == (X_train.shape[0],) assert X_test.ndim == 2 and X_test.shape[1] == X_train.shape[1] assert X_train.dtype.kind == X_test.dtype.kind # Would be weird otherwise assert min_std >= 0.0 memory = Memory(cachedir=checkpointdir, verbose=0) @memory.cache def train_predict(method_obj, X_train, y_train, X_test): method_obj.fit(X_train, y_train) try: mu, std = method_obj.predict(X_test, return_std=True) except TypeError: mu = method_obj.predict(X_test) std = np.ones_like(mu) return mu, std col_names = pd.MultiIndex.from_product([methods.keys(), ("mu", "std")], names=[METHOD, MOMENT]) pred_tbl = pd.DataFrame(index=range(n_test), columns=col_names, dtype=float) for method_name, method_obj in methods.items(): if verbose: print("Running fit/predict for %s" % method_name) mu, std = train_predict(method_obj, X_train, y_train, X_test) assert mu.shape == (n_test,) and std.shape == (n_test,) std = np.maximum(min_std, std) pred_tbl.loc[:, (method_name, "mu")] = mu pred_tbl.loc[:, (method_name, "std")] = std return pred_tbl
[docs]def just_benchmark( X_train, y_train, X_test, y_test, methods, loss_dict, ref_method, min_std=0.0, pairwise_CI=PAIRWISE_DEFAULT, method_EB="t", limits={}, ): """Simplest one-call interface to this package. Just pass it data and method objects and a performance summary DataFrame is returned. Parameters ---------- X_train : ndarray, shape (n_train, n_features) Training set 2d feature array for classifiers. Each row is an indepedent data point and each column is a feature. y_train : ndarray, shape (n_train,) True training targets for each regression data point. Typically of type `float`. Must be of same length as `X_train`. X_test : ndarray, shape (n_test, n_features) Test set 2d feature array for classifiers. Each row is an indepedent data point and each column is a feature. y_test : ndarray, shape (n_test,) True test targets for each regression data point. Typically of type `float`. Cannot be empty. Must be of same length as `X_test`. methods : dict of str to sklearn estimator Dictionary mapping method name (`str`) to object that performs training and test. Object must follow the interface of sklearn estimators, that is, it has a ``fit()`` method and a ``predict()`` method that accepts the argument ``return_std=True``. loss_dict : dict of str to callable Dictionary mapping loss function name to function that computes loss, e.g., `log_loss`, `square_loss`, ... ref_method : str Name of method that is used as reference point in paired statistical tests. This is usually some some of baseline method. `ref_method` must be found in `methods` dictionary. min_std : float Minimum value to floor the predictive standard deviation. Must be >= 0. Useful to prevent inf log loss penalties. pairwise_CI : bool If True, compute error bars on the mean of ``loss - loss_ref`` instead of just the mean of `loss`. This typically gives smaller error bars. method_EB : {'t', 'bernstein', 'boot'} Method to use for building error bar. limits : dict of str to (float, float) Dictionary mapping metric name to tuple with (lower, upper) which are the theoretical limits on the mean loss. For instance, square loss on a bounded y domain of ``(-1.0,1.0)`` would give limits of ``(0.0, 4.0)``. If entry missing, (-inf, inf) is used. Returns ------- loss_summary : DataFrame, shape (n_methods, n_metrics * 3) DataFrame with mean loss of each method according to each loss function. The rows are the methods. The columns are a hierarchical index that is the cartesian product of loss x (mean, error bar, p-value). That is, ``perf_tbl.loc['foo', 'bar']`` is a pandas series with (mean loss of foo on bar, corresponding error bar, statistical sig) The statistical significance is a p-value from a two-sided hypothesis test on the hypothesis H0 that foo has the same mean loss as the reference method `ref_method`. """ assert y_train.dtype == y_test.dtype # Would be weird otherwise pred_tbl = get_gauss_pred(X_train, y_train, X_test, methods, min_std=min_std) loss_tbl = loss_table(pred_tbl, y_test, loss_dict) loss_summary = loss_summary_table(loss_tbl, ref_method, pairwise_CI=pairwise_CI, method_EB=method_EB, limits=limits) return loss_summary