Source code for mlpaper.regression

# Ryan Turner (turnerry@iro.umontreal.ca)
from __future__ import absolute_import, division, print_function

from builtins import range

import numpy as np
import pandas as pd
import scipy.stats as ss
from joblib import Memory

from mlpaper.constants import METHOD, METRIC
from mlpaper.mlpaper import PAIRWISE_DEFAULT, loss_summary_table

MOMENT = "moment"  # Don't put in constants since only needed for regression


[docs]def shape_and_validate(y, mu, std):
    """Validate shapes and types of predictive distribution against data and
    return the shape information.

    Parameters
    ----------
    y : ndarray, shape (n_samples,)
        True targets for each regression data point. Typically of type `float`.
    mu : ndarray, shape (n_samples,)
        Predictive mean for each regression data point. Typically of type
        `float`. Must be of same shape as `y`.
    std : ndarray, shape (n_samples,)
        Predictive standard deviation for each regression data point. Typically
        of type `float`. Must be positive and of same shape as `y`.

    Returns
    -------
    n_samples : int
        Number of data points (length of `y`)
    """
    n_samples, = y.shape
    assert n_samples >= 1
    assert np.all(np.isfinite(y))

    assert mu.shape == (n_samples,) and std.shape == (n_samples,)
    assert np.all(np.isfinite(mu)) and np.all(np.isfinite(std))
    assert np.all(std > 0.0)
    return n_samples


# ============================================================================
# Loss functions
# ============================================================================


[docs]def square_loss(y, mu, std):
    """Compute MSE of predictions vs true targets.

    Parameters
    ----------
    y : ndarray, shape (n_samples,)
        True targets for each regression data point. Typically of type `float`.
    mu : ndarray, shape (n_samples,)
        Predictive mean for each regression data point. Typically of type
        `float`. Must be of same shape as `y`.
    std : ndarray, shape (n_samples,)
        Predictive standard deviation for each regression data point. Typically
        of type `float`. Must be positive and of same shape as `y`. Ignored in
        this function.

    Returns
    -------
    loss : ndarray, shape (n_samples,)
        Square error of target vs prediction. Same shape as `y`.
    """
    shape_and_validate(y, mu, std)
    loss = (y - mu) ** 2
    return loss


[docs]def abs_loss(y, mu, std):
    """Compute MAE of predictions vs true targets.

    Parameters
    ----------
    y : ndarray, shape (n_samples,)
        True targets for each regression data point. Typically of type `float`.
    mu : ndarray, shape (n_samples,)
        Predictive mean for each regression data point. Typically of type
        `float`. Must be of same shape as `y`.
    std : ndarray, shape (n_samples,)
        Predictive standard deviation for each regression data point. Typically
        of type `float`. Must be positive and of same shape as `y`. Ignored in
        this function.

    Returns
    -------
    loss : ndarray, shape (n_samples,)
        Absolute error of target vs prediction. Same shape as `y`.
    """
    shape_and_validate(y, mu, std)
    loss = np.abs(y - mu)
    return loss


[docs]def log_loss(y, mu, std):
    """Compute log loss of Gaussian predictive distribution on target `y`.

    Parameters
    ----------
    y : ndarray, shape (n_samples,)
        True targets for each regression data point. Typically of type `float`.
    mu : ndarray, shape (n_samples,)
        Predictive mean for each regression data point. Typically of type
        `float`. Must be of same shape as `y`.
    std : ndarray, shape (n_samples,)
        Predictive standard deviation for each regression data point. Typically
        of type `float`. Must be positive and of same shape as `y`.

    Returns
    -------
    loss : ndarray, shape (n_samples,)
        Log loss of Gaussian predictive distribution on target `y`. Same shape
        as `y`.
    """
    shape_and_validate(y, mu, std)
    loss = -ss.norm.logpdf(y, loc=mu, scale=std)
    return loss


# ============================================================================
# Use and summarize loss functions
# ============================================================================


[docs]def loss_table(pred_tbl, y, metrics_dict):
    """Compute loss table from table of Gaussian predictions.

    Parameters
    ----------
    pred_tbl : DataFrame, shape (n_samples, n_methods * 2)
        DataFrame with predictive distributions. Each row is a data point.
        The columns should be hierarchical index that is the cartesian product
        of methods x moments. For exampe, ``log_pred_prob_table.loc[5, 'foo']``
        is a pandas series with (mean, std deviation) prediction that method
        foo places on ``y[5]``. Cannot be empty.
    y : ndarray, shape (n_samples,)
        True targets for each regression data point. Typically of type `float`.
    metrics_dict : dict of str to callable
        Dictionary mapping loss function name to function that computes loss,
        e.g., `log_loss`, `square_loss`, ...

    Returns
    -------
    loss_tbl : DataFrame, shape (n_samples, n_metrics * n_methods)
        DataFrame with loss of each method according to each loss function on
        each data point. The rows are the data points in `y` (that is the index
        matches `pred_tbl`). The columns are a hierarchical index that is the
        cartesian product of loss x method. That is, the loss of method foo's
        prediction of ``y[5]`` according to loss function bar is stored in
        ``loss_tbl.loc[5, ('bar', 'foo')]``.
    """
    methods, moments = pred_tbl.columns.levels
    assert "mu" in moments and "std" in moments
    n_samples = len(pred_tbl)
    assert y.shape == (n_samples,)
    assert n_samples >= 1 and len(methods) >= 1

    col_names = pd.MultiIndex.from_product([metrics_dict.keys(), methods], names=[METRIC, METHOD])
    loss_tbl = pd.DataFrame(index=pred_tbl.index, columns=col_names, dtype=float)
    for method in methods:
        # These get validated inside loss function
        mu = pred_tbl[(method, "mu")].values
        std = pred_tbl[(method, "std")].values
        for metric, metric_f in metrics_dict.items():
            loss_tbl.loc[:, (metric, method)] = metric_f(y, mu, std)
    return loss_tbl


# ============================================================================
# Variables and functions to make getting results from sklearn objects easy
# ============================================================================

# Pre-build some standard metric dicts for the user
STD_REGR_LOSS = {"NLL": log_loss, "MSE": square_loss, "MAE": abs_loss}


[docs]class JustNoise:
    """Class version of iid predictor compatible with sklearn interface. Same
    as ``sklearn.dummy.DummyRegressor(strategy='mean')`` but also keeps track
    of std to be able to accept ``return_std=True``."""

    def __init__(self):
        self.mu = np.nan
        self.std = np.nan

    def fit(self, X_train, y_train):
        assert y_train.ndim == 1
        assert len(y_train) >= 2  # Require N >= 2 for std
        self.mu = np.mean(y_train)
        self.std = np.std(y_train, ddof=0)

    def predict(self, X_test, return_std=True):
        assert return_std
        N = X_test.shape[0]
        mu = np.repeat([self.mu], N, axis=0)
        std = np.repeat([self.std], N, axis=0)
        return mu, std


[docs]def get_gauss_pred(X_train, y_train, X_test, methods, min_std=0.0, verbose=False, checkpointdir=None):
    """Get the Gaussian prediction tables for each test point on a collection
    of regression methods.

    Parameters
    ----------
    X_train : ndarray, shape (n_train, n_features)
        Training set 2d feature array for classifiers. Each row is an
        indepedent data point and each column is a feature.
    y_train : ndarray, shape (n_train,)
        True training targets for each regression data point. Typically of type
        `float`. Must be of same length as `X_train`.
    X_test : ndarray, shape (n_test, n_features)
        Test set 2d feature array for classifiers. Each row is an indepedent
        data point and each column is a feature.
    methods : dict of str to sklearn estimator
        Dictionary mapping method name (`str`) to object that performs training
        and test. Object must follow the interface of sklearn estimators, that
        is, it has a ``fit()`` method and a ``predict()`` method that accepts
        the argument ``return_std=True``.
    min_std : float
        Minimum value to floor the predictive standard deviation. Must be >= 0.
        Useful to prevent inf log loss penalties.
    verbose : bool
        If True, display which method being trained.
    checkpointdir : str (directory)
        If provided, stores checkpoint results using joblib for the train/test
        in case process interrupted. If None, no checkpointing is done.

    Returns
    -------
    pred_tbl : DataFrame, shape (n_samples, n_methods * 2)
        DataFrame with predictive distributions. Each row is a data point.
        The columns should be hierarchical index that is the cartesian product
        of methods x moments. For exampe, ``log_pred_prob_table.loc[5, 'foo']``
        is a pandas series with (mean, std deviation) prediction that method
        foo places on ``y[5]``.

    Notes
    -----
    If a train/test operation is loaded from a checkpoint file, the estimator
    object in methods will not be in a fit state.
    """
    n_test = X_test.shape[0]
    assert n_test > 0
    assert X_train.ndim == 2
    assert y_train.shape == (X_train.shape[0],)
    assert X_test.ndim == 2 and X_test.shape[1] == X_train.shape[1]
    assert X_train.dtype.kind == X_test.dtype.kind  # Would be weird otherwise
    assert min_std >= 0.0

    memory = Memory(cachedir=checkpointdir, verbose=0)

    @memory.cache
    def train_predict(method_obj, X_train, y_train, X_test):
        method_obj.fit(X_train, y_train)
        try:
            mu, std = method_obj.predict(X_test, return_std=True)
        except TypeError:
            mu = method_obj.predict(X_test)
            std = np.ones_like(mu)
        return mu, std

    col_names = pd.MultiIndex.from_product([methods.keys(), ("mu", "std")], names=[METHOD, MOMENT])
    pred_tbl = pd.DataFrame(index=range(n_test), columns=col_names, dtype=float)
    for method_name, method_obj in methods.items():
        if verbose:
            print("Running fit/predict for %s" % method_name)
        mu, std = train_predict(method_obj, X_train, y_train, X_test)
        assert mu.shape == (n_test,) and std.shape == (n_test,)

        std = np.maximum(min_std, std)
        pred_tbl.loc[:, (method_name, "mu")] = mu
        pred_tbl.loc[:, (method_name, "std")] = std
    return pred_tbl


[docs]def just_benchmark(
    X_train,
    y_train,
    X_test,
    y_test,
    methods,
    loss_dict,
    ref_method,
    min_std=0.0,
    pairwise_CI=PAIRWISE_DEFAULT,
    method_EB="t",
    limits={},
):
    """Simplest one-call interface to this package. Just pass it data and
    method objects and a performance summary DataFrame is returned.

    Parameters
    ----------
    X_train : ndarray, shape (n_train, n_features)
        Training set 2d feature array for classifiers. Each row is an
        indepedent data point and each column is a feature.
    y_train : ndarray, shape (n_train,)
        True training targets for each regression data point. Typically of type
        `float`. Must be of same length as `X_train`.
    X_test : ndarray, shape (n_test, n_features)
        Test set 2d feature array for classifiers. Each row is an indepedent
        data point and each column is a feature.
    y_test : ndarray, shape (n_test,)
        True test targets for each regression data point. Typically of type
        `float`. Cannot be empty. Must be of same length as `X_test`.
    methods : dict of str to sklearn estimator
        Dictionary mapping method name (`str`) to object that performs training
        and test. Object must follow the interface of sklearn estimators, that
        is, it has a ``fit()`` method and a ``predict()`` method that accepts
        the argument ``return_std=True``.
    loss_dict : dict of str to callable
        Dictionary mapping loss function name to function that computes loss,
        e.g., `log_loss`, `square_loss`, ...
    ref_method : str
        Name of method that is used as reference point in paired statistical
        tests. This is usually some some of baseline method. `ref_method` must
        be found in `methods` dictionary.
    min_std : float
        Minimum value to floor the predictive standard deviation. Must be >= 0.
        Useful to prevent inf log loss penalties.
    pairwise_CI : bool
        If True, compute error bars on the mean of ``loss - loss_ref`` instead
        of just the mean of `loss`. This typically gives smaller error bars.
    method_EB : {'t', 'bernstein', 'boot'}
        Method to use for building error bar.
    limits : dict of str to (float, float)
        Dictionary mapping metric name to tuple with (lower, upper) which are
        the theoretical limits on the mean loss. For instance, square loss on a
        bounded y domain of ``(-1.0,1.0)`` would give limits of ``(0.0, 4.0)``.
        If entry missing, (-inf, inf) is used.

    Returns
    -------
    loss_summary : DataFrame, shape (n_methods, n_metrics * 3)
        DataFrame with mean loss of each method according to each loss
        function. The rows are the methods. The columns are a hierarchical
        index that is the cartesian product of
        loss x (mean, error bar, p-value). That is,
        ``perf_tbl.loc['foo', 'bar']`` is a pandas series with
        (mean loss of foo on bar, corresponding error bar, statistical sig)
        The statistical significance is a p-value from a two-sided hypothesis
        test on the hypothesis H0 that foo has the same mean loss as the
        reference method `ref_method`.
    """
    assert y_train.dtype == y_test.dtype  # Would be weird otherwise
    pred_tbl = get_gauss_pred(X_train, y_train, X_test, methods, min_std=min_std)
    loss_tbl = loss_table(pred_tbl, y_test, loss_dict)
    loss_summary = loss_summary_table(loss_tbl, ref_method, pairwise_CI=pairwise_CI, method_EB=method_EB, limits=limits)
    return loss_summary